diff --git a/cmake/configure.cmake b/cmake/configure.cmake index ce1857582bd3e8ab3077158384beaae36a83a4b2..e9852f00b1835adec31373f58ac538f9685251e0 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING) endif() if(WIN32) - # windows stupid compile option for all targets. + # windows header option for all targets. add_definitions(-D_XKEYCHECK_H) + # Use symbols instead of absolute path, reduce the cmake link command length. + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@") + SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@") + + # Specify the program to use when building static libraries + SET(CMAKE_C_CREATE_STATIC_LIBRARY " lib ") + SET(CMAKE_CXX_CREATE_STATIC_LIBRARY " lib ") + + # set defination for the dll export + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) endif(WIN32) if(NOT WITH_GOLANG) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index a67512578147fc7223714dbc4cd124b831fb4775..5bf82b4ddf10a646ca540ac4ee2cfd3d3bc6cf58 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,20 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) +# find all third_party modules is used for paddle static library +# for reduce the dependency when building the inference libs. +set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) +function(find_fluid_thirdparties TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(FIND "${__target_path}" "third_party" pos) + if(pos GREATER 1) + get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY) + set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}") + endif() +endfunction(find_fluid_thirdparties) + function(merge_static_libs TARGET_NAME) set(libs ${ARGN}) list(REMOVE_DUPLICATES libs) @@ -204,18 +218,13 @@ function(merge_static_libs TARGET_NAME) foreach(lib ${libs}) # Get the file names of the libraries to be merged - #if(NOT $ MATCHES "lib.*\\.lib") - # message("library" ${lib}) - # set(libfiles ${libfiles} lib$) - #else() set(libfiles ${libfiles} $) - #endif() endforeach() - - # windows cmd return error in clean env. - # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" + # msvc will put libarary in directory of "/Release/xxxlib" by default + # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles} + COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" + COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles} ) endif(WIN32) endfunction(merge_static_libs) diff --git a/doc/README.md b/doc/README.md index 77aa2a5322057d582435e83f38476833d1f73c48..998a39f10699af6d1a391f177a5cf03c9ae170fd 100644 --- a/doc/README.md +++ b/doc/README.md @@ -2,6 +2,6 @@ Thanks for reading PaddlePaddle documentation. -Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo. +Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there. -Please turn to Fluiddoc Repo for the latest documentation. +Please turn to FluidDoc Repo for the latest documentation. diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 58835b580408a4b6de5b23d1612c0d36eb81f7f3..d3583cf894991624f537a4073f14aacc470aadd0 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -22,9 +22,6 @@ paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'] paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) -paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) @@ -35,29 +32,16 @@ paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) -paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None) -paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None) -paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None) -paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None) -paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10)) -paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) -paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) +paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ -paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None)) +paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None @@ -73,7 +57,6 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) @@ -161,7 +144,16 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) +paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) +paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) +paddle.fluid.layers.stanh ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)) +paddle.fluid.layers.hard_sigmoid ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)) +paddle.fluid.layers.swish ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.brelu ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)) +paddle.fluid.layers.leaky_relu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)) +paddle.fluid.layers.soft_relu ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)) paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) @@ -170,6 +162,14 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None)) +paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -234,15 +234,7 @@ paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summariz paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.scale ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_add ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_div ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_sub ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_max ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_min ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elementwise_pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -257,32 +249,23 @@ paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defa paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.exp ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.tanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.tanh_shrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sqrt ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.abs ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.ceil ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.floor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.cos ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sin ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.round ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.reciprocal ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.square ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.softplus ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.softsign ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.brelu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.leaky_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.soft_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.tanh ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.tanh_shrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.softshrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sqrt ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.abs ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.ceil ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.floor ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.cos ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sin ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.round ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) @@ -337,7 +320,7 @@ paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=[' paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) +paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) @@ -351,6 +334,7 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) +paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) @@ -390,7 +374,7 @@ paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> Non paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)) -paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,)) +paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)) paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index ee1f655e25dedb8846bb26275072fd9f6c1f123e..519a00fb073b08f6c88de8186de187476b548fd3 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -13,3 +13,5 @@ if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) endif() + +add_subdirectory(train) diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 4fb015b0ffe27e6fa91d4eaf373fca4feca66361..21f75957be5f33f3dfc09c41fa9a1e1ca590f99e 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -20,41 +20,79 @@ namespace paddle { namespace framework { namespace details { -template -class COWPtr { +// Change it to thread safe flags if needed. +class ThreadUnsafeOwnershipFlags { public: - typedef std::shared_ptr RefPtr; + explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} - private: - RefPtr m_sp; + ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; + ThreadUnsafeOwnershipFlags& operator=( + const ThreadUnsafeOwnershipFlags& other) = delete; + ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; - void detach() { - T* tmp = m_sp.get(); - if (!(tmp == nullptr || m_sp.unique())) { - m_sp = RefPtr(new T(*tmp)); + void SetOwnership(bool flag) { flag_ = flag; } + + // Invoke the callback if it is not owned. + template + void AcquireOwnershipOnce(Callback acquire) { + if (!flag_) { + acquire(); + flag_ = true; } } - public: - COWPtr() : m_sp(nullptr) {} - explicit COWPtr(T* t) : m_sp(t) {} - explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {} + private: + bool flag_; +}; - const T& Data() const { return operator*(); } +// Copy-On-Write pointer. +// It will hold a T* pointer, and only copy once when `MutableData` is invoked. +// +// The template parameter OwnershipFlags should have: +// * a constructor takes a bool. True if own. +// * SetOwnership(bool flag). +// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not +// owned. +// +// https://en.wikipedia.org/wiki/Copy-on-write +template +class COWPtr { + public: + // Ctor from raw pointer. + explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} - T* MutableData() { return operator->(); } + // Move methods. Steal ownership from origin + COWPtr(COWPtr&& other) + : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} + COWPtr& operator=(COWPtr&& origin) = default; - const T& operator*() const { return *m_sp; } - T& operator*() { - detach(); - return *m_sp; + // Copy methods. Not own payload + COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} + COWPtr& operator=(const COWPtr& other) { + payload_ = other.payload_; + ownership_.SetOwnership(false); + return *this; } - const T* operator->() const { return m_sp.operator->(); } - T* operator->() { - detach(); - return m_sp.operator->(); + + // Access read only data. + const T& Data() const { return *payload_; } + + // Access mutable data. If the data is not owned, the data will be copied + // before. + T* MutableData() { + ownership_.AcquireOwnershipOnce( + [this] { payload_.reset(new T(*payload_)); }); + return payload_.get(); } + + private: + // Actual data pointer. + std::shared_ptr payload_; + + // Ownership flag. + OwnershipFlags ownership_; }; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index 5b055d7cb4d127dc20f2cf70869134f24a93d429..d2142af277c0b356d83941b3baab1947cce31dac 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -30,14 +30,6 @@ TEST(COWPtr, all) { ASSERT_EQ(ptr2.Data(), 10); } -TEST(COWPtr, change_old) { - COWPtr ptr(new int{0}); - COWPtr ptr2 = ptr; - *ptr.MutableData() = 10; - ASSERT_EQ(ptr2.Data(), 0); - ASSERT_EQ(ptr.Data(), 10); -} - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8f319116ab80b75c624f35b0e1315e7362e88d9a..134fcee826715672a6e021e9bf694bb771ebb830 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -210,43 +210,6 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( return recv_vars; } -bool MultiDevSSAGraphBuilder::IsDistTrainOp( - ir::Node *node, const std::vector &send_vars, - const std::vector &recv_vars) const { - if (send_vars.size() == 0 || recv_vars.size() == 0) { - return false; - } - - /** - * Check any of opvars contains `.block` and in sendvars - */ - auto checker = [](const std::vector &opvars, - const std::vector &rpc_vars) -> bool { - for (auto &var : opvars) { - // a variable name with the suffix `.block` means it's a splited - // variable by (DistributeTranspiler) - // [python/paddle/fluid/transpiler/distribute_transpiler.py] - if (var.find(".block") != std::string::npos && - std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) { - return true; - } - } - return false; - }; - - std::vector input_var_names; - std::vector output_var_names; - for (ir::Node *input : node->inputs) { - input_var_names.push_back(input->Name()); - } - for (ir::Node *output : node->outputs) { - output_var_names.push_back(output->Name()); - } - - return checker(output_var_names, send_vars) || - checker(input_var_names, recv_vars); -} - size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( const std::vector &var_names) const { int64_t numel_sum = 0; @@ -370,7 +333,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } is_dist_train = true; - } else if (IsDistTrainOp(node, send_vars, recv_vars)) { + } else if (boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(&result, node); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; @@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, .emplace(varname, op_dev_id); } } else { + LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); PADDLE_THROW( "the distribute training related op should be in [split_byref, " "concat]."); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 47aaa80f4d66a48b729d0638badcab885a50585c..cdf9f13cde608b546d17a1e53e0f6acea9e12566 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { int CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; - /** - * Is this operator as the end-point operator before/after send operator. - */ - bool IsDistTrainOp(ir::Node *node, const std::vector &send_vars, - const std::vector &recv_vars) const; - std::vector FindDistTrainSendVars( const std::vector &nodes) const; diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h index 71db8d952f4c205b875ad254dc19c0c1f74e61b3..fc479a4c4a1e7d5c824d3c202e0cccf743dd52c9 100644 --- a/paddle/fluid/framework/details/reference_count_op_handle.h +++ b/paddle/fluid/framework/details/reference_count_op_handle.h @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" namespace paddle { @@ -46,17 +47,15 @@ class ReferenceCountOpHandle : public OpHandleBase { const std::vector &var_names, GarbageCollector *gc, AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), - scope_(scope), - var_names_(var_names), - gc_(gc), - ref_cnts_(ref_cnts) { + : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { dev_ctx_ = static_cast( platform::DeviceContextPool::Instance().Get(place)); if (IsStreamGarabageCollector()) { PADDLE_ENFORCE(cudaSetDevice(place.device)); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } + + for (auto &name : var_names) AddVar(name); } ~ReferenceCountOpHandle() { @@ -69,19 +68,35 @@ class ReferenceCountOpHandle : public OpHandleBase { std::string Name() const override { return "reference_count"; } + void AddVar(const std::string &name) { + auto it = var_names_.find(name); + if (it != var_names_.end()) + ++(it->second); + else + var_names_[name] = 1; + } + protected: void RunImpl() override { auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; - for (auto &name : var_names_) { + std::vector tensors; + for (auto &pair : var_names_) { + auto &name = pair.first; auto it = ref_cnts_->find(name); if (it == ref_cnts_->end()) continue; auto *var = exec_scope->FindVar(name); - if (var == nullptr || !var->IsType()) continue; - - if (it->second.fetch_sub(1) <= 1) { - tensors.emplace_back(var->GetMutable()); + if (var == nullptr) continue; + + if (var->IsType()) { + if (it->second.fetch_sub(pair.second) <= pair.second) { + tensors.emplace_back(var->GetMutable()); + } + } else if (var->IsType()) { + if (it->second.fetch_sub(pair.second) <= pair.second) { + tensors.emplace_back( + var->GetMutable()->mutable_value()); + } } } @@ -91,7 +106,7 @@ class ReferenceCountOpHandle : public OpHandleBase { } private: - void ClearTensors(const std::vector &tensors) { + void ClearTensors(const std::vector &tensors) { auto *gc = dynamic_cast *>(gc_); if (gc != nullptr) { auto compute_stream = dev_ctx_->stream(); @@ -112,7 +127,7 @@ class ReferenceCountOpHandle : public OpHandleBase { const Scope *scope_; platform::CUDADeviceContext *dev_ctx_; - std::vector var_names_; + std::unordered_map var_names_; GarbageCollector *gc_; // not own AtomicReferenceCountMap *ref_cnts_; // not own cudaEvent_t event_; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 344754d5a1e119c04cae08ad50126924b5824315..b1ce551ce73de33bcede187c72feebad6e2fa1a5 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include @@ -23,6 +24,25 @@ namespace paddle { namespace framework { namespace details { +static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { + std::queue queue; + queue.push(var_in); + do { + auto *var = queue.front(); + queue.pop(); + for (auto *op : var->PendingOps()) { + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = Get(kGlobalReferenceCount); @@ -34,6 +54,9 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( // Step 2: Find all variables in non-computation ops which refers to variables // in computation ops std::unordered_set names; + std::unordered_map> + compute_ref_cnt_map; + auto get_ref_cnts_from_compute_op = [&]( const std::unique_ptr &op, const std::vector &vars) { @@ -54,15 +77,18 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( VarDesc *var_desc = var_handle->Node()->Var(); auto var_name = var_handle->Node()->Name(); - // This is wierd but there is really some variables without var_desc + // This is weird but there is really some variables without var_desc // in computation_op if (var_desc == nullptr) { if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) continue; } else { - if (var_desc->Persistable() || - var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR) + if (var_desc->Persistable()) continue; + auto var_type = var_desc->Proto()->type().type(); + if (var_type != proto::VarType::LOD_TENSOR && + var_type != proto::VarType::SELECTED_ROWS) { continue; + } } // compute op only runs in one device @@ -93,12 +119,33 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( if (ref_cnts.count(place.device) && ref_cnts[place.device]->count(var_name)) { ++(*ref_cnts[place.device])[var_name]; + + auto *next_compute_op = FindNextComputationOpHandle(var_handle); + if (next_compute_op != nullptr) { + if (compute_ref_cnt_map.count(next_compute_op)) { + compute_ref_cnt_map[next_compute_op]->AddVar(var_name); + VLOG(5) << "Add reference count of " << var_name << " to Operator " + << next_compute_op->Name(); + } else { + // Create new reference_count_op_handle + ir::Node *ref_cnt_node = graph->CreateEmptyNode( + "reference_count", ir::Node::Type::kOperation); + auto *ref_cnt_handle = new ReferenceCountOpHandle( + ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, + gcs[place.device].get(), cur_ref_cnts[place.device].get()); + if (next_compute_op->Outputs().empty()) { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + next_compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + ref_cnt_handle->AddInput(next_compute_op->Outputs().front()); + compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle); + } + } } } }; - std::unordered_map - compute_ref_cnt_map; auto &all_ops = graph->Get(kGraphOps); for (auto &op : all_ops) { auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); @@ -113,11 +160,13 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( auto *ref_cnt_handle = new ReferenceCountOpHandle( ref_cnt_node, compute_op->GetScope(), place, in_var_names, gcs[place.device].get(), cur_ref_cnts[place.device].get()); - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - compute_op->AddOutput(dep_var); - ref_cnt_handle->AddInput(dep_var); - graph->Get(kGraphDepVars).emplace(dep_var); - compute_ref_cnt_map[compute_op] = ref_cnt_handle; + if (compute_op->Outputs().empty()) { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + ref_cnt_handle->AddInput(compute_op->Outputs().front()); + compute_ref_cnt_map[compute_op].reset(ref_cnt_handle); } for (auto &op : all_ops) { @@ -131,7 +180,11 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( new_all_ops.emplace_back(std::move(op)); auto it = compute_ref_cnt_map.find(new_all_ops.back().get()); if (it != compute_ref_cnt_map.end()) { - new_all_ops.emplace_back(it->second); + // Add LeafNode to ReferenceCountOpHandle + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + it->second->AddOutput(dummy_leaf); + new_all_ops.emplace_back(std::move(it->second)); } } diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h index 4ea1df655df005ba7585fb67fb0a3c3411a76418..2b265a773fe967f5b2ab38ce795b0f599d859c2a 100644 --- a/paddle/fluid/framework/eigen.h +++ b/paddle/fluid/framework/eigen.h @@ -13,6 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +// logging.h and windows.h conflict +#define GLOG_NO_ABBREVIATED_SEVERITIES +// solve static linking error in windows +// https://github.com/google/glog/issues/301 +#define GOOGLE_GLOG_DLL_DECL #include "paddle/fluid/framework/tensor.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -46,11 +51,13 @@ struct EigenTensor { using ConstType = Eigen::TensorMap>; - static Type From(Tensor& tensor, DDim dims) { + static Type From(Tensor& tensor, DDim dims) { // NOLINT return Type(tensor.data(), EigenDim::From(dims)); } - static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); } + static Type From(Tensor& tensor) { // NOLINT + return From(tensor, tensor.dims_); + } // NOLINT static ConstType From(const Tensor& tensor, DDim dims) { return ConstType(tensor.data(), EigenDim::From(dims)); @@ -64,7 +71,8 @@ struct EigenTensor { template struct EigenMatrix : public EigenTensor { - static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) { + static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT + int num_col_dims) { int rank = tensor.dims_.size(); PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, "`num_col_dims` must be between (0, rank_of_tensor)."); @@ -86,11 +94,12 @@ template struct EigenVector : public EigenTensor { // Flatten reshapes a Tensor into an EigenVector. - static typename EigenVector::Type Flatten(Tensor& tensor) { + static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT return EigenVector::From(tensor, {product(tensor.dims_)}); } - static typename EigenVector::ConstType Flatten(const Tensor& tensor) { + static typename EigenVector::ConstType Flatten( + const Tensor& tensor) { // NOLINT return EigenVector::From(tensor, {product(tensor.dims_)}); } }; @@ -104,7 +113,7 @@ struct EigenScalar { using ConstType = Eigen::TensorMap< Eigen::TensorFixedSize, MajorType, IndexType>>; - static Type From(Tensor& tensor) { return Type(tensor.data()); } + static Type From(Tensor& tensor) { return Type(tensor.data()); } // NOLINT static ConstType From(const Tensor& tensor) { return ConstType(tensor.data()); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index 09c5ec59d66445bdbd5349447b125be89cb2efdf..d7df6389cfd595324e284e0da10f65213ccee80f 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -26,8 +26,6 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( PADDLE_ENFORCE(graph.get()); FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); - std::unordered_set nodes2delete; - GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() ->NewNode("conv_relu_mkldnn_fuse/conv_input") @@ -42,36 +40,20 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( Graph* g) { VLOG(4) << "handle ConvReLU fuse"; GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, - conv_relu_pattern); // Filter - GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op - // Create an ConvReLU Node. - OpDesc desc; - std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); - std::string conv_relu_w_in = conv_weight->Name(); - std::string conv_relu_b_in = conv_bias->Name(); - std::string conv_relu_out = relu_out->Name(); - desc.SetInput("Input", std::vector({conv_relu_i_in})); - desc.SetInput("Filter", std::vector({conv_relu_w_in})); - desc.SetInput("Bias", std::vector({conv_relu_b_in})); - desc.SetOutput("Output", std::vector({conv_relu_out})); - desc.SetType("conv2d"); - for (auto& attr : conv->Op()->GetAttrMap()) { - desc.SetAttr(attr.first, attr.second); - } - desc.SetAttr("fuse_relu", true); - auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out}); + // Transform Conv node into ConvReLU node. + OpDesc* desc = conv->Op(); + desc->SetOutput("Output", std::vector({relu_out->Name()})); + desc->SetAttr("fuse_relu", true); + GraphSafeRemoveNodes(graph.get(), {relu, conv_out}); PADDLE_ENFORCE(subgraph.count(conv_input)); - IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); - IR_NODE_LINK_TO(conv_weight, conv_relu_node); - IR_NODE_LINK_TO(conv_bias, conv_relu_node); - IR_NODE_LINK_TO(conv_relu_node, relu_out); + IR_NODE_LINK_TO(conv, relu_out); found_conv_relu_count++; }; diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc index 82b5fa1886098ca3b19c147c307d3f2fc3ba03d6..9dd780ec89ab991d6d99cb66fa2a9b683be2b9ca 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) { for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "conv2d") { - if (node->Op()->HasAttr("use_mkldnn")) { - bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); - if (use_mkldnn) { - if (node->Op()->HasAttr("fuse_relu")) { - bool fuse_relu = boost::get(node->Op()->GetAttr("fuse_relu")); - if (fuse_relu) { - ++conv_relu_count; - } - } - } + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + ASSERT_TRUE(op->HasAttr("fuse_relu")); + bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index ef5113819696238d4e06b826bf43064b0f368dea..6d2c51b0e9bed8461f6491b84a36a3bf6663a138 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()( ->AsInput() ->assert_is_persistable_var() ->assert_is_op_input("conv2d", "Filter"); - // Bias - auto *conv_bias_var = pattern->NewNode(conv_bias_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Bias"); // intermediate variable, will be removed in the IR after fuse. auto *conv_out_var = pattern->NewNode(conv_out_repr()) ->AsIntermediate() @@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()( ->AsOutput() ->assert_is_op_output("relu"); - conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) - .LinksTo({conv_out_var}); + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); return relu_out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 46950ed877cda7cd83ead4e9aa9a3aaae5d5ecfa..69b486c29d8bd1102a8372f5041051c25ce19359 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -379,7 +379,7 @@ struct PatternBase { // op: conv + relu // named nodes: // conv_input, conv_weight, -// conv_bias, conv_out, conv, +// conv_out, conv, // relu_out, relu struct ConvReLU : public PatternBase { ConvReLU(PDPattern* pattern, const std::string& name_scope) @@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase { PATTERN_DECL_NODE(relu); // declare variable node's name PATTERN_DECL_NODE(conv_weight); - PATTERN_DECL_NODE(conv_bias); PATTERN_DECL_NODE(conv_out); PATTERN_DECL_NODE(relu_out); }; diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index ba2c41eb8968e30bc8a801f4d8d239da8c522de9..7836ecb1272a07a79a70c9cb040335f9a42e5684 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,12 +17,10 @@ #include #include #include -#include #include -#include "paddle/fluid/framework/details/cow_ptr.h" + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -30,401 +28,206 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) -namespace details { -struct CUDABuffer { - void *data_{nullptr}; - size_t size_{0}; - platform::CUDAPlace place_; - - CUDABuffer() {} - CUDABuffer(platform::Place place, size_t size) - : size_(size), place_(boost::get(place)) { - data_ = memory::Alloc(place_, size); - } - - ~CUDABuffer() { ClearMemory(); } - - CUDABuffer(const CUDABuffer &o) = delete; - CUDABuffer &operator=(const CUDABuffer &o) = delete; - - void Resize(platform::Place place, size_t size) { - ClearMemory(); - place_ = boost::get(place); - data_ = memory::Alloc(place_, size); - size_ = size; - } - - void Swap(CUDABuffer &o) { - std::swap(data_, o.data_); - std::swap(place_, o.place_); - std::swap(size_, o.size_); - } - - private: - void ClearMemory() const { - if (data_) { - memory::Free(place_, data_); - } - } -}; -} // namespace details - // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template class Vector { public: using value_type = T; - using iterator = typename std::vector::iterator; - using const_iterator = typename std::vector::const_iterator; - - private: - // The actual class to implement vector logic - class VectorData { - public: - VectorData() : flag_(kDataInCPU) {} - VectorData(size_t count, const T &value) - : cpu_(count, value), flag_(kDataInCPU) {} - VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} - template - explicit VectorData(const std::vector &dat) - : cpu_(dat), flag_(kDataInCPU) {} - - VectorData(const VectorData &o) { - o.ImmutableCPU(); - cpu_ = o.cpu_; - flag_ = kDataInCPU; - } - - VectorData &operator=(const VectorData &o) { - o.ImmutableCPU(); - cpu_ = o.cpu_; - flag_ = kDataInCPU; - details::CUDABuffer null; - gpu_.Swap(null); - return *this; - } - - T &operator[](size_t i) { - MutableCPU(); - return cpu_[i]; - } - - const T &operator[](size_t i) const { - ImmutableCPU(); - return cpu_[i]; - } - - size_t size() const { return cpu_.size(); } - - iterator begin() { - MutableCPU(); - return cpu_.begin(); - } - - iterator end() { - MutableCPU(); - return cpu_.end(); - } - - T &front() { - MutableCPU(); - return cpu_.front(); - } - - T &back() { - MutableCPU(); - return cpu_.back(); - } - - const_iterator begin() const { - ImmutableCPU(); - return cpu_.begin(); - } - - const_iterator end() const { - ImmutableCPU(); - return cpu_.end(); - } - - const T &back() const { - ImmutableCPU(); - return cpu_.back(); - } - - T *data() { return &(*this)[0]; } - - const T *data() const { return &(*this)[0]; } - - const T &front() const { - ImmutableCPU(); - return cpu_.front(); - } - - // assign this from iterator. - // NOTE: the iterator must support `end-begin` - template - void assign(Iter begin, Iter end) { - MutableCPU(); - cpu_.assign(begin, end); - } - - // push_back. If the previous capacity is not enough, the memory will - // double. - void push_back(T elem) { - MutableCPU(); - cpu_.push_back(elem); - } - - // extend a vector by iterator. - // NOTE: the iterator must support end-begin - template - void Extend(It begin, It end) { - MutableCPU(); - auto out_it = std::back_inserter>(this->cpu_); - std::copy(begin, end, out_it); - } - - // resize the vector - void resize(size_t size) { - MutableCPU(); - cpu_.resize(size); - } - - // get cuda ptr. immutable - const T *CUDAData(platform::Place place) const { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "CUDA Data must on CUDA place"); - ImmutableCUDA(place); - return reinterpret_cast(gpu_.data_); - } - - // get cuda ptr. mutable - T *CUDAMutableData(platform::Place place) { - const T *ptr = CUDAData(place); - flag_ = kDirty | kDataInCUDA; - return const_cast(ptr); - } - - // clear - void clear() { - cpu_.clear(); - flag_ = kDirty | kDataInCPU; - } - - size_t capacity() const { return cpu_.capacity(); } - - // reserve data - void reserve(size_t size) { cpu_.reserve(size); } - - // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - ImmutableCPU(); - return cpu_; - } - - bool operator==(const VectorData &other) const { - ImmutableCPU(); - other.ImmutableCPU(); - return cpu_ == other.cpu_; - } - - private: - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void CopyToCPU() const { - // COPY GPU Data To CPU - void *src = gpu_.data_; - void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, - nullptr); - } - - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - - void ImmutableCUDA(platform::Place place) const { - if (IsDirty()) { - if (IsInCPU()) { - CopyCPUDataToCUDA(place); - UnsetFlag(kDirty); - SetFlag(kDataInCUDA); - } else if (IsInCUDA() && - !(boost::get(place) == gpu_.place_)) { - CopyCUDADataToAnotherPlace(place); - // Still dirty - } else { - // Dirty && DataInCUDA && Device is same - // Do nothing - } - } else { - if (!IsInCUDA()) { - // Even data is not dirty. However, data is not in CUDA. Copy data. - CopyCPUDataToCUDA(place); - SetFlag(kDataInCUDA); - } else if (!(boost::get(place) == gpu_.place_)) { - CopyCUDADataToAnotherPlace(place); - } else { - // Not Dirty && DataInCUDA && Device is same - // Do nothing. - } - } - } - void CopyCUDADataToAnotherPlace(const platform::Place &place) const { - details::CUDABuffer tmp(place, gpu_.size_); - const void *src = gpu_.data_; - void *dst = tmp.data_; - - memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr); - gpu_.Swap(tmp); - } - void CopyCPUDataToCUDA(const platform::Place &place) const { - void *src = cpu_.data(); - gpu_.Resize(place, cpu_.size() * sizeof(T)); - void *dst = gpu_.data_; - auto stream = static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, - stream); - } - - void ImmutableCPU() const { - if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or - // CPU has no data. - CopyToCPU(); - UnsetFlag(kDirty); - } - SetFlag(kDataInCPU); - } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } - - bool IsDirty() const { return flag_ & kDirty; } - - bool IsInCUDA() const { return flag_ & kDataInCUDA; } - bool IsInCPU() const { return flag_ & kDataInCPU; } - - mutable std::vector cpu_; - mutable details::CUDABuffer gpu_; - mutable int flag_; - }; - - public: // Default ctor. Create empty Vector - Vector() : m_(new VectorData()) {} + Vector() { InitEmpty(); } // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) - : m_(new VectorData(count, value)) {} + explicit Vector(size_t count, const T &value = T()) { + InitEmpty(); + if (count != 0) { + resize(count); + T *ptr = begin(); + for (size_t i = 0; i < count; ++i) { + ptr[i] = value; + } + } + } // Ctor with init_list - Vector(std::initializer_list init) : m_(new VectorData(init)) {} + Vector(std::initializer_list init) { + if (init.size() == 0) { + InitEmpty(); + } else { + InitByIter(init.size(), init.begin(), init.end()); + } + } // implicit cast from std::vector. template - Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT + Vector(const std::vector &dat) { // NOLINT + if (dat.size() == 0) { + InitEmpty(); + } else { + InitByIter(dat.size(), dat.begin(), dat.end()); + } } // Copy ctor - Vector(const Vector &other) { m_ = other.m_; } + Vector(const Vector &other) { this->operator=(other); } // Copy operator Vector &operator=(const Vector &other) { - m_ = other.m_; + if (other.size() != 0) { + this->InitByIter(other.size(), other.begin(), other.end()); + } else { + InitEmpty(); + } return *this; } // Move ctor - Vector(Vector &&other) { m_ = std::move(other.m_); } + Vector(Vector &&other) { + this->size_ = other.size_; + this->flag_ = other.flag_; + if (other.cuda_vec_.memory_size()) { + this->cuda_vec_.ShareDataWith(other.cuda_vec_); + } + if (other.cpu_vec_.memory_size()) { + this->cpu_vec_.ShareDataWith(other.cpu_vec_); + } + } // CPU data access method. Mutable. - T &operator[](size_t i) { return (*m_)[i]; } + T &operator[](size_t i) { + MutableCPU(); + return const_cast(cpu_vec_.data())[i]; + } // CPU data access method. Immutable. - const T &operator[](size_t i) const { return (*m_)[i]; } + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_vec_.data()[i]; + } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return m_->size(); } + size_t size() const { return size_; } - iterator begin() { return m_->begin(); } + T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } - iterator end() { return m_->end(); } + T *end() { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); + } - T &front() { return m_->front(); } + T &front() { return *begin(); } - T &back() { return m_->back(); } + T &back() { + auto it = end(); + --it; + return *it; + } - const_iterator begin() const { return m_->begin(); } + const T *begin() const { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); + } - const_iterator end() const { return m_->end(); } + const T *end() const { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); + } - const_iterator cbegin() const { return begin(); } + const T *cbegin() const { return begin(); } - const_iterator cend() const { return end(); } + const T *cend() const { return end(); } - const T &back() const { return m_->back(); } + const T &back() const { + auto it = end(); + --it; + return *it; + } - T *data() { return m_->data(); } + T *data() { return begin(); } - const T *data() const { return m_->data(); } + const T *data() const { return begin(); } - const T &front() const { return m_->front(); } + const T &front() const { return *begin(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - m_->assign(begin, end); + InitByIter(end - begin, begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { m_->push_back(elem); } + void push_back(T elem) { + if (size_ + 1 > capacity()) { + reserve((size_ + 1) << 1); + } + *end() = elem; + ++size_; + } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - m_->Extend(begin, end); + size_t pre_size = size_; + resize(pre_size + (end - begin)); + T *ptr = this->begin() + pre_size; + for (; begin < end; ++begin, ++ptr) { + *ptr = *begin; + } } // resize the vector void resize(size_t size) { - if (m_.Data().size() != size) { - m_->resize(size); + if (size + 1 <= capacity()) { + size_ = size; + } else { + MutableCPU(); + Tensor cpu_tensor; + platform::Place cpu = platform::CPUPlace(); + T *ptr = cpu_tensor.mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + const T *old_ptr = + cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + size_, ptr); + } + size_ = size; + cpu_vec_.ShareDataWith(cpu_tensor); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - return m_.Data().CUDAData(place); + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return cuda_vec_.data(); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - return m_->CUDAMutableData(place); + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); } // clear - void clear() { m_->clear(); } + void clear() { + size_ = 0; + flag_ = kDirty | kDataInCPU; + } - size_t capacity() const { return m_->capacity(); } + size_t capacity() const { + return cpu_vec_.memory_size() / SizeOfType(typeid(T)); + } // reserve data - void reserve(size_t size) { m_->reserve(size); } + void reserve(size_t size) { + size_t pre_size = size_; + resize(size); + resize(pre_size); + } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -445,7 +248,12 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { return *m_; } + operator std::vector() const { + std::vector result; + result.resize(size()); + std::copy(begin(), end(), result.begin()); + return result; + } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -459,11 +267,118 @@ class Vector { return true; } - const void *Handle() const { return &m_.Data(); } - private: - // Vector is an COW object. - details::COWPtr m_; + void InitEmpty() { + size_ = 0; + flag_ = kDataInCPU; + } + + template + void InitByIter(size_t size, Iter begin, Iter end) { + platform::Place cpu = platform::CPUPlace(); + T *ptr = this->cpu_vec_.template mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + for (size_t i = 0; i < size; ++i) { + *ptr++ = *begin++; + } + flag_ = kDataInCPU | kDirty; + size_ = size; + } + + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); + WaitPlace(cuda_vec_.place()); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + TensorCopy(cpu_vec_, boost::get(place), + &cuda_vec_); + WaitPlace(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == cuda_vec_.place())) { + framework::Tensor tmp; + TensorCopy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + cuda_vec_.ShareDataWith(tmp); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + TensorCopy(cpu_vec_, boost::get(place), + &cuda_vec_); + WaitPlace(place); + SetFlag(kDataInCUDA); + } else if (!(place == cuda_vec_.place())) { + framework::Tensor tmp; + WaitPlace(cuda_vec_.place()); + TensorCopy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + WaitPlace(place); + cuda_vec_.ShareDataWith(tmp); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void ImmutableCPU() const { + if (IsDirty() && + !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + static void WaitPlace(const platform::Place place) { + if (platform::is_gpu_place(place)) { + platform::DeviceContextPool::Instance() + .Get(boost::get(place)) + ->Wait(); + } + } + + static T &EmptyDummy() { + static T dummy = T(); + return dummy; + } + + mutable int flag_; + mutable Tensor cpu_vec_; + mutable Tensor cuda_vec_; + size_t size_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 86f6147cf7ac1e82ac2904bbcdcf9697422560ce..17f942571d0141537e992be9ab73847d2a794698 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -54,6 +54,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { size_t j = 0) const override { PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(j, Outputs(out).size()); + PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", in, i); + PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", out, j); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); if (in_var->GetType() != proto::VarType::LOD_TENSOR) { @@ -63,6 +67,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR, "The %d-th output of Output(%s) must be LoDTensor.", j, out); + out_var->SetLoDLevel(in_var->GetLoDLevel()); } diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73eac74dee030a4f7820531800f737e4e..06cf4a0f9f33af67343437baeb9623a35ddad183 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -38,27 +38,31 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + std::string op_type_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; } const proto::OpProto& Proto() const { - PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered", + op_type_); PADDLE_ENFORCE(proto_->IsInitialized(), - "Operator Proto must be initialized in op info"); + "Operator %s Proto must be initialized in op info", + op_type_); return *proto_; } const OpCreator& Creator() const { - PADDLE_ENFORCE_NOT_NULL(creator_, - "Operator Creator has not been registered"); + PADDLE_ENFORCE_NOT_NULL( + creator_, "Operator %s Creator has not been registered", op_type_); return creator_; } const GradOpMakerFN& GradOpMaker() const { PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, - "Operator GradOpMaker has not been registered."); + "Operator %s GradOpMaker has not been registered.", + op_type_); return grad_op_maker_; } @@ -73,8 +77,9 @@ class OpInfoMap { return map_.find(op_type) != map_.end(); } - void Insert(const std::string& type, const OpInfo& info) { + void Insert(const std::string& type, OpInfo info) { PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); + info.op_type_ = type; map_.insert({type, info}); } diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 4fa047bf3ee3d06ac4aec5d2cc6a355965836d42..2663c9be41a834523fb896b490e7e75df256de05 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, {static_cast(OpRole::kForward), static_cast(OpRole::kBackward), static_cast(OpRole::kOptimize), static_cast(OpRole::kRPC), + static_cast(OpRole::kDist), static_cast(OpRole::kLRSched), static_cast(OpRole::kLoss) | static_cast(OpRole::kForward), static_cast(OpRole::kLoss) | static_cast(OpRole::kBackward), @@ -131,7 +132,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - + AddAttr>(OpCreationCallstackAttrName(), + "Callstack for Op Creatation.") + .SetDefault({}); Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 18827385ad659922230ff68709a2926a8c9013ac..f13196959705bad473a6f7b3ef88f8faa8abe2b8 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -26,7 +26,13 @@ enum class OpRole { kForward = 0x0000, kBackward = 0x0001, kOptimize = 0x0002, + // RPC role is for send/recv releated op kRPC = 0x0003, + // Dist role is for split_byref/split_selected_rows/concat + // used for distributed training. + kDist = 0x0004, + // Tag all learning rate scheduler operators. + kLRSched = 0x0005, kLoss = 0x0100, // The default value of op's role. This should be only used for unittests and @@ -40,6 +46,7 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } + static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index e7dfa608b48f89a2155e43c7e63e31154675cd38..ef2eb334a4e7f3f482ba6d62d3f325f109c69302 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,6 +23,11 @@ limitations under the License. */ #include #include +#if defined(_WIN32) +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#define GOOGLE_GLOG_DLL_DECL +#endif + #include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" @@ -241,22 +246,20 @@ struct OpKernelRegistrarFunctorEx #include - #include - +#include +#include +#include #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -137,19 +142,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - VLOG(4) << place << " " << DebugStringEx(&scope); - if (platform::is_gpu_place(place)) { + try { + if (VLOG_IS_ON(4)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + } + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif + } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + if (VLOG_IS_ON(3)) { + VLOG(3) << place << " " << DebugStringEx(&scope); + } + } catch (platform::EnforceNotMet exception) { + if (Attrs().count("sub_block") != 0) { + throw exception; + } + + auto& callstack = Attr>( + OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + + if (callstack.empty()) { + throw exception; + } + std::ostringstream sout; + sout << "Invoke operator " << Type() << " error.\n"; + sout << "Python Callstacks: \n"; + for (auto& line : callstack) { + sout << line; + } + sout << "C++ Callstacks: \n"; + sout << exception.err_str_; + exception.err_str_ = sout.str(); + throw exception; + } catch (...) { + std::rethrow_exception(std::current_exception()); } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -177,7 +211,7 @@ const std::vector& OperatorBase::Inputs( } bool OperatorBase::HasOutputs(const std::string& name) const { - if (outputs_.find(name) != outputs_.end()) { + if (outputs_.end() != outputs_.find(name)) { return true; } else { return false; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1040eb882baea624e972faf4af3094119df72308..626b50edfd39424473be33e9f8baec5970471477 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -20,6 +20,8 @@ limitations under the License. */ #include #include #include +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc index ddff2c7c261746ac9986e79cff3da7e0a9654adc..89eb00ff65598eff5f4ba541df107e8da04e1a89 100644 --- a/paddle/fluid/framework/shape_inference.cc +++ b/paddle/fluid/framework/shape_inference.cc @@ -46,6 +46,16 @@ std::vector InferShapeContext::GetReaderDims( return this->GetRepeatedDims(arg_names[0]); } +void InferShapeContext::ShareLoDs(const std::string &in, + const std::string &out) const { + PADDLE_ENFORCE_EQ(Inputs(in).size(), Outputs(out).size(), + "The number of arguments in %s and %s is not equal.", in, + out); + for (size_t i = 0; i < in.size(); ++i) { + ShareLoD(in, out, i, i); + } +} + DDim InferShapeContext::GetInputsElementDim(const std::string &name, int idx) const { const std::vector &names = Inputs(name); diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index 5f497cafa0f75f7c23d550ef767d55274de7c900..fd220d961af85dd55fe2031409180823d8f178fc 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -56,6 +56,8 @@ class InferShapeContext { virtual const std::vector &Outputs( const std::string &name) const = 0; + void ShareLoDs(const std::string &in, const std::string &out) const; + virtual void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, size_t j = 0) const = 0; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 684e0ce0e292d852d4601ebd1ccd920382e42c8b..1032aadcbda4f1b05841e08e1abe7c737c3aeb9c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -71,15 +71,15 @@ bool AnalysisPredictor::Init( inference_program_ = paddle::inference::Load( executor_.get(), scope_.get(), config_.prog_file, config_.param_file); } else { - LOG(ERROR) << "fail to load inference model."; + LOG(ERROR) << "fail to load inference model from " << config_.model_dir; return false; } OptimizeInferenceProgram(); - ctx_ = executor_->Prepare(*inference_program_, 0); if (config_._use_mkldnn) { executor_->EnableMKLDNN(*inference_program_); } + ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(5) << "to create variables"; PADDLE_ENFORCE(scope_.get()); @@ -109,8 +109,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); - PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude, - "Only kExclude is supported yet."); + PADDLE_ENFORCE( + config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude, + "Only kExclude is supported yet."); Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_); CHECK(argument_.transformed_program_desc); @@ -126,8 +127,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr CreatePaddlePredictor< - AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) { +std::unique_ptr +CreatePaddlePredictor( + const contrib::AnalysisConfig& config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -154,4 +156,11 @@ std::unique_ptr CreatePaddlePredictor< return predictor; } +template <> +std::unique_ptr CreatePaddlePredictor( + const contrib::AnalysisConfig& config) { + return CreatePaddlePredictor(config); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index e53925366e9214cd60422efe56884751297c15e5..aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -30,7 +30,7 @@ using framework::proto::ProgramDesc; */ class AnalysisPredictor : public NativePaddlePredictor { public: - explicit AnalysisPredictor(const AnalysisConfig& config) + explicit AnalysisPredictor(const contrib::AnalysisConfig& config) : NativePaddlePredictor(config), config_(config) {} bool Init(const std::shared_ptr& parent_scope); @@ -46,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { Argument& analysis_argument() { return argument_; } private: - AnalysisConfig config_; + contrib::AnalysisConfig config_; Argument argument_; }; diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index 43b31269d2bd82c06e284e3599a3763da693a2af..2c4894fd887f2f509dc7ab88c367cea5c1aed99a 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -31,21 +31,24 @@ namespace paddle { +using paddle::contrib::AnakinConfig; + template PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( - const AnakinConfig &config) { + const contrib::AnakinConfig &config) { CHECK(Init(config)); } template <> PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( - const AnakinConfig &config) { + const contrib::AnakinConfig &config) { omp_set_dynamic(0); omp_set_num_threads(1); mkl_set_num_threads(1); CHECK(Init(config)); } template -bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { +bool PaddleInferenceAnakinPredictor::Init( + const contrib::AnakinConfig &config) { if (!(graph_.load(config.model_file))) { VLOG(3) << "fail to load graph from " << config.model_file; return false; @@ -200,10 +203,11 @@ template class PaddleInferenceAnakinPredictor; // A factory to help create difference predictor. template <> -std::unique_ptr CreatePaddlePredictor< - AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { +std::unique_ptr +CreatePaddlePredictor( + const contrib::AnakinConfig &config) { VLOG(3) << "Anakin Predictor create."; - if (config.target_type == AnakinConfig::NVGPU) { + if (config.target_type == contrib::AnakinConfig::NVGPU) { #ifdef PADDLE_WITH_CUDA VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; std::unique_ptr x( @@ -213,7 +217,7 @@ std::unique_ptr CreatePaddlePredictor< LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment"; return nullptr; #endif - } else if (config.target_type == AnakinConfig::X86) { + } else if (config.target_type == contrib::AnakinConfig::X86) { VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; std::unique_ptr x( new PaddleInferenceAnakinPredictor(config)); diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index dd08661880d8cc3a9f4401e9af91a3d10e6579b6..04536ea3a53bbbc9293d92e69a23567e4bfd84c0 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -29,6 +29,8 @@ limitations under the License. */ namespace paddle { +using contrib::AnakinConfig; + template class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 2e9e10139fa7008a46c3782960dfd44d3228cc26..dca4386b21b4a064c21b52218682321258f368c4 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/platform/profiler.h" @@ -101,14 +102,11 @@ bool NativePaddlePredictor::Init( inference_program_ = paddle::inference::Load( executor_.get(), scope_.get(), config_.prog_file, config_.param_file); } else { - LOG(ERROR) << "fail to load inference model."; + LOG(ERROR) << "fail to load inference model from " << config_.model_dir; return false; } ctx_ = executor_->Prepare(*inference_program_, 0); - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); - } executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); @@ -218,57 +216,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, template void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, PaddleTensor *output) { - std::vector shape; - auto dims_i = fetch.dims(); - auto lod = fetch.lod(); - const T *output_ptr = fetch.data(); - auto num = fetch.numel(); - std::vector data; - if (0 == lod.size()) { - std::copy(output_ptr, output_ptr + num, std::back_inserter(data)); - for (int j = 0; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); - } - } else { - // for batch detection - // image[0] -> output[0] shape {145, 6} - // image[1] -> output[1] shape {176, 6} - // then, - // the batch output shape {321, 6} - // the lod {{0, 145, 321}} - // so we should append output[0] to {176, 6} - size_t max_dim = 0; - for (size_t j = 1; j < lod[0].size(); j++) { - max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]); - } - size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back(); - if (max_dim > 0) { - data.resize((lod[0].size() - 1) * max_dim * common_dim, 0); - } - for (size_t j = 1; j < lod[0].size(); j++) { - size_t start = lod[0][j - 1] * common_dim; - size_t end = lod[0][j] * common_dim; - if (end > start) { - std::copy(output_ptr + start, output_ptr + end, - data.begin() + (j - 1) * max_dim * common_dim); - } - } - shape.push_back(lod[0].size() - 1); - shape.push_back(max_dim); - for (int j = 1; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); - } - } - - output->shape = shape; - auto &buffer = output->data; - if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { - buffer.Resize(sizeof(T) * data.size()); - } - std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size()); - // copy LoD - for (const auto &level : fetch.lod()) { - output->lod.emplace_back(level); + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); } } @@ -330,4 +291,10 @@ std::unique_ptr CreatePaddlePredictor< #endif } +template <> +std::unique_ptr CreatePaddlePredictor( + const NativeConfig &config) { + return CreatePaddlePredictor(config); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index ec801c58857e716241d28404510530e551ed25aa..6386d601262b3dac0e957fae991d23768b52f2c0 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -14,6 +14,12 @@ #pragma once +// logging.h and windows.h conflict +#define GLOG_NO_ABBREVIATED_SEVERITIES +// solve static linking error in windows +// https://github.com/google/glog/issues/301 +#define GOOGLE_GLOG_DLL_DECL + #include #include #include diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index d9d6e139b8735c8f07c52f63c70b6b9805e03642..6c7e63971b2d93f58e219dbd93637c8d389deb7c 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -25,10 +25,11 @@ using inference::analysis::Argument; using inference::Singleton; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using paddle::contrib::MixedRTConfig; class TensorRTSubgraphPredictor : public NativePaddlePredictor { public: - explicit TensorRTSubgraphPredictor(const TensorRTConfig& config) + explicit TensorRTSubgraphPredictor(const MixedRTConfig& config) : NativePaddlePredictor(config), config_(config) {} bool Init(const std::shared_ptr& parent_scope) { @@ -115,13 +116,13 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { } private: - TensorRTConfig config_; + MixedRTConfig config_; }; template <> std::unique_ptr -CreatePaddlePredictor( - const TensorRTConfig& config) { +CreatePaddlePredictor( + const MixedRTConfig& config) { VLOG(3) << "create TensorRTSubgraphPredictor"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,6 +151,13 @@ CreatePaddlePredictor( return std::move(predictor); } +template <> +std::unique_ptr CreatePaddlePredictor( + const MixedRTConfig& config) { + return CreatePaddlePredictor(config); +} + } // namespace paddle USE_TRT_CONVERTER(elementwise_add_weight); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 9e7425eddd2df07ffe897f908aad360abe42117a..fc6310e90b0257bc84742fb617a00f5778bb1866 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -20,6 +20,8 @@ namespace paddle { +using contrib::MixedRTConfig; + DEFINE_string(dirname, "", "Directory of the inference model."); void CompareTensorRTWithFluid(bool enable_tensorrt) { @@ -32,7 +34,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { config0.fraction_of_gpu_memory = 0.3; config0.device = 0; - TensorRTConfig config1; + MixedRTConfig config1; config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; @@ -42,7 +44,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { auto predictor0 = CreatePaddlePredictor(config0); auto predictor1 = - CreatePaddlePredictor(config1); for (int batch_id = 0; batch_id < 1; batch_id++) { diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64..d4e6bb3e4a4ceb361ccd35121d0ecf84a764243e 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -1,13 +1,32 @@ cmake_minimum_required(VERSION 3.0) - project(cpp_inference_demo CXX C) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (WIN32) -set(CMAKE_STATIC_LIBRARY_PREFIX "lib") + if (WITH_STATIC_LIB) + safe_set_static_flag() + add_definitions(-DSTATIC_LIB) + set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w") + set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w") + endif() + set(CMAKE_STATIC_LIBRARY_PREFIX "lib") else() -set(CMAKE_STATIC_LIBRARY_PREFIX "") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + set(CMAKE_STATIC_LIBRARY_PREFIX "") endif() +message("flags" ${CMAKE_CXX_FLAGS}) if(NOT DEFINED PADDLE_LIB) message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") @@ -16,14 +35,18 @@ if(NOT DEFINED DEMO_NAME) message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") endif() -option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) -option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) -option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) if(WITH_GPU) - set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + if(NOT WIN32) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + else() + if(CUDA_LIB STREQUAL "") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + endif() + endif(NOT WIN32) endif() +include_directories("D:/Paddle/") include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") @@ -83,10 +106,18 @@ set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf ${EXTERNAL_LIB}) +# NOTE(dzhwinter) shlwapi is deprecated. +set(DEPS ${DEPS} libcmt shlwapi) endif(NOT WIN32) if(WITH_GPU) - set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(NOT WIN32) + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + endif() endif() target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72..360f924810a570422db5a00b13939813fa73e2fa 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -18,6 +18,8 @@ limitations under the License. */ #include #include + +#include #include #include //NOLINT #include "paddle/fluid/inference/paddle_inference_api.h" @@ -67,7 +69,8 @@ void Main(bool use_gpu) { 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + for (size_t i = 0; i < std::min(static_cast(5), num_elements); + i++) { PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } @@ -113,7 +116,8 @@ void MainThreads(int num_threads, bool use_gpu) { const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + for (size_t i = 0; i < std::min(static_cast(5), num_elements); + i++) { PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md new file mode 100644 index 0000000000000000000000000000000000000000..44b2586ad6d33ce7cbd2bb3080acc96b5e27f660 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md @@ -0,0 +1,19 @@ +# windows inference +本文介绍windows inference,目前只提供了静态编译,编译出paddle_fluid.lib,包含了除openblas.dll之外的所有第三方依赖库。 + +1. 下载最新的paddle_fluid.lib和openblas.dll,并把它们放在同一个目录下。 + +2. 准备预训练好的模型文件,例如models中的模型,可以将模型用safe_inference_model接口保存下来。将模型文件放到该目录下 + +3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录,新建build目录,然后使用cmake生成vs2015的solution文件。 +其中PADDLE_LIB是前面的paddle_fluid.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。 +```shell + cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_fluid.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 +``` +然后用vs2015打开对应的项目文件,注意使用静态链接 "/MT",生成对应的exe。将openblas.dll放到exe所在目录。 + +4. 该exe即为项目生成文件,可绑定运行。 + +## FAQ +1. cmake需要您手动下载,并添加到系统路径里 +2. 路径中的不要包含空格,例如发现CUDA_LIB路径是Program Files(x86)可能会出错。可以将CUDA拷贝到一个新位置。 diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 8e359a67738c0df180933421b45f15b39fd0e78c..1fec2f96da0f9d978a3537b2d78e4ce5ef628c81 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -74,13 +74,17 @@ template <> std::string to_string>>( const std::vector>> &vec); +template +int VecReduceToInt(const std::vector &v) { + return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; }); +} + template static void TensorAssignData(PaddleTensor *tensor, const std::vector> &data) { // Assign buffer - int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, - [](int a, int b) { return a * b; }); - tensor->data.Resize(sizeof(T) * dim); + int num_elems = VecReduceToInt(tensor->shape); + tensor->data.Resize(sizeof(T) * num_elems); int c = 0; for (const auto &f : data) { for (T v : f) { @@ -89,7 +93,7 @@ static void TensorAssignData(PaddleTensor *tensor, } } -std::string DescribeTensor(const PaddleTensor &tensor) { +static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; os << " - type: "; @@ -113,8 +117,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) { os << "\n"; os << " - data: "; - int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, - [](int a, int b) { return a * b; }); + int dim = VecReduceToInt(tensor.shape); for (int i = 0; i < dim; i++) { os << static_cast(tensor.data.data())[i] << " "; } @@ -122,8 +125,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) { return os.str(); } -void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency, int epoch = 1) { +static void PrintTime(int batch_size, int repeat, int num_threads, int tid, + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 55a07ca705f9fafa9ea223a867300bd14e10c364..2b4e5ed73704041e18bdbce32338405f3601e082 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -28,34 +28,61 @@ limitations under the License. */ namespace paddle { +// Data type. enum PaddleDType { FLOAT32, INT64, + // TODO(Superjomn) support more data types if needed. }; +/* + * Memory menage for PaddleTensor. + * The PaddleBuf holds a buffer for data input or output. The memory can be + * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + * should be reused for better performance. + * + * For user allocated memory, the following API can be used: + * - PaddleBuf(void* data, size_t length) to set an external memory by + * specifying + * the memory address and length. + * - Reset(void* data, size_t length) to reset the PaddleBuf with an external + * memory. + * ATTENTION, for user allocated memory, deallocation should be done by users + * externally after the program finished. The PaddleBuf won't do any allocation + * or deallocation. + * + * To have the PaddleBuf allocate and manage the memory: + * - PaddleBuf(size_t length) will allocate a memory of size `length`. + * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * if the allocated memory is larger than `length`, nothing will done. + */ class PaddleBuf { public: - PaddleBuf() = default; - PaddleBuf(PaddleBuf&& other); - // Copy only available when memory is managed externally. - explicit PaddleBuf(const PaddleBuf&); - PaddleBuf& operator=(const PaddleBuf&); - PaddleBuf& operator=(PaddleBuf&&); - // Do not own the memory. - PaddleBuf(void* data, size_t length) - : data_(data), length_(length), memory_owned_{false} {} - // Own memory. + // PaddleBuf allocate memory internally, and manage it. explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} - // Resize to `length` bytes. + // Set external memory, the PaddleBuf won't manage it. + PaddleBuf(void* data, size_t length) + : data_(data), length_(length), memory_owned_{false} {} + // Copy only available when memory is managed externally. + explicit PaddleBuf(const PaddleBuf&); + + // Resize the memory. void Resize(size_t length); - // Reset to external memory. + // Reset to external memory, with address and length set. void Reset(void* data, size_t length); + // Tell whether the buffer is empty. bool empty() const { return length_ == 0; } + // Get the memory address. void* data() const { return data_; } + // Get the memory length. size_t length() const { return length_; } ~PaddleBuf() { Free(); } + PaddleBuf& operator=(const PaddleBuf&); + PaddleBuf& operator=(PaddleBuf&&); + PaddleBuf() = default; + PaddleBuf(PaddleBuf&& other); private: void Free(); @@ -64,6 +91,7 @@ class PaddleBuf { bool memory_owned_{true}; }; +// Basic input and output data structure for PaddlePredictor. struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. @@ -73,19 +101,8 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; -enum class PaddleEngineKind { - kNative = 0, // Use the native Fluid facility. - kAnakin, // Use Anakin for inference. - kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. - kAnalysis - // TODO(Superjomn) support following engines latter. - // kTensorRT, // Use TensorRT for inference. - // kAutoMixedAnakin, // Automatically mix Fluid with Anakin. -}; - /* - * A simple Inference API for Paddle. Currently this API can be used by - * non-sequence scenerios. + * A simple Inference API for Paddle. */ class PaddlePredictor { public: @@ -120,26 +137,53 @@ struct NativeConfig : public PaddlePredictor::Config { // GPU related fields. bool use_gpu{false}; int device{0}; - float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. - // NOTE: NOT use it, just for the internal test, will discard later - bool _use_mkldnn{false}; - // Specify the variable's name of each input. - bool specify_input_name{false}; + float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. + // Specify the exact path of program and parameter files. std::string prog_file; std::string param_file; + + // Specify the variable's name of each input if input tensors don't follow the + // `feeds` and `fetches` of the phase `save_inference_model`. + bool specify_input_name{false}; }; -// Configurations for Anakin engine. -struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; - std::string model_file; - int max_batch_size{-1}; - TargetType target_type; +// A factory to help create different predictors. +// +// Usage: +// +// NativeConfig config; +// ... // change the configs. +// auto native_predictor = CreatePaddlePredictor(config); +// +// FOR EXTENSION DEVELOPER: +// Different predictors are designated by config type. Similar configs can be +// merged, but there shouldn't be a huge config containing different fields for +// more than one kind of predictors. +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +// NOTE The following APIs are too trivial, we will discard it in the following +// versions. +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. + kAnalysis, // More optimization. + kAnakin // Use Anakin for inference, not mature yet. }; -struct TensorRTConfig : public NativeConfig { +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +// == +// +// ----------------------------------------------------------------------------------- +// NOTE: The following APIs are not mature yet, we are still working on them. + +namespace contrib { + +// Accelerate GPU computation with TensorRT engine. +struct MixedRTConfig : public NativeConfig { // Determine whether a subgraph will be executed by TRT. int min_subgraph_size{1}; // While TensorRT allows an engine optimized for a given max batch size @@ -154,7 +198,6 @@ struct TensorRTConfig : public NativeConfig { // NOTE WIP, not stable yet. struct AnalysisConfig : public NativeConfig { - // enum class IrPassMode { kSystem, // Use system default passes, not customize. kInclude, // Specify the passes in `ir_passes`. @@ -165,18 +208,21 @@ struct AnalysisConfig : public NativeConfig { IrPassMode ir_mode{IrPassMode::kExclude}; // attention lstm fuse works only on some specific models, disable as default. std::vector ir_passes{"attention_lstm_fuse_pass"}; + + // NOTE this is just for internal development, please not use it. + bool _use_mkldnn{false}; }; -// A factory to help create different predictors. -// -// FOR EXTENSION DEVELOPER: -// Different predictors are designated by config type and engine kind. Similar -// configs can be merged, but there shouldn't be a huge config containing -// different fields for more than one kind of predictors. -// -// Similarly, each engine kind should map to a unique predictor implementation. -template -std::unique_ptr CreatePaddlePredictor(const ConfigT& config); +// Configurations for Anakin engine. +struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86 }; + int device; + std::string model_file; + int max_batch_size{-1}; + TargetType target_type; +}; + +} // namespace contrib int PaddleDtypeSize(PaddleDType dtype); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 508ef1ce40aa0882a0f39a85f97511fd9ea2a8a5..d7ab2ac980af2cf3bd9d95bfdbfa1887ef9a64d7 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) +# seq_conv1 +set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") +download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) diff --git a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc index 62e820b68c79a47d963bb174663bfc8c4ac22de3..cf97f064beddb6ede1d4716f323b4c5b46cb266d 100644 --- a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc @@ -22,10 +22,10 @@ DEFINE_string(model, "", "Directory of the inference model(mobile_v2)."); namespace paddle { -AnakinConfig GetConfig() { - AnakinConfig config; +contrib::AnakinConfig GetConfig() { + contrib::AnakinConfig config; // using AnakinConfig::X86 if you need to use cpu to do inference - config.target_type = AnakinConfig::NVGPU; + config.target_type = contrib::AnakinConfig::NVGPU; config.model_file = FLAGS_model; config.device = 0; config.max_batch_size = 1; @@ -33,9 +33,10 @@ AnakinConfig GetConfig() { } TEST(inference, anakin) { - AnakinConfig config = GetConfig(); + auto config = GetConfig(); auto predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor( + config); float data[1 * 3 * 224 * 224] = {1.0f}; PaddleTensor tensor; diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index 98c74aaa562dce6618ccde8f11f4344eefd59ef2..82bc83988de688e46613e160b66943c89c4a0391 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -97,10 +97,10 @@ void Data::get_batch_data( namespace paddle { -AnakinConfig GetConfig() { - AnakinConfig config; +contrib::AnakinConfig GetConfig() { + contrib::AnakinConfig config; // using AnakinConfig::X86 if you need to use cpu to do inference - config.target_type = AnakinConfig::X86; + config.target_type = contrib::AnakinConfig::X86; config.model_file = FLAGS_model; config.device = 0; config.max_batch_size = 1000; // the max number of token @@ -121,9 +121,10 @@ void set_tensor(std::string name, std::vector shape, } void single_test() { - AnakinConfig config = GetConfig(); + auto config = GetConfig(); auto predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor( + config); int max_batch_size = 1000; std::string feature_file = FLAGS_datapath; diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 8cf230a51d05c3a141f7cfd4e30bf30f064f0989..59020545cd609961487cafc4a08c20951a02c8ce 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -95,7 +95,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(AnalysisConfig *cfg) { +void SetConfig(contrib::AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -117,7 +117,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_Chinese_ner, profile) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; @@ -141,7 +141,7 @@ TEST(Analyzer_Chinese_ner, profile) { // Check the fuse status TEST(Analyzer_Chinese_ner, fuse_statis) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -155,7 +155,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig TEST(Analyzer_Chinese_ner, compare) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 14bdf76efc71b326bd130858ea246be81c9bd45c..3bf5383d8f35347c767d6caee83e0dcc5fb0a446 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -149,7 +149,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(AnalysisConfig *cfg) { +void SetConfig(contrib::AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -172,7 +172,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; @@ -183,7 +183,7 @@ TEST(Analyzer_rnn1, profile) { // Check the fuse status TEST(Analyzer_rnn1, fuse_statis) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -198,7 +198,7 @@ TEST(Analyzer_rnn1, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig TEST(Analyzer_rnn1, compare) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); std::vector> input_slots_all; @@ -208,7 +208,7 @@ TEST(Analyzer_rnn1, compare) { // Test Multi-Thread. TEST(Analyzer_rnn1, multi_thread) { - AnalysisConfig cfg; + contrib::AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..2f71ed46ffc9fd5f853f5b5b46de1446d28b9e69 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> title1_all, title2_all, title3_all, l1_all; + std::vector> title1, title2, title3, l1; + std::vector title1_lod, title2_lod, title3_lod, l1_lod; + size_t batch_iter{0}; + size_t batch_size{1}; + size_t num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= title1_all.size()) { + data.title1_all.assign(title1_all.begin() + batch_iter, + title1_all.begin() + batch_end); + data.title2_all.assign(title2_all.begin() + batch_iter, + title2_all.begin() + batch_end); + data.title3_all.assign(title3_all.begin() + batch_iter, + title3_all.begin() + batch_end); + data.l1_all.assign(l1_all.begin() + batch_iter, + l1_all.begin() + batch_end); + // Prepare LoDs + data.title1_lod.push_back(0); + data.title2_lod.push_back(0); + data.title3_lod.push_back(0); + data.l1_lod.push_back(0); + CHECK(!data.title1_all.empty()); + CHECK(!data.title2_all.empty()); + CHECK(!data.title3_all.empty()); + CHECK(!data.l1_all.empty()); + CHECK_EQ(data.title1_all.size(), data.title2_all.size()); + CHECK_EQ(data.title1_all.size(), data.title3_all.size()); + CHECK_EQ(data.title1_all.size(), data.l1_all.size()); + for (size_t j = 0; j < data.title1_all.size(); j++) { + data.title1.push_back(data.title1_all[j]); + data.title2.push_back(data.title2_all[j]); + data.title3.push_back(data.title3_all[j]); + data.l1.push_back(data.l1_all[j]); + // calculate lod + data.title1_lod.push_back(data.title1_lod.back() + + data.title1_all[j].size()); + data.title2_lod.push_back(data.title2_lod.back() + + data.title2_all[j].size()); + data.title3_lod.push_back(data.title3_lod.back() + + data.title3_all[j].size()); + data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + // load title1 data + std::vector title1_data; + split_to_int64(data[0], ' ', &title1_data); + // load title2 data + std::vector title2_data; + split_to_int64(data[1], ' ', &title2_data); + // load title3 data + std::vector title3_data; + split_to_int64(data[2], ' ', &title3_data); + // load l1 data + std::vector l1_data; + split_to_int64(data[3], ' ', &l1_data); + title1_all.push_back(std::move(title1_data)); + title2_all.push_back(std::move(title2_data)); + title3_all.push_back(std::move(title3_data)); + l1_all.push_back(std::move(l1_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor; + title1_tensor.name = "title1"; + title2_tensor.name = "title2"; + title3_tensor.name = "title3"; + l1_tensor.name = "l1"; + auto one_batch = data->NextBatch(); + int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; + title1_tensor.shape.assign({title1_size, 1}); + title1_tensor.lod.assign({one_batch.title1_lod}); + int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; + title2_tensor.shape.assign({title2_size, 1}); + title2_tensor.lod.assign({one_batch.title2_lod}); + int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; + title3_tensor.shape.assign({title3_size, 1}); + title3_tensor.lod.assign({one_batch.title3_lod}); + int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; + l1_tensor.shape.assign({l1_size, 1}); + l1_tensor.lod.assign({one_batch.l1_lod}); + + // assign data + TensorAssignData(&title1_tensor, one_batch.title1); + TensorAssignData(&title2_tensor, one_batch.title2); + TensorAssignData(&title3_tensor, one_batch.title3); + TensorAssignData(&l1_tensor, one_batch.l1); + // Set inputs. + input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->model_dir = FLAGS_infer_model; + cfg->use_gpu = false; + cfg->device = 0; + cfg->specify_input_name = true; + cfg->enable_ir_optim = true; +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_seq_conv1, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); + size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(outputs[0].data.data()); + // output is probability, which is in (0, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], 0); + EXPECT_LT(result[i], 1); + } + } +} + +// Check the fuse status +TEST(Analyzer_seq_conv1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto fuse_statis = GetFuseStatis(cfg, &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_conv1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 384a40a3f992d1a9734e3189b422be0ce6adb938..9fcb5129d268a7730c11e5910077ad233050484e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -38,6 +38,8 @@ DEFINE_bool(use_analysis, true, namespace paddle { namespace inference { +using contrib::AnalysisConfig; + void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { EXPECT_GT(outputs.size(), 0UL); @@ -45,11 +47,8 @@ void CompareResult(const std::vector &outputs, for (size_t i = 0; i < outputs.size(); i++) { auto &out = outputs[i]; auto &ref_out = ref_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); + size_t size = VecReduceToInt(out.shape); + size_t ref_size = VecReduceToInt(ref_out.shape); EXPECT_GT(size, 0); EXPECT_EQ(size, ref_size); EXPECT_EQ(out.dtype, ref_out.dtype); @@ -74,25 +73,22 @@ void CompareResult(const std::vector &outputs, } } -std::unique_ptr GetPrediction(AnalysisConfig config, - bool use_analysis = true) { +std::unique_ptr CreateTestPredictor( + const AnalysisConfig &config, bool use_analysis = true) { if (use_analysis) { - return CreatePaddlePredictor( - config); + return CreatePaddlePredictor(config); } else { return CreatePaddlePredictor( config); } } -size_t GetSize(const PaddleTensor &out) { - return std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); -} +size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } std::unordered_map GetFuseStatis(AnalysisConfig config, int *num_ops) { - auto predictor = GetPrediction(config); + auto predictor = CreateTestPredictor(config); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); auto &fuse_statis = analysis_predictor->analysis_argument() @@ -113,11 +109,12 @@ std::unordered_map GetFuseStatis(AnalysisConfig config, } void TestOneThreadPrediction( - AnalysisConfig config, const std::vector> inputs, + const AnalysisConfig &config, + const std::vector> &inputs, std::vector *outputs, bool use_analysis = true) { int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; - auto predictor = GetPrediction(config, use_analysis); + auto predictor = CreateTestPredictor(config, use_analysis); Timer timer; timer.tic(); for (int i = 0; i < num_times; i++) { @@ -130,7 +127,8 @@ void TestOneThreadPrediction( } void TestMultiThreadPrediction( - AnalysisConfig config, const std::vector> inputs, + const AnalysisConfig &config, + const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { int batch_size = FLAGS_batch_size; @@ -140,7 +138,7 @@ void TestMultiThreadPrediction( // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled // because AttentionLSTM's hard code nodeid will be damanged. for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back(GetPrediction(config, use_analysis)); + predictors.emplace_back(CreateTestPredictor(config, use_analysis)); } for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { @@ -164,8 +162,8 @@ void TestMultiThreadPrediction( } } -void TestPrediction(AnalysisConfig config, - const std::vector> inputs, +void TestPrediction(const AnalysisConfig &config, + const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = FLAGS_use_analysis) { LOG(INFO) << "use_analysis: " << use_analysis; @@ -178,8 +176,8 @@ void TestPrediction(AnalysisConfig config, } void CompareNativeAndAnalysis( - AnalysisConfig config, - const std::vector> inputs) { + const AnalysisConfig &config, + const std::vector> &inputs) { std::vector native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2..c091476d6d132db17a656d5c8dee65e3a88d9ac2 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include #include "paddle/fluid/operators/mkldnn_activation_op.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel { } }; -__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC( +UNUSED constexpr char SigmoidDoc[] = R"DOC( Sigmoid Activation Operator $$out = \frac{1}{1 + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC( +UNUSED constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator $$out = \\log \\frac{1}{1 + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char ExpDoc[] = R"DOC( +UNUSED constexpr char ExpDoc[] = R"DOC( Exp Activation Operator. $out = e^x$ )DOC"; -__attribute__((unused)) constexpr char ReluDoc[] = R"DOC( +UNUSED constexpr char ReluDoc[] = R"DOC( Relu Activation Operator. $out = \max(x, 0)$ )DOC"; -__attribute__((unused)) constexpr char TanhDoc[] = R"DOC( +UNUSED constexpr char TanhDoc[] = R"DOC( Tanh Activation Operator. $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( +UNUSED constexpr char TanhShrinkDoc[] = R"DOC( TanhShrink Activation Operator. $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC( +UNUSED constexpr char SqrtDoc[] = R"DOC( Sqrt Activation Operator. $out = \sqrt{x}$ )DOC"; -__attribute__((unused)) constexpr char AbsDoc[] = R"DOC( +UNUSED constexpr char AbsDoc[] = R"DOC( Abs Activation Operator. $out = |x|$ )DOC"; -__attribute__((unused)) constexpr char CeilDoc[] = R"DOC( +UNUSED constexpr char CeilDoc[] = R"DOC( Ceil Activation Operator. $out = ceil(x)$ )DOC"; -__attribute__((unused)) constexpr char FloorDoc[] = R"DOC( +UNUSED constexpr char FloorDoc[] = R"DOC( Floor Activation Operator. $out = floor(x)$ )DOC"; -__attribute__((unused)) constexpr char CosDoc[] = R"DOC( +UNUSED constexpr char CosDoc[] = R"DOC( Cosine Activation Operator. $out = cos(x)$ )DOC"; -__attribute__((unused)) constexpr char SinDoc[] = R"DOC( +UNUSED constexpr char SinDoc[] = R"DOC( Sine Activation Operator. $out = sin(x)$ )DOC"; -__attribute__((unused)) constexpr char RoundDoc[] = R"DOC( +UNUSED constexpr char RoundDoc[] = R"DOC( Round Activation Operator. $out = [x]$ )DOC"; -__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC( +UNUSED constexpr char ReciprocalDoc[] = R"DOC( Reciprocal Activation Operator. $$out = \\frac{1}{x}$$ )DOC"; -__attribute__((unused)) constexpr char LogDoc[] = R"DOC( +UNUSED constexpr char LogDoc[] = R"DOC( Log Activation Operator. $out = \ln(x)$ @@ -212,21 +213,21 @@ Natural logarithm of x. )DOC"; -__attribute__((unused)) constexpr char SquareDoc[] = R"DOC( +UNUSED constexpr char SquareDoc[] = R"DOC( Square Activation Operator. $out = x^2$ )DOC"; -__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC( +UNUSED constexpr char SoftplusDoc[] = R"DOC( Softplus Activation Operator. $out = \ln(1 + e^{x})$ )DOC"; -__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC( +UNUSED constexpr char SoftsignDoc[] = R"DOC( Softsign Activation Operator. $$out = \frac{x}{1 + |x|}$$ diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 5b27068c9e805146b8bce03f4f676ef0d4d16c53..4cb1f3a80e95bdda79e6451dc3cc87e899b11779 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for sqrt in CPU and CUDA #include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -306,26 +307,43 @@ class AdamOpKernel : public framework::OpKernel { VLOG(3) << "grad row size is 0!!"; return; } - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; - auto& grad_merge = *(ctx.scope() - .NewScope() - .Var("sparse_adam_grad_merge") - ->GetMutable()); - merge_func(ctx.template device_context(), grad, - &grad_merge); + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + const framework::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + auto* grad_merge_var = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + merge_func(ctx.template device_context(), grad, + grad_merge_var); + grad_merge_ptr = grad_merge_var; + } + + auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - int64_t* rows = nullptr; -// When compiled without CUDA, the CUDAMutableData() interface should not be + const int64_t* rows = nullptr; +// When compiled without CUDA, the CUDAData() interface should not be // provided. #if defined(PADDLE_WITH_CUDA) if (platform::is_gpu_place(ctx.GetPlace())) { - rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); + rows = grad_merge.rows().CUDAData(ctx.GetPlace()); } else { #endif - rows = grad_merge.mutable_rows()->data(); + rows = grad_merge.rows().data(); #if defined(PADDLE_WITH_CUDA) } diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index bc58612f9d3a2b433f362787135b6bb23b203f63..57817da71adfd80faad29a48b05ba2f326de6c07 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -94,8 +94,20 @@ class ConcatOpGrad : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); - ctx->ShareLoD("X", framework::GradVarName("X")); + auto in_x = "X"; + auto out_x_g_n = framework::GradVarName(in_x); + ctx->SetOutputsDim(out_x_g_n, ctx->GetInputsDim(in_x)); + auto &in_names = ctx->Inputs(in_x); + auto &out_names = ctx->Outputs(out_x_g_n); + PADDLE_ENFORCE_EQ( + in_names.size(), out_names.size(), + "The number of arguments in %s[%d] and %s[%d] is not equal.", in_x, + in_names.size(), out_x_g_n, out_names.size()); + for (size_t i = 0; i < in_names.size(); ++i) { + if (out_names[i] != framework::kEmptyVarName) { + ctx->ShareLoD(in_x, out_x_g_n, i, i); + } + } } }; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index b98190d40a2afa684cfd29cc52fc29fac851cca7..4cc980b41b34894f9d915d4b325887548091c0eb 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -23,8 +23,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -static constexpr int kROISize = 4; - template bool GT_E(T a, T b) { return (a > b) || fabs(a - b) < 1e-4; diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index dd5d138a1e979826d59c4731920379b030e3b492..dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto ap_type = GetAPType(ctx.Attr("ap_type")); int class_num = ctx.Attr("class_num"); - auto& label_lod = in_label->lod(); - auto& detect_lod = in_detect->lod(); + auto label_lod = in_label->lod(); + auto detect_lod = in_detect->lod(); PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), @@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto labels = framework::EigenTensor::From(input_label); auto detect = framework::EigenTensor::From(input_detect); - auto& label_lod = input_label.lod(); - auto& detect_lod = input_detect.lod(); + auto label_lod = input_label.lod(); + auto detect_lod = input_detect.lod(); int batch_size = label_lod[0].size() - 1; - auto& label_index = label_lod[0]; + auto label_index = label_lod[0]; for (int n = 0; n < batch_size; ++n) { std::map> boxes; @@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { output_true_pos->set_lod(true_pos_lod); output_false_pos->set_lod(false_pos_lod); + return; } void GetInputPos(const framework::Tensor& input_pos_count, @@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto& pos_data_lod = pos_tensor.lod()[0]; + auto pos_data_lod = pos_tensor.lod()[0]; for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; @@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>* false_pos) const { int batch_size = gt_boxes.size(); for (int n = 0; n < batch_size; ++n) { - auto& image_gt_boxes = gt_boxes[n]; - for (auto& image_gt_box : image_gt_boxes) { + auto image_gt_boxes = gt_boxes[n]; + for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { size_t count = 0; - auto& labeled_bboxes = image_gt_box.second; + auto labeled_bboxes = it->second; if (evaluate_difficult) { count = labeled_bboxes.size(); } else { - for (auto& box : labeled_bboxes) { - if (!box.is_difficult) { - ++count; - } - } + for (size_t i = 0; i < labeled_bboxes.size(); ++i) + if (!(labeled_bboxes[i].is_difficult)) ++count; } if (count == 0) { continue; } - int label = image_gt_box.first; + int label = it->first; if (label_pos_count->find(label) == label_pos_count->end()) { (*label_pos_count)[label] = count; } else { diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 1617cc1b95216b118cf2c2122dbe8b6c106554c3..c4854d50b6371064003a10e18efc9e5f160d9a42 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData( ::google::protobuf::io::CodedInputStream* input, const platform::DeviceContext& ctx, const framework::DDim& dims, int length) { + auto server_var = GetVar(); + if (!server_var) { + LOG(ERROR) << "recved var should not on current server: " + << meta_.varname(); + return false; + } auto* tensor = GetVar()->GetMutable(); tensor->Resize(dims); - framework::LoD lod; for (int i = 0; i < meta_.lod_level(); ++i) { framework::Vector v; @@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData( void* tensor_data = tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); - if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { return false; } diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc index 3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7..9a297d03cfb041e584159a5fc5ba214f8ac404b4 100644 --- a/paddle/fluid/operators/extract_rows_op.cc +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { auto &in = scope.FindVar(Input("X"))->Get(); auto out = scope.FindVar(Output("Out"))->GetMutable(); - auto &in_rows = in.rows(); + auto in_rows = in.rows(); auto out_dim = framework::make_ddim( std::vector{static_cast(in_rows.size()), 1}); auto dst_ptr = out->mutable_data(out_dim, in.place()); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index ae51a53a7197950338ef773d63103fa13bf0a5f5..b27880c232a51d32777569cf9ac67656ce02f232 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -60,9 +60,11 @@ struct SelectedRowsAdd { auto out_place = context.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(out_place)); - memory::Copy(boost::get(out_place), out_data, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), context.stream()); + memory::Copy( + boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), + reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); memory::Copy(boost::get(out_place), @@ -107,7 +109,7 @@ struct SelectedRowsAddTensor { PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); auto& in1_value = input1.value(); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); @@ -146,7 +148,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - auto& in1_rows = input1.rows(); + framework::Vector in1_rows(input1.rows()); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); @@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor { PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu index e89b27855bdeba3a5189feff94eb063ddfb9b9b8..5fc50aba25d8e69480a17f0f80877b0d03e17276 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -20,7 +20,9 @@ limitations under the License. */ TEST(selected_rows_functor, gpu_add) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::platform::CUDADeviceContext& ctx = + *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); paddle::operators::math::SetConstant functor; @@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add_to) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::platform::CUDADeviceContext& ctx = + *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); paddle::operators::math::SetConstant functor; diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 5341187d1ce9400ac34750ab691608e76158ae0d..56cef91e29cc7da27384c27a7ec63e90cfadfc3b 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims, dims2str(paddings) + pooling_type + suffix; } +static inline int ComputeCeiledOutput(int input_size, int kernel_size, + int padding, int stride) { + return (input_size - kernel_size + 2 * padding) / stride + 1; +} + +static inline void CorrectOutputSize( + const std::vector& src_tz, const std::vector& dst_tz, + const std::vector& kernel_size, const std::vector& paddings, + const std::vector& strides, + std::vector& right_bot_padding) { // NOLINT + for (size_t i = 0; i < right_bot_padding.size(); i++) { + int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i], + paddings[i], strides[i]); + if (desired_size != dst_tz[i + 2]) { + right_bot_padding[i] += strides[i]; + } + } +} + template class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { + const std::vector& padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + bool ceil_mode = ctx.Attr("ceil_mode"); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + padding_right_bottom); + } auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), input_format); @@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::format::any); std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, - pooling_type, mkldnn_engine); + CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, + padding_right_bottom, ksize, pooling_type, + mkldnn_engine, ceil_mode); // save pool_pd into global device context to be referred in backward path dev_ctx.SetBlob(key_pool_pd, pool_pd); @@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { private: std::unique_ptr CreatePrimitiveDesc( const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, - const std::vector& stride, const std::vector& padding, - const std::vector& kernel, const std::string& pooling_type, - const mkldnn::engine& engine) const { + const std::vector& stride, const std::vector& padding_left_top, + const std::vector& padding_right_bot, const std::vector& kernel, + const std::string& pooling_type, const mkldnn::engine& engine, + bool ceil_mode) const { auto pool_desc = mkldnn::pooling_forward::desc( mkldnn::prop_kind::forward, pooling_type == "max" ? mkldnn::algorithm::pooling_max : mkldnn::algorithm::pooling_avg, - src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); + src, dst, stride, kernel, padding_left_top, padding_right_bot, + mkldnn::padding_kind::zero); auto p_pool_pd = new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index a0d640b2020958af53a4405ae886eadb2a1e117e..326c58ee1c09d6f745e6c8abfb92030d11d8c1c6 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference { framework::VarDesc* reader = block->FindVarRecursive(reader_name); auto dtypes = reader->GetDataTypes(); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + auto lod_levels = reader->GetLoDLevels(); for (size_t i = 0; i < dtypes.size(); ++i) { framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetDataType(dtypes[i]); + out.SetLoDLevel(lod_levels[i]); } } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 13be6c65be58314a75124106eb09b1300305baf0..bf4df4f600c14050b636b7ee6d7b6973b57adb94 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -46,9 +46,15 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( **Scale operator** -Multiply the input tensor with a float scalar to scale the input tensor. +Apply scaling and bias addition to the input tensor. -$$Out = scale*X$$ +if bias_after_scale=True: + +$$Out = scale*X + bias$$ + +else: + +$$Out = scale*(X + bias)$$ )DOC"); AddAttr("scale", "The scaling factor of the scale operator.") .SetDefault(1.0); diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h index b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e..03b59d71cc0ca2eddd1d9912e7ca25348507ba03 100644 --- a/paddle/fluid/operators/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel { } for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_LT(0, offset_data[i], + PADDLE_ENFORCE_LE(0, offset_data[i], "The offset[%d] must greater than zero.", i); PADDLE_ENFORCE_LT(0, length_data[i], "The length[%d] must greater than zero.", i); - PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i], lod[0][i + 1], "The target tensor's length overflow."); } diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 4722be7a666d3e8f3c25c9499f88ddda835f60e3..9527e7ba300e10a6af1a0dd4b312c0323115256e 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU +#include #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, } } -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel) { - const int ty = blockIdx.y; - int tid = threadIdx.x; - - selected_rows += ty * row_numel; - tensor_out += rows[ty] * row_numel; - - for (int index = tid; index < row_numel; index += block_size) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( - tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + int64_t row_numel, int64_t limit) { + for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { + const T* selected_rows_ptr = selected_rows + i * row_numel; + T* tensor_out_ptr = tensor_out + rows[i] * row_numel; + for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out_ptr + index, + -1.0 * learning_rate[0] * selected_rows_ptr[index]); + } } } } // namespace @@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<>>( + const int kThreadsPerBlock = 256; + int thread_x = kThreadsPerBlock; + int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + SparseSGDFunctorKernel<<>>( in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel); + out_data, in_row_numel, in_rows.size()); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc index 29d2fb989754f5621222768a279a1c898ea1c355..e008e130e34f60a78bf44e211c42c4b7786d1721 100644 --- a/paddle/fluid/operators/shrink_rnn_memory_op.cc +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp { size_t height = dst_num_rows; // do shrink for the top level LoD + if (x_tensor.lod().size() > 0 && x_tensor.lod()[0].size() > static_cast(dst_num_rows)) { - auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, - dst_num_rows, 0); - height = lod_offset.second.second; - auto out_lod = out_tensor.mutable_lod(); - framework::AppendLoD(out_lod, lod_offset.first); + if (x_tensor.lod().size() > 1) { // MultiLevel LoD + auto lod_offset = framework::GetSubLoDAndAbsoluteOffset( + x_tensor.lod(), 0, dst_num_rows, 0); + height = lod_offset.second.second; + auto out_lod = out_tensor.mutable_lod(); + framework::AppendLoD(out_lod, lod_offset.first); + } else { + // Shrink LoD + auto lod_item = x_tensor.lod()[0]; + lod_item.resize(dst_num_rows + 1); + out_tensor.set_lod({lod_item}); + const auto &const_lod_item = lod_item; + height = const_lod_item.back(); + } } - if (dst_num_rows != 0) { + if (height != 0) { out_tensor.mutable_data(place, x_tensor.type()); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, @@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { } else { auto &dout_tensor = dout_var->Get(); auto height = dout_tensor.dims()[0]; - auto slice = dx_tensor.Slice(0, static_cast(height)); - framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); + if (height != 0) { + auto slice = dx_tensor.Slice(0, static_cast(height)); + framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, + &slice); + } if (dx_tensor.dims()[0] > height) { auto rest_tensor = dx_tensor.Slice( static_cast(height), static_cast(dx_tensor.dims()[0])); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 2c4c2411259e9b7bfa223e4d65823ce4610596d0..6dffe527c1072ee97fcde1725bfc1a47ed1ad74a 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); + // if all the input sparse vars are empty, no need to // merge these vars. if (first_dim == 0UL) { diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 79e75ea9a035b654f0bb7026d3a491bebe0b23c4..69173ff5178d32634f9ab291b7d709a3f91cb368 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -36,7 +36,7 @@ namespace operators { using FluidDT = framework::proto::VarType_Type; using TRT_DT = nvinfer1::DataType; -namespace { +namespace { // NOLINT TRT_DT FluidDataType2TRT(FluidDT type) { switch (type) { diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index 4a8ac441cfaf642fde58ee30865a22e83c065498..92a0697e27ba0da66fa3b0f5380e7bd52575640d 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel { "Output(Indices) of TopkOp should not be null."); auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(input_dims.size(), 2, + "Rank of TopK op's input must be 2."); const int k = static_cast(ctx->Attrs().Get("k")); PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc index 517df6863499f20d7b66d15ef114a689700be5b2..28edfd2e50237c887dbeb7ac73e1f990ce239a9c 100644 --- a/paddle/fluid/platform/cudnn_helper_test.cc +++ b/paddle/fluid/platform/cudnn_helper_test.cc @@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL + #include "paddle/fluid/platform/cudnn_helper.h" #include diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index dfc079e986e93c7f02f17b299e5d6293edbedd05..1b283fc9725fb8d01da913312844d0faea29daf6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); + grid_max_dims_ = GpuMaxGridDim(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const { return multi_process * max_threads_per_mp; } +std::tuple CUDADeviceContext::GetMaxGridDims() const { + return grid_max_dims_; +} + Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 79539195157d74d4d757edee5e008cbb76c93ee2..da32b0dad4b8cfe75bf82f59ec58db8136b899f2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -13,6 +13,7 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include #include @@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return the max physical thread count in the device context */ int GetMaxPhysicalThreadCount() const; + std::tuple GetMaxGridDims() const; + /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; @@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cublasHandle_t cublas_handle_; + std::tuple grid_max_dims_; + int compute_capability; int multi_process; int max_threads_per_mp; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 61a653d9313daff96d39c08e80f17d7e33acceb1..f04395a8ac00f33501008aa12f22773ddda9b138 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -21,6 +21,7 @@ limitations under the License. */ #if defined(_WIN32) #define NOMINMAX // msvc max/min macro conflict with std::min/max #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#define GOOGLE_GLOG_DLL_DECL #endif #ifdef PADDLE_WITH_CUDA @@ -47,7 +48,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" -#if !defined(__APPLE__) and !defined(_WIN32) +#if !defined(__APPLE__) && !defined(_WIN32) #include "paddle/fluid/platform/dynload/nccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_CUDA @@ -216,7 +217,7 @@ inline typename std::enable_if::type throw_on_error( #endif } -#if !defined(__APPLE__) and !defined(_WIN32) +#if !defined(__APPLE__) && !defined(_WIN32) template inline typename std::enable_if::type throw_on_error( ncclResult_t stat, const Args&... args) { @@ -260,14 +261,8 @@ inline void throw_on_error(T e) { } \ } while (false) -#define PADDLE_THROW_EOF() \ - do { \ - throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ - __LINE__); \ - } while (false) - #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__) +#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #else // !_WIN32 @@ -281,6 +276,12 @@ inline void throw_on_error(T e) { #define PADDLE_ENFORCE(x, ...) x #endif // !_WIN32 +#define PADDLE_THROW_EOF() \ + do { \ + throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ + __LINE__); \ + } while (false) + /* * Some enforce helpers here, usage: * int a = 1; @@ -294,7 +295,7 @@ inline void throw_on_error(T e) { * extra messages is also supported, for example: * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) */ - +#if !defined(_WIN32) #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ @@ -307,6 +308,7 @@ inline void throw_on_error(T e) { __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) + #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ do { \ if (UNLIKELY(nullptr == (__VAL))) { \ @@ -326,6 +328,27 @@ inline void throw_on_error(T e) { paddle::string::Sprintf("" __VA_ARGS__)); \ } \ } while (0) +#else +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1)) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1)) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1)) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1)) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1)) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1)) + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + do { \ + if (!((__VAL0)__CMP(__VAL1))) { \ + PADDLE_THROW("Windows disable the enforce. Enforce failed."); \ + } \ + } while (0) +#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \ + do { \ + if (nullptr == (__VAL1)) { \ + PADDLE_THROW("Windows disable the enforce. Enforce failed"); \ + } \ + } while (0) +#endif // !_WIN32 } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index c153e80fe42aecb33d3aa97874d2881bce9029be..2806d726d2b1ac6b717a9041af19e7ee62be6883 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) { } template -__global__ static void ForRangeElemwiseOp(Function func, int limit) { +__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (idx < limit) { func(idx); } } +template +__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit, + int grid_dim) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + while (idx < limit) { + func(idx); + idx += grid_dim; + } +} + template <> struct ForRange { ForRange(const CUDADeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + : dev_ctx_(dev_ctx), limit_(limit) {} template inline void operator()(Function func) const { constexpr int num_threads = 1024; int block_size = limit_ <= num_threads ? limit_ : num_threads; - int grid_size = (limit_ + num_threads - 1) / num_threads; - - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); + size_t grid_size = (limit_ + num_threads - 1) / num_threads; + + int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims()); + + if (grid_size < max_grid_dim) { + int grid_size_int = static_cast(grid_size); + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } } else { - ForRangeElemwiseOp<<>>( - func, limit_); + ForRangeElemwiseOpGridLarge<<>>(func, limit_, + max_grid_dim); } } const CUDADeviceContext& dev_ctx_; - int limit_; + size_t limit_; }; #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 126636d879213b1c8f242db8fbdf6a358a1d2da9..b88523728407803a1ea9d343dc2d33c6a38d5de9 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); } + +std::tuple GpuMaxGridDim(int id) { + std::tuple result; + PADDLE_ENFORCE( + cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id), + "cudaDeviceGetAttribute failed in " + "cudaDevAttrMaxBlockDim"); + PADDLE_ENFORCE( + cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id), + "cudaDeviceGetAttribute failed in " + "cudaDevAttrMaxBlockDim"); + PADDLE_ENFORCE( + cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id), + "cudaDeviceGetAttribute failed in " + "cudaDevAttrMaxBlockDim"); + return result; +} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index f4640d3eaa2165c35e8e14690d83e9e7e7168c0b..b748c6e8a519d27acd211f815a210c7a74ff32c8 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace platform { @@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, //! Set memory dst with value count size asynchronously void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); +std::tuple GpuMaxGridDim(int id); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 0e30594672927253cc8083dcb88bb867d63ec729..992ca5e6f6a966a331616a698e3bebd2eee129d5 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -16,6 +16,9 @@ limitations under the License. */ #include #include +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL + #include "gflags/gflags.h" #include "glog/logging.h" diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index f577068d1f39a3083a54f106d006f9982304411e..882e6332e8174b59eb6e19e788c8cced808d552c 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) { .value("Backward", framework::OpRole::kBackward) .value("Optimize", framework::OpRole::kOptimize) .value("Loss", framework::OpRole::kLoss) - .value("RPC", framework::OpRole::kRPC); + .value("RPC", framework::OpRole::kRPC) + .value("Dist", framework::OpRole::kDist) + .value("LRSched", framework::OpRole::kLRSched); op_proto_and_checker_maker.def( "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); @@ -46,6 +48,9 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); + op_proto_and_checker_maker.def( + "kOpCreationCallstackAttrName", + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cd9cbe379874e5ab7e40c1349e0483ff45bb63a --- /dev/null +++ b/paddle/fluid/train/CMakeLists.txt @@ -0,0 +1,30 @@ +function(train_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) + set(arg_list "") + if(train_test_ARGS) + foreach(arg ${train_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_train_${TARGET_NAME}${arg} + SRCS test_train_${TARGET_NAME}.cc + DEPS paddle_fluid_origin + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) + set_tests_properties(test_train_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + endforeach() +endfunction(train_test) + + +if(WITH_TESTING) + train_test(recognize_digits ARGS mlp conv) +endif() diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8731dd51ad698e53b7f10cc781c52134f2d17a8 --- /dev/null +++ b/paddle/fluid/train/test_train_recognize_digits.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/place.h" + +DEFINE_string(dirname, "", "Directory of the train model."); + +namespace paddle { + +void Train() { + CHECK(!FLAGS_dirname.empty()); + framework::InitDevices(false); + const auto cpu_place = platform::CPUPlace(); + framework::Executor executor(cpu_place); + framework::Scope scope; + + auto train_program = inference::Load( + &executor, &scope, FLAGS_dirname + "__model_combined__.main_program", + FLAGS_dirname + "__params_combined__"); + + std::string loss_name = ""; + for (auto op_desc : train_program->Block(0).AllOps()) { + if (op_desc->Type() == "mean") { + loss_name = op_desc->Output("Out")[0]; + break; + } + } + + PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); + + // prepare data + auto x_var = scope.Var("img"); + auto x_tensor = x_var->GetMutable(); + x_tensor->Resize({64, 1, 28, 28}); + + auto x_data = x_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 28 * 28; ++i) { + x_data[i] = 1.0; + } + + auto y_var = scope.Var("label"); + auto y_tensor = y_var->GetMutable(); + y_tensor->Resize({64, 1}); + auto y_data = y_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 1; ++i) { + y_data[i] = static_cast(1); + } + + auto loss_var = scope.Var(loss_name); + float first_loss = 0.0; + float last_loss = 0.0; + for (int i = 0; i < 100; ++i) { + executor.Run(*train_program.get(), &scope, 0, false, true); + if (i == 0) { + first_loss = loss_var->Get().data()[0]; + } else if (i == 99) { + last_loss = loss_var->Get().data()[0]; + } + } + EXPECT_LT(last_loss, first_loss); +} + +TEST(train, recognize_digits) { Train(); } + +} // namespace paddle diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 6a8467869196929d1ec92a57649cac60eee15a87..5829f1d0e7bcac395981fb34cadca3c41f0232e9 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -157,6 +157,7 @@ function cmake_gen() { -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DPY_VERSION=${PY_VERSION:-2.7} + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -188,7 +189,8 @@ EOF -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DPY_VERSION=${PY_VERSION:-2.7} + -DPY_VERSION=${PY_VERSION:-2.7} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} } @@ -371,7 +373,7 @@ EOF ctest --output-on-failure # make install should also be test when unittest make install -j `nproc` - pip install /usr/local/opt/paddle/share/wheels/*.whl + pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index f8c1a33574e642b21feb6843d115b7f4205ef250..adc0c1aac80cbdb0b0c04535fc39b6a172d23eec 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ] for name in names: for line in f.extractfile(name): - line_split = line.strip().split(six.b('\t')) + line = cpt.to_text(line) + line_split = line.strip().split('\t') if len(line_split) != 2: continue src_seq = line_split[0] # one source sequence diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index f30dcd518ea6c0c685d027ede3ad6e0a1cb0c82c..9c02e0f41b04e113251e0fda72ca8abd976ab6f7 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): word_dict = defaultdict(int) with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile("wmt16/train"): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue sen = line_split[0] if lang == "en" else line_split[1] for w in sen.split(): @@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile(file_name): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue src_words = line_split[src_col].split() diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 1ca2ac2ddc7daef3f4c0ea2004a62258ae4610ac..7bbdf7de89cc932e0023952e3c8e102f92b06855 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -19,17 +19,8 @@ from .framework import * # import all class inside executor into fluid module from . import executor from .executor import * - from . import trainer -from .trainer import Trainer -from .trainer import BeginEpochEvent -from .trainer import EndEpochEvent -from .trainer import BeginStepEvent -from .trainer import EndStepEvent -from .trainer import CheckpointConfig - from . import inferencer -from .inferencer import Inferencer from . import io from . import evaluator @@ -46,7 +37,7 @@ from . import transpiler from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope -from .transpiler import DistributeTranspiler, InferenceTranspiler, \ +from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from . import clip diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 79904cec93d1732f9f3f25115869c63385bd6276..32b8f1189fd65ba1e8da5aeaf316fc0ae05af552 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -280,7 +280,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): group_scale_name = self.group_name + "_scale" if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) - layers.sqrt(x=group_norm_var, out=group_norm_var) + group_norm_var = layers.sqrt(x=group_norm_var) clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py new file mode 100644 index 0000000000000000000000000000000000000000..b8d5f4ffeadca0a7b103682f175d50dc46fa258a --- /dev/null +++ b/python/paddle/fluid/contrib/inferencer.py @@ -0,0 +1,112 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib + +from .. import core + +from .. import executor +from .. import framework +from .. import io +from .. import parallel_executor +from .. import unique_name +from .trainer import check_and_get_place + +__all__ = ['Inferencer', ] + + +class Inferencer(object): + """ + Inferencer High Level API. + + Args: + infer_func (Python func): Infer function that will return predict Variable + param_path (str): The path where the inference model is saved by fluid.io.save_params + place (Place): place to do the inference + parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU. + + Examples: + .. code-block:: python + + def inference_program(): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + return y_predict + + place = fluid.CPUPlace() + inferencer = fluid.Inferencer( + infer_func=inference_program, param_path="/tmp/model", place=place) + + """ + + def __init__(self, infer_func, param_path, place=None, parallel=False): + self.param_path = param_path + self.scope = core.Scope() + self.parallel = parallel + self.place = check_and_get_place(place) + + self.inference_program = framework.Program() + with framework.program_guard(self.inference_program): + with unique_name.guard(): + self.predict_var = infer_func() + + with self._prog_and_scope_guard(): + # load params from param_path into scope + io.load_params(executor.Executor(self.place), param_path) + + if parallel: + with self._prog_and_scope_guard(): + self.exe = parallel_executor.ParallelExecutor( + use_cuda=isinstance(self.place, core.CUDAPlace), + loss_name=self.predict_var.name) + else: + self.exe = executor.Executor(self.place) + + self.inference_program = self.inference_program.clone(for_test=True) + + def infer(self, inputs, return_numpy=True): + """ + Do Inference for Inputs + + Args: + inputs (map): a map of {"input_name": input_var} that will be feed into the inference program + return_numpy (bool): transform return value into numpy or not + + Returns: + Tensor or Numpy: the predict value of the inference model for the inputs + + Examples: + .. code-block:: python + + tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") + results = inferencer.infer({'x': tensor_x}) + """ + if not isinstance(inputs, dict): + raise ValueError( + "inputs should be a map of {'input_name': input_var}") + + with self._prog_and_scope_guard(): + results = self.exe.run(feed=inputs, + fetch_list=[self.predict_var.name], + return_numpy=return_numpy) + + return results + + @contextlib.contextmanager + def _prog_and_scope_guard(self): + with framework.program_guard(main_program=self.inference_program): + with executor.scope_guard(self.scope): + yield diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..8569e486f91786b5562e84dcdccf6d91da0612cc --- /dev/null +++ b/python/paddle/fluid/contrib/trainer.py @@ -0,0 +1,1258 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import os +import errno +import shutil +import six +import time + +from .. import core +from .. import data_feeder +from .. import executor +from .. import framework +from .. import io +# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module +from .. import optimizer as opt_module +from .. import parallel_executor +from ..transpiler import distribute_transpiler + +__all__ = [ + 'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent', + 'EndStepEvent', 'CheckpointConfig' +] + + +class BeginEpochEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + + def __init__(self, epoch_id): + self.epoch = epoch_id + + +class EndEpochEvent(object): + """ + The end of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + + def __init__(self, epoch_id): + self.epoch = epoch_id + + +class BeginStepEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + """ + + def __init__(self, epoch_id, step_id): + self.epoch = epoch_id + self.step = step_id + self.fetch_metrics = True + """ + If fetch_metrics is true, the metrics will be fetched at the + EndStepEvent. Default is True. + """ + + +class EndStepEvent(object): + """ + The end of a training step. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + metrics(list): A list of fetched tensor. The order of this list is same + as the :code:`train_func` returns. + """ + + def __init__(self, epoch_id, step_id, metrics): + self.epoch = epoch_id + self.step = step_id + self.metrics = metrics + + +class CheckpointConfig(object): + """ + Parameter object for :code:`save_checkpoint` and + :code:`fluid.Trainer`. Used to configuration how to save checkpoint. + + Args: + checkpoint_dir(str): Directory path to save check point. Default is the + current directory. + + max_num_checkpoints(int): The max number of local check points. + epoch_interval(int): Every number of epoch to save check point. + step_interval(int): Every number of step to save check point. + + Examples: + >>> config = fluid.CheckpointConfig("./checkpoints") + >>> trainer = fluid.Trainer(train_func=train_program, + >>> place=place, + >>> optimizer_func=optimizer_func, + >>> checkpoint_config=config) + >>> trainer.train(...) + """ + + def __init__(self, + checkpoint_dir=None, + max_num_checkpoints=3, + epoch_interval=1, + step_interval=10): + + assert epoch_interval >= 1 + assert step_interval >= 1 + + self.checkpoint_dir = checkpoint_dir \ + if checkpoint_dir is not None else os.getcwd() + self.max_num_checkpoints = max_num_checkpoints + self.epoch_interval = epoch_interval + self.step_interval = step_interval + self.epoch_id = 0 + self.step_id = 0 + self.load_serial = None + self.pserver_id = None + self.lookup_table_name = None + + +def check_and_get_place(place): + """ + Check the type of place or get the default place + Args: + place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. + + Raises: + TypeError if the type mismatched. + + Returns: + the original place if it is not None. + if fluid is compiled with CUDA, returns CUDAPlace(0) by default. + Otherwise returns CPUPlace by default. + """ + if place is None: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + else: + return core.CPUPlace() + else: + if not isinstance(place, core.CUDAPlace) and not isinstance( + place, core.CPUPlace): + raise TypeError("Place should be either CUDAPlace or CPUPlace") + return place + + +class Trainer(object): + """ + A trainer wraps MultiGPU/MultiNode training loops and can be used to train a + simple neural network easily. + + This API takes a :code:`train_func`. A :code:`train_func` is a function that + return loss as it first return value. The reset value can be fetched by + EndStepEvent.metrics + + This API also takes a :code:`optimizer_func` that will return an optimizer + instance. + + For example, to train a MLP for MNIST dataset, the sample program is + + >>> import paddle.fluid as fluid + >>> + >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10): + >>> hidden = image + >>> for layer_size in layer_sizes: + >>> hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation) + >>> return fluid.layers.fc(input=hidden, size=num_classes, act="softmax") + >>> + >>> def train_mnist_mlp(): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') + >>> prediction = mlp(img) + >>> return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label)) + >>> + >>> def optimizer(): + >>> return fluid.optimizer.Adam() + >>> + >>> trainer = Trainer(train_func=train_mnist_mlp, + >>> optimizer_func=optimizer, + >>> place=fluid.CUDAPlace(0), + >>> parallel=True) + >>> + >>> def train_callback(event): + >>> if isinstance(event, fluid.EndStepEvent): + >>> print "Epoch ID", event.epoch, "Step ID",\ + >>> event.step, "AvgLoss", event.metrics[0] + >>> elif isinstance(event, fluid.EndEpochEvent): + >>> trainer.save_params("./model_{0}".format(event.epoch)) + >>> + >>> trainer.train(num_epochs=100, event_handler=train_callback) + + For more example, please see :ref:`api_guide_high_level_api`. + + + Args: + train_func(callable): A function which will return loss. The loss must be + a scalar tensor. + optimizer_func(callable): A function that returns an Optimizer object. + place(CUDAPlace|CPUPlace): The device place of this trainer. If + :code:`parallel=True,` all CUDA Places will be used if :code:`place` + is a :code:`CUDAPlace`. + parallel(bool): True if use multiple devices. + checkpoint_config(CheckpointConfig): Configuration about how to save + checkpoints. + """ + + def __init__(self, + train_func, + optimizer_func, + param_path=None, + place=None, + parallel=False, + checkpoint_config=None): + self.__stop = False + self.parallel = parallel + + # config for checkpoint + # only chief worker will save variables + self.trainer_id = 0 + self.checkpoint_cfg = checkpoint_config + if self.checkpoint_cfg: + assert isinstance(self.checkpoint_cfg, CheckpointConfig) + serial = _get_latest_checkpoint_serial( + self.checkpoint_cfg.checkpoint_dir) + self.checkpoint_cfg.load_serial = serial if serial >= 0 else None + + self.scope = core.Scope() + + # 1. we need to generate a framework.Program by calling + # program_func. Reference: fluid.program_guard in + # test_word2vec.py + + self.startup_program = framework.Program() + self.train_program = framework.Program() + + with framework.program_guard(self.train_program, self.startup_program): + program_func_outs = train_func() + self.train_func_outputs = program_func_outs if isinstance( + program_func_outs, list) else [program_func_outs] + self.test_program = self.train_program.clone(for_test=True) + + # The first element of program_func_outs is loss. + loss = self.train_func_outputs[0] + + optimizer = optimizer_func() + if not isinstance(optimizer, opt_module.Optimizer): + raise TypeError( + "The optimizer should be an instance of Optimizer") + optimize_ops, params_grads = optimizer.minimize(loss) + + self.place = check_and_get_place(place) + + self._dist_transpile_if_necessary(optimize_ops, params_grads) + + # 2. move the default_main_program to self.program and run the + # default_startup program on an empty core.Scope() + # Run startup program + with self._prog_and_scope_guard(): + exe = executor.Executor(place) + exe.run(self.startup_program) + + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None: + self._load_checkpoint() + + if param_path and os.path.isdir(param_path): + with self._prog_and_scope_guard(): + # load params from param_path into scope + io.load_persistables( + executor=exe, + dirname=param_path, + main_program=self.startup_program) + + def _transpile_nccl2_dist(self): + # PADDLE_TRAINER_IPS + if "PADDLE_TRAINER_IPS" not in os.environ: + self.nccl_id_var = None + else: + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + port = os.getenv("PADDLE_PSERVER_PORT") + worker_ips = os.getenv("PADDLE_TRAINER_IPS") + worker_endpoints = [] + for ip in worker_ips.split(","): + worker_endpoints.append(':'.join([ip, port])) + self.num_trainers = len(worker_endpoints) + current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port + worker_endpoints.remove(current_endpoint) + # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id + # in ParallelExecutor to start + # distributed training using NCCL2 + self.nccl_id_var = self.startup_program.global_block().create_var( + name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) + self.startup_program.global_block().append_op( + type="gen_nccl_id", + inputs={}, + outputs={"NCCLID": self.nccl_id_var}, + attrs={ + "endpoint": current_endpoint, + "endpoint_list": worker_endpoints, + "trainer_id": self.trainer_id + }) + + def _dist_transpile_if_necessary(self, optimize_ops, params_grads): + self._transpile_nccl2_dist() + if self.nccl_id_var != None: + return + + if "PADDLE_TRAINING_ROLE" not in os.environ: + return + + # the port of all pservers, needed by both trainer and pserver + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + # comma separated ips of all pservers, needed by trainer and + # pserver + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + # total number of workers/trainers in the job, needed by + # trainer and pserver + trainers = int(os.getenv("PADDLE_TRAINERS")) + # the IP of the local machine, needed by pserver only + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + # the unique trainer id, starting from 0, needed by trainer + # only + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + + # the role, should be either PSERVER or TRAINER + training_role = os.getenv("PADDLE_TRAINING_ROLE") + with self._prog_and_scope_guard(): + t = distribute_transpiler.DistributeTranspiler() + t.transpile( + self.trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + if self.checkpoint_cfg: + pserver_id = eplist.index(current_endpoint) + self.checkpoint_cfg.pserver_id = pserver_id + if t.has_distributed_lookup_table: + self.checkpoint_cfg.lookup_table_name = t.table_name + + self.train_program = t.get_pserver_program(current_endpoint) + self.startup_program = t.get_startup_program(current_endpoint, + self.train_program) + elif training_role == "TRAINER": + self.train_program = t.get_trainer_program() + else: + raise ValueError( + 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + ) + + def stop(self): + """ + stop training + """ + self.__stop = True + + def train(self, num_epochs, event_handler, reader=None, feed_order=None): + """ + Start the train loop to train the model. + + Args: + num_epochs(int): The number of epoch. An epoch will process all data in reader + event_handler(callable): The event handler. A function with type (ev:Event)->void + reader(callable): A reader creator object. See also + :ref:`api_guide_python_reader` . + feed_order(list): Feeding order of reader. None will following the defining + order in program + + Returns: + None + """ + training_role = os.getenv("PADDLE_TRAINING_ROLE", "") + if training_role == "PSERVER": + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + exe.run() + return + if self.parallel: + self._train_by_parallel_executor(num_epochs, event_handler, reader, + feed_order) + else: + self._train_by_executor(num_epochs, event_handler, reader, + feed_order) + + def test(self, reader, feed_order): + """ + Test the model on given test data + + Args: + reader(callable): The reader that yields test data. + feed_order(list): Feeding order of reader. None will following the + defining order in program + """ + + return self._test_by_executor(reader, feed_order, + self.train_func_outputs) + + def save_params(self, param_path): + """ + Save all parameters into :code:`param_path`. + + Args: + param_path(str): The path to save parameters. + + Returns: + None + """ + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + io.save_persistables(exe, dirname=param_path) + + def save_inference_model(self, param_path, feeded_var_names, + target_var_indexes): + """ + Save model for cpp inference into :code:`param_path`. + + Args: + param_path(str): The path to save parameters. + feeded_var_names(list(str)): The name of the vars that you + need to feed in before run program. + target_var_indexes(list(int)): the index of target var that + you need to return in trainer.train_func. + Returns: + None + """ + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + target_vars = [ + self.train_func_outputs[index] for index in target_var_indexes + ] + io.save_inference_model(param_path, feeded_var_names, target_vars, + exe) + + @contextlib.contextmanager + def _prog_and_scope_guard(self): + with framework.program_guard( + main_program=self.train_program, + startup_program=self.startup_program): + with executor.scope_guard(self.scope): + yield + + def _train_by_executor(self, num_epochs, event_handler, reader, feed_order): + """ + Train by Executor and single device. + + Args: + num_epochs: + event_handler: + reader: + feed_order: + + Returns: + + """ + with self._prog_and_scope_guard(): + feed_var_list = build_feed_var_list(self.train_program, feed_order) + feeder = data_feeder.DataFeeder( + feed_list=feed_var_list, place=self.place) + exe = executor.Executor(self.place) + reader = feeder.decorate_reader(reader, multi_devices=False) + self._train_by_any_executor(event_handler, exe, num_epochs, reader) + + def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): + if self.checkpoint_cfg: + epochs = [ + epoch_id for epoch_id in range(num_epochs) + if epoch_id >= self.checkpoint_cfg.epoch_id + ] + else: + epochs = [epoch_id for epoch_id in range(num_epochs)] + + for epoch_id in epochs: + event_handler(BeginEpochEvent(epoch_id)) + for step_id, data in enumerate(reader()): + if self.__stop: + if self.checkpoint_cfg: + self._clean_checkpoint() + return + + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ + and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id: + continue + + begin_event = BeginStepEvent(epoch_id, step_id) + event_handler(begin_event) + if begin_event.fetch_metrics: + metrics = exe.run(feed=data, + fetch_list=[ + var.name + for var in self.train_func_outputs + ]) + else: + metrics = exe.run(feed=data, fetch_list=[]) + + if self.checkpoint_cfg: + self._save_checkpoint(epoch_id, step_id) + event_handler(EndStepEvent(epoch_id, step_id, metrics)) + event_handler(EndEpochEvent(epoch_id)) + if self.checkpoint_cfg: + self._clean_checkpoint() + + def _test_by_executor(self, reader, feed_order, fetch_list): + with executor.scope_guard(self.scope): + feed_var_list = build_feed_var_list(self.test_program, feed_order) + feeder = data_feeder.DataFeeder( + feed_list=feed_var_list, place=self.place) + exe = executor.Executor(self.place) + accumulated = len(fetch_list) * [0] + count = 0 + for data in reader(): + outs = exe.run(program=self.test_program, + feed=feeder.feed(data), + fetch_list=fetch_list) + accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)] + count += 1 + + return [x / count for x in accumulated] + + def _train_by_parallel_executor(self, num_epochs, event_handler, reader, + feed_order): + with self._prog_and_scope_guard(): + pe = self._get_or_create_parallel_executor() + feed_var_list = build_feed_var_list(self.train_program, feed_order) + feeder = data_feeder.DataFeeder( + feed_list=feed_var_list, place=self.place) + reader = feeder.decorate_reader(reader, multi_devices=True) + self._train_by_any_executor(event_handler, pe, num_epochs, reader) + + def _get_parallel_executor(self): + return getattr(self, 'parallel_executor', None) + + def _get_or_create_parallel_executor(self): + if self._get_parallel_executor() is None: + self.parallel_executor = parallel_executor.ParallelExecutor( + use_cuda=isinstance(self.place, core.CUDAPlace), + loss_name=self.train_func_outputs[0].name) + return self._get_parallel_executor() + + def _clean_checkpoint(self): + assert self.checkpoint_cfg + clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) + + def _get_checkpoint_load_args(self): + """ + epoch_id and step_id are runtime arguments, they are not variables, will load them independently. + """ + return ["epoch_id", "step_id"] + + def _get_checkpoint_save_args(self, epoch_id, step_id): + """ + epoch_id and step_id are runtime arguments, they are not variables, will save them independently. + """ + trainer_args = {} + trainer_args["epoch_id"] = epoch_id + trainer_args["step_id"] = step_id + return trainer_args + + def _save_checkpoint(self, epoch_id, step_id): + assert self.checkpoint_cfg + + if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \ + and step_id % self.checkpoint_cfg.step_interval == 0: + exe = executor.Executor(self.place) + save_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + trainer_id=self.trainer_id, + trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), + main_program=self.train_program, + max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) + + def _load_checkpoint(self): + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program) + + if not self.checkpoint_cfg.pserver_id: + load_trainer_args = self._get_checkpoint_load_args() + trainer_args = load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program, + role_id=self.trainer_id, + is_trainer=True, + load_trainer_args=load_trainer_args) + + if len(trainer_args) != 2: + raise ValueError( + "the return trainer_args length do not equal _get_checkpoint_load_args" + ) + self.checkpoint_cfg.epoch_id = int(trainer_args[0]) + self.checkpoint_cfg.step_id = int(trainer_args[1]) + else: + if self.checkpoint_cfg.lookup_table_name: + load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program, + role_id=self.checkpoint_cfg.pserver_id, + is_trainer=False, + load_trainer_args=None, + load_lookup_table=self.checkpoint_cfg.lookup_table_name) + + +def build_feed_var_list(program, feed_order): + if not isinstance(program, framework.Program): + raise TypeError("The 'program' should be an object of Program") + + if isinstance(feed_order, list): + feed_var_list = [ + program.global_block().var(var_name) for var_name in feed_order + ] + else: + if not isinstance(feed_order, dict): + raise TypeError( + "The 'feed_order' should be either None, list or dict.") + if not sorted(feed_order.values()) == list(range(len(feed_order))): + raise ValueError( + "The values of 'feed_order' should be a permutation of [0, len(feed_order))" + ) + sorted_pair_list = sorted( + six.iteritems(feed_order), key=lambda item: item[1]) + feed_var_list = [ + program.global_block().var(pair[0]) for pair in sorted_pair_list + ] + return feed_var_list + + +# move Checkpoint APIs from io.py to trainer.py, make all of them are private. +SUCCESS_MARK_FILENAME = "_SUCCESS" +CHECKPOINT_PREFIX = "checkpoint" +MODEL_DIR = "__model__" +LOOKUP_TABLE_DIR = "__lookup_table__" +TRAINER_PREFIX = "trainer" +CHECKPOINT_SEPARATOR = "_" + + +def save_checkpoint(executor, + checkpoint_dir, + trainer_id, + main_program, + trainer_args=None, + max_num_checkpoints=3, + lookup_table=None, + pserver_endpoints=None): + """ + This function filters out all checkpoint variables from the give + main_program and then saves these variables to the `checkpoint_dir` + directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there might be a lot of checkpoints in the + `checkpoint_dir`. To avoid them taking too much disk space, the + `max_num_checkpoints` are introduced to limit the total number of + checkpoints. If the number of existing checkpints is greater than + the `max_num_checkpoints`, oldest ones will be scroll deleted. + + A variable is a checkpoint variable and will be saved if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for save checkpoint. + checkpoint_dir(str): The folder where to save checkpoints. + trainer_id(int): currect trainer id, if id is equal to 0, the trainer + is chief. + trainer_args(dict|None): Current training arguments. Such as 'epoch_id' + and 'step_id'. + Defaut: None + main_program(Program): The program whose checkpoint variables will + be saved. + max_num_checkpoints(int): The max number of total number of existing + checkpoints. + Default: 3 + lookup_table(string|None): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + pserver_endpoints(list|None): the parameter server ip:port list. + when use distribute lookup table, we can get pserver_endpoints by + distribute arguments. + + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + AssertionError: If `trainer_args` is not a dict. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + trainer_args = {"epoch_id": 200, + "step_id": 20} # just an example + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + save_checkpoint(executor=exe, + checkpoint_dir=path, + trainer_id=0, + trainer_args=trainer_args, + main_program=prog, + max_num_checkpoints=3, + lookup_table=table_name, + pserver_endpoints = ps_endpoints) + """ + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + + if main_program is None: + raise ValueError('main_program should not be None.') + + if trainer_args: + assert isinstance(trainer_args, dict) + + is_chief = trainer_id == 0 + + _make_chekcpoint_dirs(checkpoint_dir) + serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1 + cur_dir = _get_serial_dir(checkpoint_dir, serial) + + _save_trainer_args(cur_dir, trainer_id, trainer_args) + + if is_chief: + _save_persist_vars_without_grad(executor, cur_dir, main_program) + + if is_chief and lookup_table and pserver_endpoints: + _save_pserver_vars_by_notify(executor, cur_dir, lookup_table, + pserver_endpoints) + + _scroll_delete(checkpoint_dir, max_num_checkpoints) + + +def load_checkpoint(executor, + checkpoint_dir, + main_program, + role_id=0, + is_trainer=True, + load_trainer_args=None, + load_lookup_table=None): + """ + This function filters out all checkpoint variables from the give + main_program and then try to load these variables from the + `checkpoint_dir` directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there are more than one checkpoint in the + `checkpoint_dir` (each checkpoint has its own sub folder), use + `serial` to specify which serial of checkpoint you would like to + load. + + A variable is a checkpoint variable and will be loaded if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading checkpoint. + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + main_program(Program): The program whose checkpoint variables will + be loaded. + role_id(int): the trainer id or the parameter server id. + is_trainer(bool): trainer is True and parameter server is False. + load_trainer_args(list|None): list about load trainer args. + load_lookup_table(str|None): the lookup table name + + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + ValueError: If `main_program` is None. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + load_checkpoint(executor=exe, checkpoint_dir=path, + serial=9, main_program=prog) + + # In this example, `load_checkpoint` function + # will first filters out all checkpoint variables in the default + # main program, and then try to load these variables form the + # folder "./checkpoints/checkpoint_9/__model__". + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + + serial = _get_latest_checkpoint_serial(checkpoint_dir) + + # there are nothing need to be loaded + if serial is None or serial < 0: + return + + if main_program is None: + raise ValueError('main_program should not be None.') + + if is_trainer and load_trainer_args is None: + cur_dir = _get_serial_dir(checkpoint_dir, serial) + _load_persist_vars_without_grad(executor, cur_dir, main_program, True) + return + + if is_trainer and load_trainer_args: + return _load_trainer_args(checkpoint_dir, serial, role_id, + load_trainer_args) + + if not is_trainer and load_lookup_table: + _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id, + load_lookup_table) + + +def clean_checkpoint(checkpoint_dir, delete_dir=False): + """ + clean the checkpoint dir, when the train exits normally, + the trainer will call clean_checkpoint to delete checkpoint directory saved before. + delete_dir only works when the directory is empty, otherwise, OSError is raised. + + : param checkpoint_dir + : param delete_dir + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + _scroll_delete(checkpoint_dir, max_num_checkpoints=0) + + if delete_dir and not os.listdir(checkpoint_dir): + os.rmdir(checkpoint_dir) + + +def _load_persist_vars_without_grad(executor, + dirname, + program, + has_model_dir=False): + """ + This function filters out all checkpoint variables from the give + program and then trys to load these variables from the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be loaded. + has_model_dir(bool): if True, the function loads variables + from a sub directory named '__model__'. + Default: False + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + _load_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog, has_model_dir=True) + + # In this example, `_load_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then trys to load these variables form the + # folder "./my_paddle_model/__model__". + """ + + if has_model_dir: + dirname = _get_model_dir(dirname) + + io.load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + +def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): + """ + The parameter server will load lookup table's local file in + selectedrows variable. + + Args: + executor(Executor): The executor to run for loading persistable variables + dirname(str): The directory path + main_program(Program): Find the variable named table_name in main_program + pserver_id(int): the serial number in pserver_endpoints list + table_name(str): lookup table name + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + dirname = "./checkpoints/checkpoint_9/" + prog = fluid.default_main_program() + pserver_id = 1 + table_name = "share_w" + _load_lookup_table_vars(executor=exe, + dirname=dirname, program=prog, pserver_id=pserver_id, + table_name=table_name) + """ + + for var in program.list_vars(): + if var.name == table_name: + lookup_table_var = var + break + + assert lookup_table_var is not None + + lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) + + load_prog = framework.Program() + load_block = load_prog.global_block() + + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [lookup_table_var]}, + attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) + + executor.run(load_prog) + + +def _save_persist_vars_without_grad(executor, dirname, program): + """ + This function filters out all checkpoint variables from the give + program and then save these variables to a sub-folder '__model__' of + the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for saving variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be saved. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + _save_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog) + + # In this example, `_save_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then saves these variables to the folder + # "./my_paddle_model/__model__". + """ + cur_dir = _get_model_dir(dirname) + io.save_vars( + executor, + dirname=cur_dir, + main_program=program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + _write_success(cur_dir) + + +def _save_pserver_vars_by_notify(executor, dirname, lookup_table, + ps_endpoint_list): + """ + This function will send checkpoint notify message from Trainer 0 + to all the pservers. + The checkpoint notify message contains lookup table name, + the absolute path on pserver to save lookup_table. + + Args: + executor(Executor): The executor to run for send checkpoint notify. + dirname(str): The folder where to save checkpoints. + lookup_table(string): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + ps_endpoint_list(list): the parameter server ip:port list. + when use distribute lookup table, we can get ps_endpoint_list by + distribute arguments. + Return: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + _save_pserver_vars_by_notify(executor=exe, + dirname=param_path, lookup_table=table_name, + ps_endpoint_list=ps_endpoints) + """ + cur_dir = _get_lookuptable_dir(dirname) + + checkpoint_notify_program = framework.Program() + checkpoint_notify_block = checkpoint_notify_program.global_block() + + attrs = {} + attrs['epmap'] = ps_endpoint_list + attrs['dir'] = cur_dir + attrs['lookup_table'] = lookup_table + + checkpoint_notify_block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(checkpoint_notify_program) + + +def _save_trainer_args(dirname, trainer_id, trainer_args): + assert isinstance(trainer_args, dict) + + cur_dir = _get_trainer_dir(dirname, trainer_id) + + for name, value in six.iteritems(trainer_args): + args_file = os.path.join(cur_dir, name) + with open(args_file, 'w') as f: + f.write(str(value)) + _write_success(cur_dir) + + +def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + """ + trainer will load some args from it's independent directory, + such as epoch_id and step_id. + + Args: + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + trainer_id(int): current trainer id. + trainer_args(list): list about load trainer args + Return: + None + + Examples: + .. code-block:: python + + param_path = "./checkpoint/" + serial = 7 + trainer_id = 2 + trainer_args = ["epoch_id", "step_id"] + + _load_trainer_args(checkpoint_dir=param_path, serial=serial, + trainer_id=trainer_id, trainer_args=trainer_args) + """ + assert isinstance(trainer_args, list) + + cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_trainer_dir(cur_dir, trainer_id) + + ret_values = [] + + for arg in trainer_args: + cur_file = os.path.join(cur_dir, arg) + with open(cur_file, 'r') as f: + contents = f.read() + ret_values.append(contents.strip()) + return ret_values + + +def _is_checkpoint_var(var): + """ + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + + : param var(Variable) + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False + + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: + return False + + return var.persistable + + +def _make_chekcpoint_dirs(dirs): + """ + _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it. + """ + assert dirs is not None + + if os.path.isfile(dirs): + raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs) + + if not os.path.isdir(dirs): + try: + os.makedirs(dirs) + except OSError as err: + if err.errno != errno.EEXIST: + raise err + + +def _get_dir_serial(dirname): + _, serial = dirname.split(CHECKPOINT_SEPARATOR) + + try: + serial_num = int(serial) + except ValueError: + serial_num = -1 + return serial_num + + +def _get_serial_dir(dirname, serial): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + serial_dir = os.path.join(dirname, serial_folder) + _make_chekcpoint_dirs(serial_dir) + + return serial_dir + + +def _get_model_dir(dirname): + model_dir = os.path.join(dirname, MODEL_DIR) + _make_chekcpoint_dirs(model_dir) + return model_dir + + +def _get_lookuptable_dir(dirname): + lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + _make_chekcpoint_dirs(lookuptable_dir) + return lookuptable_dir + + +def _get_trainer_dir(dirname, trainer_id): + trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) + trainer_dir = os.path.join(dirname, trainer_folder) + _make_chekcpoint_dirs(trainer_dir) + return trainer_dir + + +def _scroll_delete(dirname, max_num_checkpoints=3): + dirs = os.listdir(dirname) + serial_map = {} + for serial in dirs: + serial_num = _get_dir_serial(serial) + serial_map[serial_num] = serial + + if len(list(serial_map.keys())) <= max_num_checkpoints: + return + + serials = list(serial_map.keys()) + serials.sort(reverse=True) + serials = serials[max_num_checkpoints:] + for serial in serials: + cur_dir = _get_serial_dir(dirname, serial) + try: + shutil.rmtree(cur_dir) + except OSError as err: + if err.errno != errno.ENOENT: + raise err + + +def _write_success(dirname): + """ + write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. + + : param dirname + """ + success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) + with open(success_file, 'a') as f: + now = time.ctime() + f.write(now) + + +def _get_latest_checkpoint_serial(checkpoint_dir): + """ + get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory + + : param checkpoint_dir + """ + if not checkpoint_dir: + return -1 + + def has_success(checkpoint_dir, cur_dir): + """ + is _SUCCESS in this dir + """ + + serial = _get_dir_serial(cur_dir) + if serial == -1 or not os.path.isdir( + os.path.join(checkpoint_dir, cur_dir)): + return -1 + + success_path = os.path.join( + _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, + SUCCESS_MARK_FILENAME) + if os.path.isfile(success_path): + return serial + + if not os.path.isdir(checkpoint_dir): + return -1 + + current_dir = -1 + dirs = os.listdir(checkpoint_dir) + for cur_dir in dirs: + success_num = has_success(checkpoint_dir, cur_dir) + if success_num > current_dir: + current_dir = success_num + return current_dir diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 0abbb6815123f8ba65b637b3f3accef91fe66ef8..bced5fd1d9c617ab614212c811e86422d65a2e56 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import traceback import numpy as np @@ -34,11 +35,12 @@ except ImportError as e: except Exception as e: raise e from . import unique_name +import os +PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None __all__ = [ 'Program', 'Operator', - 'Parameter', 'default_startup_program', 'default_main_program', 'program_guard', @@ -489,7 +491,9 @@ class OpProtoHolder(object): def generated_op_attr_names(): return { core.op_proto_and_checker_maker.kOpRoleAttrName(), - core.op_proto_and_checker_maker.kOpRoleVarAttrName() + core.op_proto_and_checker_maker.kOpRoleVarAttrName(), + core.op_proto_and_checker_maker.kOpNameScopeAttrName(), + core.op_proto_and_checker_maker.kOpCreationCallstackAttrName() } @@ -572,6 +576,11 @@ class Operator(object): if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] + if not PADDLE_ON_MODEL_CE: + callstack_var_name = op_maker.kOpCreationCallstackAttrName() + op_attrs[callstack_var_name] = list( + reversed(traceback.format_stack()))[1:] + if len(self.desc.type()) != 0: return if type is None: @@ -1509,6 +1518,30 @@ class Program(object): self._op_role_var = [] self._current_role = OpRole.Forward + @contextlib.contextmanager + def _lr_schedule_guard(self): + """ + A with guard to set :code:`LRSched` :code:`OpRole` and + :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is + set to the target learning rate. + + Notes: This is a very low level API. Users should not use it directly. + + + Examples: + + >>> p, g = backward(...) + >>> with program.lr_schedule_guard(): + >>> lr = lr * decay + """ + OpRole = core.op_proto_and_checker_maker.OpRole + self._current_role = OpRole.LRSched + # TODO(typhoonzero): how to set target learning rate var + self._op_role_var = [] + yield + self._op_role_var = [] + self._current_role = OpRole.Forward + def __str__(self): """ Get the protobuf debug string of this Program. diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index a9b94a20720615dbfca97749463f27dbc88ac64f..7bdd430f985bd0b3818f6ef305ce2d7d8976106b 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -12,101 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - -import contextlib - -from . import core - -from . import executor -from . import framework -from . import io -from . import parallel_executor -from . import unique_name -from .trainer import check_and_get_place - -__all__ = ['Inferencer', ] - - -class Inferencer(object): - """ - Inferencer High Level API. - - Args: - infer_func (Python func): Infer function that will return predict Variable - param_path (str): The path where the inference model is saved by fluid.io.save_params - place (Place): place to do the inference - parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU. - - Examples: - .. code-block:: python - - def inference_program(): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - return y_predict - - place = fluid.CPUPlace() - inferencer = fluid.Inferencer( - infer_func=inference_program, param_path="/tmp/model", place=place) - - """ - - def __init__(self, infer_func, param_path, place=None, parallel=False): - self.param_path = param_path - self.scope = core.Scope() - self.parallel = parallel - self.place = check_and_get_place(place) - - self.inference_program = framework.Program() - with framework.program_guard(self.inference_program): - with unique_name.guard(): - self.predict_var = infer_func() - - with self._prog_and_scope_guard(): - # load params from param_path into scope - io.load_params(executor.Executor(self.place), param_path) - - if parallel: - with self._prog_and_scope_guard(): - self.exe = parallel_executor.ParallelExecutor( - use_cuda=isinstance(self.place, core.CUDAPlace), - loss_name=self.predict_var.name) - else: - self.exe = executor.Executor(self.place) - - self.inference_program = self.inference_program.clone(for_test=True) - - def infer(self, inputs, return_numpy=True): - """ - Do Inference for Inputs - - Args: - inputs (map): a map of {"input_name": input_var} that will be feed into the inference program - return_numpy (bool): transform return value into numpy or not - - Returns: - Tensor or Numpy: the predict value of the inference model for the inputs - - Examples: - .. code-block:: python - - tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") - results = inferencer.infer({'x': tensor_x}) - """ - if not isinstance(inputs, dict): - raise ValueError( - "inputs should be a map of {'input_name': input_var}") - - with self._prog_and_scope_guard(): - results = self.exe.run(feed=inputs, - fetch_list=[self.predict_var.name], - return_numpy=return_numpy) - - return results - - @contextlib.contextmanager - def _prog_and_scope_guard(self): - with framework.program_guard(main_program=self.inference_program): - with executor.scope_guard(self.scope): - yield +# NOTE: inferencer is moved into fluid.contrib.inferencer. +__all__ = [] diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 7a7a0078a557c47492a4396897aafabe6c9c5dcb..a26b8df5a240be8340597b9627866c323fa98a2d 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -74,7 +74,7 @@ class Initializer(object): directly, but need to use one of its implementations. """ - def __init_(self): + def __init__(self): pass def __call__(self, param, block): @@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer): assert loc is not None assert scale is not None assert seed is not None - super(NormalInitializer, self).__init__() + super(TruncatedNormalInitializer, self).__init__() self._mean = loc self._std_dev = scale self._seed = seed diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 78bb8a1a0a64631cbe2adc11b1494ceed6d14908..604f3eacd75beff306915b224b30c369dd3a486f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -27,8 +27,7 @@ from . import core __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', - 'load_persistables', 'save_inference_model', 'load_inference_model', - 'get_inference_program' + 'load_persistables', 'save_inference_model', 'load_inference_model' ] @@ -504,23 +503,6 @@ def load_persistables(executor, dirname, main_program=None, filename=None): filename=filename) -def get_inference_program(target_vars, main_program=None): - if main_program is None: - main_program = default_main_program() - if not isinstance(target_vars, list): - target_vars = [target_vars] - vars = [] - for var in target_vars: - if isinstance(var, Evaluator): - vars.extend(var.states) - vars.extend(var.metrics) - else: - vars.append(var) - pruned_program = main_program._prune(targets=vars) - inference_program = pruned_program._inference_optimize() - return inference_program - - def prepend_feed_ops(inference_program, feed_target_names, feed_holder_name='feed'): @@ -618,7 +600,7 @@ def save_inference_model(dirname, """ if isinstance(feeded_var_names, six.string_types): feeded_var_names = [feeded_var_names] - else: + elif export_for_deployment: if len(feeded_var_names) > 0: # TODO(paddle-dev): polish these code blocks if not (bool(feeded_var_names) and all( @@ -628,61 +610,60 @@ def save_inference_model(dirname, if isinstance(target_vars, Variable): target_vars = [target_vars] - else: + elif export_for_deployment: if not (bool(target_vars) and all( isinstance(var, Variable) for var in target_vars)): raise ValueError("'target_vars' should be a list of Variable.") if main_program is None: main_program = default_main_program() - copy_program = main_program.clone() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + _save_lookup_tables_by_notify(executor, lookup_table_filename, + main_program._distributed_lookup_table, + main_program._endpoints) if not os.path.isdir(dirname): os.makedirs(dirname) + if model_filename is not None: + model_basename = os.path.basename(model_filename) + else: + model_basename = "__model__" + model_basename = os.path.join(dirname, model_basename) # When export_for_deployment is true, we modify the program online so that # it can only be loaded for inference directly. If it's false, the whole # original program and related meta are saved so that future usage can be # more flexible. if export_for_deployment: - global_block = copy_program.global_block() + main_program = main_program.clone() + global_block = main_program.global_block() for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) if op.type == "feed" or op.type == "fetch": global_block._remove_op(i) - copy_program.desc.flush() + main_program.desc.flush() - pruned_program = copy_program._prune(targets=target_vars) - saved_program = pruned_program._inference_optimize(prune_read_op=True) + main_program = main_program._prune(targets=target_vars) + main_program = main_program._inference_optimize(prune_read_op=True) fetch_var_names = [v.name for v in target_vars] - prepend_feed_ops(saved_program, feeded_var_names) - append_fetch_ops(saved_program, fetch_var_names) + prepend_feed_ops(main_program, feeded_var_names) + append_fetch_ops(main_program, fetch_var_names) + + with open(model_basename, "wb") as f: + f.write(main_program.desc.serialize_to_string()) else: # TODO(panyx0718): Save more information so that it can also be used # for training and more flexible post-processing. - saved_program = copy_program - - if model_filename is not None: - model_filename = os.path.basename(model_filename) - else: - model_filename = "__model__" - model_filename = os.path.join(dirname, model_filename) + with open(model_basename + ".main_program", "wb") as f: + f.write(main_program.desc.serialize_to_string()) if params_filename is not None: params_filename = os.path.basename(params_filename) - - with open(model_filename, "wb") as f: - f.write(saved_program.desc.serialize_to_string()) - - save_persistables(executor, dirname, saved_program, params_filename) - - # if there is lookup table, the trainer 0 will notify all pserver to save. - if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: - lookup_table_filename = os.path.join(dirname, "__lookup_table__") - _save_lookup_tables_by_notify(executor, lookup_table_filename, - main_program._distributed_lookup_table, - main_program._endpoints) + save_persistables(executor, dirname, main_program, params_filename) def load_inference_model(dirname, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d56fa76300e7054ef71a7729483a579fa35f1dac..75c29b12724d53783b9748d6df066c52bd232482 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -311,6 +311,7 @@ def _copy_reader_var_(block, var): new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_dtypes(var.desc.dtypes()) + new_var.desc.set_lod_levels(var.desc.lod_levels()) new_var.persistable = True return new_var @@ -632,6 +633,7 @@ def py_reader(capacity, }) startup_var.desc.set_dtypes(dtypes) + startup_var.desc.set_lod_levels(lod_levels) startup_var.persistable = True main_prog_var = _copy_reader_var_(default_main_program().current_block(), diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 8963d74de014d69c590276d5ff7080111f614230..8c11921d9bde0920f33368837302d39f36f45556 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -23,7 +23,10 @@ from ..proto import framework_pb2 from ..framework import OpProtoHolder, Variable from ..layer_helper import LayerHelper -__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc'] +__all__ = [ + 'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc', + 'templatedoc' +] def _convert_(name): @@ -58,7 +61,7 @@ def escape_math(text): _two_dollar_pattern_.sub(r"!!\1!!", text))) -def _generate_doc_string_(op_proto): +def _generate_doc_string_(op_proto, additional_args_lines=None): """ Generate docstring by OpProto @@ -98,6 +101,13 @@ def _generate_doc_string_(op_proto): buf.write(escape_math(each_attr.comment)) buf.write('\n') + if additional_args_lines is not None: + for line in additional_args_lines: + line = line.strip() + buf.write(' ') + buf.write(line) + buf.write('\n') + if len(op_proto.outputs) != 0: buf.write('\nReturns:\n') buf.write(' ') @@ -205,6 +215,29 @@ def generate_layer_fn(op_type): return func +def generate_layer_fn_noattr(op_type): + """Register the Python layer for an Operator without Attribute. + + Args: + op_type: The name of the operator to be created. + + This function takes in the operator type (sigmoid, exp , tanh etc) and + creates the operator functionality. + + """ + op_proto = OpProtoHolder.instance().get_op_proto(op_type) + + def func(x, name=None): + helper = LayerHelper(op_type, **locals()) + output = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output}) + return output + + func.__name__ = op_type + func.__doc__ = _generate_doc_string_(op_proto) + return func + + def deprecated(func_or_class): """ Deprecated warning decorator. It will result a warning message. diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index be368007dd7061ba7fc97414dbadfce00d158776..dfd801a098d6451dbdb20d9ba44187d1e3f8a91a 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -27,7 +27,7 @@ from . import nn from . import ops from . import tensor from ..initializer import init_on_cpu -from ..framework import default_main_program, Parameter +from ..framework import default_main_program, Parameter, unique_name __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', @@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps): Returns: The decayed learning rate. """ - global_step = _decay_step_counter(1) + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter(1) - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) return lr_value @@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): sgd_optimizer.minimize(avg_cost) """ - global_step = _decay_step_counter() + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) - return decayed_lr + return decayed_lr def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): Returns: The decayed learning rate """ - global_step = _decay_step_counter() + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) - return decayed_lr + return decayed_lr def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): staircase=True)) sgd_optimizer.minimize(avg_cost) """ - global_step = _decay_step_counter() + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) - decayed_lr = learning_rate / (1 + decay_rate * div_res) + decayed_lr = learning_rate / (1 + decay_rate * div_res) - return decayed_lr + return decayed_lr def polynomial_decay(learning_rate, @@ -220,25 +224,28 @@ def polynomial_decay(learning_rate, Returns: Variable: The decayed learning rate """ - global_step = _decay_step_counter() - - if cycle: - div_res = ops.ceil(global_step / decay_steps) - zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0) - one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0) - - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - tensor.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res - else: - decay_steps_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() + + if cycle: + div_res = ops.ceil(global_step / decay_steps) + zero_var = tensor.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = tensor.fill_constant( + shape=[1], dtype='float32', value=1.0) + + with control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + tensor.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) - decayed_lr = (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate - return decayed_lr + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr def piecewise_decay(boundaries, values): @@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values): """ + with default_main_program()._lr_schedule_guard(): + if len(values) - len(boundaries) != 1: + raise ValueError("len(values) - len(boundaries) should be 1") - if len(values) - len(boundaries) != 1: - raise ValueError("len(values) - len(boundaries) should be 1") - - global_step = _decay_step_counter() + global_step = _decay_step_counter() - lr = tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") + lr = tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") - with control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = tensor.fill_constant( + with control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(boundaries[i]), + force_cpu=True) + value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(global_step < boundary_val): + tensor.assign(value_var, lr) + last_value_var = tensor.fill_constant( shape=[1], dtype='float32', - value=float(boundaries[i]), - force_cpu=True) - value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(global_step < boundary_val): - tensor.assign(value_var, lr) - last_value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[len(values) - 1])) - with switch.default(): - tensor.assign(last_value_var, lr) + value=float(values[len(values) - 1])) + with switch.default(): + tensor.assign(last_value_var, lr) return lr diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f896cfa04b3e0d89daaa1bd7fd893b5892a09a4e..2cb61a9cd25c744710ab7ac9ea591902740f78da 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -20,9 +20,9 @@ from __future__ import print_function import numpy as np from ..layer_helper import LayerHelper from ..initializer import Normal, Constant -from ..framework import Variable +from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr -from .layer_function_generator import autodoc, templatedoc +from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat from . import utils from .. import unique_name @@ -107,7 +107,16 @@ __all__ = [ 'log', 'crop', 'rank_loss', + 'elu', + 'relu6', + 'pow', + 'stanh', + 'hard_sigmoid', + 'swish', 'prelu', + 'brelu', + 'leaky_relu', + 'soft_relu', 'flatten', 'sequence_mask', 'stack', @@ -116,6 +125,14 @@ __all__ = [ 'sequence_enumerate', 'expand', 'sequence_concat', + 'scale', + 'elementwise_add', + 'elementwise_div', + 'elementwise_sub', + 'elementwise_mul', + 'elementwise_max', + 'elementwise_min', + 'elementwise_pow', ] @@ -3605,7 +3622,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): attrs={ 'transpose_X': transpose_x, 'transpose_Y': transpose_y, - 'alpha': alpha, + 'alpha': float(alpha), }) return out @@ -5895,6 +5912,148 @@ def pad2d(input, return out +@templatedoc() +def elu(x, alpha=1.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + alpha(${alpha_type}|1.0): ${alpha_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('elu', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='elu', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'alpha': alpha}) + return out + + +@templatedoc() +def relu6(x, threshold=6.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + threshold(${threshold_type}|6.0): ${threshold_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('relu6', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='relu6', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'threshold': threshold}) + return out + + +@templatedoc() +def pow(x, factor=1.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + factor(${factor_type}|1.0): ${factor_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('pow', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='pow', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'factor': factor}) + return out + + +@templatedoc() +def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + scale_a(${scale_a_type}|2.0 / 3.0): ${scale_a_comment} + scale_b(${scale_b_type}|1.7159): ${scale_b_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('stanh', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='stanh', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'scale_a': scale_a, + 'scale_b': scale_b}) + return out + + +@templatedoc() +def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + slope(${slope_type}|0.2): ${slope_comment} + offset(${offset_type}|0.5): ${offset_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('hard_sigmoid', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='hard_sigmoid', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'slope': slope, + 'offset': offset}) + return out + + +@templatedoc() +def swish(x, beta=1.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + beta(${beta_type}|1.0): ${beta_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('swish', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='swish', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'slope': beta}) + return out + + def prelu(x, mode, param_attr=None, name=None): """ Equation: @@ -5948,6 +6107,74 @@ def prelu(x, mode, param_attr=None, name=None): return out +@templatedoc() +def brelu(x, t_min=0.0, t_max=24.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + t_min(${t_min_type}|0.0): ${t_min_comment} + t_max(${t_max_type}|24.0): ${t_max_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('brelu', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='brelu', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'t_min': t_min, + 't_max': t_max}) + return out + + +@templatedoc() +def leaky_relu(x, alpha=0.02, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + alpha(${alpha_type}|0.02): ${alpha_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('leaky_relu', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='leaky_relu', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'alpha': alpha}) + return out + + +@templatedoc() +def soft_relu(x, threshold=40.0, name=None): + """ + ${comment} + Args: + x(${x_type}): ${x_comment} + threshold(${threshold_type}|40.0): ${threshold_comment} + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + Returns: + output(${out_type}): ${out_comment} + """ + helper = LayerHelper('soft_relu', **locals()) + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='soft_relu', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'threshold': threshold}) + return out + + def flatten(x, axis=1, name=None): """ **Flatten layer** @@ -6234,3 +6461,158 @@ def expand(x, expand_times, name=None): outputs={'Out': out}, attrs={'expand_times': expand_times}) return out + + +def _elementwise_op(helper): + op_type = helper.layer_type + x = helper.kwargs.get('x', None) + y = helper.kwargs.get('y', None) + assert x is not None, 'x cannot be None in {}'.format(op_type) + assert y is not None, 'y cannot be None in {}'.format(op_type) + axis = helper.kwargs.get('axis', -1) + use_mkldnn = helper.kwargs.get('use_mkldnn', False) + out = helper.kwargs.get('out', None) + if out is None: + name = helper.kwargs.get('name', None) + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type=op_type, + inputs={'X': x, + 'Y': y}, + outputs={'Out': out}, + attrs={'axis': axis, + 'use_mkldnn': use_mkldnn}) + return helper.append_activation(out) + + +@templatedoc() +def scale(x, + scale=1.0, + bias=0.0, + bias_after_scale=True, + out=None, + act=None, + name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + scale(${scale_type}): ${scale_comment} + bias(${bias_type}): ${bias_comment} + bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment} + out(Tensor): Output tensor. + act(basestring|None): Activation applied to the output. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper('scale', **locals()) + if out is None: + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type='scale', + inputs={'X': x}, + outputs={'Out': out}, + attrs={ + 'scale': float(scale), + 'bias': float(bias), + 'bias_after_scale': bias_after_scale + }) + return helper.append_activation(out) + + +def elementwise_add(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_add', **locals())) + + +def elementwise_div(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_div', **locals())) + + +def elementwise_sub(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_sub', **locals())) + + +def elementwise_mul(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_mul', **locals())) + + +def elementwise_max(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_max', **locals())) + + +def elementwise_min(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_min', **locals())) + + +def elementwise_pow(x, + y, + out=None, + axis=-1, + use_mkldnn=False, + act=None, + name=None): + return _elementwise_op(LayerHelper('elementwise_pow', **locals())) + + +for func in [ + elementwise_add, elementwise_div, elementwise_sub, elementwise_mul, + elementwise_max, elementwise_min, elementwise_pow +]: + op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) + func.__doc__ = _generate_doc_string_( + op_proto, + additional_args_lines=[ + "out (Tensor): The output tensor of elementwise op.", + "act (basestring|None): Activation applied to the output.", + "name (basestring|None): Name of the output." + ]) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 129252653dc139b7405626e6fd410704a4ad06d9..7867bfe00e25711643eab1ab8d0141dbbad3da52 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -13,9 +13,9 @@ # limitations under the License. from __future__ import print_function -from .layer_function_generator import generate_layer_fn +from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr -__activations__ = [ +__activations_noattr__ = [ 'sigmoid', 'logsigmoid', 'exp', @@ -33,29 +33,12 @@ __activations__ = [ 'square', 'softplus', 'softsign', - 'brelu', - 'leaky_relu', - 'soft_relu', - 'elu', - 'relu6', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', ] __all__ = [ 'mean', 'mul', - 'scale', 'sigmoid_cross_entropy_with_logits', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', 'clip', 'clip_by_norm', 'logical_and', @@ -70,11 +53,21 @@ __all__ = [ 'slice', 'shape', 'maxout', -] + __activations__ +] for _OP in set(__all__): globals()[_OP] = generate_layer_fn(_OP) +# It is a hot fix in some unittest using: +# fluid.layers.scale(x=x, scale=10.0, out=out_var) +# e.g.: test_program_code.py, test_dist_train.py +globals()['_scale'] = generate_layer_fn('scale') + +__all__ += __activations_noattr__ + +for _OP in set(__activations_noattr__): + globals()[_OP] = generate_layer_fn_noattr(_OP) + __all__ += ["uniform_random"] _uniform_random_ = generate_layer_fn('uniform_random') diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 051fe84364639ca6028326c0cb02b204a02531af..06513801dd8b34d366f9632f6943c8046872c31b 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -21,6 +21,7 @@ __all__ = [ "sequence_conv_pool", "glu", "scaled_dot_product_attention", + "img_conv_group", ] diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 44af29d3390e35129d0ee65b31eacad6b28a9d60..57d272cbfb948840679e80e8db40379c57603113 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -74,28 +74,7 @@ class ParallelExecutor(object): build_strategy=None, num_trainers=1, trainer_id=0, - scope=None, - **kwargs): - if len(kwargs) != 0: - err_msg = "" - for key in kwargs: - if key in dir(ExecutionStrategy): - err_msg += \ - "Setting {0} by constructor is deprecated. Use " \ - "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \ - "pe=ParallelExecutor(exec_strategy=strategy) " \ - "instead.\n ".format(key) - elif key in dir(BuildStrategy): - err_msg += \ - "Setting {0} by constructor is deprecated. Use " \ - "strategy=BuildStrategy(); See help(" \ - "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format( - key) - else: - err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format( - key) - raise ValueError(err_msg) - + scope=None): self._places = [] self._act_places = [] if use_cuda: diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index f0be794327f51cbbc4202b8b7b401b712b6d66a3..a51607bfdb1dde3d25f490770cc2ba368ceb27ff 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr): Args: dim(list): The parameter's name. Default None. - kwargs: Any field in ParamAttr. Default None. + name(str): The parameter's name. Default None. + initializer(Initializer): The method to initial this parameter. Default None. + learning_rate(float): The parameter's learning rate. The learning rate when + optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`. + Default 1.0. + regularizer(WeightDecayRegularizer): Regularization factor. Default None. + trainable(bool): Whether this parameter is trainable. Default True. + gradient_clip(BaseGradientClipAttr): The method to clip this parameter's + gradient. Default None. + do_model_average(bool): Whether this parameter should do model average. + Default False. Examples: .. code-block:: python @@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr): # these paramters for inference. params_with_weight_norm = [] - def __init__(self, dim=None, **kwargs): - super(WeightNormParamAttr, self).__init__(**kwargs) + def __init__(self, + dim=None, + name=None, + initializer=None, + learning_rate=1.0, + regularizer=None, + trainable=True, + gradient_clip=None, + do_model_average=False): + super(WeightNormParamAttr, self).__init__( + name=name, + initializer=initializer, + learning_rate=learning_rate, + regularizer=regularizer, + trainable=trainable, + gradient_clip=gradient_clip, + do_model_average=do_model_average) self.dim = dim diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py index e1368a3392a9cab3e82eff0a73eb225a52aa03bf..87f3b7502e26d3e6a437985f99d7897b060e101e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import contextlib import numpy import unittest @@ -57,11 +67,11 @@ def optimizer_func(): def train(use_cuda, train_program, params_dirname, inference_model_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func) def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): if event.step == 10: test_metrics = trainer.test( reader=test_reader, feed_order=['x', 'y']) @@ -91,7 +101,7 @@ def infer(use_cuda, inference_program, params_dirname=None): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place) batch_size = 10 diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index de276755bb1eb2746cc780575a40357255223809..d744a00242422defb360590b193e07c6f811dcb9 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -14,11 +14,22 @@ from __future__ import print_function +import sys + import paddle import paddle.fluid as fluid + +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.core as core import numpy -import six import os import cifar10_small_test_set @@ -106,7 +117,7 @@ def train(use_cuda, train_program, parallel, params_dirname): paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): avg_cost, accuracy = trainer.test( reader=test_reader, feed_order=['pixel', 'label']) @@ -118,7 +129,7 @@ def train(use_cuda, train_program, parallel, params_dirname): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, optimizer_func=optimizer_func, place=place, @@ -133,7 +144,7 @@ def train(use_cuda, train_program, parallel, params_dirname): def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place, diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index dd547f3448ae55c07b6c09f9de4ac08d8ec5ee88..82294d4b26fe64e6cddc81f9ba3480caf5b51620 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -14,11 +14,22 @@ from __future__ import print_function +import sys + import paddle import paddle.fluid as fluid + +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.core as core import numpy -import six import os import cifar10_small_test_set @@ -83,7 +94,7 @@ def train(use_cuda, train_program, parallel, params_dirname): paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): avg_cost, accuracy = trainer.test( reader=test_reader, feed_order=['pixel', 'label']) @@ -95,7 +106,7 @@ def train(use_cuda, train_program, parallel, params_dirname): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func, @@ -110,7 +121,7 @@ def train(use_cuda, train_program, parallel, params_dirname): def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place, diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py index ec4e1c768c7f2a2421ac409a2eecc0100c086a6a..9e155a59145db88dab27576a4a67a5d450bcfc9d 100755 --- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py +++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import numpy as np WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict() @@ -149,7 +159,7 @@ def optimize_func(): def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimize_func) feed_order = [ @@ -164,7 +174,7 @@ def train(use_cuda, train_program, params_dirname): # place) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.conll05.test(), batch_size=BATCH_SIZE) avg_cost_set = trainer.test( @@ -184,7 +194,7 @@ def train(use_cuda, train_program, params_dirname): if math.isnan(float(avg_cost)): sys.exit("got NaN loss, training failed.") - elif isinstance(event, fluid.EndStepEvent): + elif isinstance(event, EndStepEvent): print("Step {0}, Epoch {1} Metrics {2}".format( event.step, event.epoch, list(map(np.array, event.metrics)))) if event.step == 1: # Run 2 iterations to speed CI @@ -204,7 +214,7 @@ def train(use_cuda, train_program, params_dirname): def infer(use_cuda, inference_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( inference_program, param_path=params_dirname, place=place) # Setup input by creating LoDTensor to represent sequence of words. diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py index 560f1189581f631dc6a3470cf8f22f902ca26f26..b597dcf801dc5ad4b5957875634018cfdcd0b83b 100644 --- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -13,17 +13,28 @@ # limitations under the License. from __future__ import print_function + import contextlib +import sys import numpy as np import paddle import paddle.fluid as fluid + +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.framework as framework import paddle.fluid.layers as pd from paddle.fluid.executor import Executor from functools import partial import unittest -import os dict_size = 30000 source_dict_dim = target_dict_dim = dict_size @@ -198,12 +209,12 @@ def train(use_cuda, is_sparse, is_local=True): ] def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) if event.step == 10: trainer.stop() - trainer = fluid.Trainer( + trainer = Trainer( train_func=partial(train_program, is_sparse), place=place, optimizer_func=optimizer_func) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index 973308498bec3cddde2ef651751ad5d0c9f84503..ce183883e3bddd8633dd9c393ee358ba6210ea61 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -14,14 +14,22 @@ from __future__ import print_function -import argparse +import sys + import paddle.fluid as fluid + +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.core as core import paddle -import six -import sys import numpy -import unittest import math import sys import os @@ -68,14 +76,14 @@ def optimizer_func(): def train(use_cuda, train_program, parallel, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func, parallel=parallel) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) avg_cost, acc = trainer.test( @@ -91,7 +99,7 @@ def train(use_cuda, train_program, parallel, params_dirname): event.epoch + 1, avg_cost, acc)) if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") - elif isinstance(event, fluid.EndStepEvent): + elif isinstance(event, EndStepEvent): print( ("Step {0}, Epoch {1} Metrics {2}".format( event.step, event.epoch, @@ -112,7 +120,7 @@ def train(use_cuda, train_program, parallel, params_dirname): def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place, diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index cb4aeb430e1a9662a183084c0cdacc41c5a8ec11..45a5ff34af00f2dbe69bd4f08a50626d6ca814f8 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -14,14 +14,22 @@ from __future__ import print_function -import argparse +import sys + import paddle.fluid as fluid + +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.core as core import paddle -import six -import sys import numpy -import unittest import math import sys import os @@ -55,14 +63,14 @@ def optimizer_func(): def train(use_cuda, train_program, params_dirname, parallel): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func, parallel=parallel) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) avg_cost, acc = trainer.test( @@ -94,7 +102,7 @@ def train(use_cuda, train_program, params_dirname, parallel): def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place, diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py index 9e2767783bb6748cfc8f95567627068d7532a8c8..82193737967b2bebdd17cef8752eeb9cec2e85ce 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py +++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py @@ -19,6 +19,16 @@ import sys import numpy as np import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import paddle.fluid.layers as layers import paddle.fluid.nets as nets @@ -164,7 +174,7 @@ def optimizer_func(): def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func) feed_order = [ @@ -173,7 +183,7 @@ def train(use_cuda, train_program, params_dirname): ] def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): test_reader = paddle.batch( paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) avg_cost_set = trainer.test( @@ -208,7 +218,7 @@ def train(use_cuda, train_program, params_dirname): def infer(use_cuda, inference_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( inference_program, param_path=params_dirname, place=place) # Use the first data from paddle.dataset.movielens.test() as input. diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py index 097c2a468fca558106aba2f24c332256189d9076..14719774b9d90c2e96d8f6134469502241a5f1f2 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * from functools import partial import numpy as np @@ -72,13 +82,13 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - trainer = fluid.Trainer( + trainer = Trainer( train_func=partial(train_program, word_dict), place=place, optimizer_func=optimizer_func) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) avg_cost, acc = trainer.test( @@ -96,7 +106,7 @@ def train(use_cuda, train_program, params_dirname): event.epoch + 1, avg_cost, acc)) if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") - elif isinstance(event, fluid.EndStepEvent): + elif isinstance(event, EndStepEvent): print("Step {0}, Epoch {1} Metrics {2}".format( event.step, event.epoch, list(map(np.array, event.metrics)))) if event.step == 1: # Run 2 iterations to speed CI @@ -119,7 +129,7 @@ def infer(use_cuda, inference_program, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place) diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py index 5f74cd142590abb93f8846bc831a9f5e3dd2f311..62fbba6fe1a62da6a93d50abc074bf5d794cf458 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * from functools import partial import numpy as np @@ -87,13 +97,13 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - trainer = fluid.Trainer( + trainer = Trainer( train_func=partial(train_program, word_dict), place=place, optimizer_func=optimizer_func) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) avg_cost, acc = trainer.test( @@ -111,7 +121,7 @@ def train(use_cuda, train_program, params_dirname): event.epoch + 1, avg_cost, acc)) if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") - elif isinstance(event, fluid.EndStepEvent): + elif isinstance(event, EndStepEvent): print("Step {0}, Epoch {1} Metrics {2}".format( event.step, event.epoch, list(map(np.array, event.metrics)))) if event.step == 1: # Run 2 iterations to speed CI @@ -134,7 +144,7 @@ def infer(use_cuda, inference_program, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place) diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 284a6ca168636377699c287236c491352566909b..7523ad3fef17f61b1bde1fc687761cc6b86c3d9e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * from functools import partial import numpy as np @@ -79,13 +89,13 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - trainer = fluid.Trainer( + trainer = Trainer( train_func=partial(train_program, word_dict), place=place, optimizer_func=optimizer_func) def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): + if isinstance(event, EndEpochEvent): test_reader = paddle.batch( paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE, @@ -105,7 +115,7 @@ def train(use_cuda, train_program, params_dirname): event.epoch + 1, avg_cost, acc)) if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") - elif isinstance(event, fluid.EndStepEvent): + elif isinstance(event, EndStepEvent): print("Step {0}, Epoch {1} Metrics {2}".format( event.step, event.epoch, list(map(np.array, event.metrics)))) if event.step == 1: # Run 2 iterations to speed CI @@ -129,7 +139,7 @@ def infer(use_cuda, inference_program, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() word_dict = paddle.dataset.imdb.word_dict() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place) diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py index 1c7cf3199a07c3f65d967eda70a481b1bd1b1638..e4c0cc5429d3fe891034161d90fadfa9dd078b0b 100644 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py @@ -16,6 +16,16 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import sys +try: + from paddle.fluid.contrib.trainer import * + from paddle.fluid.contrib.inferencer import * +except ImportError: + print( + "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib", + file=sys.stderr) + from paddle.fluid.trainer import * + from paddle.fluid.inferencer import * import numpy as np import math import sys @@ -95,7 +105,7 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() def event_handler(event): - if isinstance(event, fluid.EndStepEvent): + if isinstance(event, EndStepEvent): outs = trainer.test( reader=test_reader, feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw']) @@ -109,7 +119,7 @@ def train(use_cuda, train_program, params_dirname): if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") - trainer = fluid.Trainer( + trainer = Trainer( train_func=train_program, optimizer_func=optimizer_func, place=place) trainer.train( @@ -121,7 +131,7 @@ def train(use_cuda, train_program, params_dirname): def infer(use_cuda, inference_program, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( + inferencer = Inferencer( infer_func=inference_program, param_path=params_dirname, place=place) # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 4b4f3e403776625fb5ca2f9b03d14ee7efe23d53..4a70976a4837c668a5e0ba6d49b598d046a8ec5d 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -67,6 +67,7 @@ def train(nn_type, use_cuda, parallel, save_dirname=None, + save_full_dirname=None, model_filename=None, params_filename=None, is_local=True): @@ -143,6 +144,13 @@ def train(nn_type, exe, model_filename=model_filename, params_filename=params_filename) + if save_full_dirname is not None: + fluid.io.save_inference_model( + save_full_dirname, [], [], + exe, + model_filename=model_filename, + params_filename=params_filename, + export_for_deployment=False) return else: print( @@ -214,10 +222,12 @@ def infer(use_cuda, def main(use_cuda, parallel, nn_type, combine): save_dirname = None + save_full_dirname = None model_filename = None params_filename = None if not use_cuda and not parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model" + save_full_dirname = "recognize_digits_" + nn_type + ".train.model" if combine == True: model_filename = "__model_combined__" params_filename = "__params_combined__" @@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine): use_cuda=use_cuda, parallel=parallel, save_dirname=save_dirname, + save_full_dirname=save_full_dirname, model_filename=model_filename, params_filename=params_filename) infer( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f53fe6d69d0855c8ba88eac8059708b690d2475b..d02c890209e65bdceb5da23ba5b9c7c0356174b8 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -80,7 +80,8 @@ if(WITH_DISTRIBUTE) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) - py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) + #FIXME(gongwb): random fails. + #py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 3ec79f8ef6e6f70f1365eaa32352c284d294a1ea..175bd130e5a8324227953eeeb769474e78f94fd2 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -437,13 +437,8 @@ def split_data(data, num_part): ] -def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, +def test_context(test_program, avg_cost, train_exe, dev_count, data_input_names, sum_cost, token_num): - # Context to do validation. - test_program = train_progm.clone() - with fluid.program_guard(test_program): - test_program = fluid.io.get_inference_program([avg_cost]) - val_data = DataReader( src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, @@ -505,7 +500,7 @@ def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, - token_num, predict): + token_num, predict, test_program): # Initialize the parameters. if TrainTaskConfig.ckpt_path: lr_scheduler.current_steps = TrainTaskConfig.start_step @@ -554,7 +549,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, -1] + label_data_input_fields if TrainTaskConfig.val_file_pattern is not None: - test = test_context(train_progm, avg_cost, train_exe, dev_count, + test = test_context(test_program, avg_cost, train_exe, dev_count, data_input_names, sum_cost, token_num) # the best cross-entropy value with label smoothing @@ -1647,6 +1642,8 @@ def get_model(is_dist, is_async): local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) + # Context to do validation. + test_program = fluid.default_main_program().clone(for_test=True) if not is_dist: optimizer = fluid.optimizer.Adam( @@ -1671,7 +1668,7 @@ def get_model(is_dist, is_async): epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) - return sum_cost, avg_cost, predict, token_num, local_lr_scheduler + return sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program def update_args(): @@ -1705,7 +1702,7 @@ class DistTransformer2x2(TestDistRunnerBase): def run_trainer(self, use_cuda, args): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() TrainTaskConfig.use_gpu = use_cuda - sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( + sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model( args.is_dist, not args.sync_mode) if args.is_dist: @@ -1726,7 +1723,7 @@ class DistTransformer2x2(TestDistRunnerBase): TrainTaskConfig.local = not args.is_dist train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost, - local_lr_scheduler, token_num, predict) + local_lr_scheduler, token_num, predict, test_program) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index e97643cddef22465436051a41ef4b825e9634d23..b5549c507ed753f4504afd655be59b444164e6f3 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + - str(actual_t) + " in class " + self.__class__.__name__) + str(actual_t)) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index 0c5343a97d5ef0f97fc6b144dfc82174eacb8573..f6eb8f2c6d8b94f92e24ff789c91efb53a645a46 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -20,7 +20,6 @@ import six import sys import collections import math -import paddle.fluid as fluid from op_test import OpTest @@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest): self.detect = np.array(self.detect).astype('float32') self.mAP = np.array(self.mAP).astype('float32') - if len(self.class_pos_count) > 0: + if (len(self.class_pos_count) > 0): self.class_pos_count = np.array(self.class_pos_count).astype( 'int32') self.true_pos = np.array(self.true_pos).astype('float32') @@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() - self.class_pos_count = [0, 2, 1, 0] + self.class_pos_count = [0, 2, 1] self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.false_pos_lod = [[0, 3, 2]] diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 59a137c18c9435ef5c5772d0cc08f197c1d86603..09b1c546e49bd02bf336f31885bf4c7339cc5a2c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -22,7 +22,7 @@ class TestDistMnist2x2(TestDistBase): self._sync_mode = True self._use_reduce = False - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_mnist.py", delta=1e-7) @@ -31,7 +31,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): self._sync_mode = True self._mem_opt = True - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_mnist.py", delta=1e-7) @@ -40,7 +40,7 @@ class TestDistMnistAsync(TestDistBase): self._sync_mode = False self._use_reduce = False - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_mnist.py", delta=200) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c0e9fa38e7d1eadd89eff9a8ba4442f888b8120e..c2b089694ea2f329e67ad6c50def26caa454720e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -21,15 +21,25 @@ class TestDistSeResneXt2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) +# TODO(typhoonzero): fix this test +# class TestDistseResnXt2x2WithMemopt(TestDistBase): +# def _setup_config(self): +# self._sync_mode = True +# self._mem_opt = True + +# def test_dist_train(self): +# self.check_with_place("dist_se_resnext.py", delta=1e-7) + + class TestDistSeResneXt2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 083525ccf54d389b60c4aaa9f8c6223f07c773cd..d0875d9ea442d0e88dfd958e5948b26225416df2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -27,6 +27,7 @@ import paddle.fluid.layers as layers from paddle.fluid.layers.io import ListenAndServ from paddle.fluid.layers.io import Recv from paddle.fluid.layers.io import Send +import paddle.fluid.layers.ops as ops from paddle.fluid import core @@ -89,7 +90,7 @@ class TestSendOp(unittest.TestCase): name="X", append_batch_size=False) fluid.initializer.Constant(value=1.0)(x, main.global_block()) - layers.scale(x=x, scale=10.0, out=out_var) + ops._scale(x=x, scale=10.0, out=out_var) self.server_exe = fluid.Executor(place) self.server_exe.run(main) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index 47083ca7e954c85bb42fcc88639f3e757283cbea..47e8dfaf03ceb27a74f5e48d662d2b534d2d152b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -59,7 +59,7 @@ class TestDistTransformer2x2Sync(TestDistBase): def _setup_config(self): self._sync_mode = True - def test_transformer(self): + def test_dist_train(self): download_files() self.check_with_place("dist_transformer.py", delta=1e-5) @@ -68,7 +68,7 @@ class TestDistTransformer2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False - def test_transformer(self): + def test_dist_train(self): download_files() self.check_with_place("dist_transformer.py", delta=1.0) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index a198b25520f97ce23b9c1ebb9cd82fc458222d73..ecde407e6d85ea1bfc0181b4b60e095ea496fb1a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -659,5 +659,28 @@ class TestLoadSliceVar(TranspilerTest): pserver2._slice_vars_and_attrs[idx][2].shape)) +class TestNCCL2Transpile(TranspilerTest): + def test_nccl2_transpile(self): + if fluid.core.is_compiled_with_cuda(): #test nccl2 only with cuda + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + self.net_conf() + + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile( + 0, + trainers="127.0.0.1:6174,127.0.0.1:6175", + current_endpoint="127.0.0.1:6174", + startup_program=startup) + print([op.type for op in startup.global_block().ops]) + self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id") + self.assertIsNotNone(startup.global_block().vars.get("NCCLID")) + else: + pass + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 9a3e92e8d775a37e0c24ee1bcc5435628d61bb91..33b39b262b95b0013e3696c3f15a288a2e801ce1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -17,19 +17,28 @@ import unittest from test_dist_base import TestDistBase -class TestDistSeResneXt2x2(TestDistBase): +class TestDistW2V2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_word2vec.py", delta=1e-4) -class TestDistSeResneXt2x2Async(TestDistBase): +class TestDistW2V2x2WithMemOpt(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._mem_opt = True + + def test_dist_train(self): + self.check_with_place("dist_word2vec.py", delta=1e-4) + + +class TestDistW2V2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False - def test_se_resnext(self): + def test_dist_train(self): self.check_with_place("dist_word2vec.py", delta=1) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 6855a0e2c0437429fe35b1dd97188716945a743b..f474cdae2054531d44724e0e3e0e58a35fb8ddcd 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -573,6 +573,158 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_brelu(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu') + self.assertIsNotNone(out) + print(str(program)) + + def test_leaky_relu(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu') + self.assertIsNotNone(out) + print(str(program)) + + def test_soft_relu(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.soft_relu(input, threshold=30.0, name='soft_relu') + self.assertIsNotNone(out) + print(str(program)) + + def test_sigmoid(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.sigmoid(input, name='sigmoid') + self.assertIsNotNone(out) + print(str(program)) + + def test_logsigmoid(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.logsigmoid(input, name='logsigmoid') + self.assertIsNotNone(out) + print(str(program)) + + def test_exp(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.exp(input, name='exp') + self.assertIsNotNone(out) + print(str(program)) + + def test_tanh(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.tanh(input, name='tanh') + self.assertIsNotNone(out) + print(str(program)) + + def test_tanh_shrink(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.tanh_shrink(input, name='tanh_shrink') + self.assertIsNotNone(out) + print(str(program)) + + def test_sqrt(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.sqrt(input, name='sqrt') + self.assertIsNotNone(out) + print(str(program)) + + def test_abs(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.abs(input, name='abs') + self.assertIsNotNone(out) + print(str(program)) + + def test_ceil(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.ceil(input, name='ceil') + self.assertIsNotNone(out) + print(str(program)) + + def test_floor(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.floor(input, name='floor') + self.assertIsNotNone(out) + print(str(program)) + + def test_cos(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.cos(input, name='cos') + self.assertIsNotNone(out) + print(str(program)) + + def test_sin(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.sin(input, name='sin') + self.assertIsNotNone(out) + print(str(program)) + + def test_round(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.round(input, name='round') + self.assertIsNotNone(out) + print(str(program)) + + def test_reciprocal(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.reciprocal(input, name='reciprocal') + self.assertIsNotNone(out) + print(str(program)) + + def test_square(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.square(input, name='square') + self.assertIsNotNone(out) + print(str(program)) + + def test_softplus(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.softplus(input, name='softplus') + self.assertIsNotNone(out) + print(str(program)) + + def test_softsign(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.softsign(input, name='softsign') + self.assertIsNotNone(out) + print(str(program)) + def test_roi_perspective_transform(self): program = Program() with program_guard(program): @@ -606,6 +758,14 @@ class TestBook(unittest.TestCase): out = layers.expand(x, [1, 2]) print(str(program)) + def test_softshrink(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[16], dtype="float32") + out = layers.softshrink(input, name='softshrink') + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 4153394c1da776d0a41e1415a09fa7d6f4b14d6d..37b9a9188ab44df81029ae6d9925ae21c1929cff 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope" + "op_namescope", "op_callstack" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py index e9c2b928617dce3904ca119896ca81454256e82e..27b22ba9392b63c0ccd7904ff03d737b977cc9fc 100644 --- a/python/paddle/fluid/tests/unittests/test_program_code.py +++ b/python/paddle/fluid/tests/unittests/test_program_code.py @@ -25,6 +25,7 @@ import paddle.fluid.layers as layers from paddle.fluid.layers.io import ListenAndServ from paddle.fluid.layers.io import Recv from paddle.fluid.layers.io import Send +import paddle.fluid.layers.ops as ops from paddle.fluid.transpiler.details import program_to_code @@ -52,7 +53,7 @@ class TestProgram2Code(unittest.TestCase): name="X", append_batch_size=False) fluid.initializer.Constant(value=1.0)(x, main.global_block()) - layers.scale(x=x, scale=10.0, out=out_var) + ops._scale(x=x, scale=10.0, out=out_var) program_to_code(main) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 30cdfe4ad2c9892184862b70ff49417ce5a08516..b495b6699b5d02ca8c466c984820be5c497d626e 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -12,1247 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - -import contextlib -import os -import errno -import shutil -import six -import time - -from . import core -from . import data_feeder -from . import executor -from . import framework -from . import io -# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module -from . import optimizer as opt_module -from . import parallel_executor -from .transpiler import distribute_transpiler - -__all__ = [ - 'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent', - 'EndStepEvent', 'CheckpointConfig' -] - - -class BeginEpochEvent(object): - """ - The begin of a training epoch. - - Args: - epoch_id(int): The current epoch ID. - """ - - def __init__(self, epoch_id): - self.epoch = epoch_id - - -class EndEpochEvent(object): - """ - The end of a training epoch. - - Args: - epoch_id(int): The current epoch ID. - """ - - def __init__(self, epoch_id): - self.epoch = epoch_id - - -class BeginStepEvent(object): - """ - The begin of a training epoch. - - Args: - epoch_id(int): The current epoch ID. - step_id(int): The current step ID. - """ - - def __init__(self, epoch_id, step_id): - self.epoch = epoch_id - self.step = step_id - self.fetch_metrics = True - """ - If fetch_metrics is true, the metrics will be fetched at the - EndStepEvent. Default is True. - """ - - -class EndStepEvent(object): - """ - The end of a training step. - - Args: - epoch_id(int): The current epoch ID. - step_id(int): The current step ID. - metrics(list): A list of fetched tensor. The order of this list is same - as the :code:`train_func` returns. - """ - - def __init__(self, epoch_id, step_id, metrics): - self.epoch = epoch_id - self.step = step_id - self.metrics = metrics - - -class CheckpointConfig(object): - """ - Parameter object for :code:`save_checkpoint` and - :code:`fluid.Trainer`. Used to configuration how to save checkpoint. - - Args: - checkpoint_dir(str): Directory path to save check point. Default is the - current directory. - - max_num_checkpoints(int): The max number of local check points. - epoch_interval(int): Every number of epoch to save check point. - step_interval(int): Every number of step to save check point. - - Examples: - >>> config = fluid.CheckpointConfig("./checkpoints") - >>> trainer = fluid.Trainer(train_func=train_program, - >>> place=place, - >>> optimizer_func=optimizer_func, - >>> checkpoint_config=config) - >>> trainer.train(...) - """ - - def __init__(self, - checkpoint_dir=None, - max_num_checkpoints=3, - epoch_interval=1, - step_interval=10): - - assert epoch_interval >= 1 - assert step_interval >= 1 - - self.checkpoint_dir = checkpoint_dir \ - if checkpoint_dir is not None else os.getcwd() - self.max_num_checkpoints = max_num_checkpoints - self.epoch_interval = epoch_interval - self.step_interval = step_interval - self.epoch_id = 0 - self.step_id = 0 - self.load_serial = None - self.pserver_id = None - self.lookup_table_name = None - - -def check_and_get_place(place): - """ - Check the type of place or get the default place - Args: - place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. - - Raises: - TypeError if the type mismatched. - - Returns: - the original place if it is not None. - if fluid is compiled with CUDA, returns CUDAPlace(0) by default. - Otherwise returns CPUPlace by default. - """ - if place is None: - if core.is_compiled_with_cuda(): - return core.CUDAPlace(0) - else: - return core.CPUPlace() - else: - if not isinstance(place, core.CUDAPlace) and not isinstance( - place, core.CPUPlace): - raise TypeError("Place should be either CUDAPlace or CPUPlace") - return place - - -class Trainer(object): - """ - A trainer wraps MultiGPU/MultiNode training loops and can be used to train a - simple neural network easily. - - This API takes a :code:`train_func`. A :code:`train_func` is a function that - return loss as it first return value. The reset value can be fetched by - EndStepEvent.metrics - - This API also takes a :code:`optimizer_func` that will return an optimizer - instance. - - For example, to train a MLP for MNIST dataset, the sample program is - - >>> import paddle.fluid as fluid - >>> - >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10): - >>> hidden = image - >>> for layer_size in layer_sizes: - >>> hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation) - >>> return fluid.layers.fc(input=hidden, size=num_classes, act="softmax") - >>> - >>> def train_mnist_mlp(): - >>> img = fluid.layers.data(name='image', shape=[784]) - >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') - >>> prediction = mlp(img) - >>> return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label)) - >>> - >>> def optimizer(): - >>> return fluid.optimizer.Adam() - >>> - >>> trainer = Trainer(train_func=train_mnist_mlp, - >>> optimizer_func=optimizer, - >>> place=fluid.CUDAPlace(0), - >>> parallel=True) - >>> - >>> def train_callback(event): - >>> if isinstance(event, fluid.EndStepEvent): - >>> print "Epoch ID", event.epoch, "Step ID",\ - >>> event.step, "AvgLoss", event.metrics[0] - >>> elif isinstance(event, fluid.EndEpochEvent): - >>> trainer.save_params("./model_{0}".format(event.epoch)) - >>> - >>> trainer.train(num_epochs=100, event_handler=train_callback) - - For more example, please see :ref:`api_guide_high_level_api`. - - - Args: - train_func(callable): A function which will return loss. The loss must be - a scalar tensor. - optimizer_func(callable): A function that returns an Optimizer object. - place(CUDAPlace|CPUPlace): The device place of this trainer. If - :code:`parallel=True,` all CUDA Places will be used if :code:`place` - is a :code:`CUDAPlace`. - parallel(bool): True if use multiple devices. - checkpoint_config(CheckpointConfig): Configuration about how to save - checkpoints. - """ - - def __init__(self, - train_func, - optimizer_func, - param_path=None, - place=None, - parallel=False, - checkpoint_config=None): - self.__stop = False - self.parallel = parallel - - # config for checkpoint - # only chief worker will save variables - self.trainer_id = 0 - self.checkpoint_cfg = checkpoint_config - if self.checkpoint_cfg: - assert isinstance(self.checkpoint_cfg, CheckpointConfig) - serial = _get_latest_checkpoint_serial( - self.checkpoint_cfg.checkpoint_dir) - self.checkpoint_cfg.load_serial = serial if serial >= 0 else None - - self.scope = core.Scope() - - # 1. we need to generate a framework.Program by calling - # program_func. Reference: fluid.program_guard in - # test_word2vec.py - - self.startup_program = framework.Program() - self.train_program = framework.Program() - - with framework.program_guard(self.train_program, self.startup_program): - program_func_outs = train_func() - self.train_func_outputs = program_func_outs if isinstance( - program_func_outs, list) else [program_func_outs] - self.test_program = self.train_program.clone(for_test=True) - - # The first element of program_func_outs is loss. - loss = self.train_func_outputs[0] - - optimizer = optimizer_func() - if not isinstance(optimizer, opt_module.Optimizer): - raise TypeError( - "The optimizer should be an instance of Optimizer") - optimize_ops, params_grads = optimizer.minimize(loss) - - self.place = check_and_get_place(place) - - self._dist_transpile_if_necessary(optimize_ops, params_grads) - - # 2. move the default_main_program to self.program and run the - # default_startup program on an empty core.Scope() - # Run startup program - with self._prog_and_scope_guard(): - exe = executor.Executor(place) - exe.run(self.startup_program) - - if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None: - self._load_checkpoint() - - if param_path and os.path.isdir(param_path): - with self._prog_and_scope_guard(): - # load params from param_path into scope - io.load_persistables( - executor=exe, - dirname=param_path, - main_program=self.startup_program) - - def _transpile_nccl2_dist(self): - # PADDLE_TRAINER_IPS - if "PADDLE_TRAINER_IPS" not in os.environ: - self.nccl_id_var = None - else: - self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - port = os.getenv("PADDLE_PSERVER_PORT") - worker_ips = os.getenv("PADDLE_TRAINER_IPS") - worker_endpoints = [] - for ip in worker_ips.split(","): - worker_endpoints.append(':'.join([ip, port])) - self.num_trainers = len(worker_endpoints) - current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port - worker_endpoints.remove(current_endpoint) - # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id - # in ParallelExecutor to start - # distributed training using NCCL2 - self.nccl_id_var = self.startup_program.global_block().create_var( - name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) - self.startup_program.global_block().append_op( - type="gen_nccl_id", - inputs={}, - outputs={"NCCLID": self.nccl_id_var}, - attrs={ - "endpoint": current_endpoint, - "endpoint_list": worker_endpoints, - "trainer_id": self.trainer_id - }) - - def _dist_transpile_if_necessary(self, optimize_ops, params_grads): - self._transpile_nccl2_dist() - if self.nccl_id_var != None: - return - - if "PADDLE_TRAINING_ROLE" not in os.environ: - return - - # the port of all pservers, needed by both trainer and pserver - port = os.getenv("PADDLE_PSERVER_PORT", "6174") - # comma separated ips of all pservers, needed by trainer and - # pserver - pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") - eplist = [] - for ip in pserver_ips.split(","): - eplist.append(':'.join([ip, port])) - pserver_endpoints = ",".join(eplist) - # total number of workers/trainers in the job, needed by - # trainer and pserver - trainers = int(os.getenv("PADDLE_TRAINERS")) - # the IP of the local machine, needed by pserver only - current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port - # the unique trainer id, starting from 0, needed by trainer - # only - self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) - - # the role, should be either PSERVER or TRAINER - training_role = os.getenv("PADDLE_TRAINING_ROLE") - with self._prog_and_scope_guard(): - t = distribute_transpiler.DistributeTranspiler() - t.transpile( - self.trainer_id, pservers=pserver_endpoints, trainers=trainers) - if training_role == "PSERVER": - if self.checkpoint_cfg: - pserver_id = eplist.index(current_endpoint) - self.checkpoint_cfg.pserver_id = pserver_id - if t.has_distributed_lookup_table: - self.checkpoint_cfg.lookup_table_name = t.table_name - - self.train_program = t.get_pserver_program(current_endpoint) - self.startup_program = t.get_startup_program(current_endpoint, - self.train_program) - elif training_role == "TRAINER": - self.train_program = t.get_trainer_program() - else: - raise ValueError( - 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' - ) - - def stop(self): - """ - stop training - """ - self.__stop = True - - def train(self, num_epochs, event_handler, reader=None, feed_order=None): - """ - Start the train loop to train the model. - - Args: - num_epochs(int): The number of epoch. An epoch will process all data in reader - event_handler(callable): The event handler. A function with type (ev:Event)->void - reader(callable): A reader creator object. See also - :ref:`api_guide_python_reader` . - feed_order(list): Feeding order of reader. None will following the defining - order in program - - Returns: - None - """ - training_role = os.getenv("PADDLE_TRAINING_ROLE", "") - if training_role == "PSERVER": - with self._prog_and_scope_guard(): - exe = executor.Executor(self.place) - exe.run() - return - if self.parallel: - self._train_by_parallel_executor(num_epochs, event_handler, reader, - feed_order) - else: - self._train_by_executor(num_epochs, event_handler, reader, - feed_order) - - def test(self, reader, feed_order): - """ - Test the model on given test data - - Args: - reader(callable): The reader that yields test data. - feed_order(list): Feeding order of reader. None will following the - defining order in program - """ - - return self._test_by_executor(reader, feed_order, - self.train_func_outputs) - - def save_params(self, param_path): - """ - Save all parameters into :code:`param_path`. - - Args: - param_path(str): The path to save parameters. - - Returns: - None - """ - with self._prog_and_scope_guard(): - exe = executor.Executor(self.place) - io.save_persistables(exe, dirname=param_path) - - def save_inference_model(self, param_path, feeded_var_names, - target_var_indexes): - """ - Save model for cpp inference into :code:`param_path`. - - Args: - param_path(str): The path to save parameters. - feeded_var_names(list(str)): The name of the vars that you - need to feed in before run program. - target_var_indexes(list(int)): the index of target var that - you need to return in trainer.train_func. - Returns: - None - """ - with self._prog_and_scope_guard(): - exe = executor.Executor(self.place) - target_vars = [ - self.train_func_outputs[index] for index in target_var_indexes - ] - io.save_inference_model(param_path, feeded_var_names, target_vars, - exe) - - @contextlib.contextmanager - def _prog_and_scope_guard(self): - with framework.program_guard( - main_program=self.train_program, - startup_program=self.startup_program): - with executor.scope_guard(self.scope): - yield - - def _train_by_executor(self, num_epochs, event_handler, reader, feed_order): - """ - Train by Executor and single device. - - Args: - num_epochs: - event_handler: - reader: - feed_order: - - Returns: - - """ - with self._prog_and_scope_guard(): - feed_var_list = build_feed_var_list(self.train_program, feed_order) - feeder = data_feeder.DataFeeder( - feed_list=feed_var_list, place=self.place) - exe = executor.Executor(self.place) - reader = feeder.decorate_reader(reader, multi_devices=False) - self._train_by_any_executor(event_handler, exe, num_epochs, reader) - - def _train_by_any_executor(self, event_handler, exe, num_epochs, reader): - if self.checkpoint_cfg: - epochs = [ - epoch_id for epoch_id in range(num_epochs) - if epoch_id >= self.checkpoint_cfg.epoch_id - ] - else: - epochs = [epoch_id for epoch_id in range(num_epochs)] - - for epoch_id in epochs: - event_handler(BeginEpochEvent(epoch_id)) - for step_id, data in enumerate(reader()): - if self.__stop: - if self.checkpoint_cfg: - self._clean_checkpoint() - return - - if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \ - and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id: - continue - - begin_event = BeginStepEvent(epoch_id, step_id) - event_handler(begin_event) - if begin_event.fetch_metrics: - metrics = exe.run(feed=data, - fetch_list=[ - var.name - for var in self.train_func_outputs - ]) - else: - metrics = exe.run(feed=data, fetch_list=[]) - - if self.checkpoint_cfg: - self._save_checkpoint(epoch_id, step_id) - event_handler(EndStepEvent(epoch_id, step_id, metrics)) - event_handler(EndEpochEvent(epoch_id)) - if self.checkpoint_cfg: - self._clean_checkpoint() - - def _test_by_executor(self, reader, feed_order, fetch_list): - with executor.scope_guard(self.scope): - feed_var_list = build_feed_var_list(self.test_program, feed_order) - feeder = data_feeder.DataFeeder( - feed_list=feed_var_list, place=self.place) - exe = executor.Executor(self.place) - accumulated = len(fetch_list) * [0] - count = 0 - for data in reader(): - outs = exe.run(program=self.test_program, - feed=feeder.feed(data), - fetch_list=fetch_list) - accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)] - count += 1 - - return [x / count for x in accumulated] - - def _train_by_parallel_executor(self, num_epochs, event_handler, reader, - feed_order): - with self._prog_and_scope_guard(): - pe = self._get_or_create_parallel_executor() - feed_var_list = build_feed_var_list(self.train_program, feed_order) - feeder = data_feeder.DataFeeder( - feed_list=feed_var_list, place=self.place) - reader = feeder.decorate_reader(reader, multi_devices=True) - self._train_by_any_executor(event_handler, pe, num_epochs, reader) - - def _get_parallel_executor(self): - return getattr(self, 'parallel_executor', None) - - def _get_or_create_parallel_executor(self): - if self._get_parallel_executor() is None: - self.parallel_executor = parallel_executor.ParallelExecutor( - use_cuda=isinstance(self.place, core.CUDAPlace), - loss_name=self.train_func_outputs[0].name) - return self._get_parallel_executor() - - def _clean_checkpoint(self): - assert self.checkpoint_cfg - clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) - - def _get_checkpoint_load_args(self): - """ - epoch_id and step_id are runtime arguments, they are not variables, will load them independently. - """ - return ["epoch_id", "step_id"] - - def _get_checkpoint_save_args(self, epoch_id, step_id): - """ - epoch_id and step_id are runtime arguments, they are not variables, will save them independently. - """ - trainer_args = {} - trainer_args["epoch_id"] = epoch_id - trainer_args["step_id"] = step_id - return trainer_args - - def _save_checkpoint(self, epoch_id, step_id): - assert self.checkpoint_cfg - - if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \ - and step_id % self.checkpoint_cfg.step_interval == 0: - exe = executor.Executor(self.place) - save_checkpoint( - executor=exe, - checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, - trainer_id=self.trainer_id, - trainer_args=self._get_checkpoint_save_args(epoch_id, step_id), - main_program=self.train_program, - max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) - - def _load_checkpoint(self): - with self._prog_and_scope_guard(): - exe = executor.Executor(self.place) - load_checkpoint( - executor=exe, - checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, - main_program=self.startup_program) - - if not self.checkpoint_cfg.pserver_id: - load_trainer_args = self._get_checkpoint_load_args() - trainer_args = load_checkpoint( - executor=exe, - checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, - main_program=self.startup_program, - role_id=self.trainer_id, - is_trainer=True, - load_trainer_args=load_trainer_args) - - if len(trainer_args) != 2: - raise ValueError( - "the return trainer_args length do not equal _get_checkpoint_load_args" - ) - self.checkpoint_cfg.epoch_id = int(trainer_args[0]) - self.checkpoint_cfg.step_id = int(trainer_args[1]) - else: - if self.checkpoint_cfg.lookup_table_name: - load_checkpoint( - executor=exe, - checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, - main_program=self.startup_program, - role_id=self.checkpoint_cfg.pserver_id, - is_trainer=False, - load_trainer_args=None, - load_lookup_table=self.checkpoint_cfg.lookup_table_name) - - -def build_feed_var_list(program, feed_order): - if not isinstance(program, framework.Program): - raise TypeError("The 'program' should be an object of Program") - - if isinstance(feed_order, list): - feed_var_list = [ - program.global_block().var(var_name) for var_name in feed_order - ] - else: - if not isinstance(feed_order, dict): - raise TypeError( - "The 'feed_order' should be either None, list or dict.") - if not sorted(feed_order.values()) == list(range(len(feed_order))): - raise ValueError( - "The values of 'feed_order' should be a permutation of [0, len(feed_order))" - ) - sorted_pair_list = sorted( - six.iteritems(feed_order), key=lambda item: item[1]) - feed_var_list = [ - program.global_block().var(pair[0]) for pair in sorted_pair_list - ] - return feed_var_list - - -# move Checkpoint APIs from io.py to trainer.py, make all of them are private. -SUCCESS_MARK_FILENAME = "_SUCCESS" -CHECKPOINT_PREFIX = "checkpoint" -MODEL_DIR = "__model__" -LOOKUP_TABLE_DIR = "__lookup_table__" -TRAINER_PREFIX = "trainer" -CHECKPOINT_SEPARATOR = "_" - - -def save_checkpoint(executor, - checkpoint_dir, - trainer_id, - main_program, - trainer_args=None, - max_num_checkpoints=3, - lookup_table=None, - pserver_endpoints=None): - """ - This function filters out all checkpoint variables from the give - main_program and then saves these variables to the `checkpoint_dir` - directory. - - In the training precess, we generally save a checkpoint in each - iteration. So there might be a lot of checkpoints in the - `checkpoint_dir`. To avoid them taking too much disk space, the - `max_num_checkpoints` are introduced to limit the total number of - checkpoints. If the number of existing checkpints is greater than - the `max_num_checkpoints`, oldest ones will be scroll deleted. - - A variable is a checkpoint variable and will be saved if it meets - all following conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for save checkpoint. - checkpoint_dir(str): The folder where to save checkpoints. - trainer_id(int): currect trainer id, if id is equal to 0, the trainer - is chief. - trainer_args(dict|None): Current training arguments. Such as 'epoch_id' - and 'step_id'. - Defaut: None - main_program(Program): The program whose checkpoint variables will - be saved. - max_num_checkpoints(int): The max number of total number of existing - checkpoints. - Default: 3 - lookup_table(string|None): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - pserver_endpoints(list|None): the parameter server ip:port list. - when use distribute lookup table, we can get pserver_endpoints by - distribute arguments. - - Returns: - None - - Raises: - ValueError: If `checkpoint_dir` is None. - AssertionError: If `trainer_args` is not a dict. - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - trainer_args = {"epoch_id": 200, - "step_id": 20} # just an example - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - save_checkpoint(executor=exe, - checkpoint_dir=path, - trainer_id=0, - trainer_args=trainer_args, - main_program=prog, - max_num_checkpoints=3, - lookup_table=table_name, - pserver_endpoints = ps_endpoints) - """ - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - - if main_program is None: - raise ValueError('main_program should not be None.') - - if trainer_args: - assert isinstance(trainer_args, dict) - - is_chief = trainer_id == 0 - - _make_chekcpoint_dirs(checkpoint_dir) - serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1 - cur_dir = _get_serial_dir(checkpoint_dir, serial) - - _save_trainer_args(cur_dir, trainer_id, trainer_args) - - if is_chief: - _save_persist_vars_without_grad(executor, cur_dir, main_program) - - if is_chief and lookup_table and pserver_endpoints: - _save_pserver_vars_by_notify(executor, cur_dir, lookup_table, - pserver_endpoints) - - _scroll_delete(checkpoint_dir, max_num_checkpoints) - - -def load_checkpoint(executor, - checkpoint_dir, - main_program, - role_id=0, - is_trainer=True, - load_trainer_args=None, - load_lookup_table=None): - """ - This function filters out all checkpoint variables from the give - main_program and then try to load these variables from the - `checkpoint_dir` directory. - - In the training precess, we generally save a checkpoint in each - iteration. So there are more than one checkpoint in the - `checkpoint_dir` (each checkpoint has its own sub folder), use - `serial` to specify which serial of checkpoint you would like to - load. - - A variable is a checkpoint variable and will be loaded if it meets - all following conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for loading checkpoint. - checkpoint_dir(str): The folder where all checkpoints are. - serial(int): The serial of checkpoint you would like to load. - main_program(Program): The program whose checkpoint variables will - be loaded. - role_id(int): the trainer id or the parameter server id. - is_trainer(bool): trainer is True and parameter server is False. - load_trainer_args(list|None): list about load trainer args. - load_lookup_table(str|None): the lookup table name - - Returns: - None - - Raises: - ValueError: If `checkpoint_dir` is None. - ValueError: If `main_program` is None. - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - load_checkpoint(executor=exe, checkpoint_dir=path, - serial=9, main_program=prog) - - # In this example, `load_checkpoint` function - # will first filters out all checkpoint variables in the default - # main program, and then try to load these variables form the - # folder "./checkpoints/checkpoint_9/__model__". - """ - - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - - serial = _get_latest_checkpoint_serial(checkpoint_dir) - - # there are nothing need to be loaded - if serial is None or serial < 0: - return - - if main_program is None: - raise ValueError('main_program should not be None.') - - if is_trainer and load_trainer_args is None: - cur_dir = _get_serial_dir(checkpoint_dir, serial) - _load_persist_vars_without_grad(executor, cur_dir, main_program, True) - return - - if is_trainer and load_trainer_args: - return _load_trainer_args(checkpoint_dir, serial, role_id, - load_trainer_args) - - if not is_trainer and load_lookup_table: - _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id, - load_lookup_table) - - -def clean_checkpoint(checkpoint_dir, delete_dir=False): - """ - clean the checkpoint dir, when the train exits normally, - the trainer will call clean_checkpoint to delete checkpoint directory saved before. - delete_dir only works when the directory is empty, otherwise, OSError is raised. - - : param checkpoint_dir - : param delete_dir - """ - - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - _scroll_delete(checkpoint_dir, max_num_checkpoints=0) - - if delete_dir and not os.listdir(checkpoint_dir): - os.rmdir(checkpoint_dir) - - -def _load_persist_vars_without_grad(executor, - dirname, - program, - has_model_dir=False): - """ - This function filters out all checkpoint variables from the give - program and then trys to load these variables from the given directory. - - A variable is a checkpoint variable if it meets all following - conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for loading variables. - dirname(str): The directory path. - program(Program): The program whose checkpoint variables will - be loaded. - has_model_dir(bool): if True, the function loads variables - from a sub directory named '__model__'. - Default: False - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - _load_persist_vars_without_grad(executor=exe, - dirname=param_path, program=prog, has_model_dir=True) - - # In this example, `_load_persist_vars_without_grad` function - # will first filters out all checkpoint variables in the default - # main program, and then trys to load these variables form the - # folder "./my_paddle_model/__model__". - """ - - if has_model_dir: - dirname = _get_model_dir(dirname) - - io.load_vars( - executor, - dirname=dirname, - main_program=program, - predicate=_is_checkpoint_var, - filename=None) - - -def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): - """ - The parameter server will load lookup table's local file in - selectedrows variable. - - Args: - executor(Executor): The executor to run for loading persistable variables - dirname(str): The directory path - main_program(Program): Find the variable named table_name in main_program - pserver_id(int): the serial number in pserver_endpoints list - table_name(str): lookup table name - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - dirname = "./checkpoints/checkpoint_9/" - prog = fluid.default_main_program() - pserver_id = 1 - table_name = "share_w" - _load_lookup_table_vars(executor=exe, - dirname=dirname, program=prog, pserver_id=pserver_id, - table_name=table_name) - """ - - for var in program.list_vars(): - if var.name == table_name: - lookup_table_var = var - break - - assert lookup_table_var is not None - - lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) - table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) - - load_prog = framework.Program() - load_block = load_prog.global_block() - - load_block.append_op( - type='load', - inputs={}, - outputs={'Out': [lookup_table_var]}, - attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) - - executor.run(load_prog) - - -def _save_persist_vars_without_grad(executor, dirname, program): - """ - This function filters out all checkpoint variables from the give - program and then save these variables to a sub-folder '__model__' of - the given directory. - - A variable is a checkpoint variable if it meets all following - conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for saving variables. - dirname(str): The directory path. - program(Program): The program whose checkpoint variables will - be saved. - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - _save_persist_vars_without_grad(executor=exe, - dirname=param_path, program=prog) - - # In this example, `_save_persist_vars_without_grad` function - # will first filters out all checkpoint variables in the default - # main program, and then saves these variables to the folder - # "./my_paddle_model/__model__". - """ - cur_dir = _get_model_dir(dirname) - io.save_vars( - executor, - dirname=cur_dir, - main_program=program, - vars=None, - predicate=_is_checkpoint_var, - filename=None) - _write_success(cur_dir) - - -def _save_pserver_vars_by_notify(executor, dirname, lookup_table, - ps_endpoint_list): - """ - This function will send checkpoint notify message from Trainer 0 - to all the pservers. - The checkpoint notify message contains lookup table name, - the absolute path on pserver to save lookup_table. - - Args: - executor(Executor): The executor to run for send checkpoint notify. - dirname(str): The folder where to save checkpoints. - lookup_table(string): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - ps_endpoint_list(list): the parameter server ip:port list. - when use distribute lookup table, we can get ps_endpoint_list by - distribute arguments. - Return: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - _save_pserver_vars_by_notify(executor=exe, - dirname=param_path, lookup_table=table_name, - ps_endpoint_list=ps_endpoints) - """ - cur_dir = _get_lookuptable_dir(dirname) - - checkpoint_notify_program = framework.Program() - checkpoint_notify_block = checkpoint_notify_program.global_block() - - attrs = {} - attrs['epmap'] = ps_endpoint_list - attrs['dir'] = cur_dir - attrs['lookup_table'] = lookup_table - - checkpoint_notify_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(checkpoint_notify_program) - - -def _save_trainer_args(dirname, trainer_id, trainer_args): - assert isinstance(trainer_args, dict) - - cur_dir = _get_trainer_dir(dirname, trainer_id) - - for name, value in six.iteritems(trainer_args): - args_file = os.path.join(cur_dir, name) - with open(args_file, 'w') as f: - f.write(str(value)) - _write_success(cur_dir) - - -def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): - """ - trainer will load some args from it's independent directory, - such as epoch_id and step_id. - - Args: - checkpoint_dir(str): The folder where all checkpoints are. - serial(int): The serial of checkpoint you would like to load. - trainer_id(int): current trainer id. - trainer_args(list): list about load trainer args - Return: - None - - Examples: - .. code-block:: python - - param_path = "./checkpoint/" - serial = 7 - trainer_id = 2 - trainer_args = ["epoch_id", "step_id"] - - _load_trainer_args(checkpoint_dir=param_path, serial=serial, - trainer_id=trainer_id, trainer_args=trainer_args) - """ - assert isinstance(trainer_args, list) - - cur_dir = _get_serial_dir(checkpoint_dir, serial) - cur_dir = _get_trainer_dir(cur_dir, trainer_id) - - ret_values = [] - - for arg in trainer_args: - cur_file = os.path.join(cur_dir, arg) - with open(cur_file, 'r') as f: - contents = f.read() - ret_values.append(contents.strip()) - return ret_values - - -def _is_checkpoint_var(var): - """ - the checkpoint will not save or load all the variables. - var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. - - : param var(Variable) - """ - if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ - var.desc.type() == core.VarDesc.VarType.RAW: - return False - # @GRAD are named for gradient variables, checkpoint will not save it. - if "@GRAD" in var.name: - return False - # .trainer_ are named for distribute train variables, checkpoint will not save it. - if ".trainer_" in var.name: - return False - - # .block is named for distribute train variables, checkpoint will not save it. - if ".block" in var.name: - return False - - return var.persistable - - -def _make_chekcpoint_dirs(dirs): - """ - _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it. - """ - assert dirs is not None - - if os.path.isfile(dirs): - raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs) - - if not os.path.isdir(dirs): - try: - os.makedirs(dirs) - except OSError as err: - if err.errno != errno.EEXIST: - raise err - - -def _get_dir_serial(dirname): - _, serial = dirname.split(CHECKPOINT_SEPARATOR) - - try: - serial_num = int(serial) - except ValueError: - serial_num = -1 - return serial_num - - -def _get_serial_dir(dirname, serial): - serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - serial_dir = os.path.join(dirname, serial_folder) - _make_chekcpoint_dirs(serial_dir) - - return serial_dir - - -def _get_model_dir(dirname): - model_dir = os.path.join(dirname, MODEL_DIR) - _make_chekcpoint_dirs(model_dir) - return model_dir - - -def _get_lookuptable_dir(dirname): - lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) - _make_chekcpoint_dirs(lookuptable_dir) - return lookuptable_dir - - -def _get_trainer_dir(dirname, trainer_id): - trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) - trainer_dir = os.path.join(dirname, trainer_folder) - _make_chekcpoint_dirs(trainer_dir) - return trainer_dir - - -def _scroll_delete(dirname, max_num_checkpoints=3): - dirs = os.listdir(dirname) - serial_map = {} - for serial in dirs: - serial_num = _get_dir_serial(serial) - serial_map[serial_num] = serial - - if len(list(serial_map.keys())) <= max_num_checkpoints: - return - - serials = list(serial_map.keys()) - serials.sort(reverse=True) - serials = serials[max_num_checkpoints:] - for serial in serials: - cur_dir = _get_serial_dir(dirname, serial) - try: - shutil.rmtree(cur_dir) - except OSError as err: - if err.errno != errno.ENOENT: - raise err - - -def _write_success(dirname): - """ - write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. - - : param dirname - """ - success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) - with open(success_file, 'a') as f: - now = time.ctime() - f.write(now) - - -def _get_latest_checkpoint_serial(checkpoint_dir): - """ - get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory - - : param checkpoint_dir - """ - if not checkpoint_dir: - return -1 - - def has_success(checkpoint_dir, cur_dir): - """ - is _SUCCESS in this dir - """ - - serial = _get_dir_serial(cur_dir) - if serial == -1 or not os.path.isdir( - os.path.join(checkpoint_dir, cur_dir)): - return -1 - - success_path = os.path.join( - _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, - SUCCESS_MARK_FILENAME) - if os.path.isfile(success_path): - return serial - - if not os.path.isdir(checkpoint_dir): - return -1 - - current_dir = -1 - dirs = os.listdir(checkpoint_dir) - for cur_dir in dirs: - success_num = has_success(checkpoint_dir, cur_dir) - if success_num > current_dir: - current_dir = success_num - return current_dir +# NOTE: Trainer is moved into fluid.contrib.trainer. +__all__ = [] diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index 200175cfe87e24a53e1e229e41d1ff2a25fd66ec..59899e7e9ab98f661699d5ac0645c92bd23a1512 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -21,13 +21,12 @@ import paddle def delete_ops(block, ops): - try: - start = list(block.ops).index(ops[0]) - end = list(block.ops).index(ops[-1]) - [block._remove_op(start) for _ in six.moves.range(end - start + 1)] - except Exception as e: - raise e - block.program._sync_with_cpp() + for op in ops: + try: + idx = list(block.ops).index(op) + block._remove_op(idx) + except Exception as e: + print(e) def find_op_by_input_arg(block, arg_name): @@ -37,10 +36,18 @@ def find_op_by_input_arg(block, arg_name): return -1 -def find_op_by_output_arg(block, arg_name): - for index, op in enumerate(block.ops): - if arg_name in op.output_arg_names: - return index +def find_op_by_output_arg(block, arg_name, reverse=False): + if reverse: + pos = len(block.ops) - 1 + while pos >= 0: + op = block.ops[pos] + if arg_name in op.output_arg_names: + return pos + pos -= 1 + else: + for index, op in enumerate(block.ops): + if arg_name in op.output_arg_names: + return index return -1 diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index f58f1883a407a3123856e19b5ec8fc01862466a7..43071def7a906e585909e50e4c0c52c56d981cde 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -50,6 +50,15 @@ OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName( ) RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC +DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist +LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched + +PRINT_LOG = False + + +def log(*args): + if PRINT_LOG: + print(args) class VarBlock: @@ -127,6 +136,9 @@ class DistributeTranspilerConfig(object): slice_var_up = True split_method = None min_block_size = 8192 + # supported modes: pserver, nccl2 + mode = "pserver" + print_log = False class DistributeTranspiler(object): @@ -134,27 +146,30 @@ class DistributeTranspiler(object): **DistributeTranspiler** Convert the fluid program to distributed data-parallelism programs. + Supports two modes: pserver mode and nccl2 mode. + + In pserver mode, the main_program will be transformed to use a remote + parameter server to do parameter optimization. And the optimization + graph will be put into a parameter server program. - The main_program will be transformed to use a remote parameter server - to do parameter optimization. And the optimization graph will be put - into a parameter server program. + In nccl2 mode, the transpiler will append a NCCL_ID broadcasting + op in startup_program to share the NCCL_ID across the job nodes. + After transpile_nccl2 called, you ***must*** pass trainer_id and + num_trainers argument to ParallelExecutor to enable NCCL2 distributed + mode. Examples: .. code-block:: python - # Define your model before these codes. - port = os.getenv("PADDLE_PSERVER_PORT", "6174") - pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") - eplist = [] - for ip in pserver_ips.split(","): - eplist.append(':'.join([ip, port])) - pserver_endpoints = ",".join(eplist) - trainers = int(os.getenv("PADDLE_TRAINERS")) - current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port - trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # for pserver mode + pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + current_endpoint = "192.168.0.1:6174" + trainer_id = 0 + trainers = 4 role = os.getenv("PADDLE_TRAINING_ROLE") - t = distribute_transpiler.DistributeTranspiler() + t = fluid.DistributeTranspiler() t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers) if role == "PSERVER": @@ -163,6 +178,18 @@ class DistributeTranspiler(object): pserver_program) elif role == "TRAINER": trainer_program = t.get_trainer_program() + + # for nccl2 mode + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss_var.name, + num_trainers=len(trainers.split(",)), + trainer_id=trainer_id + ) """ def __init__(self, config=None): @@ -174,16 +201,47 @@ class DistributeTranspiler(object): if self.config.split_method is None: self.config.split_method = RoundRobin + global PRINT_LOG + if self.config.print_log: + PRINT_LOG = True assert (self.config.min_block_size >= 8192) assert (self.config.split_method.__bases__[0] == PSDispatcher) + def _transpile_nccl2(self, + trainer_id, + trainers, + current_endpoint, + startup_program=None): + if not startup_program: + startup_program = default_startup_program() + if trainer_id >= 0: + worker_endpoints = trainers.split(",") + # send NCCL_ID to others or recv from trainer 0 + worker_endpoints.remove(current_endpoint) + + nccl_id_var = startup_program.global_block().create_var( + name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) + startup_program.global_block().append_op( + type="gen_nccl_id", + inputs={}, + outputs={"NCCLID": nccl_id_var}, + attrs={ + "endpoint": current_endpoint, + "endpoint_list": worker_endpoints, + "trainer_id": trainer_id + }) + return nccl_id_var + else: + raise ValueError("must set trainer_id > 0") + def transpile(self, trainer_id, program=None, pservers="127.0.0.1:6174", trainers=1, sync_mode=True, - startup_program=None): + startup_program=None, + current_endpoint="127.0.0.1:6174"): """ Run the transpiler. @@ -194,10 +252,15 @@ class DistributeTranspiler(object): default is fluid.default_main_program(). pservers (str): comma separated ip:port string for the pserver list. - trainers (int): number of trainers in the distributed job. + trainers (int|str): in pserver mode this is the number of + trainers, in nccl2 mode this is a string of trainer + endpoints. sync_mode (bool): Do sync training or not, default is True. startup_program (Program|None): startup_program to transpile, default is fluid.default_main_program(). + current_endpoint (str): need pass current endpoint when + transpile as nccl2 distributed mode. In pserver mode + this argument is not used. """ if program is None: program = default_main_program() @@ -207,6 +270,15 @@ class DistributeTranspiler(object): self.startup_program = startup_program self.origin_startup_program = self.startup_program.clone() + if self.config.mode == "nccl2": + assert (isinstance(trainers, str)) + self._transpile_nccl2( + trainer_id, + trainers, + current_endpoint, + startup_program=startup_program) + return + self.trainer_num = trainers self.sync_mode = sync_mode self.trainer_id = trainer_id @@ -257,12 +329,12 @@ class DistributeTranspiler(object): splited_grad_varname = grad_varname if len(splited_vars) == 1: splited_grad_varname = splited_vars[0].name - index = find_op_by_output_arg(program.global_block(), - splited_grad_varname) + index = find_op_by_output_arg( + program.global_block(), splited_grad_varname, reverse=True) elif len(splited_vars) > 1: orig_var = program.global_block().vars[splited_grad_varname] - index = find_op_by_output_arg(program.global_block(), - splited_grad_varname) + index = find_op_by_output_arg( + program.global_block(), splited_grad_varname, reverse=True) self._insert_split_op(program, orig_var, index, splited_vars) index += 1 else: @@ -301,7 +373,7 @@ class DistributeTranspiler(object): self.grad_name_to_send_dummy_out[ self.table_name] = program.global_block().create_var( name=framework.generate_control_dev_var_name()) - input_deps = self.grad_name_to_send_dummy_out.values() + input_deps = list(self.grad_name_to_send_dummy_out.values()) program.global_block().append_op( type="send_barrier", @@ -377,7 +449,10 @@ class DistributeTranspiler(object): type="concat", inputs={"X": splited_var}, outputs={"Out": [orig_param]}, - attrs={"axis": 0}) + attrs={ + "axis": 0, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) @@ -496,9 +571,9 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. - sys.stderr.write("get_pserver_program() is deprecated, call\ - get_pserver_programs() to get pserver main and startup\ - in a single call.") + sys.stderr.write("get_pserver_program() is deprecated, call \ +get_pserver_programs() to get pserver main and startup \ +in a single call.") # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed @@ -615,22 +690,31 @@ class DistributeTranspiler(object): for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = pserver_program._create_block(pre_block_idx) optimize_blocks.append(per_opt_block) + optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0] # append grad merging ops before clip and weight decay - # cases may like: - # L2Decay op -> clip op -> optimize + # e.g. merge grad -> L2Decay op -> clip op -> optimize + merged_var = None for _, op in enumerate(self.optimize_ops): - # find the origin @GRAD var before clipping - grad_varname_for_block = __op_have_grad_input__(op) - if ufind.is_connected(op, opt_op) and grad_varname_for_block: + # find the origin grad var before clipping/L2Decay, + # merged_var should be the input var name of L2Decaybuil + grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] + if op.attr(OP_ROLE_VAR_ATTR_NAME)[ + 0] == optimize_target_param_name: merged_var = self._append_pserver_grad_merge_ops( per_opt_block, grad_varname_for_block, endpoint, grad_to_block_id, self.origin_program) - break # append optimize op once then append other ops. - for _, op in enumerate(self.optimize_ops): - # optimizer is connected to itself - if ufind.is_connected(op, opt_op) and op not in global_ops: - __append_optimize_op__(op, per_opt_block, grad_to_block_id, - merged_var, lr_ops) + if merged_var: + break # append optimize op once then append other ops. + if merged_var: + for _, op in enumerate(self.optimize_ops): + # optimizer is connected to itself + if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \ + op not in global_ops: + log("append opt op: ", op.type, op.input_arg_names, + merged_var) + __append_optimize_op__(op, per_opt_block, + grad_to_block_id, merged_var, + lr_ops) # dedup grad to ids list grad_to_block_id = list(set(grad_to_block_id)) @@ -726,17 +810,17 @@ class DistributeTranspiler(object): Returns: Program: parameter server side startup program. """ - sys.stderr.write("get_startup_program() is deprecated, call\ - get_pserver_programs() to get pserver main and startup\ - in a single call.") + sys.stderr.write("get_startup_program() is deprecated, call \ +get_pserver_programs() to get pserver main and startup \ +in a single call.") if pserver_program != None: - sys.stderr.write("passing pserver_program to get_startup_program()\ - is deprecated, you can use new API get_pserver_programs() to\ - get both pserver main program and startup program.") + sys.stderr.write("passing pserver_program to get_startup_program() \ +is deprecated, you can use new API get_pserver_programs() to \ +get both pserver main program and startup program.") if startup_program != None: - sys.stderr.write("passing startup_program to get_startup_program()\ - is deprecated, use fluid.program_guard() or pass this argument\ - to transpile() call.") + sys.stderr.write("passing startup_program to get_startup_program() \ +is deprecated, use fluid.program_guard() or pass this argument \ +to transpile() call.") s_prog = Program() orig_s_prog = self.startup_program @@ -1057,7 +1141,7 @@ class DistributeTranspiler(object): if self.sync_mode else [] }, attrs={ - "sync_mode": False, + "sync_mode": self.sync_mode, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ @@ -1302,7 +1386,10 @@ class DistributeTranspiler(object): type="split_selected_rows", inputs={"X": orig_var}, outputs={"Out": splited_vars}, - attrs={"height_sections": height_sections}) + attrs={ + "height_sections": height_sections, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR: sections = [] for v in splited_vars: @@ -1312,8 +1399,10 @@ class DistributeTranspiler(object): type="split_byref", inputs={"X": orig_var}, outputs={"Out": splited_vars}, - attrs={"sections": sections} # assume split evenly - ) + attrs={ + "sections": sections, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) else: AssertionError("Variable type should be in set " "[LOD_TENSOR, SELECTED_ROWS]") @@ -1381,15 +1470,15 @@ class DistributeTranspiler(object): if not grad_block: # do not append this op if current endpoint # is not dealing with this grad block - return + return None orig_varname, block_name, trainer_name = self._get_varname_parts( grad_block.name) if block_name: merged_var_name = '.'.join([orig_varname, block_name]) else: merged_var_name = orig_varname - merged_var = \ - pserver_block.vars[merged_var_name] + + merged_var = pserver_block.vars[merged_var_name] grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx)) if self.sync_mode and self.trainer_num > 1: vars2merge = [] @@ -1473,7 +1562,6 @@ class DistributeTranspiler(object): outputs = self._get_output_map_from_op( self.origin_program.global_block().vars, opt_op) outputs["ParamOut"] = new_inputs["Param"] - optimize_block.append_op( type=opt_op.type, inputs=new_inputs, @@ -1618,6 +1706,16 @@ class DistributeTranspiler(object): return iomap def _get_lr_ops(self): + lr_ops = [] + block = self.origin_program.global_block() + for op in block.ops: + if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int( + LR_SCHED_OP_ROLE_ATTR_VALUE): + lr_ops.append(op) + log("append lr op: ", op.type) + return lr_ops + + def _get_lr_ops_deprecated(self): lr_ops = [] # find learning rate variables by optimize op lr_vars = set() @@ -1670,20 +1768,21 @@ class DistributeTranspiler(object): block = self.origin_program.global_block() opt_ops = [] params_grads = [] + # tmp set to dedup + optimize_params = set() origin_var_dict = self.origin_program.global_block().vars for op in block.ops: if self._is_opt_role_op(op): opt_ops.append(op) - # HACK(wuyi): if we find grad vars from input of optimize - # ops, we may get the output of clip op. Use syntax "@GRAD" - # and op_role_var to get the pair. - for input_name in op.input_arg_names: - if input_name.find("@GRAD") != -1 and \ - op.attr(RPC_OP_ROLE_ATTR_NAME): - param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0] + if op.attr(OP_ROLE_VAR_ATTR_NAME): + param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0] + grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] + if not param_name in optimize_params: + optimize_params.add(param_name) + log("adding param_grad pair: ", param_name, grad_name) params_grads.append([ origin_var_dict[param_name], - origin_var_dict[input_name] + origin_var_dict[grad_name] ]) else: pass diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index d4517059a4b033eec20ef6903894426ccbd597d7..d5aa54d752305b188d292f95f05cd70d27702c35 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,10 +14,10 @@ from __future__ import print_function -from collections import defaultdict +from collections import defaultdict, OrderedDict, Callable from .. import core from ... import compat as cpt -from ..framework import Program, default_main_program, Parameter +from ..framework import Program, default_main_program, Parameter, Variable from ..backward import _rename_arg_ from functools import reduce from six.moves import range @@ -113,8 +113,10 @@ class ControlFlowGraph(object): def _fill_pool(self, i, is_forward): block_desc = self._ops[i].block() in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) + # NOTE: must sort the in_diff set for cases that get different cache var. + # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in in_diff + x for x in sorted(list(in_diff)) if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: @@ -220,8 +222,9 @@ class ControlFlowGraph(object): block_desc = op.block() is_forward = i < self._forward_num if self.pool: + # NOTE: must sort the in_diff set for cases that get different cache var. defs_can_optimize = [ - x for x in self._defs[i] + x for x in sorted(list(self._defs[i])) if self._check_var_validity(block_desc, x, is_forward) ] out_pair = [ @@ -271,6 +274,8 @@ class ControlFlowGraph(object): self._program.block(block_desc.id).var(cpt.to_text( x)).desc = self._find_var(block_desc, cache_var, is_forward) + self._program.block(block_desc.id).vars[cpt.to_text(x)] = \ + Variable(self._program.block(block_desc.id), name=cpt.to_text(x)) self._update_graph(x, cache_var, begin_idx=i) break self._fill_pool(i, is_forward)