提交 1560eb4a 编写于 作者: Z ZongwuYang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my-cool-stuff

......@@ -214,6 +214,7 @@ if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows
include(external/warpctc) # download, build, install warpctc
include(cupti)
include(external/gzstream)
endif (NOT WIN32)
if(WITH_DISTRIBUTE)
......
......@@ -43,6 +43,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null
RUN rm -r /root/python_build
RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \
python3 python3-dev python3-pip \
......
......@@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: gzstream is needed when linking with ctr reader.
SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
ExternalProject_Add(
extern_gzstream
DEPENDS zlib
GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
GIT_TAG ""
PREFIX ${GZSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
BUILD_COMMAND make EXTERN_CPPFLAGS="-I${THIRD_PARTY_PATH}/install/zlib/include" EXTERM_LDFLAGS="-L${THIRD_PARTY_PATH}/install/zlib/lib" -j8
INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
&& cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
&& cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
)
ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
"${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
include_directories(${GZSTREAM_INCLUDE_DIR})
ADD_DEPENDENCIES(gzstream extern_gzstream zlib)
......@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
return()
ENDIF()
INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
......@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
ExternalProject_Add(
${NGRAPH_PROJECT}
......@@ -63,18 +69,6 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
)
if(UNIX AND NOT APPLE)
include(GNUInstallDirs)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
else()
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
endif()
MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
......
......@@ -26,7 +26,7 @@ ExternalProject_Add(
extern_pybind
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/pybind/pybind11.git"
GIT_TAG "v2.1.1"
GIT_TAG "v2.2.4"
PREFIX ${PYBIND_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
......
......@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(WIN32)
list(APPEND win32_deps shlwapi)
if("${cc_test_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_test_DEPS python)
list(APPEND win32_deps ${PYTHON_LIBRARIES})
endif()
endif(WIN32)
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
if(WIN32)
target_link_libraries(${TARGET_NAME} shlwapi)
target_link_libraries(${TARGET_NAME} ${win32_deps})
endif(WIN32)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
......@@ -683,7 +690,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
......
......@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
)
endif ()
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
copy(ngraph_lib
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir}
DEPS ngraph
)
endif ()
if (NOT WIN32)
if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
......@@ -186,8 +195,7 @@ set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_*.h
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
......
......@@ -26,12 +26,19 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
......@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
......@@ -97,8 +104,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
......@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
......@@ -291,6 +299,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
......
......@@ -34,6 +34,7 @@ add_subdirectory(ir)
add_subdirectory(details)
# ddim lib
proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
......@@ -116,14 +117,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
......@@ -131,8 +127,9 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
if(NOT WIN32)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32)
......@@ -140,7 +137,7 @@ endif(NOT WIN32)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
......@@ -162,18 +159,19 @@ endif(NOT WIN32)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
if(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
......@@ -181,8 +179,11 @@ endif()
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor)
fast_threaded_ssa_graph_executor variable_helper)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/async_executor.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace framework {
AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
: root_scope_(scope), place_(place) {}
void AsyncExecutor::CreateThreads(
ExecutorThreadWorker* worker, const ProgramDesc& main_program,
const std::shared_ptr<DataFeed>& reader,
const std::vector<std::string>& fetch_var_names, Scope* root_scope,
const int thread_index, const bool debug) {
worker->SetThreadId(thread_index);
worker->SetDebug(debug);
worker->SetRootScope(root_scope);
worker->CreateThreadResource(main_program, place_);
worker->SetDataFeed(reader);
worker->SetFetchVarNames(fetch_var_names);
worker->BindingDataFeedMemory();
}
void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers, // NOLINT
const int thread_num, const DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist) {
readers.resize(thread_num);
for (size_t i = 0; i < readers.size(); ++i) {
readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
readers[i]->Init(data_feed_desc); // set batch_size and queue_size here
}
readers[0]->SetFileList(filelist);
}
void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
const std::string& data_feed_desc_str,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_var_names,
const bool debug) {
std::vector<std::thread> threads;
auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name);
auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, "
"only variables with the last dimension size 1 supported",
var_name);
}
DataFeedDesc data_feed_desc;
google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
&data_feed_desc);
int actual_thread_num = thread_num;
int file_cnt = filelist.size();
PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
if (actual_thread_num > file_cnt) {
VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
<< ". Changing thread_num = " << file_cnt;
actual_thread_num = file_cnt;
}
/*
readerDesc: protobuf description for reader initlization
argument: class_name, batch_size, use_slot, queue_size, buffer_size,
padding_index
reader:
1) each thread has a reader, reader will read input data and
put it into input queue
2) each reader has a Next() iterface, that can fetch an instance
from the input queue
*/
// todo: should be factory method for creating datafeed
std::vector<std::shared_ptr<DataFeed>> readers;
PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
workers.resize(actual_thread_num);
for (auto& worker : workers) {
worker.reset(new ExecutorThreadWorker);
}
// prepare thread resource here
for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
CreateThreads(workers[thidx].get(), main_program, readers[thidx],
fetch_var_names, root_scope_, thidx, debug);
}
// start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
}
for (auto& th : threads) {
th.join();
}
root_scope_->DropKids();
return;
}
} // einit_modelnd namespace framework
} // end namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class AsyncExecutor {
public:
AsyncExecutor(Scope* scope, const platform::Place& place);
virtual ~AsyncExecutor() {}
void RunFromFile(const ProgramDesc& main_program,
const std::string& data_feed_desc_str,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_names,
const bool debug = false);
private:
void CreateThreads(ExecutorThreadWorker* worker,
const ProgramDesc& main_program,
const std::shared_ptr<DataFeed>& reader,
const std::vector<std::string>& fetch_var_names,
Scope* root_scope, const int thread_index,
const bool debug);
public:
Scope* root_scope_;
platform::Place place_;
};
} // namespace framework
} // namespace paddle
......@@ -18,8 +18,8 @@ namespace framework {
void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
Tensor *out) {
VLOG(30) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
VLOG(3) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
PADDLE_ENFORCE_NE(
in.place().which(), dst_place.which(),
......
......@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override {
if (Attr<bool>("use_gpu")) {
VLOG(30) << "force use gpu kernel";
VLOG(3) << "force use gpu kernel";
return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
} else {
VLOG(30) << "use default kernel";
VLOG(3) << "use default kernel";
return OpKernelType(proto::VarType::FP32,
ctx.Input<Tensor>("input")->place());
}
......@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
// get output
auto* output2 = scope.Var("OUT2");
gpu_op->Run(scope, cuda_place);
VLOG(30) << "after gpu_op run";
VLOG(3) << "after gpu_op run";
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
paddle::platform::DeviceContextPool& pool =
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
std::vector<std::string> DataFeed::filelist_;
size_t DataFeed::file_idx_;
std::mutex DataFeed::mutex_for_pick_file_;
bool DataFeed::finish_set_filelist_;
void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) {
if (name == use_slots_[i]) {
if (use_slots_is_dense_[i]) {
feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
} else {
feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
}
}
}
}
bool DataFeed::SetFileList(const std::vector<std::string>& files) {
std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
CheckInit();
if (finish_set_filelist_) {
VLOG(3) << "info: you have set the filelist.";
return false;
}
PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
filelist_.assign(files.begin(), files.end());
file_idx_ = 0;
finish_set_filelist_ = true;
return true;
}
void DataFeed::SetBatchSize(int batch_size) {
PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
default_batch_size_ = batch_size;
}
bool DataFeed::PickOneFile(std::string* filename) {
std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
if (file_idx_ == filelist_.size()) {
return false;
}
*filename = filelist_[file_idx_++];
return true;
}
void DataFeed::CheckInit() {
PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
}
void DataFeed::CheckSetFileList() {
PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
}
void DataFeed::CheckStart() {
PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
}
template <typename T>
void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
queue_size_ = queue_size;
queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
new paddle::operators::reader::BlockingQueue<T>(queue_size_));
}
template <typename T>
bool PrivateQueueDataFeed<T>::Start() {
CheckSetFileList();
read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
read_thread_.detach();
finish_start_ = true;
return true;
}
template <typename T>
void PrivateQueueDataFeed<T>::ReadThread() {
std::string filename;
while (PickOneFile(&filename)) {
file_.open(filename.c_str()); // is_text_feed
PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
T instance;
while (ParseOneInstance(&instance)) {
queue_->Send(instance);
}
file_.close();
}
queue_->Close();
}
template <typename T>
int PrivateQueueDataFeed<T>::Next() {
CheckStart();
int index = 0;
T instance;
T ins_vec;
while (index < default_batch_size_) {
if (!queue_->Receive(&instance)) {
break;
}
AddInstanceToInsVec(&ins_vec, instance, index++);
}
batch_size_ = index;
if (batch_size_ != 0) {
PutToFeedVec(ins_vec);
}
return batch_size_;
}
#ifdef _WIN32
template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
#endif
void MultiSlotDataFeed::Init(
const paddle::framework::DataFeedDesc& data_feed_desc) {
finish_init_ = false;
finish_set_filelist_ = false;
finish_start_ = false;
PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
"Multi_slot_desc has not been set.");
paddle::framework::MultiSlotDesc multi_slot_desc =
data_feed_desc.multi_slot_desc();
SetBatchSize(data_feed_desc.batch_size());
SetQueueSize(data_feed_desc.batch_size());
size_t all_slot_num = multi_slot_desc.slots_size();
all_slots_.resize(all_slot_num);
all_slots_type_.resize(all_slot_num);
use_slots_index_.resize(all_slot_num);
use_slots_.clear();
use_slots_is_dense_.clear();
for (size_t i = 0; i < all_slot_num; ++i) {
const auto& slot = multi_slot_desc.slots(i);
all_slots_[i] = slot.name();
all_slots_type_[i] = slot.type();
use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense());
}
}
feed_vec_.resize(use_slots_.size());
finish_init_ = true;
}
bool MultiSlotDataFeed::CheckFile(const char* filename) {
CheckInit(); // get info of slots
std::ifstream fin(filename);
if (!fin.good()) {
VLOG(1) << "error: open file<" << filename << "> fail";
return false;
}
std::string line;
int instance_cout = 0;
std::string all_slots_alias = "";
for (const auto& alias : all_slots_) {
all_slots_alias += alias + " ";
}
std::string use_slots_alias = "";
for (const auto& alias : use_slots_) {
use_slots_alias += alias + " ";
}
VLOG(3) << "total slots num: " << all_slots_.size();
VLOG(3) << "total slots alias: " << all_slots_alias;
VLOG(3) << "used slots num: " << use_slots_.size();
VLOG(3) << "used slots alias: " << use_slots_alias;
while (getline(fin, line)) {
++instance_cout;
const char* str = line.c_str();
char* endptr = const_cast<char*>(str);
int len = line.length();
for (size_t i = 0; i < all_slots_.size(); ++i) {
int num = strtol(endptr, &endptr, 10);
if (num < 0) {
VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
} else if (num == 0) {
VLOG(0)
<< "error: the number of ids can not be zero, you need "
"padding it in data generator; or if there is something wrong"
" with the data, please check if the data contains unresolvable "
"characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
} else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) {
strtof(endptr, &endptr);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (i + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
} else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) {
strtoull(endptr, &endptr, 10);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (i + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
} else {
VLOG(0) << "error: this type<" << all_slots_type_[i]
<< "> is not supported";
return false;
}
}
// It may be added '\t' character to the end of the output of reduce
// task when processes data by Hadoop(when the output of the reduce
// task of Hadoop has only one field, it will add a '\t' at the end
// of the line by default, and you can use this option to avoid it:
// `-D mapred.textoutputformat.ignoreseparator=true`), which does
// not affect the correctness of the data. Therefore, it should be
// judged that the data is not normal when the end of each line of
// data contains characters which are not spaces.
while (endptr - str != len) {
if (!isspace(*(endptr++))) {
VLOG(0)
<< "error: there is some extra characters at the end of the line.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
}
VLOG(3) << "instances cout: " << instance_cout;
VLOG(3) << "The file format is correct";
return true;
}
bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
std::string line;
if (getline(file_, line)) {
int use_slots_num = use_slots_.size();
instance->resize(use_slots_num);
// parse line
const char* str = line.c_str();
char* endptr = const_cast<char*>(str);
int pos = 0;
for (size_t i = 0; i < use_slots_index_.size(); ++i) {
int idx = use_slots_index_[i];
int num = strtol(&str[pos], &endptr, 10);
PADDLE_ENFORCE(
num,
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s",
str);
if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float
for (int j = 0; j < num; ++j) {
float feasign = strtof(endptr, &endptr);
(*instance)[idx].AddValue(feasign);
}
} else if ((*instance)[idx].GetType()[0] == 'u') { // uint64
for (int j = 0; j < num; ++j) {
uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
(*instance)[idx].AddValue(feasign);
}
}
pos = endptr - str;
} else {
for (int j = 0; j <= num; ++j) {
pos = line.find_first_of(' ', pos + 1);
}
}
}
} else {
return false;
}
return true;
}
void MultiSlotDataFeed::AddInstanceToInsVec(
std::vector<MultiSlotType>* ins_vec,
const std::vector<MultiSlotType>& instance, int index) {
if (index == 0) {
ins_vec->resize(instance.size());
for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].Init(instance[i].GetType());
(*ins_vec)[i].InitOffset();
}
}
for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].AddIns(instance[i]);
}
}
void MultiSlotDataFeed::PutToFeedVec(
const std::vector<MultiSlotType>& ins_vec) {
for (size_t i = 0; i < use_slots_.size(); ++i) {
const auto& type = ins_vec[i].GetType();
const auto& offset = ins_vec[i].GetOffset();
int total_instance = static_cast<int>(offset.back());
if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else {
float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
} else {
int64_t* tensor_ptr =
feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
}
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
namespace paddle {
namespace framework {
// Pack Tensor type and LoDTensor type into MixTensor type, in order
// to record either Tensor or LoDTensor information at the same time.
class MixTensor {
public:
MixTensor() {}
explicit MixTensor(LoDTensor* lodtensor) {
is_dense_ = false;
lodtensor_ = lodtensor;
}
explicit MixTensor(Tensor* tensor) {
is_dense_ = true;
tensor_ = tensor;
}
bool IsDense() { return is_dense_; }
LoDTensor* GetLoDTensor() {
PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
return lodtensor_;
}
Tensor* GetTensor() {
PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
return tensor_;
}
private:
bool is_dense_;
LoDTensor* lodtensor_;
Tensor* tensor_;
};
// DataFeed is the base virtual class for all ohther DataFeeds.
// It is used to read files and parse the data for subsequent trainer.
// Example:
// DataFeed* reader =
// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
// reader->SetFileList(filelist);
// const std::vector<std::string> & use_slot_alias =
// reader->GetUseSlotAlias();
// for (auto name: use_slot_alias){ // for binding memory
// reader->AddFeedVar(scope->Var(name), name);
// }
// reader->Start();
// while (reader->Next()) {
// // trainer do something
// }
class DataFeed {
public:
DataFeed() {}
virtual ~DataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
virtual bool CheckFile(const char* filename) {
PADDLE_THROW("This function(CheckFile) is not implemented.");
}
// Set filelist for DataFeed.
// Pay attention that it must init all readers before call this function.
// Otherwise, Init() function will init finish_set_filelist_ flag.
virtual bool SetFileList(const std::vector<std::string>& files);
virtual bool Start() = 0;
// The trainer calls the Next() function, and the DataFeed will load a new
// batch to the feed_vec. The return value of this function is the batch
// size of the current batch.
virtual int Next() = 0;
// Get all slots' alias which defined in protofile
virtual const std::vector<std::string>& GetAllSlotAlias() {
return all_slots_;
}
// Get used slots' alias which defined in protofile
virtual const std::vector<std::string>& GetUseSlotAlias() {
return use_slots_;
}
// This function is used for binding feed_vec memory
virtual void AddFeedVar(Variable* var, const std::string& name);
protected:
// The following three functions are used to check if it is executed in this
// order:
// Init() -> SetFileList() -> Start() -> Next()
virtual void CheckInit();
virtual void CheckSetFileList();
virtual void CheckStart();
virtual void SetBatchSize(
int batch); // batch size will be set in Init() function
// This function is used to pick one file from the global filelist(thread
// safe).
virtual bool PickOneFile(std::string* filename);
static std::vector<std::string> filelist_;
static size_t file_idx_;
static std::mutex mutex_for_pick_file_;
// the alias of used slots, and its order is determined by
// data_feed_desc(proto object)
std::vector<std::string> use_slots_;
std::vector<bool> use_slots_is_dense_;
// the alias of all slots, and its order is determined by data_feed_desc(proto
// object)
std::vector<std::string> all_slots_;
std::vector<std::string> all_slots_type_;
std::vector<int>
use_slots_index_; // -1: not used; >=0: the index of use_slots_
// The data read by DataFeed will be stored here
std::vector<MixTensor> feed_vec_;
// the batch size defined by user
int default_batch_size_;
// current batch size
int batch_size_;
bool finish_init_;
static bool finish_set_filelist_;
bool finish_start_;
};
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
// It use a read-thread to read file and parse data to a private-queue
// (thread level), and get data from this queue when trainer call Next().
template <typename T>
class PrivateQueueDataFeed : public DataFeed {
public:
PrivateQueueDataFeed() {}
virtual ~PrivateQueueDataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
virtual bool Start();
virtual int Next();
protected:
// The thread implementation function for reading file and parse.
virtual void ReadThread();
// This function is used to set private-queue size, and the most
// efficient when the queue size is close to the batch size.
virtual void SetQueueSize(int queue_size);
// The reading and parsing method called in the ReadThread.
virtual bool ParseOneInstance(T* instance) = 0;
// This function is used to put instance to vec_ins
virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
int index) = 0;
// This function is used to put ins_vec to feed_vec
virtual void PutToFeedVec(const T& ins_vec) = 0;
// The thread for read files
std::thread read_thread_;
// using ifstream one line and one line parse is faster
// than using fread one buffer and one buffer parse.
// for a 601M real data:
// ifstream one line and one line parse: 6034 ms
// fread one buffer and one buffer parse: 7097 ms
std::ifstream file_;
size_t queue_size_;
// The queue for store parsed data
std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
};
// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
class MultiSlotType {
public:
MultiSlotType() {}
~MultiSlotType() {}
void Init(const std::string& type) {
CheckType(type);
if (type_[0] == 'f') {
float_feasign_.clear();
} else if (type_[0] == 'u') {
uint64_feasign_.clear();
}
type_ = type;
}
void InitOffset() {
offset_.resize(1);
// LoDTensor' lod is counted from 0, the size of lod
// is one size larger than the size of data.
offset_[0] = 0;
}
const std::vector<size_t>& GetOffset() const { return offset_; }
void AddValue(const float v) {
CheckFloat();
float_feasign_.push_back(v);
}
void AddValue(const uint64_t v) {
CheckUint64();
uint64_feasign_.push_back(v);
}
void AddIns(const MultiSlotType& ins) {
if (ins.GetType()[0] == 'f') { // float
CheckFloat();
auto& vec = ins.GetFloatData();
offset_.push_back(offset_.back() + vec.size());
float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
} else if (ins.GetType()[0] == 'u') { // uint64
CheckUint64();
auto& vec = ins.GetUint64Data();
offset_.push_back(offset_.back() + vec.size());
uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
}
}
const std::vector<float>& GetFloatData() const { return float_feasign_; }
const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
const std::string& GetType() const { return type_; }
private:
void CheckType(const std::string& type) const {
PADDLE_ENFORCE((type == "uint64") || (type == "float"),
"There is no this type<%s>.", type);
}
void CheckFloat() const {
PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
}
void CheckUint64() const {
PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
}
std::vector<float> float_feasign_;
std::vector<uint64_t> uint64_feasign_;
std::string type_;
std::vector<size_t> offset_;
};
// This DataFeed is used to feed multi-slot type data.
// The format of multi-slot type data:
// [n feasign_0 feasign_1 ... feasign_n]*
class MultiSlotDataFeed
: public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
public:
MultiSlotDataFeed() {}
virtual ~MultiSlotDataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
virtual bool CheckFile(const char* filename);
protected:
virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
const std::vector<MultiSlotType>& instance,
int index);
virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.framework;
message Slot {
required string name = 1;
required string type = 2;
optional bool is_dense = 3 [ default = false ];
optional bool is_used = 4 [ default = false ];
}
message MultiSlotDesc { repeated Slot slots = 1; }
message DataFeedDesc {
optional string name = 1;
optional int32 batch_size = 2 [ default = 32 ];
optional MultiSlotDesc multi_slot_desc = 3;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
data_feedMap g_data_feed_map;
#define REGISTER_DATAFEED_CLASS(data_feed_class) \
namespace { \
std::shared_ptr<DataFeed> Creator_##data_feed_class() { \
return std::shared_ptr<DataFeed>(new data_feed_class); \
} \
class __Registerer_##data_feed_class { \
public: \
__Registerer_##data_feed_class() { \
g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
} \
}; \
__Registerer_##data_feed_class g_registerer_##data_feed_class; \
} // namespace
std::string DataFeedFactory::DataFeedTypeList() {
std::string data_feed_types;
for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
++iter) {
if (iter != g_data_feed_map.begin()) {
data_feed_types += ", ";
}
data_feed_types += iter->first;
}
return data_feed_types;
}
std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
std::string data_feed_class) {
if (g_data_feed_map.count(data_feed_class) < 1) {
exit(-1);
}
return g_data_feed_map[data_feed_class]();
}
REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeedFactory {
public:
static std::string DataFeedTypeList();
static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_feed.h"
#include <fcntl.h>
#include <chrono> // NOLINT
#include <fstream>
#include <iostream>
#include <map>
#include <mutex> // NOLINT
#include <set>
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
paddle::framework::DataFeedDesc load_datafeed_param_from_file(
const char* filename) {
paddle::framework::DataFeedDesc data_feed_desc;
int file_descriptor = open(filename, O_RDONLY);
PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
google::protobuf::io::FileInputStream fileInput(file_descriptor);
google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
close(file_descriptor);
return data_feed_desc;
}
const std::vector<std::string> load_filelist_from_file(const char* filename) {
std::vector<std::string> filelist;
std::ifstream fin(filename);
PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
std::string line;
while (getline(fin, line)) {
filelist.push_back(line);
}
fin.close();
return filelist;
}
void GenerateFileForTest(const char* protofile, const char* filelist) {
std::ofstream w_protofile(protofile);
w_protofile << "name: \"MultiSlotDataFeed\"\n"
"batch_size: 2\n"
"multi_slot_desc {\n"
" slots {\n"
" name: \"uint64_sparse_slot\"\n"
" type: \"uint64\"\n"
" is_dense: false\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"float_sparse_slot\"\n"
" type: \"float\"\n"
" is_dense: false\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"uint64_dense_slot\"\n"
" type: \"uint64\"\n"
" is_dense: true\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"float_dense_slot\"\n"
" type: \"float\"\n"
" is_dense: true\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"not_used_slot\"\n"
" type: \"uint64\"\n"
" is_dense: false\n"
" is_used: false\n"
" }\n"
"}";
w_protofile.close();
std::ofstream w_filelist(filelist);
int total_file = 4;
for (int i = 0; i < total_file; ++i) {
std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
w_filelist << filename;
if (i + 1 != total_file) {
w_filelist << std::endl;
}
std::ofstream w_datafile(filename.c_str());
w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
"2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
"1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
w_datafile.close();
}
w_filelist.close();
}
class MultiTypeSet {
public:
MultiTypeSet() {
uint64_set_.clear();
float_set_.clear();
}
~MultiTypeSet() {}
void AddValue(uint64_t v) { uint64_set_.insert(v); }
void AddValue(float v) { float_set_.insert(v); }
const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
const std::set<float>& GetFloatSet() const { return float_set_; }
private:
std::set<uint64_t> uint64_set_;
std::set<float> float_set_;
};
void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
const paddle::framework::DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist,
const int thread_num) {
int used_slot_num = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
++used_slot_num;
}
}
reader_elem_set->resize(used_slot_num);
std::vector<std::thread> threads;
std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
readers.resize(thread_num);
for (int i = 0; i < thread_num; ++i) {
readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
data_feed_desc.name());
readers[i]->Init(data_feed_desc);
}
readers[0]->SetFileList(filelist);
std::mutex mu;
for (int idx = 0; idx < thread_num; ++idx) {
threads.emplace_back(std::thread([&, idx] {
std::unique_ptr<paddle::framework::Scope> scope(
new paddle::framework::Scope());
const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
std::map<std::string, const paddle::framework::LoDTensor*>
lodtensor_targets;
std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
const auto& slot = multi_slot_desc.slots(i);
if (slot.is_used()) {
const auto& name = slot.name();
readers[idx]->AddFeedVar(scope->Var(name), name);
if (slot.is_dense()) {
tensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::Tensor>();
} else {
lodtensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
}
}
}
readers[idx]->Start();
while (readers[idx]->Next()) {
int index = 0;
for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
const auto& slot = multi_slot_desc.slots(k);
if (!slot.is_used()) {
continue;
}
if (slot.is_dense()) { // dense branch
const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
int batch_size = tens->dims()[0];
int dim = tens->dims()[1];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < dim; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(
(uint64_t)data[i * dim + j]);
}
}
} else if (slot.type() == "float") {
const float* data = tens->data<float>();
int batch_size = tens->dims()[0];
int dim = tens->dims()[1];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < dim; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(data[i * dim + j]);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
} else { // sparse branch
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
for (size_t i = 0; i < tens->NumElements(); ++i) {
std::pair<size_t, size_t> element = tens->lod_element(0, i);
for (size_t j = element.first; j < element.second; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue((uint64_t)data[j]);
}
}
} else if (slot.type() == "float") {
const float* data = tens->data<float>();
for (size_t i = 0; i < tens->NumElements(); ++i) {
std::pair<size_t, size_t> element = tens->lod_element(0, i);
for (size_t j = element.first; j < element.second; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(data[j]);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
} // end sparse branch
++index;
} // end slots loop
} // end while Next()
})); // end anonymous function
}
for (auto& th : threads) {
th.join();
}
}
void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
const std::vector<MultiTypeSet>& s2) {
EXPECT_EQ(s1.size(), s2.size());
for (size_t i = 0; i < s1.size(); ++i) {
// check for uint64
const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
auto uint64_it1 = uint64_s1.begin();
auto uint64_it2 = uint64_s2.begin();
while (uint64_it1 != uint64_s1.end()) {
EXPECT_EQ(*uint64_it1, *uint64_it2);
++uint64_it1;
++uint64_it2;
}
// check for float
const std::set<float>& float_s1 = s1[i].GetFloatSet();
const std::set<float>& float_s2 = s2[i].GetFloatSet();
EXPECT_EQ(float_s1.size(), float_s2.size());
auto float_it1 = float_s1.begin();
auto float_it2 = float_s2.begin();
while (float_it1 != float_s1.end()) {
EXPECT_EQ(*float_it1, *float_it2);
++float_it1;
++float_it2;
}
}
}
void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
const paddle::framework::DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist) {
int used_slot_num = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
++used_slot_num;
}
}
file_elem_set->resize(used_slot_num);
for (const auto& file : filelist) {
std::ifstream fin(file.c_str());
PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
while (1) {
bool end_flag = false;
int index = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
int num;
if (fin >> num) {
auto slot = data_feed_desc.multi_slot_desc().slots(i);
auto type = slot.type();
if (type == "uint64") {
while (num--) {
uint64_t feasign;
fin >> feasign;
if (slot.is_used()) {
(*file_elem_set)[index].AddValue(feasign);
}
}
} else if (type == "float") {
while (num--) {
float feasign;
fin >> feasign;
if (slot.is_used()) {
(*file_elem_set)[index].AddValue(feasign);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
if (slot.is_used()) {
++index;
}
} else {
end_flag = true;
break;
}
}
if (end_flag) {
break;
}
}
fin.close();
}
}
TEST(DataFeed, MultiSlotUnitTest) {
const char* protofile = "data_feed_desc.prototxt";
const char* filelist_name = "filelist.txt";
GenerateFileForTest(protofile, filelist_name);
const std::vector<std::string> filelist =
load_filelist_from_file(filelist_name);
paddle::framework::DataFeedDesc data_feed_desc =
load_datafeed_param_from_file(protofile);
std::vector<MultiTypeSet> reader_elem_set;
std::vector<MultiTypeSet> file_elem_set;
GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
CheckIsUnorderedSame(reader_elem_set, file_elem_set);
}
......@@ -39,11 +39,12 @@ if (WITH_GPU)
endif()
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p);
if (b) {
return b;
}
}
return nullptr;
}
std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// get vars order
int order = 0;
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
}
}
order++;
}
std::vector<OpHandleBase*> dist_ops;
// get allreduce ops.
for (auto& op : graph_ops) {
// FIXME(gongwb):add broad cast.
if (op->Name() == "all_reduce" || op->Name() == "reduce") {
dist_ops.push_back(op);
}
}
VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) {
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
op1->DebugString(), op2->DebugString());
auto l_it = vars.find(i0->name_);
auto r_it = vars.find(i1->name_);
if (l_it->second < r_it->second) return true;
if (l_it->second == r_it->second) {
return i0->name_ < i1->name_;
}
return false;
});
// add dependency.
auto& sorted_ops = dist_ops;
for (size_t i = 1; i < sorted_ops.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
auto* pre_op = sorted_ops[i - 1];
auto* op = sorted_ops[i];
pre_op->AddOutput(dep_var);
op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
<< " and " << op;
VLOG(10) << "pre_op:" << pre_op->DebugString()
<< ", op:" << op->DebugString();
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(all_reduce_deps_pass,
paddle::framework::details::AllReduceDepsPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
// TODO(gongwb): overlap allreduce with backward computation.
class AllReduceDepsPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -23,7 +23,7 @@ namespace paddle {
namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
......@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
}
if (platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int dtype = -1;
size_t numel = 0;
......
......@@ -20,7 +20,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -29,7 +29,7 @@ namespace framework {
namespace details {
struct AllReduceOpHandle : public OpHandleBase {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs);
......@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
private:
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs_;
#endif
};
......
......@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
return;
}
......@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
}
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
VarHandle *out_handle = nullptr;
int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
std::vector<std::function<void()>> broadcast_calls;
......
......@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -34,7 +34,7 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase {
public:
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs)
......@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs_;
#endif
......
......@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
......@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
......@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
......@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_.reset(nullptr);
#endif
}
......@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
......@@ -24,6 +25,10 @@ namespace paddle {
namespace framework {
namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
}
class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
......@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
if (SeqOnlyAllReduceOps(strategy)) {
AppendPass("all_reduce_deps_pass");
}
if (strategy_.remove_unnecessary_lock_) {
AppendPass("modify_op_lock_and_record_event_pass");
}
......@@ -96,7 +105,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else
const bool use_cuda) const {
......@@ -118,12 +127,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "sequential_execution_pass") {
VLOG(1) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "all_reduce_deps_pass") {
VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
......@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
......@@ -23,7 +23,7 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -73,6 +73,7 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false};
int num_trainers_{1};
bool remove_unnecessary_lock_{false};
// NOTE:
......@@ -98,7 +99,7 @@ struct BuildStrategy {
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
#else
const bool use_cuda) const;
......
......@@ -20,7 +20,7 @@ namespace paddle {
namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle::DataBalanceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -29,7 +29,7 @@ namespace details {
struct DataBalanceOpHandle : public OpHandleBase {
public:
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs);
......
......@@ -25,7 +25,7 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -35,7 +35,7 @@ namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public:
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places,
......
......@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
nodes_.emplace_back(
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not supported.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else
......
......@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
if (is_lock_and_record_event_free) {
VLOG(100) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString();
VLOG(10) << "Set is_lock_and_record_event_free be true in op "
<< compute_op->DebugString();
}
}
return ir_graph;
......
......@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
places_ = Get<const std::vector<platform::Place>>(kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
strategy_ = Get<const BuildStrategy>(kStrategy);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
#endif
......@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1];
VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
......@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
}
}
bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu = nccl_ctxs_ != nullptr;
#endif
......@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
void MultiDevSSAGraphBuilder::SetCommunicationContext(
OpHandleBase *op_handle, const platform::Place &p) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (nccl_ctxs_ == nullptr) {
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
......@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
const std::string &p_name,
size_t src_dev_id) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *op_handle = new BroadcastOpHandle(
result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_);
......@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *op_handle = new FusedBroadcastOpHandle(
result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_);
......@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
const std::string &og) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
......@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
ir::Graph *result, const std::vector<std::string> &datas) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
......@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
const std::string &og,
int dst_dev_id) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
......@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(100) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) {
sharded_var_device->emplace(varname, op_dev_id);
}
......@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if (recv_param_grad.size() == 2U) {
op_dev_id =
GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
VLOG(100) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
} else {
op_dev_id = GetAppropriateDeviceID(output_var_names);
}
......@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id =
GetVarDeviceID(*result, output->Name(), *sharded_var_device);
PADDLE_ENFORCE_NE(outvar_dev_id, -1);
PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
}
p = places_[outvar_dev_id];
ir::Node *new_node = nullptr;
......
......@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t device_id) const;
void Init() const;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
......
......@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
}
});
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(
......
......@@ -23,7 +23,7 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
......
......@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
std::vector<p::Place> gpu_list_;
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
......@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
......@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
......@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_.reset(nullptr);
#endif
}
......@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
nodes.emplace_back(new ir::Node("node"));
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
......
......@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if (next_compute_op != nullptr) {
if (compute_ref_cnt_map.count(next_compute_op)) {
compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
VLOG(50) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name();
VLOG(5) << "Add reference count of " << var_name << " to Operator "
<< next_compute_op->Name();
} else {
// Create new reference_count_op_handle
ir::Node *ref_cnt_node = graph->CreateEmptyNode(
......
......@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(100) << place_ << "RUN Scale loss grad op";
VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif
}
......
......@@ -16,7 +16,7 @@
#include <stdexcept>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
......
......@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
op_node_list[i - 1]->outputs.push_back(dep_var);
dep_var->outputs.push_back(op_node_list[i]);
dep_var->inputs.push_back(op_node_list[i - 1]);
VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name();
VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
<< " and " << op_node_list[i]->Name();
}
return graph;
}
......
......@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] {
try {
if (VLOG_IS_ON(100)) {
VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
if (VLOG_IS_ON(10)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
}
if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(100) << op << " " << op->Name() << " Done ";
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(100) << op << " " << op->Name() << "Signal posted";
VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (...) {
exception_holder_.Catch(std::current_exception());
}
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -46,7 +47,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
}
ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(50) << "destroy ExecutorPrepareContext";
VLOG(5) << "destroy ExecutorPrepareContext";
}
template <typename RefCntMap>
......@@ -63,7 +64,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
if ((it->second)-- == 1) {
auto* var = scope.FindVar(name);
if (var != nullptr) {
VLOG(100) << "Erase tensor \'" << name << "\'";
VLOG(10) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) {
......@@ -114,36 +115,6 @@ void Executor::Close() {
#endif
}
void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
}
}
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
int block_id) {
auto& global_block = pdesc.Block(block_id);
......@@ -162,21 +133,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(30) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
......@@ -307,7 +278,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0;
for (auto& feed_target : (*feed_targets)) {
std::string var_name = feed_target.first;
VLOG(30) << "feed target's name: " << var_name;
VLOG(3) << "feed target's name: " << var_name;
// prepend feed op
auto* op = global_block->PrependOp();
......@@ -330,7 +301,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int i = 0;
for (auto& fetch_target : (*fetch_targets)) {
std::string var_name = fetch_target.first;
VLOG(30) << "fetch target's name: " << var_name;
VLOG(3) << "fetch target's name: " << var_name;
// append fetch op
auto* op = global_block->AppendOp();
......@@ -482,7 +453,7 @@ void Executor::RunPreparedContext(
void Executor::EnableMKLDNN(const ProgramDesc& program) {
#ifdef PADDLE_WITH_MKLDNN
VLOG(30) << "use_mkldnn=True";
VLOG(3) << "use_mkldnn=True";
for (size_t bid = 0; bid < program.Size(); ++bid) {
auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
for (auto* op : block->AllOps()) {
......
......@@ -26,7 +26,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
template <typename T>
std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace framework {
void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
auto& block = program.Block(0);
op_names_.clear();
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
op_names_.push_back(op_desc->Type());
OperatorBase* local_op_ptr = local_op.release();
ops_.push_back(local_op_ptr);
continue;
}
}
void ExecutorThreadWorker::CreateThreadResource(
const framework::ProgramDesc& program,
const paddle::platform::Place& place) {
CreateThreadScope(program);
CreateThreadOperators(program);
SetMainProgram(program);
SetPlace(place);
}
void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
auto& block = program.Block(0);
PADDLE_ENFORCE_NOT_NULL(
root_scope_, "root_scope should be set before creating thread scope");
thread_scope_ = &root_scope_->NewScope();
for (auto& var : block.AllVars()) {
if (var->Persistable()) {
auto* ptr = root_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
} else {
auto* ptr = thread_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
}
void ExecutorThreadWorker::SetDataFeed(
const std::shared_ptr<DataFeed>& datafeed) {
thread_reader_ = datafeed;
}
void ExecutorThreadWorker::BindingDataFeedMemory() {
const std::vector<std::string>& input_feed =
thread_reader_->GetUseSlotAlias();
for (auto name : input_feed) {
thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
}
}
void ExecutorThreadWorker::SetFetchVarNames(
const std::vector<std::string>& fetch_var_names) {
fetch_var_names_.clear();
fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
fetch_var_names.end());
}
void ExecutorThreadWorker::SetDevice() {
#if defined _WIN32 || defined __APPLE__
return;
#else
static unsigned concurrency_cap = std::thread::hardware_concurrency();
int thread_id = this->thread_id_;
if (thread_id < concurrency_cap) {
unsigned proc = thread_id;
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(proc, &mask);
if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
VLOG(1) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
} else {
CPU_ZERO(&mask);
if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
(CPU_ISSET(proc, &mask) == 0)) {
VLOG(3) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
}
}
} else {
VLOG(1) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
}
#endif
}
template <typename T>
void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
auto inspect = lod_tensor.data<T>();
auto element_num = lod_tensor.numel();
std::ostringstream sstream;
sstream << var_name << " (element num " << element_num << "): [";
sstream << inspect[0];
for (int j = 1; j < element_num; ++j) {
sstream << " " << inspect[j];
}
sstream << "]";
std::cout << sstream.str() << std::endl;
}
void print_fetch_var(Scope* scope, std::string var_name) {
const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
if (std::type_index(tensor.type()) ==
std::type_index(typeid(platform::float16))) {
print_lod_tensor<platform::float16>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
print_lod_tensor<float>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(double))) {
print_lod_tensor<double>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
print_lod_tensor<int>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int64_t))) {
print_lod_tensor<int64_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
print_lod_tensor<bool>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(uint8_t))) {
print_lod_tensor<uint8_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int16_t))) {
print_lod_tensor<int16_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int8_t))) {
print_lod_tensor<int8_t>(var_name, tensor);
} else {
VLOG(1) << "print_fetch_var: unrecognized data type:"
<< tensor.type().name();
}
return;
}
void ExecutorThreadWorker::TrainFiles() {
// todo: configurable
SetDevice();
int fetch_var_num = fetch_var_names_.size();
fetch_values_.clear();
fetch_values_.resize(fetch_var_num);
thread_reader_->Start();
int cur_batch;
int batch_cnt = 0;
while ((cur_batch = thread_reader_->Next()) > 0) {
// executor run here
for (auto& op : ops_) {
op->Run(*thread_scope_, place_);
}
++batch_cnt;
thread_scope_->DropKids();
if (debug_ == false || thread_id_ != 0) {
continue;
}
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
} // end for (int i = 0...)
} // end while ()
}
void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
place_ = place;
}
void ExecutorThreadWorker::SetMainProgram(
const ProgramDesc& main_program_desc) {
main_program_.reset(new ProgramDesc(main_program_desc));
}
void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
root_scope_ = g_scope;
}
} // einit_modelnd namespace framework
} // end namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
void CreateTensor(Variable* var, proto::VarType::Type var_type);
class ExecutorThreadWorker {
public:
ExecutorThreadWorker()
: thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
~ExecutorThreadWorker() {}
void CreateThreadResource(const framework::ProgramDesc& program,
const paddle::platform::Place& place);
void SetThreadId(int tid);
void SetDebug(const bool debug) { debug_ = debug; }
void SetRootScope(Scope* g_scope);
// set cpu device in this function
// cpu binding is used by default
void SetDevice();
// since we read data into memory that can not be accessed by program
// we need to bind memory of data with corresponding variables in program
// this function should be called after data feed is set
void BindingDataFeedMemory();
// set data feed declared in executor
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function
void TrainFiles();
// set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
private:
void CreateThreadScope(const framework::ProgramDesc& program);
void CreateThreadOperators(const framework::ProgramDesc& program);
void SetMainProgram(const ProgramDesc& main_program_desc);
void SetPlace(const paddle::platform::Place& place);
protected:
// thread index
std::shared_ptr<DataFeed> thread_reader_; // shared queue, thread buffer
int thread_id_;
// operator name
std::vector<std::string> op_names_;
// thread level, local operators for forward and backward
std::vector<OperatorBase*> ops_;
// main program for training
std::unique_ptr<framework::ProgramDesc> main_program_;
// execution place
platform::Place place_;
// root scope for model parameters
Scope* root_scope_;
// a thread scope, father scope is global score which is shared
Scope* thread_scope_;
private:
std::vector<std::string> fetch_var_names_;
std::vector<std::vector<float>> fetch_values_;
bool debug_;
};
} // namespace framework
} // namespace paddle
......@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
if (index >= feed_inputs.size()) {
......@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(30) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
......
......@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(40) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(40) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(40) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W(forget);
......@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
int D = W_forget_w0.dims()[0];
int M = W_forget_w1.dims()[0];
out->Resize(make_ddim({D + M, 4 * D}));
VLOG(30) << "LSTMWeight resized to " << out->dims();
VLOG(3) << "LSTMWeight resized to " << out->dims();
float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors{
......
......@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle ConvBias fuse";
VLOG(4) << "handle ConvBias fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_bias_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp
......@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
VLOG(30) << "do not perform conv+bias fuse";
VLOG(3) << "do not perform conv+bias fuse";
return;
}
......
......@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle ConvBN fuse";
VLOG(4) << "handle ConvBN fuse";
// conv, batch_norm,
// conv_weight, conv_out,
......@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
if (fuse_option == DO_NOT_FUSE) {
VLOG(30) << "do not perform conv+bn fuse";
VLOG(3) << "do not perform conv+bn fuse";
return;
}
......@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
int found_conv_bn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle ConvBN fuse";
VLOG(4) << "handle ConvBN fuse";
// conv, batch_norm,
// conv_weight, conv_out,
......
......@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
int found_conv_relu_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle ConvReLU fuse";
VLOG(4) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
......@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
FuseOptions fuse_option = FindFuseOption(*conv, *relu);
if (fuse_option == DO_NOT_FUSE) {
VLOG(30) << "do not perform conv+relu fuse";
VLOG(3) << "do not perform conv+relu fuse";
return;
}
......
......@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
int found_depthwise_conv_mkldnn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
GET_NODE(depthwise_conv, (*pattern));
depthwise_conv->Op()->SetType("conv2d");
found_depthwise_conv_mkldnn_count++;
......
......@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle FC fuse";
VLOG(4) << "handle FC fuse";
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
......
......@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(40) << "handle FuseElewiseAddAct fuse";
VLOG(4) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
elewise_add_act_pattern);
......@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n;
VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n;
ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
found_elewise_add_act_count++;
......@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(40) << "handle FuseElewiseAddAct fuse";
VLOG(4) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
......@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n;
VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n;
ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
found_elewise_add_act_count++;
......@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
......@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto fused_node = g->CreateOpNode(&desc);
VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n;
VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n;
ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
found_elewise_add_act_count++;
......
......@@ -90,7 +90,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
const ProgramDesc &program) {
VLOG(30) << "block in program:" << program_.Size();
VLOG(3) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars;
// var nodes for each var name, will have multiple versions in SSA
std::map<std::string, std::vector<ir::Node *>> var_nodes;
......@@ -158,7 +158,7 @@ void Graph::ResolveHazard(
auto it_old = versions.rbegin();
++it_old;
for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
VLOG(30) << "deal with var: " << (*it_new)->Name();
VLOG(3) << "deal with var: " << (*it_new)->Name();
ir::Node *write_op =
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs;
......
......@@ -89,7 +89,7 @@ class Graph {
attr_name);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(30) << "deleting " << attr_name;
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}
......
......@@ -40,9 +40,8 @@ void SortHelper(
}
}
VLOG(30) << "topology sort insert: " << node->Name()
<< reinterpret_cast<void *>(node) << " input "
<< node->inputs.size();
VLOG(3) << "topology sort insert: " << node->Name()
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size();
ret->push_back(node);
}
......@@ -111,9 +110,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
}
}
......
......@@ -92,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0;
for (auto &g : subgraphs) {
VLOG(30) << "optimizing #" << id++ << " subgraph";
VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph);
}
}
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
VLOG(30) << "mark pdnodes in graph";
VLOG(3) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false;
for (auto &node : GraphTraits::DFS(graph)) {
for (const auto &pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) {
VLOG(40) << "pdnode " << pdnode->name() << " marked";
VLOG(4) << "pdnode " << pdnode->name() << " marked";
pdnodes2nodes_[pdnode.get()].insert(&node);
}
}
......@@ -112,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
// Check to early stop if some PDNode can't find matched Node.
for (auto &pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
// return false;
}
}
......@@ -121,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
}
}
VLOG(30) << pdnodes2nodes_.size() << " nodes marked";
VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty();
}
......@@ -215,7 +215,7 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for (const auto &edge : pattern_.edges()) {
VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
......@@ -226,7 +226,7 @@ GraphPatternDetector::DetectPatterns() {
// source -> target
for (Node *source : pdnodes2nodes_[edge.first]) {
for (Node *target : pdnodes2nodes_[edge.second]) {
VLOG(80) << "check " << source->id() << " -- " << target->id();
VLOG(8) << "check " << source->id() << " -- " << target->id();
// TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) {
if (IsNodesLink(source, target)) {
......@@ -243,13 +243,12 @@ GraphPatternDetector::DetectPatterns() {
}
}
}
VLOG(30) << "step " << step << " get records: " << cur_groups.size();
VLOG(3) << "step " << step << " get records: " << cur_groups.size();
for (auto &group : cur_groups) {
for (auto &item : group.roles) {
VLOG(40) << "node " << item.second->id() << " as "
<< item.first->name();
VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
}
VLOG(40) << "=========================================================";
VLOG(4) << "=========================================================";
}
}
......
......@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
VLOG(30) << "draw IR graph viz to " << graph_viz_path;
VLOG(3) << "draw IR graph viz to " << graph_viz_path;
std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
PADDLE_ENFORCE(fout->good());
std::ostream& sout = *fout;
......
......@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("is_test")) {
if (n->RuntimeHasAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
......
......@@ -15,7 +15,10 @@
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <gtest/gtest.h>
#ifdef _WIN32
#undef FALSE
#undef TRUE
#endif
namespace paddle {
namespace framework {
namespace ir {
......@@ -101,9 +104,9 @@ TEST(IsTestPass, basic) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test"));
ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
} else {
ASSERT_TRUE(op->HasAttr("is_test"));
ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
......
......@@ -20,9 +20,9 @@ namespace ir {
std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(30) << "Aplies MKL-DNN placement strategy.";
VLOG(3) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true);
}
}
......
......@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
VarDesc repeated_var = CopyVarDesc(var_desc);
repeated_var.SetName(new_gname);
VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
return repeated_var;
}
return *var_desc;
......@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes();
VLOG(30) << "origin nodes count: " << origin_nodes.size();
VLOG(3) << "origin nodes count: " << origin_nodes.size();
ir::Graph& result = *graph;
// 1. record op nodes of different roles
......@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
"%s.repeat.%d", repeated_op.Input("Variance")[0], i);
bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name;
VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name;
repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
namespace paddle {
namespace framework {
......@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
const char Node::kControlDepVarName[] = "__control_var";
#endif
std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
Node::Type type) {
return std::unique_ptr<Node>(new Node(name, type));
}
bool Node::RuntimeHasAttr(const std::string &name) const {
if (Op()->HasAttr(name)) {
return true;
} else {
auto &op_info = OpInfoMap::Instance();
auto op_type = Op()->Type();
if (op_info.Has(op_type)) {
auto op_info_ptr = op_info.Get(op_type);
if (op_info_ptr.HasOpProtoAndChecker()) {
const proto::OpProto &proto = op_info_ptr.Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return true;
}
}
}
}
}
return false;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -108,6 +108,18 @@ class Node {
Name().find(ir::Node::kControlDepVarName) != std::string::npos;
}
// RuntimeHasAttr is different with HasAttr now.
// 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
// thus, if stored program_desc_ are old which don't have an attr, a new
// library which adds the attr already will fail on this function.
// Details:
// https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
// 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
// problem.
// TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
// RuntimeHasAttr.
bool RuntimeHasAttr(const std::string& name) const;
std::vector<Node*> inputs;
std::vector<Node*> outputs;
......
......@@ -76,7 +76,7 @@ class Pass {
attr_name);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(30) << "deleting " << attr_name;
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}
......
......@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(40) << "get one concat pattern";
VLOG(4) << "get one concat pattern";
// fc
GET_NODE(fc_w, detector.pattern());
GET_NODE(fc_bias, detector.pattern());
......
......@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(40) << "handle SeqConv EltAdd Relu fuse";
VLOG(4) << "handle SeqConv EltAdd Relu fuse";
GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
......
......@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem item;
item.index = i;
item.length = vec[i + 1] - vec[i];
VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
items_.emplace_back(item);
}
// NOTE(yuyang18):
......
......@@ -26,10 +26,8 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#if !defined(_WIN32)
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#endif // _WIN32
namespace paddle {
namespace framework {
......@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
#if !defined(_WIN32)
void WriteToRecordIO(recordio::Writer *writer,
const std::vector<LoDTensor> &tensor,
const platform::DeviceContext &dev_ctx) {
......@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
return true;
}
#else
class Writer {};
class Scanner {};
void WriteToRecordIO(recordio::Writer *writer,
const std::vector<LoDTensor> &tensor,
const platform::DeviceContext &dev_ctx) {}
bool ReadFromRecordIO(recordio::Scanner *scanner,
const platform::DeviceContext &dev_ctx,
std::vector<LoDTensor> *result_ptr) {
PADDLE_ENFORCE("windows didn't supported recordio!.");
return true;
}
#endif // _WIN32
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
const std::vector<platform::Place> places) const {
check_memory_size();
......
......@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
EXPECT_EQ(offset_lod, expected);
}
#if !defined(_WIN32)
template <typename T>
static void TestRecordIO() {
LoDTensor tensor;
......@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
TestRecordIO<float>();
TestRecordIO<double>();
}
#endif // !defined(_WIN32)
} // namespace framework
} // namespace paddle
......@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
TEST(mixed_vector, ForEach) {
vec<int> tmp;
for (auto& v : tmp) {
VLOG(30) << v;
VLOG(3) << v;
}
}
......
......@@ -21,42 +21,11 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
// These code can be shared with Executor.
static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
var_type);
}
}
void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
int block_id, bool with_feed_fetch_ops) {
if (!scope) {
......
......@@ -15,23 +15,105 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <functional>
#include <vector>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
const VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(prm);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(prm);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm);
}
}
static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
const std::string prm) {
auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false;
return outputs.at(prm).size() > 0;
}
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = GetInputNode(op, "X", ngb_node_map);
auto y = GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
SetOutputNode(op, "Out", out, ngb_node_map);
}
std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {};
NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
{"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map);
NG_NODE_MAP[op_type](op, ngb_node_map_);
}
} // namespace framework
......
......@@ -20,16 +20,14 @@ limitations under the License. */
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
#include "ngraph/node.hpp"
namespace paddle {
namespace framework {
class OperatorBase;
class NgraphBridge {
public:
static std::map<
......@@ -43,14 +41,14 @@ class NgraphBridge {
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map)
: ngb_node_map(var_node_map) {}
: ngb_node_map_(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op);
void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map;
ngb_node_map_;
};
} // namespace framework
......
......@@ -19,14 +19,29 @@ limitations under the License. */
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
static ngraph::Shape Ddim2Shape(const DDim& dims) {
ngraph::Shape sp;
for (int i = 0; i < dims.size(); ++i) {
int k = dims[i];
k = k == 0 ? 1 : k;
sp.push_back(k);
}
return sp;
}
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64},
......@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
// perform graph build through bridge and execute computation
class NgraphOperator {
public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place,
......@@ -59,13 +75,23 @@ class NgraphOperator {
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {}
ng_op_state_(ng_op_state) {
var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
BuildNgIO();
GetNgFunction();
}
void Run(const Scope& scope, const platform::Place& place) const;
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache;
func_cache_;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
......@@ -74,6 +100,35 @@ class NgraphOperator {
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
// ngraph backend eg. CPU
static std::shared_ptr<ngraph::runtime::Backend> backend_;
// ngraph function to call and execute
std::shared_ptr<ngraph::Function> ngraph_function_;
// var_name of inputs
std::vector<std::string> var_in_;
// var_name of outputs from fetch in order
std::vector<std::string> var_out_;
// map input vars to nodes
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_in_node_map_;
// map each var name with a ngraph node
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_;
// cache key to check if function is cached
std::shared_ptr<std::string> GetCacheKey();
// get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<OperatorBase> op);
// Call ngraph bridge to map ops
void BuildNgNodes();
// get the ngraph input and output var list
void BuildNgIO();
// build ngraph function call
void BuildNgFunction();
// Check cache for ngraph function or otherwise build the function
void GetNgFunction();
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
......@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) {
while (left < size && ops->at(left)->Type() != kFeedOpType) {
++left;
}
if (left == size) {
......@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
size_t start = pivot, end = start;
while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) !=
ops->at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot;
++end;
......@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
: OperatorBase(type, inputs, outputs, attrs),
pdesc_(prog),
block_(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
......@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true;
is_full_ = true;
}
Process();
......@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
}
}
if (is_full) {
if (is_full_) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
}
......@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
ngraph_op.Run(scope, place);
}
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
NgraphOperator::func_cache_ = {};
std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
ngraph::runtime::Backend::create("CPU");
void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
op->RuntimeInferShape(scope_, place_);
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
}
}
}
void NgraphOperator::BuildNgNodes() {
for (auto& var_name : var_out_) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
paddle::framework::NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
}
void NgraphOperator::BuildNgIO() {
std::unordered_set<std::string> inputs;
std::unordered_set<std::string> outputs;
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
inputs.insert(var_name);
const bool is_output = outputs.find(var_name) != outputs.end();
if (!is_output &&
std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) {
// fill var_in here to keep lhs and rhs order
var_in_.push_back(var_name);
}
}
}
if (op->Type() != "fill_constant") {
GetNgInputShape(op);
}
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
outputs.insert(var_name);
}
}
}
// var_out.clear();
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) {
case PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
default:
var_out_.push_back(var_name);
}
}
}
}
}
void NgraphOperator::BuildNgFunction() {
BuildNgNodes();
ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs;
ngraph::op::ParameterVector func_inputs;
for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo));
}
for (auto& vi : var_in_) {
std::shared_ptr<ngraph::op::Parameter> prm =
std::dynamic_pointer_cast<ngraph::op::Parameter>(
var_in_node_map_->at(vi));
func_inputs.push_back(prm);
}
ngraph_function_ =
std::make_shared<ngraph::Function>(func_outputs, func_inputs);
}
std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
auto cache_key = std::make_shared<std::string>("");
*cache_key += std::to_string(fused_ops_.size());
for (auto& op : fused_ops_) {
*cache_key += op->Type();
}
for (auto& var_name : var_in_) {
auto shape = var_node_map_->at(var_name)->get_shape();
*cache_key += var_name;
*cache_key += var_type_map_.at(var_name).c_type_string();
for (size_t i = 0; i < shape.size(); ++i) {
*cache_key += std::to_string(shape.at(i));
}
}
for (auto& var_name : var_out_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
for (int i = 0; i < ddim.size(); ++i) {
*cache_key += std::to_string(ddim[i]);
}
}
}
return cache_key;
}
void NgraphOperator::GetNgFunction() {
bool cache_on = true;
if (cache_on) {
std::string cache_key_val = *GetCacheKey();
if (func_cache_.find(cache_key_val) != func_cache_.end()) {
ngraph_function_ = func_cache_.at(cache_key_val);
} else {
BuildNgFunction();
func_cache_[cache_key_val] = ngraph_function_;
}
} else {
BuildNgFunction();
}
}
void NgraphOperator::Run(const Scope& scope,
const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
for (size_t i = 0; i < var_in_.size(); ++i) {
auto vi = var_in_.at(i);
auto sp = var_node_map_->at(vi)->get_shape();
std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor");
if (tensor_pd->type().hash_code() ==
typeid(float).hash_code()) { // NOLINT
const float* arr = tensor_pd->data<float>();
ti = backend_->create_tensor(ngraph::element::f32, sp,
const_cast<float*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(int).hash_code()) { // NOLINT
const int* arr = tensor_pd->data<int>();
ti = backend_->create_tensor(ngraph::element::i32, sp,
const_cast<int*>(arr));
} else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
const int64_t* arr = tensor_pd->data<int64_t>();
ti = backend_->create_tensor(ngraph::element::i64, sp,
const_cast<int64_t*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(double).hash_code()) { // NOLINT
const double* arr = tensor_pd->data<double>();
ti = backend_->create_tensor(ngraph::element::f64, sp,
const_cast<double*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(bool).hash_code()) { // NOLINT
const bool* arr = tensor_pd->data<bool>();
ti = backend_->create_tensor(ngraph::element::boolean, sp,
const_cast<bool*>(arr));
} else {
PADDLE_THROW("Data type not handling for var %s", vi);
}
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
}
bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
? true
: false;
bool is_persistable =
(persistables_.find(vi) != persistables_.end()) ? true : false;
if (is_test && is_persistable) {
ti->set_stale(false);
}
t_in.push_back(ti);
}
for (size_t i = 0; i < var_out_.size(); ++i) {
auto var_name = var_out_[i];
auto* var = scope.FindVar(var_name);
std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims();
ngraph::Shape sp = Ddim2Shape(dd);
auto ng_type = var_type_map_.at(var_name);
if (ng_type == ngraph::element::f32) {
auto pd_arr = tensor_pd->mutable_data<float>(place);
to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handled in for var %s", var_name);
}
t_out.push_back(to);
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
}
}
backend_->call(ngraph_function_, t_out, t_in);
} // NgraphOperator::RunImpl
} // namespace framework
} // namespace paddle
#endif
......@@ -17,24 +17,19 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp"
#include "ngraph/type/element_type.hpp"
namespace paddle {
namespace framework {
......
......@@ -81,13 +81,35 @@ class CompileTimeInferShapeContext : public InferShapeContext {
"The %s[%d] is @EMPTY@", out, j);
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
VLOG(30) << "input " << in << " is not LodTensor";
if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
in_var->GetType() != proto::VarType::LOD_TENSOR_ARRAY) {
VLOG(3) << "input " << in << " is not LodTensor or LodTensorArray.";
return;
}
out_var->SetLoDLevel(in_var->GetLoDLevel());
}
void DecreaseLoDLevel(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size());
PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", in, i);
PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", out, j);
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
PADDLE_ENFORCE(out_var->GetType() == proto::VarType::LOD_TENSOR_ARRAY ||
out_var->GetType() == proto::VarType::LOD_TENSOR,
"The input %s should be LodTensorArray or LodTensor.",
out_var->Name());
PADDLE_ENFORCE(in_var->GetType() == proto::VarType::LOD_TENSOR,
"The input %s should be LodTensor.", in_var->Name());
if (in_var->GetLoDLevel() > 0) {
out_var->SetLoDLevel(in_var->GetLoDLevel() - 1);
}
}
bool IsRuntime() const override;
protected:
......@@ -241,38 +263,38 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
const proto::OpProto::Attr &attr = GetProtoAttr(name);
switch (attr.type()) {
case proto::AttrType::BOOLEANS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
this->attrs_[name] = std::vector<bool>();
break;
}
case proto::AttrType::INTS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
this->attrs_[name] = std::vector<int>();
break;
}
case proto::AttrType::LONGS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from LONGS to LONGS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from LONGS to LONGS";
this->attrs_[name] = std::vector<int64_t>();
break;
}
case proto::AttrType::FLOATS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
this->attrs_[name] = std::vector<float>();
break;
}
case proto::AttrType::STRINGS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
this->attrs_[name] = std::vector<std::string>();
break;
}
case proto::AttrType::BLOCKS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
this->SetBlocksAttr(name, std::vector<BlockDesc *>());
return;
}
......@@ -505,13 +527,13 @@ void OpDesc::CheckAttrs() {
}
void OpDesc::InferShape(const BlockDesc &block) const {
VLOG(30) << "CompileTime infer shape on " << Type();
VLOG(3) << "CompileTime infer shape on " << Type();
InitInferShapeFuncs();
auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
PADDLE_ENFORCE(static_cast<bool>(infer_shape),
"%s's infer_shape has not been registered", this->Type());
CompileTimeInferShapeContext ctx(*this, block);
if (VLOG_IS_ON(100)) {
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
auto inames = this->InputArgumentNames();
sout << " From [";
......@@ -522,7 +544,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
std::copy(onames.begin(), onames.end(),
std::ostream_iterator<std::string>(sout, ", "));
sout << "]";
VLOG(100) << sout.str();
VLOG(10) << sout.str();
}
infer_shape(&ctx);
}
......@@ -613,7 +635,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto shape = var->GetShape();
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(50) << "GetDim of variable " << name << " error";
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
......@@ -630,7 +652,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
}
} catch (...) {
VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
std::rethrow_exception(std::current_exception());
}
return res;
......
......@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const proto::OpDesc& op_desc) {
VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead.";
VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead.";
VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
AttributeMap attrs;
......
......@@ -139,7 +139,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(40) << place << " " << DebugStringEx(&scope);
VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place);
......@@ -149,20 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif
}
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
#ifndef _WIN32
// The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else
#endif
{
} else {
RunImpl(scope, place);
}
VLOG(30) << place << " " << DebugStringEx(&scope);
VLOG(3) << place << " " << DebugStringEx(&scope);
}
bool OperatorBase::HasInputs(const std::string& name) const {
......@@ -626,6 +623,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_layout(in_tensor.layout());
}
void DecreaseLoDLevel(const std::string& in, const std::string& out,
size_t i = 0, size_t j = 0) const override {
PADDLE_THROW("DecreaseLoDLevel is only used in compile time.");
}
bool IsRuntime() const override { return true; }
protected:
......@@ -693,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name);
}
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
}
void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
......@@ -719,14 +727,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(30) << "expected_kernel_key:" << expected_kernel_key;
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
......@@ -778,8 +786,7 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(30) << "share inplace var " + var_name +
" back to it's original scope";
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
auto* var = transfer_scope.FindVar(var_name);
......@@ -820,8 +827,8 @@ Scope* OperatorWithKernel::TryTransferData(
transfered_inplace_vars->emplace_back(var_name);
}
VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
......
......@@ -71,7 +71,7 @@ class OperatorBase;
class ExecutionContext;
/**
* OperatorBase has the basic element that Net will call to do computation.
* OperatorBase has the basic elements that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
* should always construct a proto message OpDesc and call
* OpRegistry::CreateOp(op_desc) to get an Operator instance.
......@@ -128,6 +128,8 @@ class OperatorBase {
virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
virtual void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {}
protected:
std::string type_;
......@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
}
void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const override;
protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
virtual OpKernelType GetKernelTypeForVar(
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
Scope *global_scope_; // not owned
std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
bool own_local_scope_;
......@@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
if (member_->use_cuda_) {
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
ncclUniqueId *nccl_id = nullptr;
if (nccl_id_var != nullptr) {
......@@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
......@@ -208,12 +208,12 @@ void ParallelExecutor::BCastParamsToDevices(
auto &main_tensor = main_var->Get<LoDTensor>();
if (!main_tensor.IsInitialized()) {
VLOG(30) << "one in var not inited, return!";
VLOG(3) << "one in var not inited, return!";
continue;
}
auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::vector<void *> buffers;
size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
......
......@@ -162,7 +162,7 @@ Variable* Scope::VarInternal(const std::string& name) {
v = new Variable();
vars_[name].reset(v);
VLOG(30) << "Create variable " << name;
VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v;
}
......
......@@ -206,7 +206,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized.");
if (ids.numel() == 0) {
VLOG(30) << "keys is empty, please check data!";
VLOG(3) << "keys is empty, please check data!";
} else {
int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
......
......@@ -120,8 +120,22 @@ class SelectedRows {
*/
int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
void SyncIndex();
/*
* @brief Get the index of the key from id_to_index_ map.
*/
inline int64_t GetIndexFromId(int64_t key) {
auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) {
return -1;
} else {
return iter->second;
}
}
void SyncIndex();
/*
* @brief Get complete Dims before
*/
DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims());
dims[0] = height_;
......@@ -133,9 +147,10 @@ class SelectedRows {
// SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled.
Vector<int64_t> rows_;
std::unordered_map<int64_t, int64_t> id_to_index_;
std::unordered_map<int64_t, int64_t>
id_to_index_; // should not be used when rows_ has duplicate member
std::unique_ptr<Tensor> value_{nullptr};
int64_t height_;
int64_t height_; // height indicates the underline tensor's height
std::unique_ptr<RWLock> rwlock_{nullptr};
};
......
......@@ -62,6 +62,9 @@ class InferShapeContext {
virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0;
virtual void DecreaseLoDLevel(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0;
virtual bool IsRuntime() const = 0;
std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
......
......@@ -22,8 +22,8 @@ namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
dst->Resize(src.dims());
......@@ -37,8 +37,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......@@ -77,8 +77,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(30) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
......@@ -114,8 +114,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
......@@ -125,8 +125,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......@@ -146,8 +146,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(30) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
return;
}
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
......
......@@ -39,7 +39,7 @@ void ThreadPool::Init() {
int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size;
VLOG(10) << "set dist_threadpool_size to " << num_threads;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
......
......@@ -17,28 +17,16 @@
namespace paddle {
namespace framework {
// Holds all the transfer scope across the process.
std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
typedef std::unordered_map<size_t, Scope*> map_t;
thread_local std::unique_ptr<map_t> x(new map_t);
thread_local auto* x = new std::unordered_map<size_t, Scope*>;
return *x;
}
// Holds all the transfer scope for this thread.
std::unordered_set<Scope*>& global_transfer_scope_cache() {
typedef std::unordered_set<Scope*> set_t;
thread_local std::unique_ptr<set_t> x(new set_t);
thread_local auto* x = new std::unordered_set<Scope*>;
return *x;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope) {
Scope* new_scope{nullptr};
......@@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
return new_scope;
}
void RemoveKidsFromTransferScopeCache(Scope* scope) {
auto it = global_transfer_scope_cache().find(scope);
if (it != global_transfer_scope_cache().end()) {
global_transfer_scope_cache().erase(it);
}
for (auto* s : scope->kids()) {
auto it = global_transfer_scope_cache().find(s);
if (it != global_transfer_scope_cache().end()) {
global_transfer_scope_cache().erase(it);
}
}
// remove global transfer data cache
auto& cache = global_transfer_data_cache();
for (auto it = cache.begin(); it != cache.end();) {
if (it->second == scope)
it = cache.erase(it);
else
it++;
}
}
} // namespace framework
} // namespace paddle
......@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
void VarDesc::SetShapes(
const std::vector<std::vector<int64_t>> &multiple_dims) {
if (multiple_dims.size() != GetTensorDescNum()) {
VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_dims.size());
}
std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
......@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
void VarDesc::SetDataTypes(
const std::vector<proto::VarType::Type> &multiple_data_type) {
if (multiple_data_type.size() != GetTensorDescNum()) {
VLOG(30) << "WARNING: The number of given data types("
<< multiple_data_type.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(3) << "WARNING: The number of given data types("
<< multiple_data_type.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_data_type.size());
}
std::vector<proto::VarType::TensorDesc *> tensor_descs =
......@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
if (multiple_lod_level.size() != GetTensorDescNum()) {
VLOG(30) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
VLOG(3) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_lod_level.size());
}
switch (desc_.type().type()) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
void InitializeVariable(Variable *var, proto::VarType::Type var_type);
}
}
......@@ -35,4 +35,5 @@ function(inference_analysis_test TARGET)
endif()
endfunction(inference_analysis_test)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_inference_api)
......@@ -46,8 +46,6 @@ class AnalysisPass {
protected:
// User should implement these.
virtual void RunImpl(Argument* argument) = 0;
Argument* argument_{nullptr};
};
} // namespace analysis
......
......@@ -19,6 +19,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace inference {
......@@ -75,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
i++) {
LOG(INFO) << "data: "
<< static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
set(analysis_deps ${analysis_deps}
ir_graph_build_pass
......
......@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass",
});
for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册