diff --git a/Dockerfile b/Dockerfile index 634be18a51bf61e96a8bf6f263b6674a7932d6e4..738bba9bc2e1ab19709722fe04f1490b1b13bd4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ + python3 python3-dev python3-pip \ git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ @@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN easy_install -U pip && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ pip install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip install pre-commit 'ipython==5.3.0' && \ +RUN pip3 install pre-commit 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ + pip install pre-commit 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python #For docstring checker +RUN pip3 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ +RUN pip3 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev +RUN pip3 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6ed51c648478efb9784d0c43b169c285e740e0f3..24de8d9d7ced5f8111cc5d65f761b7506bde048e 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS /usr/local/opt/openblas/lib) find_path(OPENBLAS_INC_DIR NAMES cblas.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) find_library(OPENBLAS_LIB NAMES openblas diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index c3fbe4dbdb28f1008bb274ee18293db348bfc6ed..755dbd610c40c2d9b85d3017b6f000a869b0f39a 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) - SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) + SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) - INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) # Because libopenblas.a is a symbolic link of another library, thus need to # install the whole directory. IF(ANDROID) @@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") -MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}") -INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR}) +MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}") +INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) # FIXME(gangliao): generate cblas target to track all high performance # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 452806a20e08c518b0f5aab7f63366eeb9341561..6418da2a7e51c51575ff56aeabedff5452458fbc 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -49,7 +49,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None)) +paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) @@ -62,14 +62,14 @@ paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) -paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) -paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) +paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) +paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) -paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) -paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False)) +paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) +paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -145,21 +145,31 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None)) -paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)) +paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) -paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False)) +paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) -paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -222,16 +232,6 @@ paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) -paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -265,9 +265,9 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) -paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) +paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -318,11 +318,11 @@ paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpo paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ -paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False)) +paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) -paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False)) +paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 39898dd23643c5742f209858c7d3dfad89968f7d..844291140602a7a0aac9d9d40256deaf9d8a4c60 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,4 @@ + # windows treat symbolic file as a real file, which is different with unix # We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -9,11 +10,23 @@ function(windows_symbolic TARGET) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - add_custom_command(OUTPUT .${src}.cu + + # only copy the xx.cu to .xx.cu when the content are modified + set(copy_flag 1) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) + if (SOURCE_STR STREQUAL TARGET_STR) + set(copy_flag 0) + endif() + endif() + if (copy_flag) + add_custom_command(OUTPUT .${src}.cu COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + endif(copy_flag) + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -81,6 +94,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) + # windows treat symbolic file as a real file, which is different with unix + # We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -149,7 +164,7 @@ if(WITH_DISTRIBUTE) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) - cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() if (NOT WIN32) @@ -169,15 +184,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -# cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) - -# disable test temporarily. -# TODO https://github.com/PaddlePaddle/Paddle/issues/11971 -# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op -# channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op -# conditional_block_op while_op assign_op print_op executor proto_desc) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h deleted file mode 100644 index 722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t -#include // NOLINT -#include -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -enum class ChannelAction { - SEND = 0, - RECEIVE = 1, - CLOSE = 2, -}; - -// Channel is the abstract class of buffered and un-buffered channels. -template -class Channel { - public: - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void Send(T*) = 0; - virtual bool Receive(T*) = 0; - virtual size_t Cap() = 0; - virtual void Lock() = 0; - - virtual void Unlock() = 0; - virtual bool IsClosed() = 0; - virtual void Close() = 0; - virtual ~Channel() {} - - virtual void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; -}; - -// Forward declaration of channel implementations. -template -class ChannelImpl; - -template -Channel* MakeChannel(size_t buffer_size) { - return new ChannelImpl(buffer_size); -} - -template -void CloseChannel(Channel* ch) { - ch->Close(); -} - -/* - * The ChannelHolder class serves two main purposes: - * 1. It acts as a unified wrapper for the different kinds of - * channels, i.e. Buffered and Unbuffered channels. This is - * similar to the ReaderHolder class. - * 2. It also helps us in TypeHiding. This is similar to the - * PlaceHolder implementations in variable.h and tensor.h. - */ -class ChannelHolder { - public: - template - void Reset(size_t buffer_size) { - holder_.reset(new PlaceholderImpl(buffer_size)); - } - - template - void Send(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - // Static cast should be safe because we have ensured that types are same - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - channel->Send(data); - } - - template - bool Receive(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - return channel->Receive(data); - } - - bool IsClosed() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->IsClosed(); - } - - bool CanSend() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanSend(); - } - - bool CanReceive() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanReceive(); - } - - void close() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Close(); - } - - size_t Cap() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Cap(); - } - - void Lock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Lock(); - } - - void Unlock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Unlock(); - } - - template - void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToSendQ(referrer, data, cond, cb); - } - } - - template - void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToReceiveQ(referrer, data, cond, cb); - } - } - - void RemoveFromSendQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromSendQ(referrer); - } - - void RemoveFromReceiveQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromReceiveQ(referrer); - } - - inline bool IsInitialized() const { return holder_ != nullptr; } - - inline const std::type_index Type() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Type(); - } - - private: - /** - * @note Placeholder hides type T, so it doesn't appear as a template - * parameter of ChannelHolder. - */ - struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_index Type() const = 0; - virtual void* Ptr() const = 0; - virtual bool IsClosed() = 0; - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; - virtual void Close() = 0; - virtual void Lock() = 0; - virtual void Unlock() = 0; - virtual size_t Cap() = 0; - }; - - template - struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(size_t buffer_size) - : type_(std::type_index(typeid(T))) { - channel_.reset(MakeChannel(buffer_size)); - } - - virtual const std::type_index Type() const { return type_; } - - virtual void* Ptr() const { return static_cast(channel_.get()); } - - virtual bool IsClosed() { - if (channel_) { - return channel_->IsClosed(); - } - return false; - } - - virtual bool CanSend() { - if (channel_) { - return channel_->CanSend(); - } - return false; - } - - virtual bool CanReceive() { - if (channel_) { - return channel_->CanReceive(); - } - return false; - } - - virtual void RemoveFromSendQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromSendQ(referrer); - } - } - - virtual void RemoveFromReceiveQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromReceiveQ(referrer); - } - } - - virtual void Close() { - if (channel_) channel_->Close(); - } - - virtual size_t Cap() { - if (channel_) - return channel_->Cap(); - else - return -1; - } - - virtual void Lock() { - if (channel_) channel_->Lock(); - } - - virtual void Unlock() { - if (channel_) channel_->Unlock(); - } - - std::unique_ptr> channel_; - const std::type_index type_; - }; - - // Pointer to a PlaceholderImpl object - std::unique_ptr holder_; -}; - -} // namespace framework -} // namespace paddle - -#include "paddle/fluid/framework/channel_impl.h" diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h deleted file mode 100644 index 26d454534e1ae38c4f83376c0836a45781ea9101..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_impl.h +++ /dev/null @@ -1,369 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for size_t -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -template -class ChannelImpl : public paddle::framework::Channel { - friend Channel *paddle::framework::MakeChannel(size_t); - friend void paddle::framework::CloseChannel(Channel *); - - public: - virtual bool CanSend(); - virtual bool CanReceive(); - virtual void Send(T *); - virtual bool Receive(T *); - virtual size_t Cap() { return cap_; } - virtual void Lock(); - virtual void Unlock(); - virtual bool IsClosed(); - virtual void Close(); - explicit ChannelImpl(size_t); - virtual ~ChannelImpl(); - - virtual void AddToSendQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - virtual void AddToReceiveQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - - virtual void RemoveFromSendQ(const void *referrer); - virtual void RemoveFromReceiveQ(const void *referrer); - - private: - struct QueueMessage { - T *data; - std::shared_ptr cond; - bool chan_closed = false; - bool completed = false; - const void *referrer; // TODO(thuan): figure out better way to do this - std::function callback; - - explicit QueueMessage(T *item) - : data(item), cond(std::make_shared()) {} - - QueueMessage(T *item, std::shared_ptr cond) - : data(item), cond(cond) {} - - void Wait(std::unique_lock &lock) { - cond->wait(lock, [this]() { return completed; }); - } - - void Notify() { - completed = true; - cond->notify_all(); - } - }; - - void send_return() { - send_ctr--; - destructor_cond_.notify_all(); - } - - bool recv_return(bool value) { - recv_ctr--; - destructor_cond_.notify_all(); - return value; - } - - std::shared_ptr get_first_message( - std::deque> *queue, ChannelAction action) { - while (!queue->empty()) { - // Check whether this message was added by Select - // If this was added by Select then execute the callback - // to check if you can execute this message. The callback - // can return false if some other case was executed in Select. - // In that case just discard this QueueMessage and process next. - std::shared_ptr m = queue->front(); - queue->pop_front(); - if (m->callback == nullptr || m->callback(action)) return m; - } - return nullptr; - } - - size_t cap_; - std::recursive_mutex mu_; - bool closed_; - std::deque buf_; - std::deque> recvq; - std::deque> sendq; - std::atomic send_ctr{0}; - std::atomic recv_ctr{0}; - std::condition_variable_any destructor_cond_; -}; - -template -ChannelImpl::ChannelImpl(size_t capacity) - : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) { - PADDLE_ENFORCE_GE(capacity, 0); -} - -template -bool ChannelImpl::CanSend() { - std::lock_guard lock{mu_}; - return !closed_ && (!recvq.empty() || buf_.size() < cap_); -} - -template -bool ChannelImpl::CanReceive() { - std::lock_guard lock{mu_}; - return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0); -} - -template -void ChannelImpl::Send(T *item) { - send_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed, throw exception - if (closed_) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - - // If there is a receiver, directly pass the value we want - // to send to the receiver, bypassing the channel buffer if any - if (!recvq.empty()) { - std::shared_ptr m = - get_first_message(&recvq, ChannelAction::SEND); - - if (m != nullptr) { - *(m->data) = std::move(*item); - m->Notify(); - send_return(); - return; - } else { - Send(item); - send_return(); - return; - } - } - - // Unbuffered channel will always bypass this - // If buffered channel has space in buffer, - // write the element to the buffer. - if (buf_.size() < cap_) { - // Copy to buffer - buf_.push_back(std::move(*item)); - send_return(); - return; - } - - // Block on channel, because some receiver will complete - // the operation for us - auto m = std::make_shared(item); - sendq.push_back(m); - m->Wait(lock); - if (m->chan_closed) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - send_return(); -} - -template -bool ChannelImpl::Receive(T *item) { - recv_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed and buffer is empty or - // channel is unbuffered - if (closed_ && buf_.empty()) return recv_return(false); - - // If there is a sender, directly receive the value we want - // from the sender. In case of a buffered channel, read from - // buffer and move front of send queue to the buffer - if (!sendq.empty()) { - std::shared_ptr m = - get_first_message(&sendq, ChannelAction::RECEIVE); - if (buf_.size() > 0) { - // Case 1 : Channel is Buffered - // Do Data transfer from front of buffer - // and add a QueueMessage to the buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // If first message from sendq is not null - // add it to the buffer and notify it - if (m != nullptr) { - // Copy to buffer - buf_.push_back(std::move(*(m->data))); - m->Notify(); - } // Ignore if there is no first message - } else { - // Case 2: Channel is Unbuffered - // Do data transfer from front of SendQ - // If front is nullptr, then recursively call itself - if (m != nullptr) { - *item = std::move(*(m->data)); - m->Notify(); - } else { - return recv_return(Receive(item)); - } - } - return recv_return(true); - } - - // If this is a buffered channel and there are items in buffer - if (buf_.size() > 0) { - // Directly read from buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // return true - return recv_return(true); - } - - // No sender available, block on this channel - // Some receiver will complete the option for us - auto m = std::make_shared(item); - recvq.push_back(m); - m->Wait(lock); - - return recv_return(!m->chan_closed); -} - -template -void ChannelImpl::Lock() { - mu_.lock(); -} - -template -void ChannelImpl::Unlock() { - mu_.unlock(); -} - -template -bool ChannelImpl::IsClosed() { - std::lock_guard lock{mu_}; - return closed_; -} - -template -void ChannelImpl::Close() { - std::unique_lock lock{mu_}; - - if (closed_) { - // TODO(abhinavarora): closing an already closed channel should panic - lock.unlock(); - return; - } - - closed_ = true; - - // Empty the readers - while (!recvq.empty()) { - std::shared_ptr m = recvq.front(); - recvq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } - - // Empty the senders - while (!sendq.empty()) { - std::shared_ptr m = sendq.front(); - sendq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } -} - -template -void ChannelImpl::AddToSendQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - sendq.push_back(m); -} - -template -void ChannelImpl::AddToReceiveQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - recvq.push_back(m); -} - -template -void ChannelImpl::RemoveFromSendQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = sendq.begin(); it != sendq.end();) { - std::shared_ptr sendMsg = (std::shared_ptr)*it; - - if (sendMsg->referrer == referrer) { - it = sendq.erase(it); - } else { - ++it; - } - } -} - -template -void ChannelImpl::RemoveFromReceiveQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = recvq.begin(); it != recvq.end();) { - std::shared_ptr recvMsg = (std::shared_ptr)*it; - - if (recvMsg->referrer == referrer) { - it = recvq.erase(it); - } else { - ++it; - } - } -} - -template -ChannelImpl::~ChannelImpl() { - Close(); - // The destructor must wait for all readers and writers to complete their task - // The channel has been closed, so we will not accept new readers and writers - std::unique_lock lock{mu_}; - destructor_cond_.wait(lock, - [this]() { return send_ctr == 0 && recv_ctr == 0; }); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc deleted file mode 100644 index 542d791f6bbdf7d68a4786998ccc0233fff6473d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_test.cc +++ /dev/null @@ -1,1008 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" - -#include // NOLINT -#include // NOLINT -#include "gtest/gtest.h" - -using paddle::framework::Channel; -using paddle::framework::ChannelHolder; -using paddle::framework::MakeChannel; -using paddle::framework::CloseChannel; - -TEST(Channel, ChannelCapacityTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - CloseChannel(ch); - delete ch; - - ch = MakeChannel(0); - EXPECT_EQ(ch->Cap(), 0U); - CloseChannel(ch); - delete ch; -} - -void RecevingOrderEqualToSendingOrder(Channel *ch, int num_items) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < num_items; i++) { - ch->Send(&i); - sum_send += i; - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - for (int i = 0; i < num_items; i++) { - int recv = -1; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - CloseChannel(ch); - t.join(); - unsigned expected_sum = (num_items * (num_items - 1)) / 2; - EXPECT_EQ(sum_send, expected_sum); - delete ch; -} - -TEST(Channel, SufficientBufferSizeDoesntBlock) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); - } - - size_t out; - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // should not block - EXPECT_EQ(out, i); - } - CloseChannel(ch); - delete ch; -} - -// This tests that a channel must return false -// on send and receive performed after closing the channel. -// Receive will only return false after close when queue is empty. -// By creating separate threads for sending and receiving, we make this -// function able to test both buffered and unbuffered channels. -void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { - const size_t data = 5; - std::thread send_thread{[&]() { - size_t i = data; - ch->Send(&i); // should not block - }}; - - std::thread recv_thread{[&]() { - size_t i; - EXPECT_EQ(ch->Receive(&i), true); // should not block - EXPECT_EQ(i, data); - }}; - - send_thread.join(); - recv_thread.join(); - - // After closing send should panic. Receive should - // also false as there is no data in queue. - CloseChannel(ch); - send_thread = std::thread{[&]() { - size_t i = data; - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - }}; - recv_thread = std::thread{[&]() { - size_t i; - // should return false because channel is closed and queue is empty - EXPECT_EQ(ch->Receive(&i), false); - }}; - - send_thread.join(); - recv_thread.join(); -} - -TEST(Channel, SendReceiveClosedBufferedChannelPanics) { - size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) { - auto ch = MakeChannel(0); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); // sending should not block - } - - size_t out; - for (size_t i = 0; i < buffer_size / 2; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // receiving should not block - EXPECT_EQ(out, i); - } - - CloseChannel(ch); - - for (size_t i = buffer_size / 2; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - true); // receving should return residual values. - EXPECT_EQ(out, i); - } - - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - false); // receiving on closed channel should return false - } - delete ch; -} - -TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - std::thread t([&]() { - // Try to write more than buffer size. - for (size_t i = 0; i < 2 * buffer_size; ++i) { - if (i < buffer_size) { - ch->Send(&i); // should block after 10 iterations - } else { - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - } - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { - auto ch = MakeChannel(0); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) { - // Test that Receive Order is same as Send Order when number of items - // sent is less than size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 5); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) { - // Test that Receive Order is same as Send Order when number of items - // sent is equal to size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 10); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) { - // Test that Receive Order is same as Send Order when number of items - // sent is greater than the size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -void ChannelCloseUnblocksReceiversTest(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a buffered channel also unblocks -// any receivers waiting on the channel -TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a buffered channel also unblocks -// any senders waiting for channel to have write space -TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksSendersTest(ch, true); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any senders waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksSendersTest(ch, false); - delete ch; -} - -TEST(Channel, UnbufferedLessReceiveMoreSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - // Send should block after three iterations - // since we only have three receivers. - std::thread t([&]() { - // Try to send more number of times - // than receivers - for (int i = 0; i < 4; i++) { - try { - ch->Send(&i); - sum_send += i; - } catch (paddle::platform::EnforceNotMet e) { - } - } - }); - for (int i = 0; i < 3; i++) { - int recv; - ch->Receive(&recv); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 3U); - - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, UnbufferedMoreReceiveLessSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - unsigned sum_receive = 0; - // The receiver should block after 5 - // iterations, since there are only 5 senders. - std::thread t([&]() { - for (int i = 0; i < 8; i++) { - int recv; - ch->Receive(&recv); // should block after the fifth iteration. - EXPECT_EQ(recv, i); - sum_receive += i; - } - }); - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 10U); - EXPECT_EQ(sum_receive, 10U); - // send three more elements - for (int i = 5; i < 8; i++) { - ch->Send(&i); - sum_send += i; - } - - CloseChannel(ch); - t.join(); - EXPECT_EQ(sum_send, 28U); - EXPECT_EQ(sum_receive, 28U); - delete ch; -} - -// This tests that destroying a channel unblocks -// any senders waiting for channel to have write space -void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successful sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channel also unblocks -// any receivers waiting on the channel -void ChannelDestroyUnblockReceivers(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockSenders(ch, true); -} - -// This tests that destroying an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockSenders(ch, false); -} - -TEST(ChannelHolder, ChannelHolderCapacityTest) { - const size_t buffer_size = 10; - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - delete ch; - - ch = new ChannelHolder(); - ch->Reset(0); - EXPECT_EQ(ch->Cap(), 0U); - delete ch; -} - -void ChannelHolderSendReceive(ChannelHolder *ch) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - }); - for (int i = 0; i < 5; i++) { - int recv; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - - ch->close(); - t.join(); - EXPECT_EQ(sum_send, 10U); -} - -TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelUninitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - EXPECT_EQ(ch->IsInitialized(), false); - int i = 10; - bool send_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - send_exception = true; - } - EXPECT_EQ(send_exception, true); - - bool recv_exception = false; - try { - ch->Receive(&i); - } catch (paddle::platform::EnforceNotMet e) { - recv_exception = true; - } - EXPECT_EQ(recv_exception, true); - - bool is_exception = false; - try { - ch->Type(); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, ChannelInitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(2); - EXPECT_EQ(ch->IsInitialized(), true); - // Channel should remain intialized even after close - ch->close(); - EXPECT_EQ(ch->IsInitialized(), true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchSendTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool boolean_data = true; - try { - ch->Send(&boolean_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Send(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchReceiveTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool float_data; - try { - ch->Receive(&float_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Receive(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a channelholder unblocks -// any receivers waiting on the channel -TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a channelholder unblocks -// any senders waiting for channel to have write space -TEST(Channel, ChannelHolderCloseUnblocksSendersTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksSendersTest(ch, true); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksSendersTest(ch, false); - delete ch; -} - -// This tests that destroying a channelholder unblocks -// any senders waiting for channel -void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successfuld sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channelholder also unblocks -// any receivers waiting on the channel -void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockReceivers(ch); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockReceivers(ch); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockSenders(ch, true); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockSenders(ch, false); -} - -// This tests that closing a channelholder many times. -void ChannelHolderManyTimesClose(ChannelHolder *ch) { - const int kNumThreads = 15; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to send data to channel. - for (size_t i = 0; i < kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *ended) { - int data = 10; - ch->Send(&data); - *ended = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to receive data to channel. - for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - if (ch->Receive(&data)) { - EXPECT_EQ(data, 10); - } - *p = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to close the channel. - for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - if (!ch->IsClosed()) { - ch->close(); - } - *p = true; - }, - &thread_ended[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - EXPECT_TRUE(ch->IsClosed()); - // delete the channel - delete ch; - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderManyTimesClose(ch); -} diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc deleted file mode 100644 index bbf67f5ba92150f70cf45d49e3f4ca0a16393541..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/concurrency_test.cc +++ /dev/null @@ -1,292 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" - -USE_NO_KERNEL_OP(go); -USE_NO_KERNEL_OP(channel_close); -USE_NO_KERNEL_OP(channel_create); -USE_NO_KERNEL_OP(channel_recv); -USE_NO_KERNEL_OP(channel_send); -USE_NO_KERNEL_OP(elementwise_add); -USE_NO_KERNEL_OP(select); -USE_NO_KERNEL_OP(conditional_block); -USE_NO_KERNEL_OP(equal); -USE_NO_KERNEL_OP(assign); -USE_NO_KERNEL_OP(while); -USE_NO_KERNEL_OP(print); - -namespace f = paddle::framework; -namespace p = paddle::platform; - -namespace paddle { -namespace framework { - -template -LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place, - std::string name, T value) { - // Create LoDTensor of dim [1] - auto var = scope->Var(name); - auto tensor = var->GetMutable(); - tensor->Resize({1}); - T *expect = tensor->mutable_data(place); - expect[0] = value; - return tensor; -} - -void AddOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, AttributeMap attrs, - BlockDesc *block) { - // insert op - auto op = block->AppendOp(); - op->SetType(type); - for (auto &kv : inputs) { - op->SetInput(kv.first, kv.second); - } - for (auto &kv : outputs) { - op->SetOutput(kv.first, kv.second); - } - op->SetAttrMap(attrs); -} - -void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place, - BlockDesc *casesBlock, int caseId, int caseType, - std::string caseChannel, std::string caseVarName, - std::function func) { - std::string caseCondName = std::string("caseCond") + std::to_string(caseId); - std::string caseCondXVarName = - std::string("caseCondX") + std::to_string(caseId); - - BlockDesc *caseBlock = program->AppendBlock(*casesBlock); - func(caseBlock, scope); - - CreateVariable(scope, *place, caseCondName, false); - CreateVariable(scope, *place, caseCondXVarName, caseId); - CreateVariable(scope, *place, caseVarName, caseId); - - scope->Var("step_scope"); - - AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}}, - {{"Out", {caseCondName}}}, {}, casesBlock); - - AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}}, - {{"Out", {}}, {"Scope", {"step_scope"}}}, - {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock); -} - -void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, - BlockDesc *parentBlock, std::string dataChanName, - std::string quitChanName) { - BlockDesc *whileBlock = program->AppendBlock(*parentBlock); - - CreateVariable(scope, *place, "whileExitCond", true); - CreateVariable(scope, *place, "caseToExecute", -1); - CreateVariable(scope, *place, "case1var", 0); - - CreateVariable(scope, *place, "xtemp", 0); - - // TODO(thuan): Need to create fibXToSend, since channel send moves the actual - // data, - // which causes the data to be no longer accessible to do the fib calculation - // TODO(abhinav): Change channel send to do a copy instead of a move! - CreateVariable(scope, *place, "fibXToSend", 0); - - CreateVariable(scope, *place, "fibX", 0); - CreateVariable(scope, *place, "fibY", 1); - CreateVariable(scope, *place, "quitVar", 0); - - BlockDesc *casesBlock = program->AppendBlock(*whileBlock); - std::function f = [](BlockDesc *caseBlock) {}; - - // TODO(thuan): Remove this once we change channel send to do a copy instead - // of move - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock); - - // Case 0: Send to dataChanName - std::function case0Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock); - AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock); - AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}}, - {{"Out", {"fibY"}}}, {}, caseBlock); - }; - AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend", - case0Func); - std::string case0Config = - std::string("0,1,") + dataChanName + std::string(",fibXToSend"); - - // Case 1: Receive from quitChanName - std::function case2Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - // Exit the while loop after we receive from quit channel. - // We assign a false to "whileExitCond" variable, which will - // break out of while_op loop - CreateVariable(scope, *place, "whileFalse", false); - AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {}, - caseBlock); - }; - AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar", - case2Func); - std::string case1Config = - std::string("1,2,") + quitChanName + std::string(",quitVar"); - - // Select block - AddOp("select", {{"X", {dataChanName, quitChanName}}, - {"case_to_execute", {"caseToExecute"}}}, - {{"Out", {}}}, - {{"sub_block", casesBlock}, - {"cases", std::vector{case0Config, case1Config}}}, - whileBlock); - - scope->Var("stepScopes"); - AddOp("while", - {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}}, - {{"Out", {}}, {"StepScopes", {"stepScopes"}}}, - {{"sub_block", whileBlock}}, parentBlock); -} - -TEST(Concurrency, Go_Op) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - // Create channel variable - scope.Var("Channel"); - - // Create Variables, x0 will be put into channel, - // result will be pulled from channel - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "x0", 99); - CreateVariable(&scope, place, "result", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - AddOp("channel_create", {}, {{"Out", {"Channel"}}}, - {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}}, - block); - - // Create Go Op routine - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}}, - block); - - // Create Channel Receive Op - AddOp("channel_recv", {{"Channel", {"Channel"}}}, - {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block); - - // Check the result tensor to make sure it is set to 0 - const LoDTensor &tensor = (scope.FindVar("result"))->Get(); - auto *initialData = tensor.data(); - EXPECT_EQ(initialData[0], 0); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, the Go operator should do a channel_send to - // set the "result" variable to 99. - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 99); -} - -/** - * This test implements the fibonacci function using go_op and select_op - */ -TEST(Concurrency, Select) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "result", 0); - CreateVariable(&scope, place, "currentXFib", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - std::string dataChanName = "Channel"; - scope.Var(dataChanName); - AddOp("channel_create", {}, {{"Out", {dataChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - std::string quitChanName = "Quit"; - scope.Var(quitChanName); - AddOp("channel_create", {}, {{"Out", {quitChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - // Create Go Op routine, which loops 10 times over fibonacci sequence - CreateVariable(&scope, place, "xReceiveVar", 0); - - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - for (int i = 0; i < 10; ++i) { - AddOp("channel_recv", {{"Channel", {dataChanName}}}, - {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock); - AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}}, - {{"first_n", 100}, - {"summarize", -1}, - {"print_tensor_name", false}, - {"print_tensor_type", true}, - {"print_tensor_shape", false}, - {"print_tensor_lod", false}, - {"print_phase", std::string("FORWARD")}, - {"message", std::string("X: ")}}, - goOpBlock); - } - - CreateVariable(&scope, place, "quitSignal", 0); - AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {dataChanName, quitChanName}}}, {}, - {{"sub_block", goOpBlock}}, block); - - AddFibonacciSelect(&scope, &place, &program, block, dataChanName, - quitChanName); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block); - AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, "result" variable should be equal to 34 - // (which is 10 loops through fibonacci sequence) - const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get(); - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 34); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index b1ce551ce73de33bcede187c72feebad6e2fa1a5..2d1f688d64ece3322e253b0c070264b9eb73d678 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -80,15 +80,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( // This is weird but there is really some variables without var_desc // in computation_op if (var_desc == nullptr) { - if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) - continue; - } else { - if (var_desc->Persistable()) continue; - auto var_type = var_desc->Proto()->type().type(); - if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { - continue; - } + var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); + if (var_desc == nullptr) continue; + } + + if (var_desc->Persistable()) continue; + auto var_type = var_desc->Proto()->type().type(); + if (var_type != proto::VarType::LOD_TENSOR && + var_type != proto::VarType::SELECTED_ROWS) { + continue; } // compute op only runs in one device diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8d8042a0563a21dad216ffd53a474322c378ace6..70ec6e90a4d0106b7f838e51b8357798daa4b10d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { var->GetMutable(); - } else if (var_type == proto::VarType::CHANNEL) { - var->GetMutable(); } else if (var_type == proto::VarType::RAW) { // GetMutable will be called in operator } else { PADDLE_THROW( "Variable type %d is not in " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", var_type); } } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 460401df5473f8650f450a2bd247a703d91b6048..25f0ba418433571343c5b2bbfdbf9fb940eaec52 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -126,7 +126,6 @@ message VarType { LOD_TENSOR_ARRAY = 13; PLACE_LIST = 14; READER = 15; - CHANNEL = 16; // Any runtime decided variable type is raw // raw variables should manage their own allocations // in operators like nccl_op @@ -158,12 +157,6 @@ message VarType { message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } optional ReaderDesc reader = 5; - message ChannelDesc { - required Type data_type = 1; - required int64 capacity = 2; - } - optional ChannelDesc channel = 6; - message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a0bf1afd402c4e4eebe13cc3fc43f44f23dccaed..0076a8bece31f9a977b375717c25688fc0c95819 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,5 +1,6 @@ set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#pragma once\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") @@ -34,6 +35,7 @@ endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) +pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba11f19c9273650113096be3fa23ca077bbc7dd9 --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace framework { +namespace ir { + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Build pattern + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) + ->assert_is_op_input("lookup_table") + ->assert_var_not_persistable(); + patterns::Embedding embedding_pattern(pattern, name_scope); + // TODO(jczaja): Intermediate can only be for val that are not used anywhere + // but lookup table output may go into other LSTM (for reverse + // direction) + auto* embedding_out = embedding_pattern(x); + patterns::FC fc_pattern(pattern, name_scope); + + // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. + auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate(); + patterns::LSTM lstm_pattern(pattern, name_scope); + lstm_pattern(fc_out); + + // Create New OpDesc + auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm, + Node* input, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* cell, + Node* xx, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fused_embedding_fc_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(Ids, input); + SET_IN(WeightH, weight_h); + // Neet to have this passed as We need Wc data for peephole connections + SET_IN(Bias, bias); +#undef SET_IN + + // Multiply embeddings with Weights + PADDLE_ENFORCE(scope); + const std::string& embeddings = patterns::UniqueKey("Embeddings"); + auto* embeddings_var = scope->Var(embeddings); + PADDLE_ENFORCE(embeddings_var); + auto* embeddings_tensor = + embeddings_var->GetMutable(); + // Get WeightX size: [single_embedding, fc_size] + // and embedding size: [dict_size, single_embedding] + // and create new size of embeddings eg. [dict_size , hidden_size] + auto* embedding_var = scope->FindVar(W->Name()); + PADDLE_ENFORCE(embedding_var); + const auto& embedding_tensor = embedding_var->Get(); + + const auto& weightx_tensor = + scope->FindVar(weight_x->Name())->Get(); + embeddings_tensor->Resize( + {embedding_tensor.dims()[0], weightx_tensor.dims()[1]}); + + // Multiplie embeddings via WeightsX and add bias + auto embedding_data = embedding_tensor.data(); + auto weightx_data = weightx_tensor.data(); + auto embeddings_data = + embeddings_tensor->mutable_data(platform::CPUPlace()); + + // Adding biases to GEMM result to be + auto* lstm_bias_var = scope->FindVar(bias->Name()); + PADDLE_ENFORCE(lstm_bias_var); + const auto& lstm_bias_tensor = lstm_bias_var->Get(); + + auto alpha = 1.0f; + auto beta = 1.0f; + int m = embedding_tensor.dims()[0]; + int n = weightx_tensor.dims()[1]; + int k = embedding_tensor.dims()[1]; + + // Copy only gate biases values (only actual bias data, not peephole + // weights) + std::vector combined_biases; + combined_biases.reserve(n); + std::copy_n(lstm_bias_tensor.data(), n, + std::back_inserter(combined_biases)); + + if (with_fc_bias) { + // Add FC-bias with LSTM-bias (into GEMM result to be) + auto* fc_bias_var = scope->FindVar(fc_bias->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + for (int i = 0; i < fc_bias_tensor.numel(); i++) { + combined_biases[i] += fc_bias_tensor.data()[i]; + } + } + + // broadcast biases + std::vector ones(m, 1.0f); + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1, + &combined_biases[0], n, 0.0f, embeddings_data, n); + + // Wx*embeddings + biases + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, + embedding_data, k, weightx_data, n, beta, embeddings_data, n); + op_desc.SetInput("Embeddings", {embeddings}); + + // Create temp variables. + const std::string BatchedInput = patterns::UniqueKey("BatchedInput"); + const std::string BatchedCellPreAct = + patterns::UniqueKey("BatchedCellPreAct"); + const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + + scope->Var(BatchedInput)->GetMutable(); + scope->Var(BatchedCellPreAct)->GetMutable(); + scope->Var(BatchedGate)->GetMutable(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetOutput("Cell", {cell->Name()}); + op_desc.SetOutput("XX", {xx->Name()}); + op_desc.SetOutput("BatchedGate", {BatchedGate}); + op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); + op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); +#define OP_SET_OUT(x) \ + const std::string x = patterns::UniqueKey(#x); \ + op_desc.SetOutput(#x, {x}); \ + scope->Var(x)->GetMutable() + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); + IR_NODE_LINK_TO(op, hidden); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + + // TODO(jczaja): Add support for is_sparse / is_distributed + auto is_sparse = boost::get(lookup_table->Op()->GetAttr("is_sparse")); + auto is_distributed = + boost::get(lookup_table->Op()->GetAttr("is_distributed")); + + if (is_sparse == true || is_distributed == true) { + return; + } + + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, fc_bias); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + std::unordered_set marked_nodes( + //{lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + {mul, lstm, elementwise_add, fc_bias}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, nullptr); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + // std::unordered_set marked_nodes({lookup_table, W, mul, + // lstm}); + std::unordered_set marked_nodes({mul, lstm}); + GraphSafeRemoveNodes(graph, marked_nodes); + } + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr EmbeddingFCLSTMFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + true /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(embedding_fc_lstm_fuse_pass, + paddle::framework::ir::EmbeddingFCLSTMFusePass); diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h similarity index 51% rename from paddle/fluid/inference/api/timer.h rename to paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h index 2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784..e5ad3067ec4060e41f1464395f3fc76183de3e66 100644 --- a/paddle/fluid/inference/api/timer.h +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -11,29 +11,30 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #pragma once -#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { -namespace inference { +namespace framework { +namespace ir { + +// Fusing of Embedding , FC and LSTM op -// Timer for timer -class Timer { +// Just FC without bias +class EmbeddingFCLSTMFusePass : public FusePassBase { public: - std::chrono::high_resolution_clock::time_point start; - std::chrono::high_resolution_clock::time_point startu; - - void tic() { start = std::chrono::high_resolution_clock::now(); } - double toc() { - startu = std::chrono::high_resolution_clock::now(); - std::chrono::duration time_span = - std::chrono::duration_cast>(startu - - start); - double used_time_ms = static_cast(time_span.count()) * 1000.0; - return used_time_ms; - } + virtual ~EmbeddingFCLSTMFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"embedding_fc_lstm_fuse"}; }; -} // namespace inference +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6d2c51b0e9bed8461f6491b84a36a3bf6663a138..46c6a52c09e896596aa6d8e1e901955a68a4957d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -692,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, } } +PDNode *patterns::Embedding::operator()(PDNode *x) { + x->assert_is_op_input("lookup_table", "Ids"); + auto *lookup_table_op = + pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = pattern->NewNode(arg__##_repr()) \ + ->assert_is_op_##io__("lookup_table", #arg__); + + NEW_NODE(W, input); + + NEW_NODE(Out, output); +#undef NEW_NODE + + lookup_table_op->LinksFrom({x, W}); + lookup_table_op->LinksTo({Out}); + return Out; +} + PDNode *patterns::LSTM::operator()(PDNode *x) { x->assert_is_op_input("lstm", "Input"); auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 69b486c29d8bd1102a8372f5041051c25ce19359..508113bf4fcab274394f2705c36eddbf4ba3c77a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -418,6 +418,23 @@ struct FC : public PatternBase { PATTERN_DECL_NODE(Out); }; +// Embedding +struct Embedding : public PatternBase { + Embedding(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding") {} + + PDNode* operator()(PDNode* x); + + // declare operator node's name + PATTERN_DECL_NODE(lookup_table); + // Inputs + // + PATTERN_DECL_NODE(Ids); + PATTERN_DECL_NODE(W); // embeddings + // Outputs + PATTERN_DECL_NODE(Out); +}; + struct LSTM : public PatternBase { LSTM(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "lstm") {} diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f681d4ecef9efe2b51c7154787230e8be2fb2702..ba10687d65cfbbac89cfc76879c8b202ebd03229 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/naive_executor.h" -#include "paddle/fluid/framework/channel.h" +#include +#include + #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/string/pretty_log.h" @@ -44,8 +46,6 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { var->GetMutable(); - } else if (var_type == proto::VarType::CHANNEL) { - var->GetMutable(); } else if (var_type == proto::VarType::RAW) { // GetMutable will be called in operator } else { @@ -146,5 +146,22 @@ void NaiveExecutor::CleanFeedFetchOps() { ops_.swap(ops); } +void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t block_id = 0; block_id < program.Size(); ++block_id) { + auto *block = const_cast(program).MutableBlock(block_id); + for (auto *op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#else + LOG(WARNING) + << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 9355e9e36a6358aa91553dca35aaf1b658516a0a..9374f3f4a35cc0f90e5b2d6e8b397784b8eae123 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -46,6 +48,8 @@ class NaiveExecutor { void CleanFeedFetchOps(); + void EnableMKLDNN(const ProgramDesc& program); + protected: void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2663c9be41a834523fb896b490e7e75df256de05..df2a7a27ca4a6011b214202ac9bf4f30dc482ece 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -132,9 +132,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") - .SetDefault({}); + Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index f13196959705bad473a6f7b3ef88f8faa8abe2b8..4ed3cc45d66849267ef4945a03da1db76b53e4ea 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -46,7 +46,6 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } - static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 96624e33c6323dee7b6534673278b6b1b6343ae0..a103be7191d02a96ee97d76f786f9364938c1c65 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -14,17 +14,15 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES #define GOOGLE_GLOG_DLL_DECL -#include "paddle/fluid/framework/operator.h" #include #include + #include -#include -#include -#include + #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/profiler.h" @@ -142,54 +140,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - try { - if (VLOG_IS_ON(4)) { - VLOG(4) << place << " " << DebugStringEx(&scope); - } - if (platform::is_gpu_place(place)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } - - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - } - - RunImpl(scope, place); - - if (VLOG_IS_ON(3)) { - VLOG(3) << place << " " << DebugStringEx(&scope); - } - } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw exception; - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); - - if (callstack.empty()) { - throw exception; - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); - throw exception; - } catch (...) { - std::rethrow_exception(std::current_exception()); } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -217,7 +180,7 @@ const std::vector& OperatorBase::Inputs( } bool OperatorBase::HasOutputs(const std::string& name) const { - if (outputs_.end() != outputs_.find(name)) { + if (outputs_.find(name) != outputs_.end()) { return true; } else { return false; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 720d17a654bf96ca2bad43cc0c4374b2303ac233..f06bad6c78c05804e583f859906b88fb7b500372 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -156,10 +156,12 @@ ParallelExecutor::ParallelExecutor( params, member_->local_scopes_, member_->use_cuda_); #endif - // If the loss_var_name is given, the number of graph should be only one. - if (loss_var_name.size()) { - PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, - "The number of graph should be only one"); + if (VLOG_IS_ON(5)) { + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { @@ -248,6 +250,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { ResetReferenceCount(); + for (auto &pair : cur_ref_cnts_) { + auto &name_map = *(pair.second); + for (auto &fetch_name : fetch_tensors) { + name_map.erase(fetch_name); + } + name_map.erase(fetched_var_name); + } } #endif auto fetch_data = member_->executor_->Run(fetch_tensors); diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index da163835e8652ae479121bd67f2eed77332b2740..dbf00f3a79f7d1dcf97b346fccfdb68f119d4aa3 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -46,6 +46,7 @@ struct RWLock { private: pthread_rwlock_t lock_; }; +// TODO(paddle-dev): Support RWLock for WIN32 for correctness. #else // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 40dee143f5d8f64a44bc2469bd5f38b89338ea5d..1a727a2c8c759d010606d5b605823b7252b35c69 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -20,13 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -// The mutex is not needed by training and inference, only for distribution. -#if PADDLE_WITH_DISTRIBUTE -#define WITH_LOCK 1 -#else -#define WITH_LOCK 0 -#endif - DEFINE_bool(benchmark, false, "Doing memory benchmark. It will make deleting scope synchronized, " "and add some memory usage logs." @@ -56,24 +49,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif return VarInternal(name); } Variable* Scope::Var(std::string* name) { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -82,39 +69,29 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif return FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif return FindScopeInternal(var); } void Scope::DropKids() { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -124,9 +101,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -139,9 +114,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -154,16 +127,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { -#if WITH_LOCK std::unique_lock lock(mutex_); -#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h index f6c6a1fec13d8b12efd1fa71a7a93316e484d045..508ee931c6ed7f66e09abd8f0e4b33c3d3c135fd 100644 --- a/paddle/fluid/framework/tuple.h +++ b/paddle/fluid/framework/tuple.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 1aa0ae0f7c1946d91736ab61236a65a45c203fe3..7e3f002b53351ba5892aaa50482b21a83db94069 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -88,13 +88,7 @@ std::vector> VarDesc::GetShapes() const { } void VarDesc::SetDataType(proto::VarType::Type data_type) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - mutable_channel_desc()->set_data_type(data_type); - break; - default: - mutable_tensor_desc()->set_data_type(data_type); - } + mutable_tensor_desc()->set_data_type(data_type); } void VarDesc::SetDataTypes( @@ -115,13 +109,7 @@ void VarDesc::SetDataTypes( } proto::VarType::Type VarDesc::GetDataType() const { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return channel_desc().data_type(); - break; - default: - return tensor_desc().data_type(); - } + return tensor_desc().data_type(); } std::vector VarDesc::GetDataTypes() const { @@ -134,17 +122,6 @@ std::vector VarDesc::GetDataTypes() const { return res; } -void VarDesc::SetCapacity(int64_t capacity) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - desc_.mutable_type()->mutable_channel()->set_capacity(capacity); - break; - default: - PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.", - this->Name()); - } -} - void VarDesc::SetLoDLevel(int32_t lod_level) { switch (desc_.type().type()) { case proto::VarType::LOD_TENSOR: @@ -214,19 +191,6 @@ std::vector VarDesc::GetLoDLevels() const { } } -const proto::VarType::ChannelDesc &VarDesc::channel_desc() const { - PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.type().channel(); - default: - PADDLE_THROW( - "Getting 'channel_desc' is not supported by the type of var %s.", - this->Name()); - } -} - const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); @@ -262,20 +226,6 @@ std::vector VarDesc::tensor_descs() const { } } -proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.mutable_type()->mutable_channel(); - default: - PADDLE_THROW( - "Getting 'mutable_channel_desc' is not supported by the type of var " - "%s.", - this->Name()); - } -} - proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 9f7a21ef42b8d3e74b6e211d6254294ba1fa2341..e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -87,8 +87,6 @@ class VarDesc { void SetDataTypes( const std::vector &multiple_data_type); - void SetCapacity(int64_t capacity); - proto::VarType::Type GetDataType() const; std::vector GetDataTypes() const; @@ -110,10 +108,8 @@ class VarDesc { void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } private: - const proto::VarType::ChannelDesc &channel_desc() const; const proto::VarType::TensorDesc &tensor_desc() const; std::vector tensor_descs() const; - proto::VarType::ChannelDesc *mutable_channel_desc(); proto::VarType::TensorDesc *mutable_tensor_desc(); std::vector mutable_tensor_descs(); diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index e9550dbfb976bee70741158b94b04084919e8271..3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) { return proto::VarType_Type_SELECTED_ROWS; } else if (IsType(type)) { return proto::VarType_Type_READER; - } else if (IsType(type)) { - return proto::VarType_Type_CHANNEL; } else { PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); } @@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { case proto::VarType_Type_READER: visitor(var.Get()); return; - case proto::VarType_Type_CHANNEL: - visitor(var.Get()); - return; default: PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index db381bbc3911ad9650162d9b9012580e5b638828..ec1bc7825dd21628f5c37ea44a154abe7b7e8c73 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -20,7 +20,8 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) add_subdirectory(api) # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api + analysis_predictor zero_copy_tensor) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -31,6 +32,7 @@ endif() cc_library(paddle_fluid_shared SHARED SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c740ea009f6cfc2ea250d8f1abdd7d442c2a0bb0..d4d2fd4634f9e11f3f002e11e177c332ced49885 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -20,8 +20,6 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) -set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) - function (inference_analysis_test TARGET) if(WITH_TESTING) set(options "") diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3..13805ea4acf936b242bcd86b2faf89813753a9fe 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -41,12 +41,6 @@ class AnalysisPass { // all passes have run. virtual bool Finalize() { return false; } - // Get a Pass appropriate to print the Node this pass operates on. - virtual AnalysisPass *CreatePrinterPass(std::ostream &os, - const std::string &banner) const { - return nullptr; - } - // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 9bdbefc07cbc4bf7a4714927c84855837610430e..0aa9367bf5692e53e2a1f1247523cf9a4f0b3a1d 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry { // larger fusion. const std::vector all_ir_passes_{{ // Manual update the passes here. - "infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN "conv_relu_mkldnn_fuse_pass", // #endif diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 32d58b87413c95908644ffba31bbec22d8e23201..0ddd5d53f836131fe37d412fc867cb38f11ee2b5 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,6 @@ function(inference_api_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) cc_test(${TARGET_NAME} SRCS ${inference_test_SRC} DEPS "${inference_deps}" diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 0c11694d5a905be4d9f0c6ebbc6159a4dc4a346e..3bc6af5241c41bd805699121d614d431d46d863f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -24,7 +24,6 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -72,6 +71,11 @@ bool AnalysisPredictor::Init( } else { inference_program_ = program; } + + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } + executor_->Prepare(scope_.get(), *inference_program_, 0, config_.use_feed_fetch_ops); @@ -93,6 +97,7 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to set feed"; return false; } + // Run the inference program // if share variables, we need not create variables executor_->Run(); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 53740899cd4176ae007c09b7728e504675d13248..6682e0a81b20c82aa668a249d37986386d769c83 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 106a941b2954bc7490c4ee6380b5249e126fbfb3..bed7c871311e476b4ed113d982c690383ad732de 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -21,6 +21,12 @@ limitations under the License. */ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" +#ifdef __clang__ +#define ACC_DIFF 4e-3 +#else +#define ACC_DIFF 1e-3 +#endif + DEFINE_string(dirname, "", "Directory of the inference model."); namespace paddle { @@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) { float* lod_data = output1.data(); for (int i = 0; i < output1.numel(); ++i) { - EXPECT_LT(lod_data[i] - data[i], 1e-3); - EXPECT_GT(lod_data[i] - data[i], -1e-3); + EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); + EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); } } @@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) { float* data = static_cast(outputs[0].data.data()); float* lod_data = output1.data(); for (size_t j = 0; j < len / sizeof(float); ++j) { - EXPECT_NEAR(lod_data[j], data[j], 1e-3); + EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); } } @@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } @@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 0f7d541c5edfc62e80cf50f83b491f06dcb42644..44335a872f2e00b34e29a9e7601cb390a460362c 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -2,6 +2,9 @@ set -x PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode +DATA_DIR=$4 # dataset +cd `dirname $0` +current_dir=`pwd` if [ $2 == ON ]; then # You can export yourself if move the install path MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib @@ -29,15 +32,15 @@ function download() { fi cd .. } -mkdir -p data -cd data +mkdir -p $DATA_DIR +cd $DATA_DIR vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name done -cd .. # compile and test the demo +cd $current_dir mkdir -p build cd build @@ -73,9 +76,9 @@ for WITH_STATIC_LIB in ON OFF; do for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do ./vis_demo \ - --modeldir=../data/$vis_demo_name/model \ - --data=../data/$vis_demo_name/data.txt \ - --refer=../data/$vis_demo_name/result.txt \ + --modeldir=$DATA_DIR/$vis_demo_name/model \ + --data=$DATA_DIR/$vis_demo_name/data.txt \ + --refer=$DATA_DIR/$vis_demo_name/result.txt \ --use_gpu=$use_gpu if [ $? -ne 0 ]; then echo "vis demo $vis_demo_name runs fail." diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index dbbd3f6a6786a4a4849002878263353919e8f31b..24f59cf43a9700ff1732e1ef6ad82e1a6294eede 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -16,19 +16,34 @@ #include #include -#include +#include // NOLINT #include #include #include #include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/string/printf.h" +#include "paddle_inference_api.h" namespace paddle { namespace inference { +// Timer for timer +class Timer { + public: + std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point startu; + + void tic() { start = std::chrono::high_resolution_clock::now(); } + double toc() { + startu = std::chrono::high_resolution_clock::now(); + std::chrono::duration time_span = + std::chrono::duration_cast>(startu - + start); + double used_time_ms = static_cast(time_span.count()) * 1000.0; + return used_time_ms; + } +}; + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); @@ -154,127 +169,5 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } -template -std::string LoDTensorSummary(const framework::LoDTensor &tensor) { - std::stringstream ss; - ss << "\n---- tensor ---" << '\n'; - ss << "lod: ["; - for (const auto &level : tensor.lod()) { - ss << "[ "; - for (auto i : level) { - ss << i << ", "; - } - ss << "]"; - } - ss << "]\n"; - - ss << "shape: ["; - int size = 1; - for (int i = 0; i < tensor.dims().size(); i++) { - int dim = tensor.dims()[i]; - ss << dim << ", "; - size *= dim; - } - ss << "]\n"; - - ss << "data: "; - for (int i = 0; i < std::min(20, size); i++) { - ss << tensor.data()[i] << " "; - } - ss << "\n"; - - return ss.str(); -} - -static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { - if (a.size() != b.size()) { - LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), - b.size()); - return false; - } - for (size_t i = 0; i < a.size(); i++) { - auto &al = a[i]; - auto &bl = b[i]; - if (al.size() != bl.size()) { - LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), - bl.size()); - return false; - } - } - return true; -} - -static bool CompareShape(const std::vector &a, - const std::vector &b) { - if (a.size() != b.size()) { - LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), - b.size()); - return false; - } - for (size_t i = 0; i < a.size(); i++) { - if (a[i] != b[i]) { - LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, - a[i], b[i]); - return false; - } - } - return true; -} - -static bool CompareTensorData(const framework::LoDTensor &a, - const framework::LoDTensor &b) { - auto a_shape = framework::vectorize(a.dims()); - auto b_shape = framework::vectorize(b.dims()); - size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, - [](int a, int b) { return a * b; }); - if (a_size != b_size) { - LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", - a_size, b_size); - } - - for (size_t i = 0; i < a_size; i++) { - if (a.type() == typeid(float)) { - const auto *a_data = a.data(); - const auto *b_data = b.data(); - if (std::abs(a_data[i] - b_data[i]) > 1e-3) { - LOG(ERROR) << string::Sprintf( - "tensor data %d-th element not match, %f != %f", i, a_data[i], - b_data[i]); - return false; - } - } else if (a.type() == typeid(int64_t)) { - const auto *a_data = a.data(); - const auto *b_data = b.data(); - if (std::abs(a_data[i] - b_data[i]) > 1e-3) { - LOG(ERROR) << string::Sprintf( - "tensor data %d-th element not match, %f != %f", i, a_data[i], - b_data[i]); - return false; - } - } - } - - return true; -} - -static bool CompareTensor(const framework::LoDTensor &a, - const framework::LoDTensor &b) { - if (!CompareLoD(a.lod(), b.lod())) { - return false; - } - if (!CompareShape(framework::vectorize(a.dims()), - framework::vectorize(b.dims()))) { - return false; - } - - if (!CompareTensorData(a, b)) { - return false; - } - - return true; -} - } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 3aa5c614687953f824fc5a94e8bde29090dbeb5d..d2876dc27c8826c2f27be21fa0b9fef92d03067a 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -263,14 +263,13 @@ struct AnalysisConfig : public NativeConfig { bool enable_ir_optim = true; // Manually determine the IR passes to run. IrPassMode ir_mode{IrPassMode::kExclude}; - std::vector ir_passes; + std::vector ir_passes{"embedding_fc_lstm_fuse_pass"}; // NOT stable yet. bool use_feed_fetch_ops{true}; - // NOTE this is just for internal development, please not use it. NOT - // stable - // yet. + // NOTE this is just for internal development, please not use it. + // NOT stable yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 70f9e397c96cf3fe92779778950f3df71b5a67c9..c3dd1f433691e1c96e9f38ef7b595befad26408f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -70,6 +70,14 @@ if (NOT EXISTS ${OCR_INSTALL_DIR}) endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +# resnet50 +set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") +if (NOT EXISTS ${RESNET50_INSTALL_DIR}) + inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz") +endif() +inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model) + # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # anakin rnn1 diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index 82bc83988de688e46613e160b66943c89c4a0391..c4022225fd4526998af8526d0afb87e7a5be6336 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/timer.h" #include "utils/logger/logger.h" DEFINE_string(model, "", "Directory of the inference model."); diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..290fb007d8ba94a2d121947fe67c6474586ac0e0 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->param_file = FLAGS_infer_model + "/params"; + cfg->prog_file = FLAGS_infer_model + "/model"; + cfg->use_gpu = false; + cfg->device = 0; + cfg->enable_ir_optim = true; + cfg->specify_input_name = true; +} + +void SetInput(std::vector> *inputs) { + PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); + + PaddleTensor input; + // channel=3, height/width=318 + std::vector shape({FLAGS_batch_size, 3, 318, 318}); + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + + // fill input data, for profile easily, do not use random data here. + size_t size = FLAGS_batch_size * 3 * 318 * 318; + input.data.Resize(size * sizeof(float)); + float *input_data = static_cast(input.data.data()); + for (size_t i = 0; i < size; i++) { + *(input_data + i) = static_cast(i) / size; + } + + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +// Easy for profiling independently. +TEST(Analyzer_resnet50, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); + size_t size = GetSize(outputs[0]); + // output is a 512-dimension feature + EXPECT_EQ(size, 512 * FLAGS_batch_size); + } +} + +// Check the fuse status +TEST(Analyzer_resnet50, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_resnet50, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index d2e344111bdf84c936bbef7ff51246b0f248f41d..c76d72ccd99649913aefcb2aa57fe6061db8ca6d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" DEFINE_bool(with_precision_check, true, "turn on test"); @@ -271,10 +270,11 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); } -bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope, +bool CompareTensors(const framework::Scope &a_scope, + const framework::Scope &b_scope, const std::vector &tensors) { for (auto &x : tensors) { auto *a_var = a_scope.FindVar(x); diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 340ef152f0b1a15a451f840b36ae845ef4984740..ca19475bda372398d425b0fa6f9a732cd79a8166 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) { CompareNativeAndAnalysis(cfg, input_slots_all); } +TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { + AnalysisConfig cfg; + SetConfig(&cfg); + // Enable embedding_fc_lstm_fuse_pass (disabled by default) + auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(), + "embedding_fc_lstm_fuse_pass"); + if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index a2e86305b85dd893f578e97e0105fec828916fb4..305b8bfe158150d5dfd8bdaee2c0a89afe264de4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -61,8 +61,6 @@ void SetConfig(AnalysisConfig *cfg) { cfg->ir_passes.push_back("fc_gru_fuse_pass"); #ifdef PADDLE_WITH_MKLDNN cfg->_use_mkldnn = true; - // disable mkldnn fuse since it should have some bugs - cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); #endif } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index cb36ddc8c879b1aff9838bba90364b17d53aa84e..8603d09cbd09e4cee254faf52c2ccbe8d661994d 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include // NOLINT #include @@ -182,5 +183,127 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 017fc4cd7b11c150cb941fffca2606a4d707330f..977155440df5294216382cff1c67c2aaca1f546d 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -4,7 +4,6 @@ function(inference_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(arg_list "") if(inference_test_ARGS) foreach(arg ${inference_test_ARGS}) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9c67df7bdfb2c4e5d1c9fe60676c412ab11b4fa5..2ef13b72ed3ff6ae8ad8748ddea977e693615ac6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -82,10 +82,11 @@ function(op_library TARGET) if (${cc_srcs_len} EQUAL 0) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() - - #remove windows unsupported op if (WIN32) - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + # remove windows unsupported op, because windows has no nccl, no warpctc such ops. + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" + "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" + "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -281,10 +282,12 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) +if (NOT WIN32) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(gru_op DEPS sequence2batch gru_compute) +endif(NOT WIN32) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) @@ -297,10 +300,10 @@ op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) op_library(fusion_lstm_op DEPS cpu_lstm_compute) - if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) + op_library(reduce_mean_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() @@ -313,11 +316,6 @@ op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(concat_op DEPS concat) -# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency -add_subdirectory(concurrency) -op_library(channel_send_op DEPS concurrency) -op_library(channel_recv_op DEPS concurrency) - list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc deleted file mode 100644 index 8e2db250a069c488ee98f618bc03df6485022456..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_close_op.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace pf = paddle::framework; -static constexpr char kChannel[] = "Channel"; - -namespace paddle { -namespace operators { - -class ChannelCloseOp : public framework::OperatorBase { - public: - ChannelCloseOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &inp = *scope.FindVar(Input(kChannel)); - - // Get the mutable version of the channel variable and closes it. - pf::ChannelHolder *ch = inp.GetMutable(); - ch->close(); - } -}; - -class ChannelCloseOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("Channel"), - "The input of ChannelClose op must be set"); - } -}; - -class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kChannel, - "The Channel Variable that should be closed by" - " the ChannelClose Op."); - AddComment(R"DOC( -Channel Close Operator. - -This operator closes an open channel. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCloseOpMaker); diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc deleted file mode 100644 index a7f59e4088e3fb328e5b5a83eed65f0f90edb9f0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_create_op.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/reader.h" - -namespace pf = paddle::framework; - -static constexpr char kOutput[] = "Out"; - -namespace paddle { -namespace operators { - -class ChannelCreateOp : public framework::OperatorBase { - public: - ChannelCreateOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &out = *scope.FindVar(Output(kOutput)); - - // Determine the datatype and capacity of the channel to be created - // from the attributes provided. - auto dtype = - static_cast(Attr("data_type")); - auto capacity = Attr("capacity"); - - // Based on the datatype, create a new channel holder initialized with - // the given capacity. When capacity is 0, an unbuffered channel is - // created. - pf::ChannelHolder *ch = out.GetMutable(); - if (dtype == framework::proto::VarType::LOD_TENSOR) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::SELECTED_ROWS) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::READER) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::CHANNEL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::BOOL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT64) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP64) { - ch->Reset(capacity); - } else { - PADDLE_THROW( - "Data type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, " - "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]", - dtype); - } - } -}; - -class ChannelCreateOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasOutput(kOutput), - "The output of ChannelCreate op must be set"); - context->SetOutputDim(kOutput, {1}); - } -}; - -class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddOutput(kOutput, - "The object of a Channel type created by ChannelCreate Op."); - AddAttr("capacity", "The size of the buffer of Channel.") - .SetDefault(0); - AddAttr("data_type", "The data type of elements inside the Channel."); - AddComment(R"DOC( -Channel Create Operator. - -This operator creates an object of the VarType Channel and returns it. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCreateOpMaker); diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc deleted file mode 100644 index 101015e837e28b504b71d919abd5f908a102c812..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_recv_op.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char Status[] = "Status"; -static constexpr char Out[] = "Out"; - -namespace paddle { -namespace operators { - -void SetReceiveStatus(const platform::Place &dev_place, - framework::Variable *status_var, bool status) { - auto cpu = platform::CPUPlace(); - auto status_tensor = - status_var->GetMutable()->mutable_data({1}, - cpu); - status_tensor[0] = status; -} - -class ChannelRecvOp : public framework::OperatorBase { - public: - ChannelRecvOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Out), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Status), - "Output(Status) of ChannelRecvOp should not be null."); - ctx->SetOutputDim("Status", {1}); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto output_var = scope.FindVar(Output(Out)); - // Receive the data from the channel. - bool ok = concurrency::ChannelReceive(ch, output_var); - - // Set the status output of the `ChannelReceive` call. - SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok); - } -}; - -class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"receives\" the a value sent" - "to it by a channel_send op.") - .AsDuplicable(); - AddOutput(Out, - "(Variable) Output Variable that will hold the data received" - " from the Channel") - .AsDuplicable(); - AddOutput(Status, - "(Tensor) An LoD Tensor that returns a boolean status of the" - "result of the receive operation.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelRecvOpMaker); diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc deleted file mode 100644 index 67d6deb511d883ac69426ddd34be2199367cd4c7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_send_op.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char X[] = "X"; - -namespace paddle { -namespace operators { - -class ChannelSendOp : public framework::OperatorBase { - public: - ChannelSendOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelSendOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput(X), - "Input(X) of ChannelSendOp should not be null."); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto input_var = scope.FindVar(Input(X)); - - // Send the input data through the channel. - concurrency::ChannelSend(ch, input_var); - } -}; - -class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"sends\" the passed in value to " - "a listening receiver.") - .AsDuplicable(); - AddInput(X, "(Variable) The value which gets sent by the channel.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelSendOpMaker); diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt deleted file mode 100644 index e4617440d152b4c15d09e81cd19c76739b95b979..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3) diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc deleted file mode 100644 index fba4abf1897bceea615222b2438700085ed8e551..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/framework/var_type.h" - -namespace poc = paddle::operators::concurrency; - -void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - ch->Send(var->GetMutable()); - else - PADDLE_THROW("ChannelSend:Unsupported type"); -} - -bool poc::ChannelReceive(framework::ChannelHolder *ch, - framework::Variable *var) { - // Get type of channel and use that to call mutable data for Variable - auto type = framework::ToVarType(ch->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Receive(var->GetMutable()); - else - PADDLE_THROW("ChannelReceive:Unsupported type"); -} - -void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else { - PADDLE_THROW("ChannelAddToSendQ:Unsupported type"); - } -} - -void poc::ChannelAddToReceiveQ( - framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToReceiveQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else { - PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type"); - } -} diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h deleted file mode 100644 index cd18ca78c6fdecdc6c72748611ccdd9c2690ef46..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/variable.h" - -namespace paddle { -namespace operators { -namespace concurrency { - -void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); -bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); - -void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); -void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); - -} // namespace concurrency -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index b3140116dfe6a17a400bb88219ff43b249ecb32a..ef76106f17218a03d24ebc0eca43dbb0ae935093 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel { math::DepthwiseConvFunctor depthwiseConv; auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); } }; @@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, input_grad); + paddings, dilations, input_grad); } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e..88c578b1410558b9adcd55f1cd6b53fb9cb124e2 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, - output); + dilations, output); } }; @@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, input_grad); } @@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..16fdad775f7befaac04b1ac59a601f04e0ab2bdc --- /dev/null +++ b/paddle/fluid/operators/cub_reduce.h @@ -0,0 +1,322 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include // NOLINT +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +namespace detail { +template +struct Array { + public: + HOSTDEVICE inline Array() {} + + HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T& operator[](size_t index) const { + return data_[index]; + } + + HOSTDEVICE constexpr inline size_t size() const { return ElementCount; } + + template + static inline Array From(const VectorLikeType& vec) { + PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match"); + size_t n = static_cast(vec.size()); + Array ret; + for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; + return ret; + } + + private: + T data_[ElementCount]; +}; + +// reduce the last axis of 2d array +template +__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + int idx_x = blockIdx.x * reduce_num; + int idx_y = threadIdx.x; + Ty reduce_var = init; + for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) + reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y])); + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +template +__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, int reduce_num, + Array x_strides, + Array reduce_dim, + Array reduce_strides, + Array left_dim, + Array left_strides) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + Array sub_index; + int left_idx = blockIdx.x; + for (int i = 0; i < Rank - ReduceRank; ++i) { + sub_index[left_dim[i]] = left_idx / left_strides[i]; + left_idx %= left_strides[i]; + } + + int reduce_idx = threadIdx.x; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + Ty reduce_var = static_cast(transformer(x[idx_x])); + + for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { + int reduce_idx = i; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + reduce_var = static_cast(reducer(reduce_var, transformer(x[idx_x]))); + } + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +static inline std::vector GetStrides(const std::vector& dims) { + int n = static_cast(dims.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + return strides; +} + +static inline std::vector GetStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +constexpr int kMaxBlockDim = 512; + +static inline int GetDesiredBlockDim(int block_dim) { + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << static_cast(std::log2(block_dim))); +} + +template +static void TensorReduceImpl( + const Tx* x_data, Ty* y_data, const platform::Place& place, + const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, + int left_num, int reduce_num, const std::vector& x_strides, + const std::vector& reduce_dim, const std::vector& reduce_strides, + const std::vector& left_dim, const std::vector& left_strides, + cudaStream_t stream) { +#define CUB_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kRank = i; \ + switch (reduce_rank) { __VA_ARGS__; } \ + } break + +#define CUB_REDUCE_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kReduceRank = i; \ + ReduceKernel<<>>( \ + x_data, y_data, reducer, transformer, init, reduce_num, \ + Array::From(x_strides), \ + Array::From(reduce_dim), \ + Array::From(reduce_strides), \ + Array::From(left_dim), \ + Array::From(left_strides)); \ + } break + + int rank = x_strides.size(); + int reduce_rank = reduce_strides.size(); + if (rank == reduce_rank) { + cub::TransformInputIterator trans_x( + x_data, transformer); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), + place); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + return; + } + if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + ReduceKernel2D<<>>( + x_data, y_data, reducer, transformer, init, reduce_num); + return; + } + /* + if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) { + // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced. + // Currently, it is handled by code below, but inefficient + return; + } + */ + + switch (rank) { + CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); + + CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);); + + CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3);); + + CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);); + + CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5);); + + CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6); + CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8);); + } + +#undef CUB_REDUCE_RANK_CASE +#undef CUB_RANK_CASE +} + +} // namespace detail + +template +void TensorReduce(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, const Ty& init, + const ReduceOp& reducer, const TransformOp& transformer, + cudaStream_t stream) { + auto x_dim = framework::vectorize2int(x.dims()); + std::vector new_x_dim, new_reduce_dims; + int is_reduced = 0; + for (auto e : origin_reduce_dims) { + auto pos = e >= 0 ? e : e + x_dim.size(); + is_reduced |= 1 << e; + } + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + new_x_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + new_reduce_dims.push_back(new_x_dim.size() - 1); + } else { + new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; + } + } + x_dim = new_x_dim; + origin_reduce_dims = new_reduce_dims; + int x_rank = static_cast(x_dim.size()); + std::set left_set, reduce_set; + for (int i = 0; i < x_rank; ++i) left_set.insert(i); + + for (auto e : origin_reduce_dims) { + left_set.erase(e); + reduce_set.insert(e); + } + + std::vector reduce_dim(reduce_set.begin(), reduce_set.end()); + std::vector left_dim(left_set.begin(), left_set.end()); + + std::vector x_strides = detail::GetStrides(x_dim); + std::vector reduce_strides = detail::GetStrides(x_dim, reduce_dim); + std::vector left_strides = detail::GetStrides(x_dim, left_dim); + int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + int left_num = 1; + if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]]; + + std::vector y_dim(left_dim.size()); + for (int i = 0; i < left_dim.size(); ++i) { + y_dim[i] = x_dim[left_dim[i]]; + } + auto x_data = x.data(); + auto y_data = y->mutable_data(x.place()); + if (reduce_num == 1) return; + +#define CUB_BLOCK_DIM_CASE(block_dim) \ + case block_dim: { \ + constexpr auto kBlockDim = block_dim; \ + detail::TensorReduceImpl( \ + x_data, y_data, x.place(), reducer, transformer, init, left_num, \ + reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \ + left_strides, stream); \ + } break + + switch (detail::GetDesiredBlockDim(reduce_num)) { + CUB_BLOCK_DIM_CASE(512); + CUB_BLOCK_DIM_CASE(256); + CUB_BLOCK_DIM_CASE(128); + CUB_BLOCK_DIM_CASE(64); + CUB_BLOCK_DIM_CASE(32); + CUB_BLOCK_DIM_CASE(16); + CUB_BLOCK_DIM_CASE(8); + CUB_BLOCK_DIM_CASE(4); + CUB_BLOCK_DIM_CASE(2); + } +#undef CUB_BLOCK_DIM_CASE +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 4cc980b41b34894f9d915d4b325887548091c0eb..42c720e701fbabacf1280dec2f78d3f6b99dfea2 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -104,7 +104,6 @@ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) * a33 = 1 - * */ template void get_transform_matrix(const int transformed_width, @@ -260,8 +259,8 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -393,8 +392,8 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -404,7 +403,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { for (int in_h = 0; in_h < in_height; ++in_h) { for (int in_w = 0; in_w < in_width; ++in_w) { T gradient = 0.0; - for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { + for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { const T* rois = rois_data + roi_idx * 8; T roi_x[4]; T roi_y[4]; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index b683b7573db747bde5f57e530ec53760db099843..c82930cc4994c3854e60f40ae9909a90d82cbff6 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -345,8 +345,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(platform::CPUPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -432,7 +432,7 @@ __global__ void RoiTransformGradKernel( T gradient = 0.0; // Accumulate gradient over all RoIs that interpolated this element - for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { + for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { const T* rois = rois_data + roi_idx * 8; T roi_x[4]; T roi_y[4]; diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 75a3662316462a222760bfbb7d7906c70f46d143..d8e9cee85bd734c2ed4b1cae03ecee04e304b651 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include // NOLINT #include // NOLINT diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3dbbd75b1e945208395c42ace3235db7891936c5..5be7095acd3c5ac6f880a8a26c246f60a93643b5 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -15,6 +15,7 @@ #pragma once #include +#include // NOLINT #include #include diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d88e8c640ffb5ea44e88318cc973c9a783862435..f3e61e1575ced0b9ffbad23e6973121daca9751b 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include // NOLINT diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index a79b900b9801e6b80e4433a9acdd4dab6c34859d..94df11bee70dec44f19ee9ffff04ca92d5990ee8 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") .SetDefault(false); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator +Elementwise %s Operator The equation is: diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b917a403620e2ffb2cbb4ca7856cce9584e1eef --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc @@ -0,0 +1,604 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void FusedEmbeddingFCLSTMOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Embeddings"), + "Assert only one Input(Embeddings) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Assert only one Output(Hidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Assert only one Output(Cell) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("Embeddings"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto x_dims = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + auto embeddings_dims = ctx->GetInputDim("Embeddings"); + PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2, + "The rank of Input(Embeddings) should be 2."); + + auto wh_dims = ctx->GetInputDim("WeightH"); + int frame_size = wh_dims[1] / 4; + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, + "The second dimension of Input(WeightH) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + PADDLE_ENFORCE_EQ( + b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection or" + "4 * %d if disable peepholes", + frame_size, frame_size); + + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->ShareLoD("Ids", "Hidden"); + ctx->ShareLoD("Ids", "Cell"); + int xx_width; + if (ctx->Attrs().Get("use_seq")) { + xx_width = wh_dims[1]; + } else { + xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1]; + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), + "Assert only one Output(BatchedInput) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Assert only one Output(BatchedHidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), + "Assert only one Output(BatchedCell) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), + "Assert only one Output(ReorderedH0) of LSTM"); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), + "Assert only one Output(ReorderedC0) of LSTM."); + ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchedCell", out_dims); + } + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("Ids", "XX"); +} + +framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Embeddings")->type()), + ctx.device_context()); +} + +void FusedEmbeddingFCLSTMOpMaker::Make() { + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddInput("Embeddings", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr("use_seq", + "(bool, defalut: True) " + "whether to use seq mode to compute.") + .SetDefault(true); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. +)DOC"); +} + +template +class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { + public: +#define INIT_VEC_FUNC \ + std::function act_gate, act_cell, act_cand; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_cell_str = ctx.Attr("cell_activation"); \ + auto& act_cand_str = ctx.Attr("candidate_activation"); \ + if (platform::jit::MayIUse(platform::jit::avx)) { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } else { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } + +#define INIT_BASE_INPUT_OUTPUT \ + auto* ids = ctx.Input("Ids"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* embeddings = ctx.Input("Embeddings"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); + +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = ids->numel(); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ + const int D4 = wh_dims[1]; + +#define INIT_BASE_INPUT_DATAS \ + const int64_t* ids_data = ids->data(); \ + const T* embeddings_data = embeddings->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + Tensor checked_cell; \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + checked_cell_data = checked_cell.mutable_data({2, D}, place); \ + } + +/// Compute LSTM +#define GEMM_WH_ADDON(bs, prev, out) \ + blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ + wh_data, D4, static_cast(1), out, D4) + +// gates: W_ch, W_ih, W_fh, W_oh +#define GET_Ct(ct_1, gates, ct) \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, gates + D); \ + blas.VMUL(D, ct_1, gates + D2, gates + D2); \ + blas.VADD(D, gates + D, gates + D2, ct) + +#define GET_Ht(ct, gates, ht) \ + /* H_t = act_cell(C_t) * ogated */ \ + act_cell(D, ct, gates + D2); \ + blas.VMUL(D, gates + D2, gates + D3, ht) + +#define GET_Ct_NOH0C0(gates, ct) \ + /* C_t = igated * cgated*/ \ + act_gate(D, gates + D, gates + D); \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, ct) + +#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + /* get outgated, put W_oc * C_t on igated */ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ + act_gate(D3, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ + /* get fgated and igated*/ \ + blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ + blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ + blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ + act_gate(D2, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + /* get ogated*/ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "====> SeqCompute" << std::endl; + auto ids_lod = ids->lod(); + const int total_T = ids_dims[0]; + const int N = ids_lod[0].size() - 1; + const T* h0_data = h0 ? h0->data() : nullptr; + const T* c0_data = c0 ? c0->data() : nullptr; + T* xx_data = xx->mutable_data(place); + T* h_out_data = hidden_out->mutable_data(place); + T* c_out_data = cell_out->mutable_data(place); + auto blas = math::GetBlas(ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + int xx_offset = D4; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 4; + h_out_data = h_out_data + offset; + c_out_data = c_out_data + offset; + xx_offset = -D4; + gate_offset = -D; + } + +#define MOVE_ONE_STEP \ + prev_h_data = h_out_data; \ + prev_c_data = c_out_data; \ + xx_data = xx_data + xx_offset; \ + h_out_data = h_out_data + gate_offset; \ + c_out_data = c_out_data + gate_offset + +#define PROCESS_H0C0_DEFINES \ + int bid = is_reverse ? N - 1 - i : i; \ + int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \ + const T* prev_c_data = nullptr; \ + const T* prev_h_data = nullptr; \ + int tstart = 0 + +#define PROCESS_H0C0_PEEPHOLE \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + +#define PROCESS_H0C0 \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + + if (use_peepholes) { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0_PEEPHOLE + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } else { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0 + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } +#undef PROCESS_H0C0_DEFINES +#undef PROCESS_H0C0_PEEPHOLE +#undef PROCESS_H0C0 +#undef MOVE_ONE_STEP + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + if (ids->lod()[0].size() == 2) { + SeqCompute(ctx); + return; + } + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "===> Batch Compute" << std::endl; + + auto* reordered_h0 = ctx.Output("ReorderedH0"); + auto* reordered_c0 = ctx.Output("ReorderedC0"); + auto* batched_input = ctx.Output("BatchedInput"); + auto* batched_c_out = ctx.Output("BatchedCell"); + auto* batched_h_out = ctx.Output("BatchedHidden"); + T* xx_data = xx->mutable_data(place); + T* batched_input_data = batched_input->mutable_data(place); + T* batched_c_out_data = batched_c_out->mutable_data(place); + T* batched_h_out_data = batched_h_out->mutable_data(place); + hidden_out->mutable_data(place); + cell_out->mutable_data(place); + + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + to_batch(dev_ctx, *xx, batched_input, true, is_reverse); + + auto batched_lod = batched_input->lod(); + const auto& seq_order = batched_lod[2]; + const int max_bs = seq_order.size(); + reordered_h0->Resize({max_bs, D}); + reordered_c0->Resize({max_bs, D}); + + int tstart = 0; + T* prev_h_data = nullptr; + T* prev_c_data = nullptr; + if (h0) { + // reorder h0, c0 + T* reordered_h0_data = reordered_h0->mutable_data(place); + T* reordered_c0_data = reordered_c0->mutable_data(place); + const T* h0_data = h0->data(); + const T* c0_data = c0->data(); + prev_h_data = reordered_h0_data; + prev_c_data = reordered_c0_data; + size_t sz = sizeof(T) * D; + for (int i = 0; i < max_bs; ++i) { + std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); + std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + reordered_h0_data += D; + reordered_c0_data += D; + } + } else { + // compute without h0, c0 + T* cur_in_data = batched_input_data; + T* cur_h_out_data = batched_h_out_data; + T* cur_c_out_data = batched_c_out_data; + for (int i = 0; i < max_bs; ++i) { + GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); + if (use_peepholes) { + blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); + blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); + } + act_gate(D, cur_in_data + D3, cur_in_data + D3); + GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + cur_in_data += D4; + cur_c_out_data += D; + cur_h_out_data += D; + } + tstart = 1; + prev_h_data = batched_h_out_data; + prev_c_data = batched_c_out_data; + } + const auto& batch_starts = batched_lod[0]; + const int max_seq_len = batch_starts.size() - 1; + const int offset = tstart * max_bs * D; + batched_input_data = batched_input_data + offset * 4; + batched_h_out_data = batched_h_out_data + offset; + batched_c_out_data = batched_c_out_data + offset; + +#define DEFINE_CUR \ + T* cur_in_data = batched_input_data; \ + T* cur_prev_c_data = prev_c_data; \ + T* cur_c_out_data = batched_c_out_data; \ + T* cur_h_out_data = batched_h_out_data + +#define MOVE_ONE_BATCH \ + cur_in_data += D4; \ + cur_prev_c_data += D; \ + cur_c_out_data += D; \ + cur_h_out_data += D + +#define MOVE_ONE_STEP \ + prev_c_data = batched_c_out_data; \ + prev_h_data = batched_h_out_data; \ + batched_c_out_data = cur_c_out_data; \ + batched_h_out_data = cur_h_out_data; \ + batched_input_data = cur_in_data + + if (use_peepholes) { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } else { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } +#undef MOVE_ONE_STEP +#undef MOVE_ONE_BATCH +#undef DEFINE_CUR + + math::Batch2LoDTensorFunctor to_seq; + batched_h_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_h_out, hidden_out); + batched_c_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_c_out, cell_out); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + if (ctx.Attr("use_seq")) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } + +#undef COMPUTE_CtHt_PEEPHOLE +#undef COMPUTE_CtHt +#undef GET_Ct_NOH0C0 +#undef COMPUTE_CtHt_NOH0C0 +#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 +#undef GET_Ht +#undef GET_Ct +#undef GEMM_WH_ADDON +#undef INIT_BASE_INPUT_DATAS +#undef INIT_BASE_SIZES +#undef INIT_BASE_INPUT_OUTPUT +#undef INIT_VEC_FUNC +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp, + ops::FusedEmbeddingFCLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm, + ops::FusedEmbeddingFCLSTMKernel, + ops::FusedEmbeddingFCLSTMKernel); diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2775b2ac04d2890355fe6d75a1e2507a2668dc95 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 31e87d9113118ebe7a4b25ffee5ba55e2714fb66..a04c1c1263fba659e2d3f623b607e9f476bb40ed 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; auto* x = ctx.Input("X"); + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({total_T, D3}); SeqCompute(ctx); return; } - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES INIT_VEC_FUNC auto* reordered_h0 = ctx.Output("ReorderedH0"); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 23e8edd18d037a7f9127482951f25be3abf1b62f..ae1f6d8e489039667d861a69acabf2c632ef2061 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -432,11 +432,12 @@ class FuisonLSTMKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = platform::CPUDeviceContext; INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_BASE_SIZES INIT_VEC_FUNC INIT_BASE_INPUT_DATAS diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc index 58e6512021203664573a0478dade052f92dd70bb..e96d1879331974e0873e13f171414bcfa8c45953 100644 --- a/paddle/fluid/operators/math/cpu_lstm_compute.cc +++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc @@ -13,6 +13,31 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace math {} // namespace math +namespace math { +#ifdef __AVX__ +template <> +void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, + float* ht) { + namespace act = detail::forward::avx; + // gates: W_ch, W_ih, W_fh, W_oh + __m256 c, i, f, o; + c = _mm256_loadu_ps(gates); + i = _mm256_loadu_ps(gates + 8); + f = _mm256_loadu_ps(gates + 16); + o = _mm256_loadu_ps(gates + 24); + + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); + i = _mm256_loadu_ps(ct_1); + f = _mm256_mul_ps(i, act::Sigmoid(f)); + f = _mm256_add_ps(c, f); + _mm256_storeu_ps(ct, f); + + /* H_t = act_cell(C_t) * ogated */ + o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); + _mm256_storeu_ps(ht, o); +} +#endif +} // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h index 28b6f71729edf1b8cc5d610d76af78dea213313e..169a9e4b47f54851ad436428416eca879b78e186 100644 --- a/paddle/fluid/operators/math/cpu_lstm_compute.h +++ b/paddle/fluid/operators/math/cpu_lstm_compute.h @@ -48,32 +48,15 @@ namespace forward { namespace avx { __m256 Sigmoid(const __m256 a); __m256 Tanh(const __m256 a); + } // namespace avx } // namespace forward } // namespace detail template <> void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht) { - namespace act = detail::forward::avx; - // gates: W_ch, W_ih, W_fh, W_oh - __m256 c, i, f, o; - c = _mm256_loadu_ps(gates); - i = _mm256_loadu_ps(gates + 8); - f = _mm256_loadu_ps(gates + 16); - o = _mm256_loadu_ps(gates + 24); + float* ht); - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); - i = _mm256_loadu_ps(ct_1); - f = _mm256_mul_ps(i, act::Sigmoid(f)); - f = _mm256_add_ps(c, f); - _mm256_storeu_ps(ct, f); - - /* H_t = act_cell(C_t) * ogated */ - o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); - _mm256_storeu_ps(ht, o); -} #endif } // namespace math diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 027e2de48d229761f12f974dc73625c8ea1b3567..3be389912307f7aac6dda6d1018943eb8f08696d 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -20,149 +21,268 @@ namespace paddle { namespace operators { namespace math { +template +__inline__ __device__ T warpReduceSum(T val) { +#if CUDA_VERSION < 9000 + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; +#else +#define FULL_MASK 0xffffffff + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down_sync(FULL_MASK, val, offset); + return val; +#endif +} +__forceinline__ __device__ unsigned lane_id() { + unsigned ret; + asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} + +__forceinline__ __device__ unsigned warp_id() { + unsigned ret; + asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); + return ret; +} + // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__global__ void KernelDepthwiseConv( - const int nthreads, const T* const input_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, +__device__ __inline__ void KernelDepthwiseConv( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T* const output_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - - if (index < nthreads) { - const int batch = index / output_channels / output_height / output_width; - const int c_out = (index / output_height / output_width) % output_channels; - const int h_out = (index / output_width) % output_height; - const int w_out = index % output_width; - - const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height; - const int w_in_end = w_in_start + filter_width; - - const int in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - value += - weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * - input_data[offset]; + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { + for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height * dilate_height; + const int w_in_end = w_in_start + filter_width * dilate_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + int weight_offset = 0; + + for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { + for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { + if (h_in >= h_start && h_in < h_end && w_in >= w_start && + w_in < w_end) { + const int offset = in_offset + h_in * input_width + w_in; + value += weight[weight_offset] * input_data[offset]; + } + weight_offset++; + } } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; } - output_data[index] = value; } } +template +__global__ void KernelDepthwiseConvSp( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, filter_multiplier, + filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, + dilate_height, dilate_width, output_data); + + else + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, c_filter_multiplier, + filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, + dilate_width, output_data); +} + // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ void KernelDepthwiseConvInputGrad( - const int nthreads, const T* const output_grad_data, - const T* const filter_data, const int batch_size, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const input_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int batch = index / input_channels / input_height / input_width; - const int c_in = (index / input_height / input_width) % input_channels; - const int h_in = (index / input_width) % input_height; - const int w_in = index % input_width; - - const int c_out_start = c_in * filter_multiplier; - - int h_out_start = - (h_in - filter_height + padding_height + stride_height) / stride_height; - h_out_start = 0 > h_out_start ? 0 : h_out_start; - - int h_out_end = (h_in + padding_height) / stride_height; - h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; - - int w_out_start = - (w_in - filter_width + padding_width + stride_width) / stride_width; - w_out_start = 0 > w_out_start ? 0 : w_out_start; - - int w_out_end = (w_in + padding_width) / stride_width; - w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; - - T value = 0; - - for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; - c_out++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + padding_height - h_out * stride_height; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + padding_width - w_out * stride_width; - const int filter_offset = c_out * filter_height * filter_width + - filter_h * filter_width + filter_w; - const int output_grad_offset = - ((batch * output_channels + c_out) * output_height + h_out) * - output_width + - w_out; - value += - output_grad_data[output_grad_offset] * filter_data[filter_offset]; +__device__ __inline__ void KernelDepthwiseConvInputGrad( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { + for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + h_in - (filter_height - 1) * dilate_height + padding_height; + + int h_out_end = h_in + padding_height; + + int w_out_start = + w_in - (filter_width - 1) * dilate_width + padding_width; + + int w_out_end = w_in + padding_width; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + int filter_offset = (c_out + 1) * filter_height * filter_width; + for (int h_out = h_out_start; h_out <= h_out_end; + h_out += dilate_height) { + for (int w_out = w_out_start; w_out <= w_out_end; + w_out += dilate_width) { + filter_offset--; + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += output_grad_data[output_grad_offset] * + filter_data[filter_offset]; + } + } } } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + input_grad_data[index] = value; } - input_grad_data[index] += value; } } +template +__global__ void KernelDepthwiseConvInputGradSp( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, input_grad_data); + else + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + input_grad_data); +} + // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__global__ void KernelDepthwiseConvFilterGrad( - const int nthreads, const T* const output_grad_data, - const T* const input_data, const int num, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const filter_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int w_out = index % output_width; - const int h_out = (index / output_width) % output_height; - const int c_out = (index / output_width / output_height) % output_channels; - const int batch = (index / output_width / output_height / output_channels); - const int c_in = c_out / filter_multiplier; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = - -padding_height + h_out * stride_height + filter_height; - const int w_in_end = -padding_width + w_out * stride_width + filter_width; - const int in_offset = - (batch * input_channels + c_in) * input_height * input_width; - - T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = addr_offset + (h_in - h_in_start) * filter_width + - (w_in - w_in_start); - paddle::platform::CudaAtomicAdd(addr, diff_temp); +__device__ __inline__ void KernelDepthwiseConvFilterGrad( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + T s = 0; + + int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; + int lid = lane_id(); + + for (int image_w = threadIdx.x; image_w < output_width; + image_w += blockDim.x) { + for (int bid = 0; bid < num; bid++) { + for (int image_h = threadIdx.y; image_h < output_height; + image_h += blockDim.y) { + int kernel_id = blockIdx.z; + int kernel_h = blockIdx.y * dilate_height - padding_height; + int kernel_w = blockIdx.x * dilate_width - padding_width; + + int image_hk = image_h * stride_height + kernel_h; + int image_wk = image_w * stride_width + kernel_w; + if (image_hk < 0 || image_hk >= input_height) continue; + if (image_wk < 0 || image_wk >= input_width) continue; +#define gaid(N, C, H, W) \ + ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) + + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + input_data[((bid * (gridDim.z / filter_multiplier) + + kernel_id / filter_multiplier) * + input_height + + image_hk) * + input_width + + image_wk]; + +#undef gaid } } } +#if __CUDA_ARCH__ >= 530 + s = warpReduceSum(s); + if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#else + paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#endif +} + +template +__global__ void KernelDepthwiseConvFilterGradSp( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); + else + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); } /* @@ -177,7 +297,9 @@ class DepthwiseConvFunctor { const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output) { + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -191,22 +313,37 @@ class DepthwiseConvFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* filter_data = filter.data(); T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConv<<>>( - nthreads, input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - output_data); + int thread = 512; + int blocks = std::min(std::max(thread / output_width, 1), output_height); + dim3 threads(std::min(output_width, thread), blocks, 1); + dim3 grid(output_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvSp<<>>( \ + input_data, filter_data, batch_size, output_channels, output_height, \ + output_width, input_channels, input_height, input_width, \ + filter_multiplier, ksize_height, ksize_width, stride_height, \ + stride_width, padding_height, padding_width, dilate_height, \ + dilate_width, output_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * input_channels * input_height * input_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvInputGrad<<>>( - nthreads, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - input_grad_data); + int thread = 512; + int blocks = std::min(std::max(thread / input_width, 1), input_height); + dim3 threads(std::min(input_width, thread), blocks, 1); + dim3 grid(input_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvInputGradSp< \ + T, c_filter_multiplier, \ + c_stride><<>>( \ + output_grad_data, filter_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, input_grad_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* output_grad_data = output_grad.data(); T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvFilterGrad<<>>( - nthreads, output_grad_data, input_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - filter_grad_data); + int block_size = 512; + int crop_output_height = + std::min(std::max(block_size / output_width, 1), output_height); + dim3 grid(ksize_width, ksize_height, output_channels); + dim3 threads(std::min(output_width, block_size), crop_output_height, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier) \ + if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ + KernelDepthwiseConvFilterGradSp< \ + T, c_filter_multiplier><<>>( \ + output_grad_data, input_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, filter_grad_data); \ + return; \ + } + check_case(1); + check_case(0); +#undef check_case } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 97aec401889a56d3fc9ac08e766d931bb3725b01..71f6fcb23df1942d6dcf7177165f2ec1022a9b35 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -32,7 +32,8 @@ class DepthwiseConvFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output); + const std::vector& paddings, + const std::vector& dilations, framework::Tensor* output); }; template @@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad); }; @@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad); }; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 5923792902a81521256de300f77955f1ea3d16c6..854c8653ff545cb12eef79837d0312bb28458af8 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -13,6 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#endif + #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index c63ad89e46d2c187c7e6fe6b2fe73fbbed5f4044..b4f19417b6eabf24805c5c8128c2a6d423ddac69 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -13,18 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" -#endif - -#ifdef PADDLE_USE_OPENBLAS -#include -// remove typedef in openblas -#undef FLOAT -#undef INT -#undef SIZE -#endif - #include #include diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 960cb3235be7f4cc98b97d3b088ceaeb3d4a4209..59b30244839849d79e3e531953134633503c4090 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -12,17 +12,64 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_mean, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; + +template +class ReduceMeanKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + DivideFunctor(reduce_num), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel); + REGISTER_OP_CUDA_KERNEL( reduce_mean_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f..53cd9e9419dd9aecee730917ae21d7a4ab332ffc 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -12,17 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_sum, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor() {} + + HOSTDEVICE inline T operator()(const T& x) const { return x; } +}; + +template +class ReduceSumKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, + ops::ReduceSumKernel, ops::ReduceSumKernel, + ops::ReduceSumKernel); + REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc deleted file mode 100644 index e71841d4d1815d50cd9800910c9db34e121beffc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/select_op.cc +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" - -#include - -namespace paddle { -namespace operators { - -static constexpr char kX[] = "X"; -static constexpr char kCaseToExecute[] = "case_to_execute"; -static constexpr char kOutputs[] = "Out"; - -static constexpr char kCases[] = "cases"; -static constexpr char kCasesBlock[] = "sub_block"; - -class SelectOp : public framework::OperatorBase { - public: - SelectOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - enum class SelectOpCaseType { - DEFAULT = 0, - SEND = 1, - RECEIVE = 2, - }; - - struct SelectOpCase { - int caseIndex; - SelectOpCaseType caseType; - std::string channelName; - std::string varName; - - SelectOpCase() {} - - SelectOpCase(int caseIndex, SelectOpCaseType caseType, - std::string channelName, std::string varName) - : caseIndex(caseIndex), - caseType(caseType), - channelName(channelName), - varName(varName) {} - }; - - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - std::vector casesConfigs = - Attr>(kCases); - - framework::BlockDesc *casesBlock = - Attr(kCasesBlock); - - framework::Scope &casesBlockScope = scope.NewScope(); - - std::string caseToExecuteVarName = Input(kCaseToExecute); - framework::Variable *caseToExecuteVar = - casesBlockScope.FindVar(caseToExecuteVarName); - - // Construct cases from "conditional_block_op"(s) in the casesBlock - std::vector> cases = - ParseAndShuffleCases(&casesConfigs); - - // Get all unique channels involved in select - std::set channelsSet; - for (auto c : cases) { - if (!c->channelName.empty()) { - auto channelVar = scope.FindVar(c->channelName); - framework::ChannelHolder *ch = - channelVar->GetMutable(); - - if (channelsSet.find(ch) == channelsSet.end()) { - channelsSet.insert(ch); - } - } - } - - // Order all channels by their pointer address - std::vector channels(channelsSet.begin(), - channelsSet.end()); - std::sort(channels.begin(), channels.end()); - - // Poll all cases - int32_t caseToExecute = pollCases(&scope, &cases, channels); - - // At this point, the case to execute has already been determined, - // so we can proceed with executing the cases block - framework::LoDTensor *caseToExecuteTensor = - caseToExecuteVar->GetMutable(); - caseToExecuteTensor->data()[0] = caseToExecute; - - // Execute the cases block, only one case will be executed since we set the - // case_to_execute value to the index of the case we want to execute - framework::Executor executor(dev_place); - framework::ProgramDesc *program = casesBlock->Program(); - executor.Run(*program, &casesBlockScope, casesBlock->ID(), - false /*create_local_scope*/); - } - - /** - * Goes through all operators in the casesConfigs and processes - * "conditional_block" operators. These operators are mapped to our - * SelectOpCase objects. We randomize the case orders, and set the - * default case (if any exists) as the last case) - * @param casesBlock - * @return - */ - std::vector> ParseAndShuffleCases( - std::vector *casesConfigs) const { - std::vector> cases; - std::shared_ptr defaultCase; - - if (casesConfigs != nullptr) { - boost::char_delimiters_separator sep(false, ",", ""); - for (std::vector::iterator itr = casesConfigs->begin(); - itr < casesConfigs->end(); ++itr) { - std::string caseConfig = *itr; - boost::tokenizer<> tokens(caseConfig, sep); - - boost::tokenizer<>::iterator tok_iter = tokens.begin(); - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index"); - std::string caseIndexString = *tok_iter; - int caseIndex = std::stoi(caseIndexString); - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type"); - std::string caseTypeString = *tok_iter; - SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString); - - std::string caseChannel; - std::string caseChannelVar; - - ++tok_iter; - if (caseType != SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel"); - caseChannel = *tok_iter; - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), - "Cannot get case channel variable"); - caseChannelVar = *tok_iter; - } - - auto c = std::make_shared(caseIndex, caseType, - caseChannel, caseChannelVar); - - if (caseType == SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(defaultCase == nullptr, - "Select can only contain one default case."); - defaultCase = c; - } else { - cases.push_back(c); - } - } - } - - // Randomly sort cases, with default case being last - std::random_shuffle(cases.begin(), cases.end()); - if (defaultCase != nullptr) { - cases.push_back(defaultCase); - } - - return cases; - } - - /** - * This method will recursively poll the cases and determines if any case - * condition is true. - * If none of the cases conditions are true (and there is no default case), - * then block - * the thread. The thread may be woken up by a channel operation, at which - * point we - * execute the case. - * @param scope - * @param cases - * @param channels - * @return - */ - int32_t pollCases(const framework::Scope *scope, - std::vector> *cases, - std::vector channels) const { - // Lock all involved channels - lockChannels(channels); - - std::atomic caseToExecute(-1); - - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - switch (c->caseType) { - case SelectOpCaseType::SEND: - PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel"); - if (ch->CanSend()) { - // We can send to channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelSend(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::RECEIVE: - if (ch->CanReceive()) { - // We can receive from channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelReceive(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::DEFAULT: - caseToExecute = c->caseIndex; - break; - } - - if (caseToExecute != -1) { - // We found a case to execute, stop looking at other case statements - break; - } - - ++it; - } - - if (caseToExecute == -1) { - // None of the cases are eligible to execute, enqueue current thread - // into all the sending/receiving queue of each involved channel - std::atomic completed(false); - std::recursive_mutex mutex; - std::unique_lock lock{mutex}; - // std::condition_variable_any selectCond; - auto selectCond = std::make_shared(); - - std::recursive_mutex callbackMutex; - pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute, - &completed, &callbackMutex); - - // TODO(thuan): Atomically unlock all channels and sleep current thread - unlockChannels(channels); - selectCond->wait(lock, [&completed]() { return completed.load(); }); - - // Select has been woken up by case operation - lockChannels(channels); - removeThreadOnChannelQueues(scope, cases); - - if (caseToExecute == -1) { - // Recursively poll cases, since we were woken up by a channel close - // TODO(thuan): Need to test if this is a valid case - unlockChannels(channels); - return pollCases(scope, cases, channels); - } - } - - // At this point, caseToExecute != -1, and we can proceed with executing - // the case block - unlockChannels(channels); - - return caseToExecute; - } - - void lockChannels(std::vector chs) const { - std::vector::iterator it = chs.begin(); - while (it != chs.end()) { - framework::ChannelHolder *ch = *it; - ch->Lock(); - ++it; - } - } - - void unlockChannels(std::vector chs) const { - std::vector::reverse_iterator it = chs.rbegin(); - while (it != chs.rend()) { - framework::ChannelHolder *ch = *it; - ch->Unlock(); - ++it; - } - } - - void pushThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases, - std::shared_ptr rCond, - std::atomic *caseToExecute, std::atomic *completed, - std::recursive_mutex *callbackMutex) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - std::function cb = - [&caseToExecute, &completed, &callbackMutex, - c](framework::ChannelAction channelAction) { - std::lock_guard lock{*callbackMutex}; - - bool canProcess = false; - if (!(*completed)) { - // If the channel wasn't closed, we set the caseToExecute index - // as this current case - if (channelAction != framework::ChannelAction::CLOSE) { - *caseToExecute = c->caseIndex; - } - // This will allow our conditional variable to break out of wait - *completed = true; - canProcess = true; - } - - return canProcess; - }; - - switch (c->caseType) { - case SelectOpCaseType::SEND: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb); - break; - } - case SelectOpCaseType::RECEIVE: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb); - break; - } - default: - break; - } - ++it; - } - } - - void removeThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - switch (c->caseType) { - case SelectOpCaseType::SEND: { - ch->RemoveFromSendQ(this); - break; - } - case SelectOpCaseType::RECEIVE: { - ch->RemoveFromReceiveQ(this); - break; - } - default: - break; - } - ++it; - } - } -}; - -class SelectOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kX, - "A set of variables, which are required by operators inside the " - "cases of Select Op") - .AsDuplicable(); - AddInput(kCaseToExecute, - "(Int) The variable the sets the index of the case to execute, " - "after evaluating the channels being sent to and received from") - .AsDuplicable(); - AddOutput(kOutputs, - "A set of variables, which will be assigned with values " - "generated by the operators inside the cases of Select Op.") - .AsDuplicable(); - AddAttr>(kCases, - "(String vector) Serialized list of" - "all cases in the select op. Each" - "case is serialized as: " - "',,,'" - "where type is 0 for default, 1 for" - "send, and 2 for receive" - "No channel and values are needed for" - "default cases."); - AddAttr(kCasesBlock, - "The cases block inside select_op"); - AddComment(R"DOC( -)DOC"); - } -}; - -// TODO(thuan): Implement Gradient Operator for SELECT_OP - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(select, paddle::operators::SelectOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::SelectOpMaker); diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc index 1c86486157a02c3b78ed61e840fd8e452b9cb452..816ba123a6cbf84ec9b321d5d7cfef7fab9749b1 100644 --- a/paddle/fluid/operators/sequence_erase_op.cc +++ b/paddle/fluid/operators/sequence_erase_op.cc @@ -24,9 +24,9 @@ class SequenceEraseOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequenceEraseOp should not be null."); + "Input(X) of SequenceErase operator should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequenceEraseOp should not be null."); + "Output(Out) of SequenceErase operator should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1, "Input(X) of SequenceEraseOp should be a 2-D LoDTensor " diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 3c78c29c1a30d74947be84cd2b52ad308e732a2d..d4ba0f9c33c91811647f9d19a332f139c16b0eb2 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -34,7 +34,7 @@ namespace operators { using FluidDT = framework::proto::VarType_Type; using TRT_DT = nvinfer1::DataType; -namespace { // NOLINT +namespace { TRT_DT FluidDataType2TRT(FluidDT type) { switch (type) { diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index 92a0697e27ba0da66fa3b0f5380e7bd52575640d..4a8ac441cfaf642fde58ee30865a22e83c065498 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel { "Output(Indices) of TopkOp should not be null."); auto input_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(input_dims.size(), 2, - "Rank of TopK op's input must be 2."); const int k = static_cast(ctx->Attrs().Get("k")); PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index c7c533bd42859c374c4783d43ec4cdd34a6a994a..4ea0cd7283b55649dbdbbf97f81f10c69ac6a1d2 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -55,7 +55,7 @@ extern void *cublas_dso_handle; struct DynLoad__##__name { \ template \ inline cublasStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 0103e7a3accf88f3c83f109298010c3c9af3d549..e6353f67ef118072a2d8e49111e8ecc486589998 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL +#include #include #include // NOLINT @@ -47,13 +50,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - return __name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cudnnStatus_t operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 2daf1b4215ce1f7f771bbac72bfe103b0b941976..0bb300ec33076d9ddfaf69190f14131279cc888e 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -44,7 +44,7 @@ extern void *curand_dso_handle; struct DynLoad__##__name { \ template \ curandStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 6a3ad2151081504fda2a3818c5f99ad47039d91d..cc5cda6106c188f3156d33480b5d3641eed32556 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -107,7 +107,11 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, const std::string& dso_name, bool throw_on_error = true) { +#if !defined(_WIN32) int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 void* dso_handle = nullptr; std::string dlPath = dso_name; @@ -117,10 +121,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, // search xxx.so from custom path dlPath = join(search_root, dso_name); dso_handle = dlopen(dlPath.c_str(), dynload_flags); +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 // if not found, search from default path if (nullptr == dso_handle) { LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" - << dlerror() << ")"; + << errorno << ")"; if (dlPath.find("nccl") != std::string::npos) { std::cout << "You may need to install 'nccl2' from NVIDIA official website: " @@ -139,10 +148,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " "using the DYLD_LIBRARY_PATH is impossible unless System " "Integrity Protection (SIP) is disabled."; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 if (throw_on_error) { - PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror()); + PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); } else if (nullptr == dso_handle) { - LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); } return dso_handle; diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 126636d879213b1c8f242db8fbdf6a358a1d2da9..f599e7fbc886a60394ae4690e4160275b55b8596 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -20,8 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, - "Default use 92% of GPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); + "Allocate a trunk of gpu memory that is this fraction of the " + "total gpu memory size. Future memory usage will be allocated " + "from the trunk. If the trunk doesn't have enough gpu memory, " + "additional trunks of the same size will be requested from gpu " + "until the gpu has no memory left for another trunk."); namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 882e6332e8174b59eb6e19e788c8cced808d552c..1f61a0e289f32196ead04d71d07b513cbe4655b1 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -48,9 +48,6 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); - op_proto_and_checker_maker.def( - "kOpCreationCallstackAttrName", - framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index a5bc44122028c1191f511157bdde2e7c2d30c6aa..3b22718a8c6f994dbc2dc3e7aaa19a7163f716ba 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) { .def("set_shapes", &pd::VarDesc::SetShapes) .def("set_dtype", &pd::VarDesc::SetDataType) .def("set_dtypes", &pd::VarDesc::SetDataTypes) - .def("set_capacity", &pd::VarDesc::SetCapacity) .def("shape", &pd::VarDesc::GetShape, pybind11::return_value_policy::reference) .def("shapes", &pd::VarDesc::GetShapes, @@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) { .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES) .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE) .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) - .value("CHANNEL", pd::proto::VarType::CHANNEL) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) .value("RAW", pd::proto::VarType::RAW); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ef2f1f2a20a2eddee8ac077ee4bbf4dfd777448d..295af1c5837d70c32b522cc47c8c3e12d5bd61c7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt index 6cd9cbe379874e5ab7e40c1349e0483ff45bb63a..fae28fcb4c3102240438b62c203c65281f029192 100644 --- a/paddle/fluid/train/CMakeLists.txt +++ b/paddle/fluid/train/CMakeLists.txt @@ -4,7 +4,6 @@ function(train_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(arg_list "") if(train_test_ARGS) foreach(arg ${train_test_ARGS}) diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt index 08548bea4c4a7fc4fa99d9305208abd4ee442572..fbefcced5643b65372072856bfeb6c87cd4071a8 100644 --- a/paddle/legacy/trainer/tests/CMakeLists.txt +++ b/paddle/legacy/trainer/tests/CMakeLists.txt @@ -16,7 +16,11 @@ endfunction() trainer_test(test_Compare) trainer_test(test_PyDataProviderWrapper) trainer_test(test_recurrent_machine_generation) -trainer_test(test_Trainer) +if(NOT APPLE) + trainer_test(test_Trainer) +else() + message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") +endif() ############### test_TrainerOnePass ########################## if(WITH_PYTHON) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7d2fb7c6ce9e6a89df2c777323fc6a547fc227f4..d9214d0b8cef948dfca2ac9b1e3597fefc990786 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -395,10 +395,11 @@ EOF ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 - pip install /usr/local/opt/paddle/share/wheels/*.whl + pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi + pip uninstall -y paddlepaddle fi } @@ -654,11 +655,21 @@ function gen_fluid_inference_lib() { if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then cat < with soft_label (bool): A flag to indicate whether to interpretate the given labels as soft labels. By default, `soft_label` is set to False. - ignore_index (int): Specifies a target value that is ignored and does - not contribute to the input gradient. Only valid + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid if soft_label is set to False. Default: -100 Returns: @@ -4601,14 +4685,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): def squeeze(input, axes, name=None): """ - Remove single-dimensional entries from the shape of a tensor. Takes a - parameter axes with a list of axes to squeeze. If axes is not provided, all - the single dimensions will be removed from the shape. If an axis is + Remove single-dimensional entries from the shape of a tensor. Takes a + parameter axes with a list of axes to squeeze. If axes is not provided, all + the single dimensions will be removed from the shape. If an axis is selected with shape entry not equal to one, an error is raised. - + Examples: Case 1: - Given + Given X.shape = (1, 3, 1, 5) and axes = [0] @@ -4617,11 +4701,11 @@ def squeeze(input, axes, name=None): Case 2: Given X.shape = (1, 3, 1, 5) - and + and axes = [] we get: Out.shape = (3, 5) - + Args: input (Variable): The input variable to be squeezed. axes (list): List of integers, indicating the dimensions to be squeezed. @@ -4651,14 +4735,14 @@ def squeeze(input, axes, name=None): def unsqueeze(input, axes, name=None): """ - Insert single-dimensional entries to the shape of a tensor. Takes one - required argument axes, a list of dimensions that will be inserted. - Dimension indices in axes are as seen in the output tensor. + Insert single-dimensional entries to the shape of a tensor. Takes one + required argument axes, a list of dimensions that will be inserted. + Dimension indices in axes are as seen in the output tensor. - For example: - Given a tensor such that tensor with shape [3, 4, 5], + For example: + Given a tensor such that tensor with shape [3, 4, 5], then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. - + Args: input (Variable): The input variable to be unsqueezed. axes (list): List of integers, indicating the dimensions to be inserted. @@ -5757,39 +5841,39 @@ def pad2d(input, Example: Given that X is a channel of image from input: - + X = [[1, 2, 3], [4, 5, 6]] - + Case 0: - + paddings = [0, 1, 2, 3], mode = 'constant' pad_value = 0 - + Out = [[0, 0, 1, 2, 3, 0, 0, 0] [0, 0, 4, 5, 6, 0, 0, 0] [0, 0, 0, 0, 0, 0, 0, 0]] - + Case 1: - + paddings = [0, 1, 2, 1], mode = 'reflect' - + Out = [[3, 2, 1, 2, 3, 2] [6, 5, 4, 5, 6, 5] [3, 2, 1, 2, 3, 2]] - + Case 2: - + paddings = [0, 1, 2, 1], mode = 'edge' - + Out = [[1, 1, 1, 2, 3, 3] [4, 4, 4, 5, 6, 6] [4, 4, 4, 5, 6, 6]] - - + + Args: input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format. paddings (tuple|list): The padding size. If padding is a tuple, it must @@ -5988,7 +6072,7 @@ def prelu(x, mode, param_attr=None, name=None): channel:elements in a channel share same weight element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -6166,10 +6250,10 @@ def flatten(x, axis=1, name=None): def sequence_enumerate(input, win_size, pad_value=0, name=None): """ Generate a new sequence for the input index sequence, which enumerates all the - sub-sequences with length `win_size` of the input. + sub-sequences with length `win_size` of the input. The enumerated sequence has the same 1st dimension with variable `input`, and the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. - + Examples: Case 1: Input: @@ -6296,20 +6380,20 @@ def unstack(x, axis=0, num=None): **UnStack Layer** This layer unstacks input :code:`x` into several tensors along axis. - + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is - raised. + raised. Args: - x (Variable): Input variable. + x (Variable): Input variable. axis (int): The axis along which the input is unstacked. num (int|None): The number of output variables. - + Returns: list(Variable): The unstacked variables. - + """ helper = LayerHelper('unstack', **locals()) @@ -6342,21 +6426,21 @@ def expand(x, expand_times, name=None): .. code-block:: text Input(X) is a 3-D tensor with shape [2, 3, 1]: - + [ [[1], [2], [3]], [[4], [5], [6]] ] - + Attr(expand_times): [1, 2, 2] - + Output(Out) is a 3-D tensor with shape [2, 6, 2]: - + [ [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] ] - + Args: x (Variable): A tensor with rank in [1, 6]. expand_times (list|tuple): Expand times number for each dimension. @@ -6432,12 +6516,7 @@ def uniform_random_batch_size_like(input, @templatedoc() -def gaussian_random(shape, - mean=0.0, - std=1.0, - seed=0, - dtype='float32', - use_mkldnn=False): +def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'): """ ${comment} @@ -6447,7 +6526,6 @@ def gaussian_random(shape, std (Float): ${std_comment} seed (Int): ${seed_comment} dtype(np.dtype|core.VarDesc.VarType|str): Output data type. - use_mkldnn (Bool): Only used in mkldnn kernel. Returns: out (Variable): ${out_comment} @@ -6466,7 +6544,7 @@ def gaussian_random(shape, 'std': std, 'seed': seed, 'dtype': c_dtype, - 'use_mkldnn': use_mkldnn + 'use_mkldnn': False }) return out @@ -6549,13 +6627,12 @@ def gaussian_random_batch_size_like(input, @templatedoc() -def sum(x, use_mkldnn=False): +def sum(x): """ ${comment} Args: x (Variable): ${x_comment} - use_mkldnn (Bool): ${use_mkldnn_comment} Returns: out (Variable): ${out_comment} @@ -6567,7 +6644,7 @@ def sum(x, use_mkldnn=False): type='sum', inputs={'X': x}, outputs={'Out': out}, - attrs={'use_mkldnn': use_mkldnn}) + attrs={'use_mkldnn': False}) return out @@ -6630,14 +6707,12 @@ def _elementwise_op(helper): assert y is not None, 'y cannot be None in {}'.format(op_type) axis = helper.kwargs.get('axis', -1) use_mkldnn = helper.kwargs.get('use_mkldnn', False) - out = helper.kwargs.get('out', None) - if out is None: - name = helper.kwargs.get('name', None) - if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) - else: - out = helper.create_variable( - name=name, dtype=x.dtype, persistable=False) + name = helper.kwargs.get('name', None) + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) helper.append_op( type=op_type, @@ -6650,13 +6725,7 @@ def _elementwise_op(helper): @templatedoc() -def scale(x, - scale=1.0, - bias=0.0, - bias_after_scale=True, - out=None, - act=None, - name=None): +def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): """ ${comment} @@ -6665,21 +6734,19 @@ def scale(x, scale(${scale_type}): ${scale_comment} bias(${bias_type}): ${bias_comment} bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment} - out(Tensor): Output tensor. act(basestring|None): Activation applied to the output. - name(basestring|None): Name of the output. + name(basestring|None): Name of the output. Returns: out(${out_type}): ${out_comment} """ helper = LayerHelper('scale', **locals()) - if out is None: - if name is None: - out = helper.create_tmp_variable(dtype=x.dtype) - else: - out = helper.create_variable( - name=name, dtype=x.dtype, persistable=False) + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) helper.append_op( type='scale', @@ -6693,73 +6760,31 @@ def scale(x, return helper.append_activation(out) -def elementwise_add(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_add(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_add', **locals())) -def elementwise_div(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_div(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_div', **locals())) -def elementwise_sub(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_sub(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_sub', **locals())) -def elementwise_mul(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_mul(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_mul', **locals())) -def elementwise_max(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_max(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_max', **locals())) -def elementwise_min(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_min(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_min', **locals())) -def elementwise_pow(x, - y, - out=None, - axis=-1, - use_mkldnn=False, - act=None, - name=None): +def elementwise_pow(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_pow', **locals())) @@ -6771,7 +6796,291 @@ for func in [ func.__doc__ = _generate_doc_string_( op_proto, additional_args_lines=[ - "out (Tensor): The output tensor of elementwise op.", "act (basestring|None): Activation applied to the output.", "name (basestring|None): Name of the output." ]) + + +def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): + helper = LayerHelper(op_name, **locals()) + + if binary_op: + assert x.dtype == y.dtype + + if out is None: + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + if binary_op: + helper.append_op( + type=op_name, inputs={"X": x, + "Y": y}, outputs={"Out": out}) + else: + helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out}) + + return out + + +@templatedoc() +def logical_and(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_or(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_xor(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_not(x, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False) + + +@templatedoc() +def clip(x, min, max, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + min(${min_type}): ${min_comment} + max(${max_type}): ${max_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("clip", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="clip", + inputs={"X": x}, + attrs={"min": min, + "max": max}, + outputs={"Out": out}) + + return out + + +@templatedoc() +def clip_by_norm(x, max_norm, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + max_norm(${max_norm_type}): ${max_norm_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("clip_by_norm", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="clip_by_norm", + inputs={"X": x}, + attrs={"max_norm": max_norm}, + outputs={"Out": out}) + + return out + + +@templatedoc() +def mean(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("mean", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out}) + + return out + + +@templatedoc() +def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + x_num_col_dims(${x_num_col_dims_type}): ${x_num_col_dims_comment} + y_num_col_dims(${y_num_col_dims_type}): ${y_num_col_dims_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("mul", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="mul", + inputs={"X": x, + "Y": y}, + attrs={ + "x_num_col_dims": x_num_col_dims, + "y_num_col_dims": y_num_col_dims + }, + outputs={"Out": out}) + return out + + +@templatedoc() +def sigmoid_cross_entropy_with_logits(x, label, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + label(${label_type}): ${label_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="sigmoid_cross_entropy_with_logits", + inputs={"X": x, + "Label": label}, + attrs={}, + outputs={"Out": out}) + return out + + +@templatedoc() +def maxout(x, groups, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + groups(${groups_type}): ${groups_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + helper = LayerHelper("maxout", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="maxout", + inputs={"X": x}, + attrs={"groups": groups}, + outputs={"Out": out}) + return out diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 220d065f8f1cc02508dea2679820e1f7f490866d..9a8300524d8784fae598635796888382b1adbccf 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -35,18 +35,7 @@ __activations_noattr__ = [ 'softsign', ] -__all__ = [ - 'mean', - 'mul', - 'sigmoid_cross_entropy_with_logits', - 'clip', - 'clip_by_norm', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'maxout', -] +__all__ = [] for _OP in set(__all__): globals()[_OP] = generate_layer_fn(_OP) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 06513801dd8b34d366f9632f6943c8046872c31b..1dabad54f5b976e0fcabf6918d3bc6ece4eed384 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -40,8 +40,7 @@ def simple_img_conv_pool(input, param_attr=None, bias_attr=None, act=None, - use_cudnn=True, - use_mkldnn=False): + use_cudnn=True): """ The simple_img_conv_pool is composed with one Convolution2d and one Pool2d. @@ -84,8 +83,6 @@ def simple_img_conv_pool(input, act (str): Activation type for Conv2d. Default: None use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled - with mkldnn library. Default: False Return: Variable: The result of input after Convolution2d and Pool2d. @@ -112,8 +109,7 @@ def simple_img_conv_pool(input, param_attr=param_attr, bias_attr=bias_attr, act=act, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) pool_out = layers.pool2d( input=conv_out, @@ -122,8 +118,7 @@ def simple_img_conv_pool(input, pool_stride=pool_stride, pool_padding=pool_padding, global_pooling=global_pooling, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) return pool_out @@ -138,8 +133,7 @@ def img_conv_group(input, conv_batchnorm_drop_rate=0.0, pool_stride=1, pool_type="max", - use_cudnn=True, - use_mkldnn=False): + use_cudnn=True): """ The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut, and Pool2d. According to the input arguments, img_conv_group will do serials of @@ -177,8 +171,6 @@ def img_conv_group(input, average-pooling. Default :math:`max`. use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled - with mkldnn library. Default: False Return: Variable: The final result after serial computation using Convolution2d, @@ -226,8 +218,7 @@ def img_conv_group(input, padding=conv_padding[i], param_attr=param_attr[i], act=local_conv_act, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) if conv_with_batchnorm[i]: tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True) @@ -240,8 +231,7 @@ def img_conv_group(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) return pool_out diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index d24417bbacb503d9ea70e68e7e0edb59e7dddbde..1885dda44ab5eaeca6a4f54e4b84379c71ec3167 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -1,3 +1,4 @@ +set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt index 673c965b662a022739f8d489c331f4de9455a926..ad056aaa7b30b06d950486fd059c5b6a15770551 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt @@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") # default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() +if(NOT APPLE) + foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) + endforeach() +else() + foreach(src ${TEST_OPS}) + if(${src} STREQUAL "test_recognize_digits_conv") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + else() + py_test(${src} SRCS ${src}.py) + endif() + endforeach() +endif() diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py deleted file mode 100644 index b5d7676f4a2cb085c6900cd0bd0644afa2b2afd5..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/no_test_concurrency.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid import framework, unique_name, layer_helper -from paddle.fluid.executor import Executor -from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print - - -class TestRoutineOp(unittest.TestCase): - def test_simple_routine(self): - ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - - # Create LOD_TENSOR and put it into the scope. This placeholder - # variable will be filled in and returned by fluid.channel_recv - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - - with fluid.Go(): - input_value = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234) - fluid.channel_send(ch, input_value) - - result, status = fluid.channel_recv(ch, result) - fluid.channel_close(ch) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - outs = exe.run(fetch_list=[result]) - self.assertEqual(outs[0], 1234) - - def test_daisy_chain(self): - ''' - Mimics classic Daisy-chain test: https://talks.golang.org/2012/concurrency.slide#39 - ''' - n = 100 - - leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - left = leftmost - - # TODO(thuan): Use fluid.While() after scope capture is implemented. - # https://github.com/PaddlePaddle/Paddle/issues/8502 - for i in range(n): - right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - with fluid.Go(): - one_tensor = self._create_one_dim_tensor(1) - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - - result, status = fluid.channel_recv(right, result) - one_added = fluid.layers.elementwise_add(x=one_tensor, y=result) - fluid.channel_send(left, one_added) - left = right - - # Trigger the channel propagation by sending a "1" to rightmost channel - with fluid.Go(): - one_tensor = self._create_one_dim_tensor(1) - fluid.channel_send(right, one_tensor) - - leftmost_result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result) - - cpu = core.CPUPlace() - exe = Executor(cpu) - leftmost_data = exe.run(fetch_list=[leftmost_result]) - - # The leftmost_data should be equal to the number of channels + 1 - self.assertEqual(leftmost_data[0][0], n + 1) - - def _create_one_dim_tensor(self, value): - one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value) - one_dim_tensor.stop_gradient = True - return one_dim_tensor - - def _create_tensor(self, name, type, dtype): - return framework.default_main_program().current_block().create_var( - name=unique_name.generate(name), type=type, dtype=dtype) - - def _create_persistable_tensor(self, name, type, dtype): - return framework.default_main_program().current_block().create_var( - name=unique_name.generate(name), - type=type, - dtype=dtype, - persistable=True) - - def test_select(self): - with framework.program_guard(framework.Program()): - ch1 = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - - result1 = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - input_value = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=10) - - with fluid.Select() as select: - with select.case(fluid.channel_send, ch1, input_value): - # Execute something. - pass - - with select.default(): - pass - - # This should not block because we are using a buffered channel. - result1, status = fluid.channel_recv(ch1, result1) - fluid.channel_close(ch1) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - result = exe.run(fetch_list=[result1]) - self.assertEqual(result[0][0], 10) - - def test_fibonacci(self): - """ - Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5 - """ - with framework.program_guard(framework.Program()): - quit_ch_input_var = self._create_persistable_tensor( - 'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT32) - quit_ch_input = fill_constant( - shape=[1], - dtype=core.VarDesc.VarType.INT32, - value=0, - out=quit_ch_input_var) - - result = self._create_persistable_tensor( - 'result', core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT32) - fill_constant( - shape=[1], - dtype=core.VarDesc.VarType.INT32, - value=0, - out=result) - - x = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - y = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=1) - - while_cond = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True) - - while_false = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False) - - x_tmp = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - - def fibonacci(channel, quit_channel): - while_op = While(cond=while_cond) - with while_op.block(): - result2 = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - - with fluid.Select() as select: - with select.case( - fluid.channel_send, channel, x, is_copy=True): - assign(input=x, output=x_tmp) - assign(input=y, output=x) - assign(elementwise_add(x=x_tmp, y=y), output=y) - - with select.case(fluid.channel_recv, quit_channel, - result2): - # Quit - helper = layer_helper.LayerHelper('assign') - helper.append_op( - type='assign', - inputs={'X': [while_false]}, - outputs={'Out': [while_cond]}) - - ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - - with fluid.Go(): - for i in range(10): - fluid.channel_recv(ch1, result) - Print(result) - - fluid.channel_send(quit_ch, quit_ch_input) - - fibonacci(ch1, quit_ch) - - fluid.channel_close(ch1) - fluid.channel_close(quit_ch) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - exe_result = exe.run(fetch_list=[result]) - self.assertEqual(exe_result[0][0], 34) - - def test_ping_pong(self): - """ - Mimics Ping Pong example: https://gobyexample.com/channel-directions - """ - with framework.program_guard(framework.Program()): - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - ping_result = self._create_tensor('ping_return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - def ping(ch, message): - fluid.channel_send(ch, message, is_copy=True) - - def pong(ch1, ch2): - fluid.channel_recv(ch1, ping_result) - fluid.channel_send(ch2, ping_result, is_copy=True) - - pings = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - pongs = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - - msg = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=9) - - ping(pings, msg) - pong(pings, pongs) - - fluid.channel_recv(pongs, result) - - fluid.channel_close(pings) - fluid.channel_close(pongs) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - exe_result = exe.run(fetch_list=[result]) - self.assertEqual(exe_result[0][0], 9) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py deleted file mode 100644 index fd9da4cce0ea51c53b4b01e7c3dc2a2ed1eeb089..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/notest_concurrency.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.executor import Executor - - -class TestRoutineOp(unittest.TestCase): - def test_simple_routine(self): - ch = fluid.make_channel( - dtype=core.VarDesc.VarType.BOOL, name="CreateChannel") - with fluid.Go(): - fluid.channel_send(ch, True) - - result = fluid.channel_recv(ch) - fluid.channel_close(ch) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - outs = exe.run(fetch_list=[result]) - self.assertEqual(outs[0], True) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 723f9eb9c978755b77724100c266be199e0f301a..7de0ebce06e9de439d3570bee9ac7dbce33ee868 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -76,11 +76,13 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) - py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + # TODO: fix this test + #py_test_modules(test_dist_transformer MODULES test_dist_transformer) + #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) - #FIXME(gongwb): random fails. - #py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index a4ffe7d40c40501ebd43fec0b664159227ea34bd..5da370570680e9f10a22ad882e3346e6381dfe63 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -247,7 +247,7 @@ class DistSeResneXt2x2(TestDistRunnerBase): # Reader train_reader = paddle.batch( - paddle.dataset.flowers.train(), batch_size=batch_size) + paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 6a2732e9399aa5a93f4c47eb73bfd23dba608c3d..2ecc2504a8c9c5ecfc32cee96df9e368ff219cbb 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): def setUp(self): self.op_type = "conv2d" self.use_cudnn = False + self.use_cuda = False self.use_mkldnn = False self.data_format = "AnyLayout" self.dtype = np.float32 @@ -101,24 +102,25 @@ class TestConv2dOp(OpTest): } self.outputs = {'Output': output} - def testcudnn(self): - return core.is_compiled_with_cuda() and self.use_cudnn + def testcuda(self): + return core.is_compiled_with_cuda() and (self.use_cudnn or + self.use_cuda) def test_check_output(self): - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_output_with_place(place, atol=1e-5) def test_check_grad(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def test_check_grad_no_filter(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, ['Input'], 'Output', @@ -128,7 +130,7 @@ class TestConv2dOp(OpTest): def test_check_grad_no_input(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, ['Filter'], 'Output', @@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): class TestDepthwiseConv(TestConv2dOp): def init_test_case(self): + self.use_cuda = True self.pad = [1, 1] self.stride = [2, 2] self.input_size = [2, 3, 5, 5] # NCHW self.groups = 3 assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] + self.filter_size = [3, f_c, 3, 3] self.op_type = "depthwise_conv2d" class TestDepthwiseConv2(TestConv2dOp): def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv3(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True self.pad = [1, 1] self.stride = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW @@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp): self.op_type = "depthwise_conv2d" +class TestDepthwiseConvWithDilation(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConvWithDilation2(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0b9af6d7f6d5eb2ba81c04a51169127bbdba1b1a..04924bec057e301bfb342a62bb4c1e0b3c3aff4c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -164,6 +164,17 @@ class TestDistBase(unittest.TestCase): def _setup_config(self): raise NotImplementedError("tests should have _setup_config implemented") + def _after_setup_config(self): + if self._enforce_place == "CPU": + self.__use_cuda = False + elif self._enforce_place == "GPU": + self.__use_cuda = True + else: + if fluid.core.is_compiled_with_cuda(): + self.__use_cuda = True + else: + self.__use_cuda = False + def setUp(self): self._trainers = 2 self._pservers = 2 @@ -171,11 +182,12 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = "python" self._sync_mode = True - self._use_cuda = True + self._enforce_place = None self._mem_opt = False self._use_reduce = False self._use_reader_alloc = True self._setup_config() + self._after_setup_config() def _find_free_port(self): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: @@ -199,13 +211,10 @@ class TestDistBase(unittest.TestCase): ps0_cmd += " --mem_opt" ps1_cmd += " --mem_opt" - ps0_pipe = subprocess.PIPE - ps1_pipe = subprocess.PIPE - if check_error_log: - print(ps0_cmd) - print(ps1_cmd) - ps0_pipe = open("/tmp/ps0_err.log", "wb") - ps1_pipe = open("/tmp/ps1_err.log", "wb") + print(ps0_cmd) + print(ps1_cmd) + ps0_pipe = open("/tmp/ps0_err.log", "wb") + ps1_pipe = open("/tmp/ps1_err.log", "wb") ps0_proc = subprocess.Popen( ps0_cmd.strip().split(" "), @@ -218,10 +227,7 @@ class TestDistBase(unittest.TestCase): stderr=ps1_pipe, env=required_envs) - if not check_error_log: - return ps0_proc, ps1_proc, None, None - else: - return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe + return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe def _wait_ps_ready(self, pid): retry_times = 50 @@ -242,7 +248,7 @@ class TestDistBase(unittest.TestCase): cmd = "%s %s --role trainer" % (self._python_interp, model) - if self._use_cuda: + if self.__use_cuda: cmd += " --use_cuda" env_local = {"CUDA_VISIBLE_DEVICES": "0"} else: @@ -250,7 +256,7 @@ class TestDistBase(unittest.TestCase): envs.update(env_local) - if not check_error_log: + if check_error_log: err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( cmd.split(" "), @@ -264,7 +270,6 @@ class TestDistBase(unittest.TestCase): stderr=subprocess.PIPE, env=envs) - local_proc.wait() local_out, local_err = local_proc.communicate() local_ret = cpt.to_text(local_out) @@ -305,7 +310,7 @@ class TestDistBase(unittest.TestCase): if self._use_reader_alloc: tr0_cmd += " --use_reader_alloc" tr1_cmd += " --use_reader_alloc" - if self._use_cuda: + if self.__use_cuda: tr0_cmd += " --use_cuda" tr1_cmd += " --use_cuda" env0 = {"CUDA_VISIBLE_DEVICES": "0"} @@ -317,15 +322,10 @@ class TestDistBase(unittest.TestCase): env0.update(envs) env1.update(envs) - FNULL = open(os.devnull, 'w') - - tr0_pipe = subprocess.PIPE - tr1_pipe = subprocess.PIPE - if check_error_log: - print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) - print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) - tr0_pipe = open("/tmp/tr0_err.log", "wb") - tr1_pipe = open("/tmp/tr1_err.log", "wb") + print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") tr0_proc = subprocess.Popen( tr0_cmd.strip().split(" "), @@ -338,29 +338,22 @@ class TestDistBase(unittest.TestCase): stderr=tr1_pipe, env=env1) - tr0_proc.wait() - tr1_proc.wait() - tr0_out, tr0_err = tr0_proc.communicate() tr0_loss_text = cpt.to_text(tr0_out) tr1_out, tr1_err = tr1_proc.communicate() tr1_loss_text = cpt.to_text(tr1_out) # close trainer file - if check_error_log: - tr0_pipe.close() - tr1_pipe.close() + tr0_pipe.close() + tr1_pipe.close() - ps0_pipe.close() - ps1_pipe.close() + ps0_pipe.close() + ps1_pipe.close() # FIXME: use terminate() instead of sigkill. os.kill(ps0.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL) ps0.terminate() ps1.terminate() - ps0.wait() - ps1.wait() - FNULL.close() # print log sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) @@ -385,6 +378,7 @@ class TestDistBase(unittest.TestCase): "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_cudnn_deterministic": "1", + "http_proxy": "" } required_envs.update(need_envs) diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py index 081d6e9273ebaf7af643b8481399d11d1ab60e00..3575fd07fc727bd6c6b07a19a60b1df6656ae9e2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -21,10 +21,11 @@ from test_dist_base import TestDistBase class TestDistCTR2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - self._use_cuda = False + self._enforce_place = "CPU" - def test_dist_ctr(self): - self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) + +def test_dist_ctr(self): + self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index 6bc707c245ab13dd2dbe50b953ef5308aba05b78..e971f29db42a7c1a2394505a8ece3d2fd6b347e9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -22,7 +22,7 @@ from test_dist_base import TestDistBase class TestDistSimnetBowDense2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - self._use_cuda = False + self._enforce_place = "CPU" def test_simnet_bow(self): need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} @@ -36,7 +36,7 @@ class TestDistSimnetBowDense2x2(TestDistBase): class TestDistSimnetBow2x2DenseAsync(TestDistBase): def _setup_config(self): self._sync_mode = False - self._use_cuda = False + self._enforce_place = "CPU" def test_simnet_bow(self): need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} @@ -50,7 +50,7 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): class TestDistSimnetBowSparse2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - self._use_cuda = False + self._enforce_place = "CPU" def test_simnet_bow(self): need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} @@ -64,7 +64,7 @@ class TestDistSimnetBowSparse2x2(TestDistBase): class TestDistSimnetBow2x2SparseAsync(TestDistBase): def _setup_config(self): self._sync_mode = False - self._use_cuda = False + self._enforce_place = "CPU" def test_simnet_bow(self): need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py index b830c965caf2e47c5cc648bc98960459fa6b30ee..0c1680359e2b84807084b06eab0534b41ecd6133 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py +++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py @@ -21,7 +21,7 @@ from test_dist_base import TestDistBase class TestDistTextClassification2x2(TestDistBase): def _setup_config(self): self._sync_mode = True - self._use_cuda = False + self._enforce_place = "CPU" def test_text_classification(self): self.check_with_place("dist_text_classification.py", delta=1e-6) @@ -30,7 +30,7 @@ class TestDistTextClassification2x2(TestDistBase): class TestDistTextClassification2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False - self._use_cuda = False + self._enforce_place = "CPU" def test_se_resnext(self): self.check_with_place("dist_text_classification.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b8dc9e8ad7cd7cd100d5c3cb99319e6f5a37da91..1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -825,6 +825,15 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def iou_similarity(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[16], dtype="float32") + y = layers.data(name="y", shape=[16], dtype="float32") + out = layers.iou_similarity(x, y, name='iou_similarity') + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 37b9a9188ab44df81029ae6d9925ae21c1929cff..4153394c1da776d0a41e1415a09fa7d6f4b14d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope", "op_callstack" + "op_namescope" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 43d51b03e81895d7322d9e28a9c40b6d7cc69206..c402535b27142e94af339a6c18401ba20bc6564d 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -124,7 +124,7 @@ class InferenceTranspiler(object): next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu - current_op.set_attr("fuse_relu", True) + current_op._set_attr("fuse_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -454,7 +454,7 @@ class InferenceTranspiler(object): :type eltwise_op: Operator ''' - conv_op.set_attr("fuse_eltwise", True) + conv_op._set_attr("fuse_eltwise", True) self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]