“33133d26d6e769d6b18c297fba6e067f65193c89”上不存在“source/dnode/vnode/git@gitcode.net:taosdata/tdengine.git”
提交 91756a5a 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-opyreader

...@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ ...@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \ apt-get install -y --allow-downgrades patchelf \
python3 python3-dev python3-pip \
git python-pip python-dev python-opencv openssh-server bison \ git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
...@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed. # version(1.7.1 for now), which causes building documentation failed.
RUN easy_install -U pip && \ RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \
pip install -U wheel && \ pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \ RUN pip3 install pre-commit 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \
pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python pip install opencv-python
#For docstring checker #For docstring checker
RUN pip3 install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker RUN pip install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt RUN pip install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev RUN apt-get install -y libssl-dev libffi-dev
RUN pip3 install certifi urllib3[secure]
RUN pip install certifi urllib3[secure] RUN pip install certifi urllib3[secure]
......
...@@ -52,6 +52,7 @@ ExternalProject_Add( ...@@ -52,6 +52,7 @@ ExternalProject_Add(
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS ${CMAKE_ARGS_PREFIX} CMAKE_ARGS ${CMAKE_ARGS_PREFIX}
-DUSE_LOGGER=YES
-DUSE_X86_PLACE=YES -DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO -DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
......
...@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en ...@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
...@@ -49,7 +49,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var ...@@ -49,7 +49,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None)) paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
...@@ -62,14 +62,14 @@ paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', ...@@ -62,14 +62,14 @@ paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label',
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...@@ -145,21 +145,31 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key ...@@ -145,21 +145,31 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None)) paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None))
paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None))
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0))
paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False)) paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32'))
paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...@@ -223,16 +233,6 @@ paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], ...@@ -223,16 +233,6 @@ paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'],
paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -266,9 +266,9 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp ...@@ -266,9 +266,9 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -300,13 +300,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', ...@@ -300,13 +300,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
...@@ -315,11 +319,11 @@ paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpo ...@@ -315,11 +319,11 @@ paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpo
paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False)) paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
......
...@@ -56,9 +56,9 @@ else() ...@@ -56,9 +56,9 @@ else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
else() else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
endif (NOT WIN32) endif (NOT WIN32)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
...@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) ...@@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass graph build_strategy
multi_devices_graph_print_pass multi_devices_graph_check_pass fast_threaded_ssa_graph_executor)
fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
endif() # NOT WIN32 endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
...@@ -167,15 +169,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) ...@@ -167,15 +169,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
if (NOT WIN32) if (NOT WIN32)
cc_test(rw_lock_test SRCS rw_lock_test.cc) cc_test(rw_lock_test SRCS rw_lock_test.cc)
endif (NOT WIN32) endif (NOT WIN32)
# disable test temporarily.
# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
# channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
# conditional_block_op while_op assign_op print_op executor proto_desc)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <condition_variable> // NOLINT
#include <typeindex>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
enum class ChannelAction {
SEND = 0,
RECEIVE = 1,
CLOSE = 2,
};
// Channel is the abstract class of buffered and un-buffered channels.
template <typename T>
class Channel {
public:
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual void Send(T*) = 0;
virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
virtual bool IsClosed() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
virtual void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
};
// Forward declaration of channel implementations.
template <typename T>
class ChannelImpl;
template <typename T>
Channel<T>* MakeChannel(size_t buffer_size) {
return new ChannelImpl<T>(buffer_size);
}
template <typename T>
void CloseChannel(Channel<T>* ch) {
ch->Close();
}
/*
* The ChannelHolder class serves two main purposes:
* 1. It acts as a unified wrapper for the different kinds of
* channels, i.e. Buffered and Unbuffered channels. This is
* similar to the ReaderHolder class.
* 2. It also helps us in TypeHiding. This is similar to the
* PlaceHolder implementations in variable.h and tensor.h.
*/
class ChannelHolder {
public:
template <typename T>
void Reset(size_t buffer_size) {
holder_.reset(new PlaceholderImpl<T>(buffer_size));
}
template <typename T>
void Send(T* data) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
// Static cast should be safe because we have ensured that types are same
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
channel->Send(data);
}
template <typename T>
bool Receive(T* data) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
return channel->Receive(data);
}
bool IsClosed() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->IsClosed();
}
bool CanSend() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->CanSend();
}
bool CanReceive() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->CanReceive();
}
void close() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Close();
}
size_t Cap() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->Cap();
}
void Lock() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Lock();
}
void Unlock() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Unlock();
}
template <typename T>
void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToSendQ(referrer, data, cond, cb);
}
}
template <typename T>
void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToReceiveQ(referrer, data, cond, cb);
}
}
void RemoveFromSendQ(const void* referrer) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromSendQ(referrer);
}
void RemoveFromReceiveQ(const void* referrer) {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromReceiveQ(referrer);
}
inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() {
PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->Type();
}
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a template
* parameter of ChannelHolder.
*/
struct Placeholder {
virtual ~Placeholder() {}
virtual const std::type_index Type() const = 0;
virtual void* Ptr() const = 0;
virtual bool IsClosed() = 0;
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
virtual void Close() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
virtual size_t Cap() = 0;
};
template <typename T>
struct PlaceholderImpl : public Placeholder {
explicit PlaceholderImpl(size_t buffer_size)
: type_(std::type_index(typeid(T))) {
channel_.reset(MakeChannel<T>(buffer_size));
}
virtual const std::type_index Type() const { return type_; }
virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
virtual bool IsClosed() {
if (channel_) {
return channel_->IsClosed();
}
return false;
}
virtual bool CanSend() {
if (channel_) {
return channel_->CanSend();
}
return false;
}
virtual bool CanReceive() {
if (channel_) {
return channel_->CanReceive();
}
return false;
}
virtual void RemoveFromSendQ(const void* referrer) {
if (channel_) {
channel_->RemoveFromSendQ(referrer);
}
}
virtual void RemoveFromReceiveQ(const void* referrer) {
if (channel_) {
channel_->RemoveFromReceiveQ(referrer);
}
}
virtual void Close() {
if (channel_) channel_->Close();
}
virtual size_t Cap() {
if (channel_)
return channel_->Cap();
else
return -1;
}
virtual void Lock() {
if (channel_) channel_->Lock();
}
virtual void Unlock() {
if (channel_) channel_->Unlock();
}
std::unique_ptr<Channel<T>> channel_;
const std::type_index type_;
};
// Pointer to a PlaceholderImpl object
std::unique_ptr<Placeholder> holder_;
};
} // namespace framework
} // namespace paddle
#include "paddle/fluid/framework/channel_impl.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <atomic>
#include <condition_variable> // NOLINT
#include <deque>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
template <typename T>
class ChannelImpl : public paddle::framework::Channel<T> {
friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T> *);
public:
virtual bool CanSend();
virtual bool CanReceive();
virtual void Send(T *);
virtual bool Receive(T *);
virtual size_t Cap() { return cap_; }
virtual void Lock();
virtual void Unlock();
virtual bool IsClosed();
virtual void Close();
explicit ChannelImpl(size_t);
virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void AddToReceiveQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void RemoveFromSendQ(const void *referrer);
virtual void RemoveFromReceiveQ(const void *referrer);
private:
struct QueueMessage {
T *data;
std::shared_ptr<std::condition_variable_any> cond;
bool chan_closed = false;
bool completed = false;
const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback;
explicit QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
: data(item), cond(cond) {}
void Wait(std::unique_lock<std::recursive_mutex> &lock) {
cond->wait(lock, [this]() { return completed; });
}
void Notify() {
completed = true;
cond->notify_all();
}
};
void send_return() {
send_ctr--;
destructor_cond_.notify_all();
}
bool recv_return(bool value) {
recv_ctr--;
destructor_cond_.notify_all();
return value;
}
std::shared_ptr<QueueMessage> get_first_message(
std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
while (!queue->empty()) {
// Check whether this message was added by Select
// If this was added by Select then execute the callback
// to check if you can execute this message. The callback
// can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next.
std::shared_ptr<QueueMessage> m = queue->front();
queue->pop_front();
if (m->callback == nullptr || m->callback(action)) return m;
}
return nullptr;
}
size_t cap_;
std::recursive_mutex mu_;
bool closed_;
std::deque<T> buf_;
std::deque<std::shared_ptr<QueueMessage>> recvq;
std::deque<std::shared_ptr<QueueMessage>> sendq;
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
std::condition_variable_any destructor_cond_;
};
template <typename T>
ChannelImpl<T>::ChannelImpl(size_t capacity)
: cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
PADDLE_ENFORCE_GE(capacity, 0);
}
template <typename T>
bool ChannelImpl<T>::CanSend() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !closed_ && (!recvq.empty() || buf_.size() < cap_);
}
template <typename T>
bool ChannelImpl<T>::CanReceive() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
}
template <typename T>
void ChannelImpl<T>::Send(T *item) {
send_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed, throw exception
if (closed_) {
send_return();
lock.unlock();
PADDLE_THROW("Cannot send on closed channel");
}
// If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m =
get_first_message(&recvq, ChannelAction::SEND);
if (m != nullptr) {
*(m->data) = std::move(*item);
m->Notify();
send_return();
return;
} else {
Send(item);
send_return();
return;
}
}
// Unbuffered channel will always bypass this
// If buffered channel has space in buffer,
// write the element to the buffer.
if (buf_.size() < cap_) {
// Copy to buffer
buf_.push_back(std::move(*item));
send_return();
return;
}
// Block on channel, because some receiver will complete
// the operation for us
auto m = std::make_shared<QueueMessage>(item);
sendq.push_back(m);
m->Wait(lock);
if (m->chan_closed) {
send_return();
lock.unlock();
PADDLE_THROW("Cannot send on closed channel");
}
send_return();
}
template <typename T>
bool ChannelImpl<T>::Receive(T *item) {
recv_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed and buffer is empty or
// channel is unbuffered
if (closed_ && buf_.empty()) return recv_return(false);
// If there is a sender, directly receive the value we want
// from the sender. In case of a buffered channel, read from
// buffer and move front of send queue to the buffer
if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m =
get_first_message(&sendq, ChannelAction::RECEIVE);
if (buf_.size() > 0) {
// Case 1 : Channel is Buffered
// Do Data transfer from front of buffer
// and add a QueueMessage to the buffer
*item = std::move(buf_.front());
buf_.pop_front();
// If first message from sendq is not null
// add it to the buffer and notify it
if (m != nullptr) {
// Copy to buffer
buf_.push_back(std::move(*(m->data)));
m->Notify();
} // Ignore if there is no first message
} else {
// Case 2: Channel is Unbuffered
// Do data transfer from front of SendQ
// If front is nullptr, then recursively call itself
if (m != nullptr) {
*item = std::move(*(m->data));
m->Notify();
} else {
return recv_return(Receive(item));
}
}
return recv_return(true);
}
// If this is a buffered channel and there are items in buffer
if (buf_.size() > 0) {
// Directly read from buffer
*item = std::move(buf_.front());
buf_.pop_front();
// return true
return recv_return(true);
}
// No sender available, block on this channel
// Some receiver will complete the option for us
auto m = std::make_shared<QueueMessage>(item);
recvq.push_back(m);
m->Wait(lock);
return recv_return(!m->chan_closed);
}
template <typename T>
void ChannelImpl<T>::Lock() {
mu_.lock();
}
template <typename T>
void ChannelImpl<T>::Unlock() {
mu_.unlock();
}
template <typename T>
bool ChannelImpl<T>::IsClosed() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return closed_;
}
template <typename T>
void ChannelImpl<T>::Close() {
std::unique_lock<std::recursive_mutex> lock{mu_};
if (closed_) {
// TODO(abhinavarora): closing an already closed channel should panic
lock.unlock();
return;
}
closed_ = true;
// Empty the readers
while (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front();
m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify();
}
// Empty the senders
while (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front();
m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify();
}
}
template <typename T>
void ChannelImpl<T>::AddToSendQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
sendq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::AddToReceiveQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
recvq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = sendq.begin(); it != sendq.end();) {
std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
if (sendMsg->referrer == referrer) {
it = sendq.erase(it);
} else {
++it;
}
}
}
template <typename T>
void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = recvq.begin(); it != recvq.end();) {
std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
if (recvMsg->referrer == referrer) {
it = recvq.erase(it);
} else {
++it;
}
}
}
template <typename T>
ChannelImpl<T>::~ChannelImpl() {
Close();
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
std::unique_lock<std::recursive_mutex> lock{mu_};
destructor_cond_.wait(lock,
[this]() { return send_ctr == 0 && recv_ctr == 0; });
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <chrono> // NOLINT
#include <thread> // NOLINT
#include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::ChannelHolder;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, ChannelCapacityTest) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
EXPECT_EQ(ch->Cap(), buffer_size);
CloseChannel(ch);
delete ch;
ch = MakeChannel<size_t>(0);
EXPECT_EQ(ch->Cap(), 0U);
CloseChannel(ch);
delete ch;
}
void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < num_items; i++) {
ch->Send(&i);
sum_send += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(200));
for (int i = 0; i < num_items; i++) {
int recv = -1;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
CloseChannel(ch);
t.join();
unsigned expected_sum = (num_items * (num_items - 1)) / 2;
EXPECT_EQ(sum_send, expected_sum);
delete ch;
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i);
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
// This tests that a channel must return false
// on send and receive performed after closing the channel.
// Receive will only return false after close when queue is empty.
// By creating separate threads for sending and receiving, we make this
// function able to test both buffered and unbuffered channels.
void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
const size_t data = 5;
std::thread send_thread{[&]() {
size_t i = data;
ch->Send(&i); // should not block
}};
std::thread recv_thread{[&]() {
size_t i;
EXPECT_EQ(ch->Receive(&i), true); // should not block
EXPECT_EQ(i, data);
}};
send_thread.join();
recv_thread.join();
// After closing send should panic. Receive should
// also false as there is no data in queue.
CloseChannel(ch);
send_thread = std::thread{[&]() {
size_t i = data;
bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}};
recv_thread = std::thread{[&]() {
size_t i;
// should return false because channel is closed and queue is empty
EXPECT_EQ(ch->Receive(&i), false);
}};
send_thread.join();
recv_thread.join();
}
TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
SendReceiveWithACloseChannelShouldPanic(ch);
delete ch;
}
TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
auto ch = MakeChannel<size_t>(0);
SendReceiveWithACloseChannelShouldPanic(ch);
delete ch;
}
TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // sending should not block
}
size_t out;
for (size_t i = 0; i < buffer_size / 2; ++i) {
EXPECT_EQ(ch->Receive(&out), true); // receiving should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out),
true); // receving should return residual values.
EXPECT_EQ(out, i);
}
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out),
false); // receiving on closed channel should return false
}
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size) {
ch->Send(&i); // should block after 10 iterations
} else {
bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
auto ch = MakeChannel<int>(0);
RecevingOrderEqualToSendingOrder(ch, 20);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
// Test that Receive Order is same as Send Order when number of items
// sent is less than size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 5);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
// Test that Receive Order is same as Send Order when number of items
// sent is equal to size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 10);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
// Test that Receive Order is same as Send Order when number of items
// sent is greater than the size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 20);
}
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksSendersTest(ch, true);
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
ChannelCloseUnblocksSendersTest(ch, false);
delete ch;
}
TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
// Send should block after three iterations
// since we only have three receivers.
std::thread t([&]() {
// Try to send more number of times
// than receivers
for (int i = 0; i < 4; i++) {
try {
ch->Send(&i);
sum_send += i;
} catch (paddle::platform::EnforceNotMet e) {
}
}
});
for (int i = 0; i < 3; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
t.join();
delete ch;
}
TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
unsigned sum_receive = 0;
// The receiver should block after 5
// iterations, since there are only 5 senders.
std::thread t([&]() {
for (int i = 0; i < 8; i++) {
int recv;
ch->Receive(&recv); // should block after the fifth iteration.
EXPECT_EQ(recv, i);
sum_receive += i;
}
});
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
for (int i = 5; i < 8; i++) {
ch->Send(&i);
sum_send += i;
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 28U);
EXPECT_EQ(sum_receive, 28U);
delete ch;
}
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly destroy the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successful sends
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
if (isBuffered) {
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} else {
// In unbuffered channel, no send should be successful
EXPECT_EQ(ct, 0);
}
// Join all threads
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channel also unblocks
// any receivers waiting on the channel
void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
size_t buffer_size = 1;
auto ch = MakeChannel<int>(buffer_size);
ChannelDestroyUnblockReceivers(ch);
}
TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
size_t buffer_size = 1;
auto ch = MakeChannel<int>(buffer_size);
ChannelDestroyUnblockSenders(ch, true);
}
// This tests that destroying an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
ChannelDestroyUnblockReceivers(ch);
}
TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
ChannelDestroyUnblockSenders(ch, false);
}
TEST(ChannelHolder, ChannelHolderCapacityTest) {
const size_t buffer_size = 10;
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(buffer_size);
EXPECT_EQ(ch->Cap(), buffer_size);
delete ch;
ch = new ChannelHolder();
ch->Reset<int>(0);
EXPECT_EQ(ch->Cap(), 0U);
delete ch;
}
void ChannelHolderSendReceive(ChannelHolder *ch) {
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
ch->close();
t.join();
EXPECT_EQ(sum_send, 10U);
}
TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) {
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(10);
ChannelHolderSendReceive(ch);
delete ch;
}
TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderSendReceive(ch);
delete ch;
}
TEST(ChannelHolder, ChannelUninitializedTest) {
ChannelHolder *ch = new ChannelHolder();
EXPECT_EQ(ch->IsInitialized(), false);
int i = 10;
bool send_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
send_exception = true;
}
EXPECT_EQ(send_exception, true);
bool recv_exception = false;
try {
ch->Receive(&i);
} catch (paddle::platform::EnforceNotMet e) {
recv_exception = true;
}
EXPECT_EQ(recv_exception, true);
bool is_exception = false;
try {
ch->Type();
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
TEST(ChannelHolder, ChannelInitializedTest) {
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(2);
EXPECT_EQ(ch->IsInitialized(), true);
// Channel should remain intialized even after close
ch->close();
EXPECT_EQ(ch->IsInitialized(), true);
delete ch;
}
TEST(ChannelHolder, TypeMismatchSendTest) {
// Test with unbuffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(0);
bool is_exception = false;
bool boolean_data = true;
try {
ch->Send(&boolean_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
// Test with Buffered Channel
ch = new ChannelHolder();
ch->Reset<float>(10);
is_exception = false;
int int_data = 23;
try {
ch->Send(&int_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
TEST(ChannelHolder, TypeMismatchReceiveTest) {
// Test with unbuffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(0);
bool is_exception = false;
bool float_data;
try {
ch->Receive(&float_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
// Test with Buffered Channel
ch = new ChannelHolder();
ch->Reset<float>(10);
is_exception = false;
int int_data = 23;
try {
ch->Receive(&int_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly close the thread
// This should unblock all senders
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a channelholder unblocks
// any receivers waiting on the channel
TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
// Check for buffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderCloseUnblocksReceiversTest(ch);
delete ch;
// Check for unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing a channelholder unblocks
// any senders waiting for channel to have write space
TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
// Check for buffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderCloseUnblocksSendersTest(ch, true);
delete ch;
// Check for unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderCloseUnblocksSendersTest(ch, false);
delete ch;
}
// This tests that destroying a channelholder unblocks
// any senders waiting for channel
void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly destroy the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
int ct = 0;
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
if (isBuffered) {
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} else {
// In unbuffered channel, no send should be successful
EXPECT_EQ(ct, 0);
}
// Join all threads
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channelholder also unblocks
// any receivers waiting on the channel
void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
// Check for Buffered Channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderDestroyUnblockReceivers(ch);
// ch is already deleted already deleted in
// ChannelHolderDestroyUnblockReceivers
// Check for Unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderDestroyUnblockReceivers(ch);
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
// Check for Buffered Channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderDestroyUnblockSenders(ch, true);
// ch is already deleted already deleted in
// ChannelHolderDestroyUnblockReceivers
// Check for Unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderDestroyUnblockSenders(ch, false);
}
// This tests that closing a channelholder many times.
void ChannelHolderManyTimesClose(ChannelHolder *ch) {
const int kNumThreads = 15;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to send data to channel.
for (size_t i = 0; i < kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *ended) {
int data = 10;
ch->Send(&data);
*ended = true;
},
&thread_ended[i]);
}
// Launches threads that try to receive data to channel.
for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
if (ch->Receive(&data)) {
EXPECT_EQ(data, 10);
}
*p = true;
},
&thread_ended[i]);
}
// Launches threads that try to close the channel.
for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
if (!ch->IsClosed()) {
ch->close();
}
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are unblocked
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
EXPECT_TRUE(ch->IsClosed());
// delete the channel
delete ch;
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
// Check for Buffered Channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(10);
ChannelHolderManyTimesClose(ch);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
USE_NO_KERNEL_OP(go);
USE_NO_KERNEL_OP(channel_close);
USE_NO_KERNEL_OP(channel_create);
USE_NO_KERNEL_OP(channel_recv);
USE_NO_KERNEL_OP(channel_send);
USE_NO_KERNEL_OP(elementwise_add);
USE_NO_KERNEL_OP(select);
USE_NO_KERNEL_OP(conditional_block);
USE_NO_KERNEL_OP(equal);
USE_NO_KERNEL_OP(assign);
USE_NO_KERNEL_OP(while);
USE_NO_KERNEL_OP(print);
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace paddle {
namespace framework {
template <typename T>
LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
std::string name, T value) {
// Create LoDTensor<int> of dim [1]
auto var = scope->Var(name);
auto tensor = var->GetMutable<LoDTensor>();
tensor->Resize({1});
T *expect = tensor->mutable_data<T>(place);
expect[0] = value;
return tensor;
}
void AddOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, AttributeMap attrs,
BlockDesc *block) {
// insert op
auto op = block->AppendOp();
op->SetType(type);
for (auto &kv : inputs) {
op->SetInput(kv.first, kv.second);
}
for (auto &kv : outputs) {
op->SetOutput(kv.first, kv.second);
}
op->SetAttrMap(attrs);
}
void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
BlockDesc *casesBlock, int caseId, int caseType,
std::string caseChannel, std::string caseVarName,
std::function<void(BlockDesc *, Scope *)> func) {
std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
std::string caseCondXVarName =
std::string("caseCondX") + std::to_string(caseId);
BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
func(caseBlock, scope);
CreateVariable(scope, *place, caseCondName, false);
CreateVariable(scope, *place, caseCondXVarName, caseId);
CreateVariable(scope, *place, caseVarName, caseId);
scope->Var("step_scope");
AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
{{"Out", {caseCondName}}}, {}, casesBlock);
AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
{{"Out", {}}, {"Scope", {"step_scope"}}},
{{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
}
void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
BlockDesc *parentBlock, std::string dataChanName,
std::string quitChanName) {
BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
CreateVariable(scope, *place, "whileExitCond", true);
CreateVariable(scope, *place, "caseToExecute", -1);
CreateVariable(scope, *place, "case1var", 0);
CreateVariable(scope, *place, "xtemp", 0);
// TODO(thuan): Need to create fibXToSend, since channel send moves the actual
// data,
// which causes the data to be no longer accessible to do the fib calculation
// TODO(abhinav): Change channel send to do a copy instead of a move!
CreateVariable(scope, *place, "fibXToSend", 0);
CreateVariable(scope, *place, "fibX", 0);
CreateVariable(scope, *place, "fibY", 1);
CreateVariable(scope, *place, "quitVar", 0);
BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
// TODO(thuan): Remove this once we change channel send to do a copy instead
// of move
AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
// Case 0: Send to dataChanName
std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
BlockDesc *caseBlock, Scope *scope) {
AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
{{"Out", {"fibY"}}}, {}, caseBlock);
};
AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
case0Func);
std::string case0Config =
std::string("0,1,") + dataChanName + std::string(",fibXToSend");
// Case 1: Receive from quitChanName
std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
BlockDesc *caseBlock, Scope *scope) {
// Exit the while loop after we receive from quit channel.
// We assign a false to "whileExitCond" variable, which will
// break out of while_op loop
CreateVariable(scope, *place, "whileFalse", false);
AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
caseBlock);
};
AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
case2Func);
std::string case1Config =
std::string("1,2,") + quitChanName + std::string(",quitVar");
// Select block
AddOp("select", {{"X", {dataChanName, quitChanName}},
{"case_to_execute", {"caseToExecute"}}},
{{"Out", {}}},
{{"sub_block", casesBlock},
{"cases", std::vector<std::string>{case0Config, case1Config}}},
whileBlock);
scope->Var("stepScopes");
AddOp("while",
{{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
{{"Out", {}}, {"StepScopes", {"stepScopes"}}},
{{"sub_block", whileBlock}}, parentBlock);
}
TEST(Concurrency, Go_Op) {
Scope scope;
p::CPUPlace place;
// Initialize scope variables
p::CPUDeviceContext ctx(place);
// Create channel variable
scope.Var("Channel");
// Create Variables, x0 will be put into channel,
// result will be pulled from channel
CreateVariable(&scope, place, "Status", false);
CreateVariable(&scope, place, "x0", 99);
CreateVariable(&scope, place, "result", 0);
framework::Executor executor(place);
ProgramDesc program;
BlockDesc *block = program.MutableBlock(0);
// Create channel OP
AddOp("channel_create", {}, {{"Out", {"Channel"}}},
{{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
block);
// Create Go Op routine
BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
{{"Status", {"Status"}}}, {}, goOpBlock);
// Create Go Op
AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
block);
// Create Channel Receive Op
AddOp("channel_recv", {{"Channel", {"Channel"}}},
{{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
// Create Channel Close Op
AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
// Check the result tensor to make sure it is set to 0
const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
auto *initialData = tensor.data<int>();
EXPECT_EQ(initialData[0], 0);
executor.Run(program, &scope, 0, true, true);
// After we call executor.run, the Go operator should do a channel_send to
// set the "result" variable to 99.
auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 99);
}
/**
* This test implements the fibonacci function using go_op and select_op
*/
TEST(Concurrency, Select) {
Scope scope;
p::CPUPlace place;
// Initialize scope variables
p::CPUDeviceContext ctx(place);
CreateVariable(&scope, place, "Status", false);
CreateVariable(&scope, place, "result", 0);
CreateVariable(&scope, place, "currentXFib", 0);
framework::Executor executor(place);
ProgramDesc program;
BlockDesc *block = program.MutableBlock(0);
// Create channel OP
std::string dataChanName = "Channel";
scope.Var(dataChanName);
AddOp("channel_create", {}, {{"Out", {dataChanName}}},
{{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
std::string quitChanName = "Quit";
scope.Var(quitChanName);
AddOp("channel_create", {}, {{"Out", {quitChanName}}},
{{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
// Create Go Op routine, which loops 10 times over fibonacci sequence
CreateVariable(&scope, place, "xReceiveVar", 0);
BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
for (int i = 0; i < 10; ++i) {
AddOp("channel_recv", {{"Channel", {dataChanName}}},
{{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
{{"first_n", 100},
{"summarize", -1},
{"print_tensor_name", false},
{"print_tensor_type", true},
{"print_tensor_shape", false},
{"print_tensor_lod", false},
{"print_phase", std::string("FORWARD")},
{"message", std::string("X: ")}},
goOpBlock);
}
CreateVariable(&scope, place, "quitSignal", 0);
AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
{{"Status", {"Status"}}}, {}, goOpBlock);
// Create Go Op
AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
{{"sub_block", goOpBlock}}, block);
AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
quitChanName);
// Create Channel Close Op
AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
executor.Run(program, &scope, 0, true, true);
// After we call executor.run, "result" variable should be equal to 34
// (which is 10 loops through fibonacci sequence)
const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 34);
}
} // namespace framework
} // namespace paddle
...@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu ...@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace details {
class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) {
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
// Add op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path",
new std::string(graph_path));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy_.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
}
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
}
private:
BuildStrategy strategy_;
};
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
const {
pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
return pass_builder_;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
#else
const bool use_cuda) const {
#endif
// Create a default one if not initialized by user.
if (!pass_builder_) {
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("params");
pass->SetNotOwned<const std::unordered_set<std::string>>("params",
&param_names);
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
}
graph = pass->Apply(std::move(graph));
}
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
...@@ -15,6 +15,17 @@ ...@@ -15,6 +15,17 @@
#pragma once #pragma once
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -57,6 +68,30 @@ struct BuildStrategy { ...@@ -57,6 +68,30 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false}; bool enable_data_balance_{false};
// User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes
// from python side.
// A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(
const ProgramDesc &main_program,
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes,
#ifdef PADDLE_WITH_CUDA
const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
#else
const bool use_cuda) const;
#endif
private:
mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
}; };
} // namespace details } // namespace details
......
...@@ -80,15 +80,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( ...@@ -80,15 +80,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// This is weird but there is really some variables without var_desc // This is weird but there is really some variables without var_desc
// in computation_op // in computation_op
if (var_desc == nullptr) { if (var_desc == nullptr) {
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
continue; if (var_desc == nullptr) continue;
} else { }
if (var_desc->Persistable()) continue;
auto var_type = var_desc->Proto()->type().type(); if (var_desc->Persistable()) continue;
if (var_type != proto::VarType::LOD_TENSOR && auto var_type = var_desc->Proto()->type().type();
var_type != proto::VarType::SELECTED_ROWS) { if (var_type != proto::VarType::LOD_TENSOR &&
continue; var_type != proto::VarType::SELECTED_ROWS) {
} continue;
} }
// compute op only runs in one device // compute op only runs in one device
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
...@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { ...@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
var->GetMutable<platform::PlaceList>(); var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) { } else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>(); var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::CHANNEL) {
var->GetMutable<ChannelHolder>();
} else if (var_type == proto::VarType::RAW) { } else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator // GetMutable will be called in operator
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Variable type %d is not in " "Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type); var_type);
} }
} }
......
...@@ -126,7 +126,6 @@ message VarType { ...@@ -126,7 +126,6 @@ message VarType {
LOD_TENSOR_ARRAY = 13; LOD_TENSOR_ARRAY = 13;
PLACE_LIST = 14; PLACE_LIST = 14;
READER = 15; READER = 15;
CHANNEL = 16;
// Any runtime decided variable type is raw // Any runtime decided variable type is raw
// raw variables should manage their own allocations // raw variables should manage their own allocations
// in operators like nccl_op // in operators like nccl_op
...@@ -158,12 +157,6 @@ message VarType { ...@@ -158,12 +157,6 @@ message VarType {
message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
optional ReaderDesc reader = 5; optional ReaderDesc reader = 5;
message ChannelDesc {
required Type data_type = 1;
required int64 capacity = 2;
}
optional ChannelDesc channel = 6;
message Tuple { repeated Type element_type = 1; } message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7; optional Tuple tuple = 7;
} }
......
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#pragma once\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
...@@ -28,12 +29,13 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap ...@@ -28,12 +29,13 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
if(WITH_MKLDNN) if (WITH_MKLDNN)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference)
endif() endif ()
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass inference) pass_library(fc_lstm_fuse_pass inference)
pass_library(embedding_fc_lstm_fuse_pass inference)
pass_library(fc_gru_fuse_pass inference) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference)
...@@ -41,12 +43,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass ...@@ -41,12 +43,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if(WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
endif() endif ()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
#include <algorithm>
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace framework {
namespace ir {
static int BuildFusion(Graph* graph, const std::string& name_scope,
Scope* scope, bool with_fc_bias) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
// Build pattern
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
->assert_is_op_input("lookup_table")
->assert_var_not_persistable();
patterns::Embedding embedding_pattern(pattern, name_scope);
// TODO(jczaja): Intermediate can only be for val that are not used anywhere
// but lookup table output may go into other LSTM (for reverse
// direction)
auto* embedding_out = embedding_pattern(x);
patterns::FC fc_pattern(pattern, name_scope);
// fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate();
patterns::LSTM lstm_pattern(pattern, name_scope);
lstm_pattern(fc_out);
// Create New OpDesc
auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm,
Node* input, Node* weight_x, Node* weight_h,
Node* bias, Node* hidden, Node* cell,
Node* xx, Node* fc_bias) {
OpDesc op_desc;
op_desc.SetType("fused_embedding_fc_lstm");
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
SET_IN(Ids, input);
SET_IN(WeightH, weight_h);
// Neet to have this passed as We need Wc data for peephole connections
SET_IN(Bias, bias);
#undef SET_IN
// Multiply embeddings with Weights
PADDLE_ENFORCE(scope);
const std::string& embeddings = patterns::UniqueKey("Embeddings");
auto* embeddings_var = scope->Var(embeddings);
PADDLE_ENFORCE(embeddings_var);
auto* embeddings_tensor =
embeddings_var->GetMutable<framework::LoDTensor>();
// Get WeightX size: [single_embedding, fc_size]
// and embedding size: [dict_size, single_embedding]
// and create new size of embeddings eg. [dict_size , hidden_size]
auto* embedding_var = scope->FindVar(W->Name());
PADDLE_ENFORCE(embedding_var);
const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
const auto& weightx_tensor =
scope->FindVar(weight_x->Name())->Get<framework::LoDTensor>();
embeddings_tensor->Resize(
{embedding_tensor.dims()[0], weightx_tensor.dims()[1]});
// Multiplie embeddings via WeightsX and add bias
auto embedding_data = embedding_tensor.data<float>();
auto weightx_data = weightx_tensor.data<float>();
auto embeddings_data =
embeddings_tensor->mutable_data<float>(platform::CPUPlace());
// Adding biases to GEMM result to be
auto* lstm_bias_var = scope->FindVar(bias->Name());
PADDLE_ENFORCE(lstm_bias_var);
const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
auto alpha = 1.0f;
auto beta = 1.0f;
int m = embedding_tensor.dims()[0];
int n = weightx_tensor.dims()[1];
int k = embedding_tensor.dims()[1];
// Copy only gate biases values (only actual bias data, not peephole
// weights)
std::vector<float> combined_biases;
combined_biases.reserve(n);
std::copy_n(lstm_bias_tensor.data<float>(), n,
std::back_inserter(combined_biases));
if (with_fc_bias) {
// Add FC-bias with LSTM-bias (into GEMM result to be)
auto* fc_bias_var = scope->FindVar(fc_bias->Name());
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
for (int i = 0; i < fc_bias_tensor.numel(); i++) {
combined_biases[i] += fc_bias_tensor.data<float>()[i];
}
}
// broadcast biases
std::vector<float> ones(m, 1.0f);
paddle::operators::math::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
&combined_biases[0], n, 0.0f, embeddings_data, n);
// Wx*embeddings + biases
paddle::operators::math::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
embedding_data, k, weightx_data, n, beta, embeddings_data, n);
op_desc.SetInput("Embeddings", {embeddings});
// Create temp variables.
const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
const std::string BatchedCellPreAct =
patterns::UniqueKey("BatchedCellPreAct");
const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden->Name()});
op_desc.SetOutput("Cell", {cell->Name()});
op_desc.SetOutput("XX", {xx->Name()});
op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
#define OP_SET_OUT(x) \
const std::string x = patterns::UniqueKey(#x); \
op_desc.SetOutput(#x, {x}); \
scope->Var(x)->GetMutable<LoDTensor>()
OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0);
OP_SET_OUT(ReorderedC0);
#undef OP_SET_OUT
auto* op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(input, op);
IR_NODE_LINK_TO(weight_x, op);
IR_NODE_LINK_TO(weight_h, op);
IR_NODE_LINK_TO(bias, op);
IR_NODE_LINK_TO(op, hidden);
return op;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern);
GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern);
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
// TODO(jczaja): Add support for is_sparse / is_distributed
auto is_sparse = boost::get<bool>(lookup_table->Op()->GetAttr("is_sparse"));
auto is_distributed =
boost::get<bool>(lookup_table->Op()->GetAttr("is_distributed"));
if (is_sparse == true || is_distributed == true) {
return;
}
if (with_fc_bias) {
GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
Bias, Hidden, Cell, fc_out, fc_bias);
// Remove unneeded nodes.
// TODO(jczaja): Proper removing of lookup table
std::unordered_set<const Node*> marked_nodes(
//{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
{mul, lstm, elementwise_add, fc_bias});
GraphSafeRemoveNodes(graph, marked_nodes);
} else {
GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
Bias, Hidden, Cell, fc_out, nullptr);
// Remove unneeded nodes.
// TODO(jczaja): Proper removing of lookup table
// std::unordered_set<const Node*> marked_nodes({lookup_table, W, mul,
// lstm});
std::unordered_set<const Node*> marked_nodes({mul, lstm});
GraphSafeRemoveNodes(graph, marked_nodes);
}
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
true /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(embedding_fc_lstm_fuse_pass,
paddle::framework::ir::EmbeddingFCLSTMFusePass);
...@@ -11,29 +11,30 @@ ...@@ -11,29 +11,30 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <chrono> // NOLINT #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace inference { namespace framework {
namespace ir {
// Fusing of Embedding , FC and LSTM op
// Timer for timer // Just FC without bias
class Timer { class EmbeddingFCLSTMFusePass : public FusePassBase {
public: public:
std::chrono::high_resolution_clock::time_point start; virtual ~EmbeddingFCLSTMFusePass() {}
std::chrono::high_resolution_clock::time_point startu;
protected:
void tic() { start = std::chrono::high_resolution_clock::now(); } std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
double toc() {
startu = std::chrono::high_resolution_clock::now(); const std::string name_scope_{"embedding_fc_lstm_fuse"};
std::chrono::duration<double> time_span =
std::chrono::duration_cast<std::chrono::duration<double>>(startu -
start);
double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
return used_time_ms;
}
}; };
} // namespace inference } // namespace ir
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm> #include <algorithm>
#include <deque>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return adj_list; return adj_list;
} }
size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes = graph.Nodes();
std::unordered_set<ir::Node *> visited_nodes;
visited_nodes.reserve(nodes.size());
std::deque<ir::Node *> q_nodes;
std::vector<std::unordered_set<ir::Node *>> graph_nodes;
std::unordered_set<ir::Node *> g_nodes;
size_t graph_count = 0;
auto traverse_nodes = [&visited_nodes,
&q_nodes](const std::vector<ir::Node *> &nodes) {
std::copy_if(
nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
[&visited_nodes](Node *node) { return !visited_nodes.count(node); });
};
while (visited_nodes.size() != nodes.size()) {
if (!q_nodes.empty()) {
auto cur_node = q_nodes.front();
q_nodes.pop_front();
visited_nodes.insert(cur_node);
g_nodes.insert(cur_node);
traverse_nodes(cur_node->inputs);
traverse_nodes(cur_node->outputs);
} else {
++graph_count;
if (g_nodes.size()) {
graph_nodes.emplace_back(g_nodes);
}
g_nodes.clear();
for (auto &n : nodes) {
if (visited_nodes.count(n) == 0) {
q_nodes.push_back(n);
break;
}
}
}
}
if (g_nodes.size()) {
graph_nodes.emplace_back(g_nodes);
}
if (VLOG_IS_ON(10)) {
VLOG(10) << "graph_num: " << graph_nodes.size();
for (auto &g_n : graph_nodes) {
VLOG(10) << "graph_nodes: " << g_n.size();
if (g_n.size() < 10) {
std::stringstream out;
for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) {
out << n->Name() << ", ";
}
out << "], out[";
for (auto &n : node->outputs) {
out << n->Name() << ", ";
}
out << "]";
}
VLOG(10) << out.str();
}
}
}
return graph_count;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -27,6 +27,8 @@ namespace ir { ...@@ -27,6 +27,8 @@ namespace ir {
// Test if the graph contains circle. // Test if the graph contains circle.
bool HasCircle(const Graph &graph); bool HasCircle(const Graph &graph);
size_t GraphNum(const Graph &graph);
// Topology Sort the operations in the graph from inputs to outputs. // Topology Sort the operations in the graph from inputs to outputs.
// `graph` cannot contain circle. // `graph` cannot contain circle.
std::vector<ir::Node *> TopologySortOperations(const Graph &graph); std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
......
...@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) { ...@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_EQ(node_map.at("op2"), 1UL);
ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
} }
void BuildZeroGraph(Graph* g) {}
void BuildOneGraph(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
v4->outputs.push_back(o5);
}
void BuildTwoGraphs(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
// o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
// v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
// o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
// v4->outputs.push_back(o5);
}
TEST(GraphHelperTest, GraphNum) {
ProgramDesc prog;
Graph g(prog);
BuildZeroGraph(&g);
ASSERT_EQ(GraphNum(g), 0);
Graph g2(prog);
BuildOneGraph(&g2);
ASSERT_EQ(GraphNum(g2), 1);
Graph g3(prog);
BuildTwoGraphs(&g3);
ASSERT_EQ(GraphNum(g3), 2);
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -692,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, ...@@ -692,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
} }
} }
PDNode *patterns::Embedding::operator()(PDNode *x) {
x->assert_is_op_input("lookup_table", "Ids");
auto *lookup_table_op =
pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table");
#define NEW_NODE(arg__, io__) \
auto *arg__ = pattern->NewNode(arg__##_repr()) \
->assert_is_op_##io__("lookup_table", #arg__);
NEW_NODE(W, input);
NEW_NODE(Out, output);
#undef NEW_NODE
lookup_table_op->LinksFrom({x, W});
lookup_table_op->LinksTo({Out});
return Out;
}
PDNode *patterns::LSTM::operator()(PDNode *x) { PDNode *patterns::LSTM::operator()(PDNode *x) {
x->assert_is_op_input("lstm", "Input"); x->assert_is_op_input("lstm", "Input");
auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
......
...@@ -418,6 +418,23 @@ struct FC : public PatternBase { ...@@ -418,6 +418,23 @@ struct FC : public PatternBase {
PATTERN_DECL_NODE(Out); PATTERN_DECL_NODE(Out);
}; };
// Embedding
struct Embedding : public PatternBase {
Embedding(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "embedding") {}
PDNode* operator()(PDNode* x);
// declare operator node's name
PATTERN_DECL_NODE(lookup_table);
// Inputs
//
PATTERN_DECL_NODE(Ids);
PATTERN_DECL_NODE(W); // embeddings
// Outputs
PATTERN_DECL_NODE(Out);
};
struct LSTM : public PatternBase { struct LSTM : public PatternBase {
LSTM(PDPattern* pattern, const std::string& name_scope) LSTM(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "lstm") {} : PatternBase(pattern, name_scope, "lstm") {}
......
...@@ -19,7 +19,6 @@ namespace paddle { ...@@ -19,7 +19,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const { std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
for (const std::string& attr : required_pass_attrs_) { for (const std::string& attr : required_pass_attrs_) {
PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
......
...@@ -42,6 +42,8 @@ class Pass { ...@@ -42,6 +42,8 @@ class Pass {
attr_dels_.clear(); attr_dels_.clear();
} }
std::string Type() const { return type_; }
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const; std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
// Get a reference to the attributed previously set. // Get a reference to the attributed previously set.
...@@ -52,6 +54,21 @@ class Pass { ...@@ -52,6 +54,21 @@ class Pass {
return *boost::any_cast<AttrType *>(attrs_.at(attr_name)); return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
} }
bool Has(const std::string &attr_name) const {
return attrs_.find(attr_name) != attrs_.end();
}
void Erase(const std::string &attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}
// Set a pointer to the attribute. Pass takes ownership of the attribute. // Set a pointer to the attribute. Pass takes ownership of the attribute.
template <typename AttrType> template <typename AttrType>
void Set(const std::string &attr_name, AttrType *attr) { void Set(const std::string &attr_name, AttrType *attr) {
...@@ -68,13 +85,15 @@ class Pass { ...@@ -68,13 +85,15 @@ class Pass {
// should delete the attribute. // should delete the attribute.
template <typename AttrType> template <typename AttrType>
void SetNotOwned(const std::string &attr_name, AttrType *attr) { void SetNotOwned(const std::string &attr_name, AttrType *attr) {
PADDLE_ENFORCE(attrs_.count(attr_name) == 0); PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
attr_name);
attrs_[attr_name] = attr; attrs_[attr_name] = attr;
} }
protected: protected:
virtual std::unique_ptr<Graph> ApplyImpl( virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
std::unique_ptr<Graph> graph) const = 0; LOG(FATAL) << "Calling virtual Pass not implemented.";
}
private: private:
template <typename PassType> template <typename PassType>
...@@ -89,7 +108,10 @@ class Pass { ...@@ -89,7 +108,10 @@ class Pass {
required_graph_attrs_.insert(attrs.begin(), attrs.end()); required_graph_attrs_.insert(attrs.begin(), attrs.end());
} }
void RegisterType(const std::string &type) { type_ = type; }
mutable bool applied_{false}; mutable bool applied_{false};
std::string type_;
std::unordered_set<std::string> required_pass_attrs_; std::unordered_set<std::string> required_pass_attrs_;
std::unordered_set<std::string> required_graph_attrs_; std::unordered_set<std::string> required_graph_attrs_;
std::map<std::string, boost::any> attrs_; std::map<std::string, boost::any> attrs_;
...@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar { ...@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
"'%s' is registered more than once.", pass_type); "'%s' is registered more than once.", pass_type);
PassRegistry::Instance().Insert( PassRegistry::Instance().Insert(
pass_type, [this]() -> std::unique_ptr<Pass> { pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
std::unique_ptr<Pass> pass(new PassType()); std::unique_ptr<Pass> pass(new PassType());
pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
pass->RegisterType(pass_type);
return pass; return pass;
}); });
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/pass_builder.h"
namespace paddle {
namespace framework {
namespace ir {
std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
auto pass = ir::PassRegistry::Instance().Get(pass_type);
passes_.emplace_back(pass.release());
return passes_.back();
}
void PassBuilder::RemovePass(size_t idx) {
PADDLE_ENFORCE(passes_.size() > idx);
passes_.erase(passes_.begin() + idx);
}
std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
const std::string& pass_type) {
PADDLE_ENFORCE(passes_.size() >= idx);
std::shared_ptr<Pass> pass(
ir::PassRegistry::Instance().Get(pass_type).release());
passes_.insert(passes_.begin() + idx, std::move(pass));
return passes_[idx];
}
} // namespace ir
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class PassBuilder {
public:
PassBuilder() {}
virtual ~PassBuilder() {}
// Append a new pass to the end.
std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
// Insert a new pass after `idx`.
std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
// Remove a new pass at `idx`.
void RemovePass(size_t idx);
// Returns a list of all passes.
std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
protected:
std::vector<std::shared_ptr<Pass>> passes_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) { ...@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2); ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
try { // Allow apply more than once.
graph = pass->Apply(std::move(graph)); graph.reset(new Graph(prog));
} catch (paddle::platform::EnforceNotMet e) { graph->Set<int>("test_graph_attr", new int);
exception = std::string(e.what()); graph = pass->Apply(std::move(graph));
}
ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
pass = PassRegistry::Instance().Get("test_pass"); pass = PassRegistry::Instance().Get("test_pass");
pass->SetNotOwned<int>("test_pass_attr", &val); pass->SetNotOwned<int>("test_pass_attr", &val);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
// These code can be shared with Executor.
static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
var_type);
}
}
void NaiveExecutor::Prepare(Scope *parent_scope,
const ProgramDesc &program_desc, int block_id,
bool with_feed_fetch_ops) {
if (!parent_scope) {
scope_ = new framework::Scope;
} else {
scope_ = &parent_scope->NewScope();
}
CreateVariables(program_desc, scope_, block_id);
CreateOps(program_desc, block_id, with_feed_fetch_ops);
}
void NaiveExecutor::Run() {
for (auto &op : ops_) {
VLOG(4) << "run " << op->Type();
op->Run(*scope_, place_);
}
}
void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
int block_id) {
PADDLE_ENFORCE(scope);
auto &global_block = desc.Block(block_id);
const Scope *ancestor_scope = scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
if (ancestor_scope != scope) {
for (auto &var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
// Create persistable vars in ancestor scope.
if (var->Persistable()) {
auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else { // Create temporary variables in local scope.
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto &var : global_block.AllVars()) {
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
bool with_feed_fetch_ops) {
for (const auto &op_desc : desc.Block(block_id).AllOps()) {
if (!with_feed_fetch_ops &&
(op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s",
op_desc->Input("X")[0], op_desc->Type(),
op_desc->Output("Out")[0]);
continue;
}
ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
}
}
LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
PADDLE_ENFORCE(scope_, "Need to init scope first");
auto *var = scope_->FindVar(name);
PADDLE_ENFORCE(var, "No variable [%s] in the scope");
auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
return tensor;
}
void NaiveExecutor::CleanFeedFetchOps() {
std::vector<std::unique_ptr<OperatorBase>> ops;
for (auto &op : ops_) {
if (op->Type() != "feed" && op->Type() != "fetch") {
ops.emplace_back(std::move(op));
}
}
ops_.swap(ops);
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
/*
* Simple, intuitive and effective. Only single thread is supported, and
* currently designed for inference.
*/
class NaiveExecutor {
public:
explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
// Create child scope.
// Create variables.
// @with_feed_fetch_ops: whether to work with the feed and fetch operators.
void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
int block_id, bool with_feed_fetch_ops);
// Run all the operators.
void Run();
// Get an tensor to operating directly, without the need for feed_ops.
LoDTensor* FindTensor(const std::string& name);
Scope* scope() { return scope_; }
void CleanFeedFetchOps();
protected:
void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
void CreateOps(const ProgramDesc& desc, int block_id,
bool with_feed_fetch_ops);
private:
const platform::Place place_;
// Catch the required resource to avoid recreate.
std::vector<std::unique_ptr<OperatorBase>> ops_;
Scope* scope_;
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/naive_executor.h"
#include <gtest/gtest.h>
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
TEST(NaiveExecutor, Basic) {
ProgramDesc program;
auto* main_block = program.MutableBlock(0);
auto* a = main_block->Var("a"); // input
auto* b = main_block->Var("b"); // input
auto* c = main_block->Var("c"); // input
a->SetType(proto::VarType::LOD_TENSOR);
b->SetType(proto::VarType::LOD_TENSOR);
c->SetType(proto::VarType::LOD_TENSOR);
auto* add = main_block->AppendOp();
add->SetType("elementwise_add");
add->SetInput("X", {"a"});
add->SetInput("Y", {"b"});
add->SetOutput("Out", {"c"});
auto place = platform::CPUPlace();
NaiveExecutor exe(place);
exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
auto* a_tensor = exe.FindTensor("a");
auto* b_tensor = exe.FindTensor("b");
auto* c_tensor = exe.FindTensor("c");
a_tensor->Resize({1, 4});
b_tensor->Resize({1, 4});
c_tensor->Resize({1, 4});
b_tensor->mutable_data<float>(place);
a_tensor->mutable_data<float>(place);
float a_arr[] = {0, 1, 2, 3};
float b_arr[] = {0.0, .1, .2, .3};
std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
exe.Run();
auto* c_data = c_tensor->mutable_data<float>(place);
for (int i = 0; i < 4; i++) {
EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
}
}
} // namespace framework
} // namespace paddle
USE_OP(elementwise_add);
...@@ -132,9 +132,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -132,9 +132,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.") AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
.SetDefault(""); .SetDefault("");
AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
"Callstack for Op Creatation.")
.SetDefault({});
Validate(); Validate();
} }
......
...@@ -46,7 +46,6 @@ class OpProtoAndCheckerMaker { ...@@ -46,7 +46,6 @@ class OpProtoAndCheckerMaker {
static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpRoleVarAttrName() { return "op_role_var"; }
static const char *OpNamescopeAttrName() { return "op_namescope"; } static const char *OpNamescopeAttrName() { return "op_namescope"; }
static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
......
...@@ -14,17 +14,15 @@ limitations under the License. */ ...@@ -14,17 +14,15 @@ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES #define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL #define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/operator.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -142,48 +140,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -142,48 +140,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try { VLOG(4) << place << " " << DebugStringEx(&scope);
if (VLOG_IS_ON(4)) { if (platform::is_gpu_place(place)) {
VLOG(4) << place << " " << DebugStringEx(&scope);
}
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
#else #else
auto dev_id = boost::get<platform::CUDAPlace>(place).device; auto dev_id = boost::get<platform::CUDAPlace>(place).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope);
}
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw exception;
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) {
throw exception;
}
std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n";
sout << "Python Callstacks: \n";
for (auto& line : callstack) {
sout << line;
}
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw exception;
} catch (...) {
std::rethrow_exception(std::current_exception());
} }
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
VLOG(3) << place << " " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
...@@ -211,7 +180,7 @@ const std::vector<std::string>& OperatorBase::Inputs( ...@@ -211,7 +180,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
} }
bool OperatorBase::HasOutputs(const std::string& name) const { bool OperatorBase::HasOutputs(const std::string& name) const {
if (outputs_.end() != outputs_.find(name)) { if (outputs_.find(name) != outputs_.end()) {
return true; return true;
} else { } else {
return false; return false;
......
...@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and ...@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -35,80 +33,6 @@ limitations under the License. */ ...@@ -35,80 +33,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &param_names,
const std::vector<Scope *> &local_scopes, const bool use_cuda,
#ifdef PADDLE_WITH_CUDA
const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
#else
const BuildStrategy &strategy) {
#endif
// Convert the program to graph.
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
// Apply op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass =
ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
}
// Convert graph to run on multi-devices.
auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass");
multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
"params", &param_names);
multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
graph = multi_devices_pass->Apply(std::move(graph));
// Apply a graph print pass to record a graph with device info.
if (!strategy.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass =
ir::PassRegistry::Instance().Get("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy.debug_graphviz_path_);
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
graph = multi_devices_print_pass->Apply(std::move(graph));
}
// Verify that the graph is correct for multi-device executor.
auto multi_devices_check_pass =
ir::PassRegistry::Instance().Get("multi_devices_check_pass");
graph = multi_devices_check_pass->Apply(std::move(graph));
return graph;
}
class ParallelExecutorPrivate { class ParallelExecutorPrivate {
public: public:
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places) explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
...@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor( ...@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, params, main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, build_strategy, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
member_->nccl_ctxs_.get());
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
...@@ -228,11 +151,19 @@ ParallelExecutor::ParallelExecutor( ...@@ -228,11 +151,19 @@ ParallelExecutor::ParallelExecutor(
} }
} }
#else #else
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass( std::unique_ptr<ir::Graph> graph =
main_program, member_->places_, loss_var_name, params, build_strategy.Apply(main_program, member_->places_, loss_var_name,
member_->local_scopes_, member_->use_cuda_, build_strategy); params, member_->local_scopes_, member_->use_cuda_);
#endif #endif
if (VLOG_IS_ON(5)) {
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
"The number of graph should be only one");
}
}
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph))); exec_strategy, member_->local_scopes_, places, std::move(graph)));
...@@ -319,6 +250,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors, ...@@ -319,6 +250,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!gcs_.empty()) { if (!gcs_.empty()) {
ResetReferenceCount(); ResetReferenceCount();
for (auto &pair : cur_ref_cnts_) {
auto &name_map = *(pair.second);
for (auto &fetch_name : fetch_tensors) {
name_map.erase(fetch_name);
}
name_map.erase(fetched_var_name);
}
} }
#endif #endif
auto fetch_data = member_->executor_->Run(fetch_tensors); auto fetch_data = member_->executor_->Run(fetch_tensors);
...@@ -373,12 +311,6 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -373,12 +311,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
USE_PASS(reference_count_pass); USE_PASS(reference_count_pass);
#endif #endif
...@@ -14,14 +14,14 @@ limitations under the License. */ ...@@ -14,14 +14,14 @@ limitations under the License. */
#pragma once #pragma once
#include <paddle/fluid/framework/details/build_strategy.h>
#include <atomic> #include <atomic>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
......
...@@ -88,13 +88,7 @@ std::vector<std::vector<int64_t>> VarDesc::GetShapes() const { ...@@ -88,13 +88,7 @@ std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
} }
void VarDesc::SetDataType(proto::VarType::Type data_type) { void VarDesc::SetDataType(proto::VarType::Type data_type) {
switch (desc_.type().type()) { mutable_tensor_desc()->set_data_type(data_type);
case proto::VarType::CHANNEL:
mutable_channel_desc()->set_data_type(data_type);
break;
default:
mutable_tensor_desc()->set_data_type(data_type);
}
} }
void VarDesc::SetDataTypes( void VarDesc::SetDataTypes(
...@@ -115,13 +109,7 @@ void VarDesc::SetDataTypes( ...@@ -115,13 +109,7 @@ void VarDesc::SetDataTypes(
} }
proto::VarType::Type VarDesc::GetDataType() const { proto::VarType::Type VarDesc::GetDataType() const {
switch (desc_.type().type()) { return tensor_desc().data_type();
case proto::VarType::CHANNEL:
return channel_desc().data_type();
break;
default:
return tensor_desc().data_type();
}
} }
std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const { std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
...@@ -134,17 +122,6 @@ std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const { ...@@ -134,17 +122,6 @@ std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
return res; return res;
} }
void VarDesc::SetCapacity(int64_t capacity) {
switch (desc_.type().type()) {
case proto::VarType::CHANNEL:
desc_.mutable_type()->mutable_channel()->set_capacity(capacity);
break;
default:
PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.",
this->Name());
}
}
void VarDesc::SetLoDLevel(int32_t lod_level) { void VarDesc::SetLoDLevel(int32_t lod_level) {
switch (desc_.type().type()) { switch (desc_.type().type()) {
case proto::VarType::LOD_TENSOR: case proto::VarType::LOD_TENSOR:
...@@ -214,19 +191,6 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const { ...@@ -214,19 +191,6 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
} }
} }
const proto::VarType::ChannelDesc &VarDesc::channel_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
switch (desc_.type().type()) {
case proto::VarType::CHANNEL:
return desc_.type().channel();
default:
PADDLE_THROW(
"Getting 'channel_desc' is not supported by the type of var %s.",
this->Name());
}
}
const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
...@@ -262,20 +226,6 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const { ...@@ -262,20 +226,6 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
} }
} }
proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
switch (desc_.type().type()) {
case proto::VarType::CHANNEL:
return desc_.mutable_type()->mutable_channel();
default:
PADDLE_THROW(
"Getting 'mutable_channel_desc' is not supported by the type of var "
"%s.",
this->Name());
}
}
proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
......
...@@ -87,8 +87,6 @@ class VarDesc { ...@@ -87,8 +87,6 @@ class VarDesc {
void SetDataTypes( void SetDataTypes(
const std::vector<proto::VarType::Type> &multiple_data_type); const std::vector<proto::VarType::Type> &multiple_data_type);
void SetCapacity(int64_t capacity);
proto::VarType::Type GetDataType() const; proto::VarType::Type GetDataType() const;
std::vector<proto::VarType::Type> GetDataTypes() const; std::vector<proto::VarType::Type> GetDataTypes() const;
...@@ -110,10 +108,8 @@ class VarDesc { ...@@ -110,10 +108,8 @@ class VarDesc {
void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
private: private:
const proto::VarType::ChannelDesc &channel_desc() const;
const proto::VarType::TensorDesc &tensor_desc() const; const proto::VarType::TensorDesc &tensor_desc() const;
std::vector<proto::VarType::TensorDesc> tensor_descs() const; std::vector<proto::VarType::TensorDesc> tensor_descs() const;
proto::VarType::ChannelDesc *mutable_channel_desc();
proto::VarType::TensorDesc *mutable_tensor_desc(); proto::VarType::TensorDesc *mutable_tensor_desc();
std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs(); std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) { ...@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) {
return proto::VarType_Type_SELECTED_ROWS; return proto::VarType_Type_SELECTED_ROWS;
} else if (IsType<ReaderHolder>(type)) { } else if (IsType<ReaderHolder>(type)) {
return proto::VarType_Type_READER; return proto::VarType_Type_READER;
} else if (IsType<ChannelHolder>(type)) {
return proto::VarType_Type_CHANNEL;
} else { } else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
} }
...@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { ...@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarType_Type_READER: case proto::VarType_Type_READER:
visitor(var.Get<ReaderHolder>()); visitor(var.Get<ReaderHolder>());
return; return;
case proto::VarType_Type_CHANNEL:
visitor(var.Get<ChannelHolder>());
return;
default: default:
PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
} }
......
...@@ -20,7 +20,8 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) ...@@ -20,7 +20,8 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
add_subdirectory(api) add_subdirectory(api)
# Create static library # Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api
analysis_predictor zero_copy_tensor)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
...@@ -31,6 +32,7 @@ endif() ...@@ -31,6 +32,7 @@ endif()
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
DEPS ${fluid_modules} paddle_fluid_api) DEPS ${fluid_modules} paddle_fluid_api)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
...@@ -53,7 +55,7 @@ if(NOT APPLE) ...@@ -53,7 +55,7 @@ if(NOT APPLE)
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
# tests/book depends the models that generated by python/paddle/fluid/tests/book # tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book) add_subdirectory(tests/book)
if(WITH_INFERENCE_API_TEST) if(WITH_INFERENCE_API_TEST)
add_subdirectory(tests/api) add_subdirectory(tests/api)
......
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
set(analysis_deps set(analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc analyzer.cc
......
...@@ -41,12 +41,6 @@ class AnalysisPass { ...@@ -41,12 +41,6 @@ class AnalysisPass {
// all passes have run. // all passes have run.
virtual bool Finalize() { return false; } virtual bool Finalize() { return false; }
// Get a Pass appropriate to print the Node this pass operates on.
virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const {
return nullptr;
}
// Create a debugger Pass that draw the DFG by graphviz toolkit. // Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; } virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
......
...@@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion. // larger fusion.
const std::vector<std::string> all_ir_passes_{{ const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here. // Manual update the passes here.
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // "embedding_fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
"fc_gru_fuse_pass", // "mul_lstm_fuse_pass", //
"mul_gru_fuse_pass", // "fc_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", // "mul_gru_fuse_pass", //
"fc_fuse_pass", // "seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", //
#endif #endif
......
...@@ -18,10 +18,10 @@ if(APPLE) ...@@ -18,10 +18,10 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
endif() endif()
function(inference_api_test TARGET_NAME) function(inference_api_test TARGET_NAME)
...@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) ...@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
endif(WITH_TESTING) endif(WITH_TESTING)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api ...@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
inference_api_test(test_api_impl SRC api_impl_tester.cc inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif() endif()
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
function(anakin_target target_name) function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction() endfunction()
......
...@@ -16,9 +16,12 @@ ...@@ -16,9 +16,12 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -28,8 +31,11 @@ DECLARE_bool(profile); ...@@ -28,8 +31,11 @@ DECLARE_bool(profile);
namespace paddle { namespace paddle {
using contrib::AnalysisConfig;
bool AnalysisPredictor::Init( bool AnalysisPredictor::Init(
const std::shared_ptr<framework::Scope>& parent_scope) { const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program) {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
#if !defined(_WIN32) #if !defined(_WIN32)
if (FLAGS_profile) { if (FLAGS_profile) {
...@@ -43,7 +49,8 @@ bool AnalysisPredictor::Init( ...@@ -43,7 +49,8 @@ bool AnalysisPredictor::Init(
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently"; LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
"is turned false.";
config_.enable_ir_optim = false; config_.enable_ir_optim = false;
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
...@@ -56,37 +63,134 @@ bool AnalysisPredictor::Init( ...@@ -56,37 +63,134 @@ bool AnalysisPredictor::Init(
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
} }
executor_.reset(new paddle::framework::Executor(place_)); executor_.reset(new paddle::framework::NaiveExecutor(place_));
// Initialize the inference program if (!program) {
if (!config_.model_dir.empty()) { if (!LoadProgramDesc()) return false;
// Parameters are saved in separate files sited in OptimizeInferenceProgram();
// the specified `dirname`.
inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else { } else {
LOG(ERROR) << "fail to load inference model from " << config_.model_dir; inference_program_ = program;
}
executor_->Prepare(scope_.get(), *inference_program_, 0,
config_.use_feed_fetch_ops);
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch();
return true;
}
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict";
inference::Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
return false; return false;
} }
// Run the inference program
// if share variables, we need not create variables
executor_->Run();
OptimizeInferenceProgram(); // get fetch variable
if (config_._use_mkldnn) { if (!GetFetch(output_data, scope)) {
executor_->EnableMKLDNN(*inference_program_); LOG(ERROR) << "fail to get fetches";
return false;
} }
ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(3) << "predict cost: " << timer.toc() << "ms";
return true;
}
VLOG(5) << "to create variables"; bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
PADDLE_ENFORCE(scope_.get()); framework::Scope *scope) {
executor_->CreateVariables(*inference_program_, VLOG(3) << "Predictor::set_feed";
sub_scope_ ? sub_scope_ : scope_.get(), 0); if (inputs.size() != feeds_.size()) {
// Get the feed_target_names and fetch_target_names LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
PrepareFeedFetch(); << inputs.size();
return false;
}
// Cache the inputs memory for better concurrency performance.
feed_tensors_.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto &input = feed_tensors_[i];
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {
lod.emplace_back(level);
}
input.set_lod(lod);
int idx = -1;
if (config_.specify_input_name) {
idx = feed_names_[inputs[i].name];
} else {
idx = boost::get<int>(feeds_[i]->GetAttr("col"));
}
framework::SetFeedVariable(scope, input, "feed", idx);
}
return true;
}
template <typename T>
void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
PaddleTensor *output) {
// set shape.
auto shape = framework::vectorize(fetch.dims());
output->shape.assign(shape.begin(), shape.end());
// set data.
const T *data = fetch.data<T>();
int num_elems = inference::VecReduceToInt(shape);
output->data.Resize(num_elems * sizeof(T));
// The fetched tensor output by fetch op, should always in CPU memory, so just
// copy.
memcpy(output->data.data(), data, num_elems * sizeof(T));
// set lod
output->lod.clear();
for (auto &level : fetch.lod()) {
output->lod.emplace_back(level.begin(), level.end());
}
}
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type();
auto output = &(outputs->at(i));
if (type == typeid(float)) {
GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32;
} else if (type == typeid(int64_t)) {
GetFetchOne<int64_t>(fetch, output);
output->dtype = PaddleDType::INT64;
} else {
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
}
return true; return true;
} }
...@@ -107,6 +211,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -107,6 +211,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
new std::string(config_.prog_file)); new std::string(config_.prog_file));
argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
PADDLE_ENFORCE( PADDLE_ENFORCE(
...@@ -127,9 +232,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -127,9 +232,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>( AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
const contrib::AnalysisConfig& config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
...@@ -150,15 +254,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>( ...@@ -150,15 +254,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
} }
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config)); std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
return predictor; return predictor;
} }
void AnalysisPredictor::PrepareFeedFetch() {
for (auto *op : inference_program_->Block(0).AllOps()) {
if (op->Type() == "feed") {
int idx = boost::get<int>(op->GetAttr("col"));
if (feeds_.size() <= static_cast<size_t>(idx)) {
feeds_.resize(idx + 1);
}
feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1);
}
fetchs_[idx] = op;
}
}
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true;
res->SetName(name);
return res;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
std::unique_ptr<ZeroCopyTensor> res(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false;
res->SetName(name);
return res;
}
bool AnalysisPredictor::ZeroCopyRun() {
executor_->Run();
return true;
}
bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::unique_ptr<framework::Executor> tmp_exe(
new framework::Executor(platform::CPUPlace()));
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir,
config_.param_file);
return false;
}
return true;
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
auto *x = new AnalysisPredictor(config_);
x->Init(scope_, inference_program_);
return std::unique_ptr<PaddlePredictor>(x);
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig& config) { const contrib::AnalysisConfig &config) {
return CreatePaddlePredictor<contrib::AnalysisConfig, return CreatePaddlePredictor<contrib::AnalysisConfig,
PaddleEngineKind::kAnalysis>(config); PaddleEngineKind::kAnalysis>(config);
} }
......
...@@ -12,42 +12,81 @@ ...@@ -12,42 +12,81 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
using inference::analysis::Argument; using inference::analysis::Argument;
using inference::analysis::Analyzer; using inference::analysis::Analyzer;
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
using framework::NaiveExecutor;
using contrib::AnalysisConfig;
/* This predictor is based on the original native predictor with IR and Analysis /* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime. * support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor? * TODO(Superjomn) Replace the Navive predictor?
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
public: public:
explicit AnalysisPredictor(const contrib::AnalysisConfig& config) explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
bool Run(const std::vector<PaddleTensor>& inputs, bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor> *output_data,
int batch_size = -1) override { int batch_size = -1) override;
return NativePaddlePredictor::Run(inputs, output_data, batch_size);
} std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string &name) override;
bool ZeroCopyRun() override;
void PrepareFeedFetch();
void OptimizeInferenceProgram(); void OptimizeInferenceProgram();
Argument& analysis_argument() { return argument_; } Argument &analysis_argument() { return argument_; }
std::unique_ptr<PaddlePredictor> Clone() override;
framework::Scope *scope() { return executor_->scope(); }
framework::ProgramDesc &program() { return *inference_program_; }
protected:
bool LoadProgramDesc();
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
framework::Scope *scope);
bool GetFetch(std::vector<PaddleTensor> *output_data,
framework::Scope *scope);
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
private: private:
contrib::AnalysisConfig config_; contrib::AnalysisConfig config_;
Argument argument_; Argument argument_;
std::unique_ptr<NaiveExecutor> executor_;
platform::Place place_;
std::shared_ptr<framework::Scope> scope_;
framework::Scope *sub_scope_{nullptr};
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
}; };
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(dirname, "", "dirname to tests.");
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
config.use_feed_fetch_ops = false;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
auto w0 = predictor->GetInputTensor("firstw");
auto w1 = predictor->GetInputTensor("secondw");
auto w2 = predictor->GetInputTensor("thirdw");
auto w3 = predictor->GetInputTensor("forthw");
w0->Reshape({4, 1});
w1->Reshape({4, 1});
w2->Reshape({4, 1});
w3->Reshape({4, 1});
auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
for (int i = 0; i < 4; i++) {
w0_data[i] = i;
w1_data[i] = i;
w2_data[i] = i;
w3_data[i] = i;
}
predictor->ZeroCopyRun();
auto out = predictor->GetOutputTensor("fc_1.tmp_2");
PaddlePlace place;
int size = 0;
auto* out_data = out->data<float>(&place, &size);
LOG(INFO) << "output size: " << size / sizeof(float);
LOG(INFO) << "output_data: " << out_data;
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); //
you may not use this file except in compliance with the License. // Licensed under the Apache License, Version 2.0 (the "License");
You may obtain a copy of the License at // you may not use this file except in compliance with the License.
http://www.apache.org/licenses/LICENSE-2.0 // You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software //
distributed under the License is distributed on an "AS IS" BASIS, // http://www.apache.org/licenses/LICENSE-2.0
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
See the License for the specific language governing permissions and // Unless required by applicable law or agreed to in writing, software
limitations under the License. */ // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace paddle { namespace paddle {
...@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { ...@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
} }
} }
PaddleBuf::PaddleBuf(PaddleBuf&& other) PaddleBuf::PaddleBuf(PaddleBuf &&other)
: data_(other.data_), : data_(other.data_),
length_(other.length_), length_(other.length_),
memory_owned_(other.memory_owned_) { memory_owned_(other.memory_owned_) {
...@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) ...@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
other.length_ = 0; other.length_ = 0;
} }
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
if (!other.memory_owned_) { if (!other.memory_owned_) {
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
...@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { ...@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
return *this; return *this;
} }
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
// only the buffer with external memory can be copied // only the buffer with external memory can be copied
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
...@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { ...@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
} }
} }
void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Reset(void *data, size_t length) {
Free(); Free();
memory_owned_ = false; memory_owned_ = false;
data_ = data; data_ = data;
...@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { ...@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
PADDLE_ENFORCE_GT(length_, 0); PADDLE_ENFORCE_GT(length_, 0);
free(static_cast<char*>(data_)); free(static_cast<char *>(data_));
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }
......
...@@ -23,7 +23,6 @@ limitations under the License. */ ...@@ -23,7 +23,6 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid"); DEFINE_bool(profile, false, "Turn on profiler for fluid");
...@@ -145,7 +144,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -145,7 +144,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
VLOG(4) << "Run prepared context"; VLOG(4) << "Run prepared context";
executor_->RunPreparedContext(ctx_.get(), scope, executor_->RunPreparedContext(ctx_.get(), scope,
false, /* don't create local scope each time*/ false, /* don't create local scope each time*/
false /* don't create variable eatch time */); false /* don't create variable each time */);
VLOG(4) << "Finish prepared context"; VLOG(4) << "Finish prepared context";
// get fetch variable // get fetch variable
if (!GetFetch(output_data, scope)) { if (!GetFetch(output_data, scope)) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
...@@ -30,6 +30,8 @@ ...@@ -30,6 +30,8 @@
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
~NativePaddlePredictor() override; ~NativePaddlePredictor() override;
framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
protected: protected:
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
framework::Scope *scope); framework::Scope *scope);
......
...@@ -21,6 +21,12 @@ limitations under the License. */ ...@@ -21,6 +21,12 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__
#define ACC_DIFF 4e-3
#else
#define ACC_DIFF 1e-3
#endif
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
namespace paddle { namespace paddle {
...@@ -43,7 +49,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -43,7 +49,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) { ...@@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) {
float* lod_data = output1.data<float>(); float* lod_data = output1.data<float>();
for (int i = 0; i < output1.numel(); ++i) { for (int i = 0; i < output1.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], 1e-3); EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
EXPECT_GT(lod_data[i] - data[i], -1e-3); EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
} }
} }
...@@ -110,7 +116,7 @@ void MainImageClassification(bool use_gpu) { ...@@ -110,7 +116,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "/image_classification_resnet.inference.model";
const bool is_combined = false; const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes =
...@@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) { ...@@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) {
float* data = static_cast<float*>(outputs[0].data.data()); float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>(); float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], 1e-3); EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
} }
} }
...@@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
float* ref_data = refs[tid].data<float>(); float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float))); EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
} }
}); });
} }
...@@ -214,7 +220,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -214,7 +220,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model"; FLAGS_dirname + "/image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs); std::vector<framework::LoDTensor> jobs(num_jobs);
...@@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) {
float* ref_data = refs[tid].data<float>(); float* ref_data = refs[tid].data<float>();
EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
} }
}); });
} }
......
...@@ -2,6 +2,9 @@ set -x ...@@ -2,6 +2,9 @@ set -x
PADDLE_ROOT=$1 PADDLE_ROOT=$1
TURN_ON_MKL=$2 # use MKL or Openblas TURN_ON_MKL=$2 # use MKL or Openblas
TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
DATA_DIR=$4 # dataset
cd `dirname $0`
current_dir=`pwd`
if [ $2 == ON ]; then if [ $2 == ON ]; then
# You can export yourself if move the install path # You can export yourself if move the install path
MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
...@@ -29,15 +32,15 @@ function download() { ...@@ -29,15 +32,15 @@ function download() {
fi fi
cd .. cd ..
} }
mkdir -p data mkdir -p $DATA_DIR
cd data cd $DATA_DIR
vis_demo_list='se_resnext50 ocr mobilenet' vis_demo_list='se_resnext50 ocr mobilenet'
for vis_demo_name in $vis_demo_list; do for vis_demo_name in $vis_demo_list; do
download $vis_demo_name download $vis_demo_name
done done
cd ..
# compile and test the demo # compile and test the demo
cd $current_dir
mkdir -p build mkdir -p build
cd build cd build
...@@ -73,9 +76,9 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -73,9 +76,9 @@ for WITH_STATIC_LIB in ON OFF; do
for use_gpu in $use_gpu_list; do for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do for vis_demo_name in $vis_demo_list; do
./vis_demo \ ./vis_demo \
--modeldir=../data/$vis_demo_name/model \ --modeldir=$DATA_DIR/$vis_demo_name/model \
--data=../data/$vis_demo_name/data.txt \ --data=$DATA_DIR/$vis_demo_name/data.txt \
--refer=../data/$vis_demo_name/result.txt \ --refer=$DATA_DIR/$vis_demo_name/result.txt \
--use_gpu=$use_gpu --use_gpu=$use_gpu
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "vis demo $vis_demo_name runs fail." echo "vis demo $vis_demo_name runs fail."
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(input_or_output_,
"Can't reshape the output tensor, it is readonly");
PADDLE_ENFORCE(scope_);
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
auto *tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(shape));
}
template <typename T>
T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
switch (static_cast<int>(place)) {
case static_cast<int>(PaddlePlace::kCPU): {
return tensor->mutable_data<T>(platform::CPUPlace());
}
case static_cast<int>(PaddlePlace::kGPU): {
return tensor->mutable_data<T>(platform::CUDAPlace());
}
default:
PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
break;
}
return nullptr;
}
template <typename T>
T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
auto *res = tensor->data<T>();
if (platform::is_cpu_place(tensor->place())) {
*place = PaddlePlace::kCPU;
} else if (platform::is_gpu_place(tensor->place())) {
*place = PaddlePlace::kGPU;
} else {
*place = PaddlePlace::kUNK;
}
*size = tensor->numel();
return res;
}
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(),
"Need to SetName first, so that the corresponding tensor can "
"be retrieved.");
PADDLE_ENFORCE(scope_);
auto *scope = static_cast<framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
auto *tensor = var->GetMutable<framework::LoDTensor>();
return tensor;
}
std::vector<int64_t> ZeroCopyTensor::shape() {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims());
}
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
framework::LoD lod;
for (auto &level : x) {
lod.emplace_back(level);
}
tensor->set_lod(lod);
}
std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
std::vector<std::vector<size_t>> res;
auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
for (auto &level : tensor->lod()) {
res.emplace_back(level);
}
return res;
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
template <typename T>
T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
return nullptr;
}
template <typename T>
T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
return nullptr;
}
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
return std::vector<std::vector<size_t>>();
}
} // namespace paddle
...@@ -16,17 +16,34 @@ ...@@ -16,17 +16,34 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <sys/time.h> #include <sys/time.h>
#include <algorithm> #include <chrono> // NOLINT
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/inference/api/timer.h" #include "paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
// Timer for timer
class Timer {
public:
std::chrono::high_resolution_clock::time_point start;
std::chrono::high_resolution_clock::time_point startu;
void tic() { start = std::chrono::high_resolution_clock::now(); }
double toc() {
startu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_span =
std::chrono::duration_cast<std::chrono::duration<double>>(startu -
start);
double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
return used_time_ms;
}
};
static void split(const std::string &str, char sep, static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) { std::vector<std::string> *pieces) {
pieces->clear(); pieces->clear();
...@@ -93,6 +110,20 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -93,6 +110,20 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
} }
template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
int size{0};
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
int c = 0;
for (const auto &f : data) {
for (T v : f) {
ptr[c++] = v;
}
}
return size;
}
static std::string DescribeTensor(const PaddleTensor &tensor) { static std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os; std::stringstream os;
os << "Tensor [" << tensor.name << "]\n"; os << "Tensor [" << tensor.name << "]\n";
......
...@@ -101,6 +101,40 @@ struct PaddleTensor { ...@@ -101,6 +101,40 @@ struct PaddleTensor {
std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
// Tensor without copy, currently only supports AnalysisPredictor.
class ZeroCopyTensor {
public:
void Reshape(const std::vector<int>& shape);
// Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
// Once can directly call this data to feed the data.
// This is for write the input tensor.
template <typename T>
T* mutable_data(PaddlePlace place);
// Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor.
template <typename T>
T* data(PaddlePlace* place, int* size);
std::vector<int64_t> shape();
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
protected:
ZeroCopyTensor(void* scope) : scope_{scope} {}
void SetName(const std::string& name) { name_ = name; }
void* FindTensor() const;
private:
std::string name_;
bool input_or_output_;
friend class AnalysisPredictor;
void* scope_{nullptr};
};
/* /*
* A simple Inference API for Paddle. * A simple Inference API for Paddle.
*/ */
...@@ -120,6 +154,19 @@ class PaddlePredictor { ...@@ -120,6 +154,19 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0; int batch_size = -1) = 0;
// Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
// without copy.
virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string& name) {
return nullptr;
}
virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string& name) {
return nullptr;
}
virtual bool ZeroCopyRun() { return false; }
// Clone a predictor that share the model weights, the Cloned predictor should // Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe. // be thread-safe.
virtual std::unique_ptr<PaddlePredictor> Clone() = 0; virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
...@@ -216,9 +263,13 @@ struct AnalysisConfig : public NativeConfig { ...@@ -216,9 +263,13 @@ struct AnalysisConfig : public NativeConfig {
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run. // Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
std::vector<std::string> ir_passes; std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
// NOT stable yet.
bool use_feed_fetch_ops{true};
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool _use_mkldnn{false}; bool _use_mkldnn{false};
}; };
......
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "utils/logger/logger.h" #include "utils/logger/logger.h"
DEFINE_string(model, "", "Directory of the inference model."); DEFINE_string(model, "", "Directory of the inference model.");
......
...@@ -18,6 +18,8 @@ namespace paddle { ...@@ -18,6 +18,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<int64_t> data; std::vector<int64_t> data;
std::vector<size_t> lod; std::vector<size_t> lod;
...@@ -78,6 +80,7 @@ struct DataRecord { ...@@ -78,6 +80,7 @@ struct DataRecord {
} }
} }
} }
DataRecord NextBatch() { DataRecord NextBatch() {
DataRecord data; DataRecord data;
data.data = batched_datas[batch_iter]; data.data = batched_datas[batch_iter];
...@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { ...@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig;
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> word_data_all, mention_data_all; std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
...@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { ...@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
......
...@@ -14,10 +14,13 @@ ...@@ -14,10 +14,13 @@
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_bool(with_precision_check, true, "turn on test");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using namespace framework; // NOLINT using namespace framework; // NOLINT
using namespace contrib; // NOLINT
struct DataRecord { struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all; std::vector<std::vector<std::vector<float>>> link_step_data_all;
...@@ -29,10 +32,12 @@ struct DataRecord { ...@@ -29,10 +32,12 @@ struct DataRecord {
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1) explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) { : batch_size(batch_size) {
Load(path); Load(path);
} }
DataRecord NextBatch() { DataRecord NextBatch() {
DataRecord data; DataRecord data;
size_t batch_end = batch_iter + batch_size; size_t batch_end = batch_iter + batch_size;
...@@ -101,6 +106,7 @@ struct DataRecord { ...@@ -101,6 +106,7 @@ struct DataRecord {
num_samples = num_lines; num_samples = num_lines;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) { int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
...@@ -149,7 +155,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -149,7 +155,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
ZeroCopyTensor *cell_init_tensor,
ZeroCopyTensor *data_tensor,
ZeroCopyTensor *hidden_init_tensor,
ZeroCopyTensor *week_tensor,
ZeroCopyTensor *minute_tensor,
DataRecord *data_record, int batch_size) {
auto one_batch = data_record->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor->Reshape({1, 2});
lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
cell_init_tensor->Reshape({batch_size, 15});
cell_init_tensor->SetLoD({one_batch.lod3});
hidden_init_tensor->Reshape({batch_size, 15});
hidden_init_tensor->SetLoD({one_batch.lod3});
data_tensor->Reshape(rnn_link_data_shape);
data_tensor->SetLoD({one_batch.lod1});
week_tensor->Reshape(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor->SetLoD({one_batch.lod3});
minute_tensor->Reshape(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor->SetLoD({one_batch.lod3});
// assign data
float arr0[] = {0, 0};
std::vector<float> zeros(batch_size * 15, 0);
std::copy_n(arr0, 2,
lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(zeros.begin(), zeros.size(),
cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
std::copy_n(zeros.begin(), zeros.size(),
hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
}
void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param"; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false; cfg->use_gpu = false;
...@@ -187,7 +241,9 @@ TEST(Analyzer_rnn1, fuse_statis) { ...@@ -187,7 +241,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
...@@ -214,7 +270,229 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -214,7 +270,229 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
}
bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
const std::vector<std::string> &tensors) {
for (auto &x : tensors) {
auto *a_var = a_scope.FindVar(x);
auto *b_var = b_scope.FindVar(x);
if (a_var && b_var) {
if (a_var->Type() == typeid(framework::LoDTensor) ||
a_var->Type() == typeid(framework::Tensor)) {
LOG(INFO) << "comparing tensor " << x;
auto &a_t = a_var->Get<framework::LoDTensor>();
auto &b_t = b_var->Get<framework::LoDTensor>();
if (!inference::CompareTensor(a_t, b_t)) {
LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
}
} else {
LOG(INFO) << "skip no tensor " << x;
}
} else {
LOG(INFO) << "skip tensor " << x;
}
}
return true;
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// on the complex RNN1 model.
TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
PaddlePlace place;
int output_size{0};
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
config.use_feed_fetch_ops = true;
auto native_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch.
auto analysis_predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
data_tensor.get(), hidden_init_tensor.get(),
week_tensor.get(), minute_tensor.get(), &data,
FLAGS_batch_size);
// Prepare data for NativePredictor
std::vector<std::vector<PaddleTensor>> native_inputs;
SetInput(&native_inputs);
std::vector<PaddleTensor> native_outputs;
std::vector<PaddleTensor> analysis_outputs;
auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
// Run analysis predictor
int num_ops;
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
ASSERT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
Timer timer;
double total_time{0};
double native_total_time{0};
double analysis_total_time{0.};
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
auto *output_data = output_tensor->data<float>(&place, &output_size);
ASSERT_GT(output_size, 0); // more than one output!
for (int i = 0; i < FLAGS_repeat; i++) {
// Run native predictor.
timer.tic();
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
native_total_time += timer.toc();
}
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
ASSERT_TRUE(
analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
analysis_total_time += timer.toc();
}
if (!FLAGS_with_precision_check) {
return;
}
int native_output_size = VecReduceToInt(native_outputs.front().shape);
EXPECT_EQ(native_output_size, output_size);
// Compare tensors between analysis and zerocopy
auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
std::vector<std::string> tensor_names;
for (auto &var_desc : p0->program().Block(0).AllVars()) {
tensor_names.push_back(var_desc->Name());
}
LOG(INFO) << "Comparing tensors";
ASSERT_TRUE(
CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
ASSERT_TRUE(
CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
p0->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
p1->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
p2->scope()
->FindVar("final_output.tmp_1")
->Get<framework::LoDTensor>());
for (int i = 0; i < output_size; i++) {
LOG(INFO) << output_data[i] << " "
<< static_cast<float *>(native_outputs.front().data.data())[i]
<< " "
<< static_cast<float *>(analysis_outputs.front().data.data())[i];
EXPECT_NEAR(output_data[i],
static_cast<float *>(native_outputs.front().data.data())[i],
1e-3);
}
LOG(INFO) << "batch_size: " << FLAGS_batch_size;
LOG(INFO) << "zero average time: "
<< total_time / (FLAGS_repeat * FLAGS_batch_size);
LOG(INFO) << "analysis average time: "
<< analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
LOG(INFO) << "native average time: "
<< native_total_time / (FLAGS_repeat * FLAGS_batch_size);
}
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
double total_time_of_threads{0};
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
}
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
// auto predictor = base_predictor->Clone();
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
cell_init_tensor.get(), data_tensor.get(),
hidden_init_tensor.get(), week_tensor.get(),
minute_tensor.get(), &data, FLAGS_batch_size);
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
} }
} // namespace inference } // namespace inference
......
...@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { ...@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
GetFuseStatis(predictor.get(), &num_ops);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) { ...@@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) {
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
AnalysisConfig cfg;
SetConfig(&cfg);
// Enable embedding_fc_lstm_fuse_pass (disabled by default)
auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(),
"embedding_fc_lstm_fuse_pass");
if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using contrib::AnalysisConfig;
struct Record { struct Record {
std::vector<float> data; std::vector<float> data;
...@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { ...@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
GetFuseStatis(cfg, &num_ops); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
GetFuseStatis(predictor.get(), &num_ops);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
...@@ -86,11 +87,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor( ...@@ -86,11 +87,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config, std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
int *num_ops) { int *num_ops) {
auto predictor = CreateTestPredictor(config); auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument() auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>( .Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr); framework::ir::kFuseStatisAttr);
...@@ -184,5 +183,127 @@ void CompareNativeAndAnalysis( ...@@ -184,5 +183,127 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss;
ss << "\n---- tensor ---" << '\n';
ss << "lod: [";
for (const auto &level : tensor.lod()) {
ss << "[ ";
for (auto i : level) {
ss << i << ", ";
}
ss << "]";
}
ss << "]\n";
ss << "shape: [";
int size = 1;
for (int i = 0; i < tensor.dims().size(); i++) {
int dim = tensor.dims()[i];
ss << dim << ", ";
size *= dim;
}
ss << "]\n";
ss << "data: ";
for (int i = 0; i < std::min(20, size); i++) {
ss << tensor.data<T>()[i] << " ";
}
ss << "\n";
return ss.str();
}
static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
if (a.size() != b.size()) {
LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
b.size());
return false;
}
for (size_t i = 0; i < a.size(); i++) {
auto &al = a[i];
auto &bl = b[i];
if (al.size() != bl.size()) {
LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
bl.size());
return false;
}
}
return true;
}
static bool CompareShape(const std::vector<int64_t> &a,
const std::vector<int64_t> &b) {
if (a.size() != b.size()) {
LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
b.size());
return false;
}
for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) {
LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
a[i], b[i]);
return false;
}
}
return true;
}
static bool CompareTensorData(const framework::LoDTensor &a,
const framework::LoDTensor &b) {
auto a_shape = framework::vectorize(a.dims());
auto b_shape = framework::vectorize(b.dims());
size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
[](int a, int b) { return a * b; });
size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
[](int a, int b) { return a * b; });
if (a_size != b_size) {
LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
a_size, b_size);
}
for (size_t i = 0; i < a_size; i++) {
if (a.type() == typeid(float)) {
const auto *a_data = a.data<float>();
const auto *b_data = b.data<float>();
if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
LOG(ERROR) << string::Sprintf(
"tensor data %d-th element not match, %f != %f", i, a_data[i],
b_data[i]);
return false;
}
} else if (a.type() == typeid(int64_t)) {
const auto *a_data = a.data<int64_t>();
const auto *b_data = b.data<int64_t>();
if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
LOG(ERROR) << string::Sprintf(
"tensor data %d-th element not match, %f != %f", i, a_data[i],
b_data[i]);
return false;
}
}
}
return true;
}
static bool CompareTensor(const framework::LoDTensor &a,
const framework::LoDTensor &b) {
if (!CompareLoD(a.lod(), b.lod())) {
return false;
}
if (!CompareShape(framework::vectorize(a.dims()),
framework::vectorize(b.dims()))) {
return false;
}
if (!CompareTensorData(a, b)) {
return false;
}
return true;
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,8 @@ namespace memory { ...@@ -36,6 +36,8 @@ namespace memory {
using BuddyAllocator = detail::BuddyAllocator; using BuddyAllocator = detail::BuddyAllocator;
BuddyAllocator* GetCPUBuddyAllocator() { BuddyAllocator* GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
static std::once_flag init_flag; static std::once_flag init_flag;
static detail::BuddyAllocator* a = nullptr; static detail::BuddyAllocator* a = nullptr;
...@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() {
return a; return a;
} }
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
void* Alloc(size_t size) { return malloc(size); }
void Free(void* p) {
PADDLE_ENFORCE(p);
free(p);
}
static NaiveAllocator* Instance() {
static NaiveAllocator x;
return &x;
}
private:
std::mutex lock_;
};
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
......
...@@ -82,10 +82,11 @@ function(op_library TARGET) ...@@ -82,10 +82,11 @@ function(op_library TARGET)
if (${cc_srcs_len} EQUAL 0) if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif() endif()
#remove windows unsupported op
if (WIN32) if (WIN32)
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
...@@ -281,10 +282,12 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) ...@@ -281,10 +282,12 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
op_library(max_sequence_len_op DEPS lod_rank_table) op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
if (NOT WIN32)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
endif(NOT WIN32)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
...@@ -297,10 +300,10 @@ op_library(sequence_pad_op DEPS sequence_padding) ...@@ -297,10 +300,10 @@ op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory) op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute) op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
op_library(reduce_mean_op DEPS cub)
else() else()
op_library(conv_op DEPS vol2col im2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
...@@ -313,11 +316,6 @@ op_library(save_combine_op DEPS lod_tensor) ...@@ -313,11 +316,6 @@ op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(concat_op DEPS concat) op_library(concat_op DEPS concat)
# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
add_subdirectory(concurrency)
op_library(channel_send_op DEPS concurrency)
op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/op_registry.h"
namespace pf = paddle::framework;
static constexpr char kChannel[] = "Channel";
namespace paddle {
namespace operators {
class ChannelCloseOp : public framework::OperatorBase {
public:
ChannelCloseOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto &inp = *scope.FindVar(Input(kChannel));
// Get the mutable version of the channel variable and closes it.
pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
ch->close();
}
};
class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("Channel"),
"The input of ChannelClose op must be set");
}
};
class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(kChannel,
"The Channel Variable that should be closed by"
" the ChannelClose Op.");
AddComment(R"DOC(
Channel Close Operator.
This operator closes an open channel.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelCloseOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace pf = paddle::framework;
static constexpr char kOutput[] = "Out";
namespace paddle {
namespace operators {
class ChannelCreateOp : public framework::OperatorBase {
public:
ChannelCreateOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto &out = *scope.FindVar(Output(kOutput));
// Determine the datatype and capacity of the channel to be created
// from the attributes provided.
auto dtype =
static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
auto capacity = Attr<int>("capacity");
// Based on the datatype, create a new channel holder initialized with
// the given capacity. When capacity is 0, an unbuffered channel is
// created.
pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
if (dtype == framework::proto::VarType::LOD_TENSOR) {
ch->Reset<pf::LoDTensor>(capacity);
} else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
ch->Reset<pf::SelectedRows>(capacity);
} else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
ch->Reset<pf::LoDRankTable>(capacity);
} else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
ch->Reset<pf::LoDTensorArray>(capacity);
} else if (dtype == framework::proto::VarType::READER) {
ch->Reset<pf::ReaderHolder>(capacity);
} else if (dtype == framework::proto::VarType::CHANNEL) {
ch->Reset<pf::ChannelHolder>(capacity);
} else if (dtype == framework::proto::VarType::BOOL) {
ch->Reset<bool>(capacity);
} else if (dtype == framework::proto::VarType::INT32) {
ch->Reset<int>(capacity);
} else if (dtype == framework::proto::VarType::INT64) {
ch->Reset<int64_t>(capacity);
} else if (dtype == framework::proto::VarType::FP32) {
ch->Reset<float>(capacity);
} else if (dtype == framework::proto::VarType::FP64) {
ch->Reset<double>(capacity);
} else {
PADDLE_THROW(
"Data type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
"READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
dtype);
}
}
};
class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasOutput(kOutput),
"The output of ChannelCreate op must be set");
context->SetOutputDim(kOutput, {1});
}
};
class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddOutput(kOutput,
"The object of a Channel type created by ChannelCreate Op.");
AddAttr<int>("capacity", "The size of the buffer of Channel.")
.SetDefault(0);
AddAttr<int>("data_type", "The data type of elements inside the Channel.");
AddComment(R"DOC(
Channel Create Operator.
This operator creates an object of the VarType Channel and returns it.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelCreateOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <paddle/fluid/framework/lod_rank_table.h>
#include <paddle/fluid/framework/lod_tensor_array.h>
#include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel";
static constexpr char Status[] = "Status";
static constexpr char Out[] = "Out";
namespace paddle {
namespace operators {
void SetReceiveStatus(const platform::Place &dev_place,
framework::Variable *status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
class ChannelRecvOp : public framework::OperatorBase {
public:
ChannelRecvOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE(ctx->HasInput(Channel),
"Input(Channel) of ChannelRecvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Out),
"Input(Channel) of ChannelRecvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelRecvOp should not be null.");
ctx->SetOutputDim("Status", {1});
}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
// Get the channel holder created by channel_create op, passed as input.
framework::ChannelHolder *ch =
scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
auto output_var = scope.FindVar(Output(Out));
// Receive the data from the channel.
bool ok = concurrency::ChannelReceive(ch, output_var);
// Set the status output of the `ChannelReceive` call.
SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
}
};
class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(Channel,
"(Channel) A variable which \"receives\" the a value sent"
"to it by a channel_send op.")
.AsDuplicable();
AddOutput(Out,
"(Variable) Output Variable that will hold the data received"
" from the Channel")
.AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the receive operation.")
.AsDuplicable();
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelRecvOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <paddle/fluid/framework/lod_rank_table.h>
#include <paddle/fluid/framework/lod_tensor_array.h>
#include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel";
static constexpr char X[] = "X";
namespace paddle {
namespace operators {
class ChannelSendOp : public framework::OperatorBase {
public:
ChannelSendOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE(ctx->HasInput(Channel),
"Input(Channel) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(X),
"Input(X) of ChannelSendOp should not be null.");
}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
// Get the channel holder created by channel_create op, passed as input.
framework::ChannelHolder *ch =
scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel.
concurrency::ChannelSend(ch, input_var);
}
};
class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(Channel,
"(Channel) A variable which \"sends\" the passed in value to "
"a listening receiver.")
.AsDuplicable();
AddInput(X, "(Variable) The value which gets sent by the channel.")
.AsDuplicable();
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelSendOpMaker);
cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include "paddle/fluid/framework/var_type.h"
namespace poc = paddle::operators::concurrency;
void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
ch->Send(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelSend:Unsupported type");
}
bool poc::ChannelReceive(framework::ChannelHolder *ch,
framework::Variable *var) {
// Get type of channel and use that to call mutable data for Variable
auto type = framework::ToVarType(ch->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Receive(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Receive(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Receive(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Receive(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Receive(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelReceive:Unsupported type");
}
void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
} else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_READER) {
ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_CHANNEL) {
ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
cb);
} else {
PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
}
}
void poc::ChannelAddToReceiveQ(
framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_READER) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_CHANNEL) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
cond, cb);
} else {
PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
}
}
...@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> { ...@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
output);
} }
}; };
...@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> { ...@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0)); set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad); paddings, dilations, input_grad);
} }
if (filter_grad) { if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0)); set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad); dilations, filter_grad);
} }
} }
}; };
......
...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> { ...@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math::DepthwiseConvInputGradFunctor<DeviceContext, T> math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad; depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
output); dilations, output);
} }
}; };
...@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
if (input_grad) { if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv; math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
input_grad); input_grad);
} }
...@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math::DepthwiseConvFilterGradFunctor<DeviceContext, T> math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad; depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
filter_grad); dilations, filter_grad);
} }
} }
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace detail {
template <typename T, size_t ElementCount>
struct Array {
public:
HOSTDEVICE inline Array() {}
HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
HOSTDEVICE inline const T& operator[](size_t index) const {
return data_[index];
}
HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
template <typename VectorLikeType>
static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
size_t n = static_cast<size_t>(vec.size());
Array<T, ElementCount> ret;
for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
return ret;
}
private:
T data_[ElementCount];
};
// reduce the last axis of 2d array
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim>
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
int reduce_num) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
int idx_x = blockIdx.x * reduce_num;
int idx_y = threadIdx.x;
Ty reduce_var = init;
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
Array<int, Rank> x_strides,
Array<int, ReduceRank> reduce_dim,
Array<int, ReduceRank> reduce_strides,
Array<int, Rank - ReduceRank> left_dim,
Array<int, Rank - ReduceRank> left_strides) {
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
Array<int, Rank> sub_index;
int left_idx = blockIdx.x;
for (int i = 0; i < Rank - ReduceRank; ++i) {
sub_index[left_dim[i]] = left_idx / left_strides[i];
left_idx %= left_strides[i];
}
int reduce_idx = threadIdx.x;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
int reduce_idx = i;
for (int j = 0; j < ReduceRank; ++j) {
sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
reduce_idx %= reduce_strides[j];
}
int idx_x = 0;
for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
}
reduce_var =
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
if (threadIdx.x == 0) {
y[blockIdx.x] = reduce_var;
}
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
int n = static_cast<int>(dims.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[i + 1];
}
return strides;
}
static inline std::vector<int> GetStrides(const std::vector<int>& dims,
const std::vector<int>& idx) {
int n = static_cast<int>(idx.size());
if (n == 0) return std::vector<int>();
std::vector<int> strides(n);
strides.back() = 1;
for (int i = n - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * dims[idx[i + 1]];
}
return strides;
}
constexpr int kMaxBlockDim = 512;
static inline int GetDesiredBlockDim(int block_dim) {
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(block_dim)));
}
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void TensorReduceImpl(
const Tx* x_data, Ty* y_data, const platform::Place& place,
const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
int left_num, int reduce_num, const std::vector<int>& x_strides,
const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
const std::vector<int>& left_dim, const std::vector<int>& left_strides,
cudaStream_t stream) {
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int rank = x_strides.size();
int reduce_rank = reduce_strides.size();
if (rank == reduce_rank) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
reduce_num, reducer, init, stream);
return;
}
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
BlockDim><<<left_num, BlockDim, 0, stream>>>(
x_data, y_data, reducer, transformer, init, reduce_num);
return;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3););
CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5););
CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
} // namespace detail
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
cudaStream_t stream) {
auto x_dim = framework::vectorize2int(x.dims());
std::vector<int> new_x_dim, new_reduce_dims;
int is_reduced = 0;
for (auto e : origin_reduce_dims) {
auto pos = e >= 0 ? e : e + x_dim.size();
is_reduced |= 1 << e;
}
for (int i = 0; i < x_dim.size(); i++) {
if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
new_x_dim.push_back(x_dim[i]);
if ((is_reduced >> i) & 1)
new_reduce_dims.push_back(new_x_dim.size() - 1);
} else {
new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
}
}
x_dim = new_x_dim;
origin_reduce_dims = new_reduce_dims;
int x_rank = static_cast<int>(x_dim.size());
std::set<int> left_set, reduce_set;
for (int i = 0; i < x_rank; ++i) left_set.insert(i);
for (auto e : origin_reduce_dims) {
left_set.erase(e);
reduce_set.insert(e);
}
std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
std::vector<int> left_dim(left_set.begin(), left_set.end());
std::vector<int> x_strides = detail::GetStrides(x_dim);
std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
int left_num = 1;
if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
std::vector<int> y_dim(left_dim.size());
for (int i = 0; i < left_dim.size(); ++i) {
y_dim[i] = x_dim[left_dim[i]];
}
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (reduce_num == 1) return;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch (detail::GetDesiredBlockDim(reduce_num)) {
CUB_BLOCK_DIM_CASE(512);
CUB_BLOCK_DIM_CASE(256);
CUB_BLOCK_DIM_CASE(128);
CUB_BLOCK_DIM_CASE(64);
CUB_BLOCK_DIM_CASE(32);
CUB_BLOCK_DIM_CASE(16);
CUB_BLOCK_DIM_CASE(8);
CUB_BLOCK_DIM_CASE(4);
CUB_BLOCK_DIM_CASE(2);
}
#undef CUB_BLOCK_DIM_CASE
}
} // namespace operators
} // namespace paddle
...@@ -104,7 +104,6 @@ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { ...@@ -104,7 +104,6 @@ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1 * a33 = 1
*
*/ */
template <typename T> template <typename T>
void get_transform_matrix(const int transformed_width, void get_transform_matrix(const int transformed_width,
...@@ -260,8 +259,8 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> { ...@@ -260,8 +259,8 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
roi2image.Resize({rois_num}); roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace()); int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back(); auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) { for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i; roi2image_data[j] = i;
} }
} }
...@@ -393,8 +392,8 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> { ...@@ -393,8 +392,8 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
roi2image.Resize({rois_num}); roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace()); int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back(); auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) { for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i; roi2image_data[j] = i;
} }
} }
...@@ -404,7 +403,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> { ...@@ -404,7 +403,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
for (int in_h = 0; in_h < in_height; ++in_h) { for (int in_h = 0; in_h < in_height; ++in_h) {
for (int in_w = 0; in_w < in_width; ++in_w) { for (int in_w = 0; in_w < in_width; ++in_w) {
T gradient = 0.0; T gradient = 0.0;
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8; const T* rois = rois_data + roi_idx * 8;
T roi_x[4]; T roi_x[4];
T roi_y[4]; T roi_y[4];
......
...@@ -345,8 +345,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> { ...@@ -345,8 +345,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
roi2image.Resize({rois_num}); roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace()); int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
auto lod = rois->lod().back(); auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) { for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i; roi2image_data[j] = i;
} }
} }
...@@ -432,7 +432,7 @@ __global__ void RoiTransformGradKernel( ...@@ -432,7 +432,7 @@ __global__ void RoiTransformGradKernel(
T gradient = 0.0; T gradient = 0.0;
// Accumulate gradient over all RoIs that interpolated this element // Accumulate gradient over all RoIs that interpolated this element
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8; const T* rois = rois_data + roi_idx * 8;
T roi_x[4]; T roi_x[4];
T roi_y[4]; T roi_y[4];
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <time.h> #include <time.h>
#include <atomic>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <time.h> #include <time.h>
#include <condition_variable> // NOLINT
#include <functional> #include <functional>
#include <string> #include <string>
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <atomic>
#include <set> #include <set>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
......
...@@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.") AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
.SetDefault(false); .SetDefault(false);
AddComment(string::Sprintf(R"DOC( AddComment(string::Sprintf(R"DOC(
Limited Elementwise %s Operator Elementwise %s Operator
The equation is: The equation is:
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
void FusedEmbeddingFCLSTMOp::InferShape(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Embeddings"),
"Assert only one Input(Embeddings) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Assert only one Input(WeightH) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Assert only one Output(Hidden) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Assert only one Output(Cell) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input(Ids) of LookupTableOp should not be null.");
auto table_dims = ctx->GetInputDim("Embeddings");
auto ids_dims = ctx->GetInputDim("Ids");
int ids_rank = ids_dims.size();
PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
"The last dimension of the 'Ids' tensor must be 1.");
auto x_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2.");
if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(Cell) and Input(Hidden) of LSTM should not "
"be null at the same time.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
}
auto embeddings_dims = ctx->GetInputDim("Embeddings");
PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2,
"The rank of Input(Embeddings) should be 2.");
auto wh_dims = ctx->GetInputDim("WeightH");
int frame_size = wh_dims[1] / 4;
PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
"The rank of Input(WeightH) should be 2.");
PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
"The first dimension of Input(WeightH) "
"should be %d.",
frame_size);
PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
"The second dimension of Input(WeightH) "
"should be 4 * %d.",
frame_size);
auto b_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE_EQ(
b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection or"
"4 * %d if disable peepholes",
frame_size, frame_size);
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims);
ctx->ShareLoD("Ids", "Hidden");
ctx->ShareLoD("Ids", "Cell");
int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wh_dims[1];
} else {
xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Assert only one Output(BatchedInput) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Assert only one Output(BatchedHidden) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Assert only one Output(BatchedCell) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Assert only one Output(ReorderedH0) of LSTM");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Assert only one Output(ReorderedC0) of LSTM.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims);
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("Ids", "XX");
}
framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("Embeddings")->type()),
ctx.device_context());
}
void FusedEmbeddingFCLSTMOpMaker::Make() {
AddInput("Ids",
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1.");
AddInput("Embeddings",
"(Tensor) the learnable weights of X."
" - The shape is (M x 4D), where M is the dim size of x, D is the "
"hidden size. "
" - Weight = {W_cx, W_ix, W_fx, W_ox}");
AddInput("WeightH",
"(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
" - The shape is (D x 4D), where D is the hidden size. "
" - Weight = {W_ch, W_ih, W_fh, W_oh}");
AddInput("Bias",
"(Tensor) the learnable weights. Almost same as LSTMOp"
"Note: we should add the fc bias into this (1x4D) in bias."
"input-hidden bias weight and peephole connections weight if "
"setting `use_peepholes` True. "
"1. `use_peepholes = False` "
" - The shape is (1 x 4D). "
" - Bias = {b_c, b_i, b_f, b_o}."
"2. `use_peepholes = True` "
" - The shape is (1 x 7D). "
" - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
AddInput("H0",
"(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
"optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size and D is the hidden size.")
.AsDispensable();
AddInput("C0",
"(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
"optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size. `H0` and `C0` can be NULL but only at the same time.")
.AsDispensable();
AddOutput("Hidden",
"(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("Cell",
"(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("XX",
"(LoDTensor) the result after X * WeightX (size is T x 4D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.")
.AsIntermediate();
AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
.SetDefault(true);
AddAttr<bool>("is_reverse",
"(bool, defalut: False) "
"whether to compute reversed LSTM.")
.SetDefault(false);
AddAttr<bool>("use_seq",
"(bool, defalut: True) "
"whether to use seq mode to compute.")
.SetDefault(true);
AddAttr<std::string>("gate_activation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("cell_activation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("candidate_activation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC");
}
template <typename T>
class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
public:
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* ids = ctx.Input<LoDTensor>("Ids"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
auto* embeddings = ctx.Input<Tensor>("Embeddings"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes");
#define INIT_BASE_SIZES \
auto ids_dims = ids->dims(); /* T x M*/ \
auto ids_numel = ids->numel(); /* T x 1*/ \
auto wh_dims = wh->dims(); /* D x 4D*/ \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D3 = D * 3; \
int64_t row_number = embeddings->dims()[0]; \
int64_t row_width = embeddings->dims()[1]; \
const int D4 = wh_dims[1];
#define INIT_BASE_INPUT_DATAS \
const int64_t* ids_data = ids->data<int64_t>(); \
const T* embeddings_data = embeddings->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/ \
Tensor checked_cell; \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
}
/// Compute LSTM
#define GEMM_WH_ADDON(bs, prev, out) \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4)
// gates: W_ch, W_ih, W_fh, W_oh
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, gates + D); \
blas.VMUL(D, ct_1, gates + D2, gates + D2); \
blas.VADD(D, gates + D, gates + D2, ct)
#define GET_Ht(ct, gates, ht) \
/* H_t = act_cell(C_t) * ogated */ \
act_cell(D, ct, gates + D2); \
blas.VMUL(D, gates + D2, gates + D3, ht)
#define GET_Ct_NOH0C0(gates, ct) \
/* C_t = igated * cgated*/ \
act_gate(D, gates + D, gates + D); \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, ct)
#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
/* get outgated, put W_oc * C_t on igated */ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/ \
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
act_gate(D2, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
/* get ogated*/ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
// std::cout << "====> SeqCompute" << std::endl;
auto ids_lod = ids->lod();
const int total_T = ids_dims[0];
const int N = ids_lod[0].size() - 1;
const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* c0_data = c0 ? c0->data<T>() : nullptr;
T* xx_data = xx->mutable_data<T>(place);
T* h_out_data = hidden_out->mutable_data<T>(place);
T* c_out_data = cell_out->mutable_data<T>(place);
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
int xx_offset = D4;
int gate_offset = D;
if (is_reverse) {
const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 4;
h_out_data = h_out_data + offset;
c_out_data = c_out_data + offset;
xx_offset = -D4;
gate_offset = -D;
}
#define MOVE_ONE_STEP \
prev_h_data = h_out_data; \
prev_c_data = c_out_data; \
xx_data = xx_data + xx_offset; \
h_out_data = h_out_data + gate_offset; \
c_out_data = c_out_data + gate_offset
#define PROCESS_H0C0_DEFINES \
int bid = is_reverse ? N - 1 - i : i; \
int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \
const T* prev_c_data = nullptr; \
const T* prev_h_data = nullptr; \
int tstart = 0
#define PROCESS_H0C0_PEEPHOLE \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
#define PROCESS_H0C0 \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
if (use_peepholes) {
for (int i = 0; i < N; ++i) {
PROCESS_H0C0_PEEPHOLE
for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data);
COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP;
}
}
} else {
for (int i = 0; i < N; ++i) {
PROCESS_H0C0
for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data);
COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP;
}
}
}
#undef PROCESS_H0C0_DEFINES
#undef PROCESS_H0C0_PEEPHOLE
#undef PROCESS_H0C0
#undef MOVE_ONE_STEP
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext;
INIT_BASE_INPUT_OUTPUT
if (ids->lod()[0].size() == 2) {
SeqCompute(ctx);
return;
}
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
// std::cout << "===> Batch Compute" << std::endl;
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
T* xx_data = xx->mutable_data<T>(place);
T* batched_input_data = batched_input->mutable_data<T>(place);
T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
hidden_out->mutable_data<T>(place);
cell_out->mutable_data<T>(place);
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids_data[i], row_number);
PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
row_width * sizeof(T));
}
to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
auto batched_lod = batched_input->lod();
const auto& seq_order = batched_lod[2];
const int max_bs = seq_order.size();
reordered_h0->Resize({max_bs, D});
reordered_c0->Resize({max_bs, D});
int tstart = 0;
T* prev_h_data = nullptr;
T* prev_c_data = nullptr;
if (h0) {
// reorder h0, c0
T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
const T* h0_data = h0->data<T>();
const T* c0_data = c0->data<T>();
prev_h_data = reordered_h0_data;
prev_c_data = reordered_c0_data;
size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
reordered_h0_data += D;
reordered_c0_data += D;
}
} else {
// compute without h0, c0
T* cur_in_data = batched_input_data;
T* cur_h_out_data = batched_h_out_data;
T* cur_c_out_data = batched_c_out_data;
for (int i = 0; i < max_bs; ++i) {
GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
if (use_peepholes) {
blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
}
act_gate(D, cur_in_data + D3, cur_in_data + D3);
GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
cur_in_data += D4;
cur_c_out_data += D;
cur_h_out_data += D;
}
tstart = 1;
prev_h_data = batched_h_out_data;
prev_c_data = batched_c_out_data;
}
const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1;
const int offset = tstart * max_bs * D;
batched_input_data = batched_input_data + offset * 4;
batched_h_out_data = batched_h_out_data + offset;
batched_c_out_data = batched_c_out_data + offset;
#define DEFINE_CUR \
T* cur_in_data = batched_input_data; \
T* cur_prev_c_data = prev_c_data; \
T* cur_c_out_data = batched_c_out_data; \
T* cur_h_out_data = batched_h_out_data
#define MOVE_ONE_BATCH \
cur_in_data += D4; \
cur_prev_c_data += D; \
cur_c_out_data += D; \
cur_h_out_data += D
#define MOVE_ONE_STEP \
prev_c_data = batched_c_out_data; \
prev_h_data = batched_h_out_data; \
batched_c_out_data = cur_c_out_data; \
batched_h_out_data = cur_h_out_data; \
batched_input_data = cur_in_data
if (use_peepholes) {
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
MOVE_ONE_STEP;
}
} else {
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
MOVE_ONE_STEP;
}
}
#undef MOVE_ONE_STEP
#undef MOVE_ONE_BATCH
#undef DEFINE_CUR
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batched_h_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_h_out, hidden_out);
batched_c_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_c_out, cell_out);
}
void Compute(const framework::ExecutionContext& ctx) const override {
if (ctx.Attr<bool>("use_seq")) {
SeqCompute(ctx);
} else {
BatchCompute(ctx);
}
}
#undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
#undef GET_Ht
#undef GET_Ct
#undef GEMM_WH_ADDON
#undef INIT_BASE_INPUT_DATAS
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp,
ops::FusedEmbeddingFCLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm,
ops::FusedEmbeddingFCLSTMKernel<float>,
ops::FusedEmbeddingFCLSTMKernel<double>);
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -13,26 +13,29 @@ See the License for the specific language governing permissions and ...@@ -13,26 +13,29 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace concurrency {
void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); using LoDTensor = framework::LoDTensor;
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); using Tensor = framework::Tensor;
class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
framework::Variable *var, public:
std::shared_ptr<std::condition_variable_any> cond, void Make() override;
std::function<bool(framework::ChannelAction)> cb); };
void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb);
} // namespace concurrency
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> {
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext; using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if (x->lod()[0].size() == 2) { if (x->lod()[0].size() == 2) {
xx->Resize({total_T, D3});
SeqCompute(ctx); SeqCompute(ctx);
return; return;
} }
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC INIT_VEC_FUNC
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0"); auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
......
...@@ -432,11 +432,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -432,11 +432,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext; using DeviceContext = platform::CPUDeviceContext;
INIT_BASE_INPUT_OUTPUT INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if (x->lod()[0].size() == 2) { if (x->lod()[0].size() == 2) {
xx->Resize({x_dims[0], D4});
SeqCompute(ctx); SeqCompute(ctx);
return; return;
} }
INIT_BASE_SIZES
INIT_VEC_FUNC INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS INIT_BASE_INPUT_DATAS
......
...@@ -13,6 +13,31 @@ limitations under the License. */ ...@@ -13,6 +13,31 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math {} // namespace math namespace math {
#ifdef __AVX__
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht) {
namespace act = detail::forward::avx;
// gates: W_ch, W_ih, W_fh, W_oh
__m256 c, i, f, o;
c = _mm256_loadu_ps(gates);
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16);
o = _mm256_loadu_ps(gates + 24);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
}
#endif
} // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -48,32 +48,15 @@ namespace forward { ...@@ -48,32 +48,15 @@ namespace forward {
namespace avx { namespace avx {
__m256 Sigmoid(const __m256 a); __m256 Sigmoid(const __m256 a);
__m256 Tanh(const __m256 a); __m256 Tanh(const __m256 a);
} // namespace avx } // namespace avx
} // namespace forward } // namespace forward
} // namespace detail } // namespace detail
template <> template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct, void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht) { float* ht);
namespace act = detail::forward::avx;
// gates: W_ch, W_ih, W_fh, W_oh
__m256 c, i, f, o;
c = _mm256_loadu_ps(gates);
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16);
o = _mm256_loadu_ps(gates + 24);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
}
#endif #endif
} // namespace math } // namespace math
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -20,149 +21,268 @@ namespace paddle { ...@@ -20,149 +21,268 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
__inline__ __device__ T warpReduceSum(T val) {
#if CUDA_VERSION < 9000
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
#else
#define FULL_MASK 0xffffffff
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
#endif
}
__forceinline__ __device__ unsigned lane_id() {
unsigned ret;
asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned warp_id() {
unsigned ret;
asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
// A Cuda kernel to compute the depthwise convolution forward pass // A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format. // in NCHW format.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConv( __device__ __inline__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data, const T* const input_data, const T* const filter_data, const int batch_size,
const int batch_size, const int output_channels, const int output_height, const int output_channels, const int output_height, const int output_width,
const int output_width, const int input_channels, const int input_height, const int input_channels, const int input_height, const int input_width,
const int input_width, const int filter_multiplier, const int filter_height, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width, const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) { const int padding_height, const int padding_width, const int dilate_height,
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int dilate_width, T* const output_data) {
for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
if (index < nthreads) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
const int batch = index / output_channels / output_height / output_width; const int batch = blockIdx.y;
const int c_out = (index / output_height / output_width) % output_channels; const int c_out = blockIdx.x;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width; const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
const int c_in = c_out / filter_multiplier; T value = 0;
const T* weight = filter_data + c_out * filter_height * filter_width; const int h_in_start = -padding_height + h_out * stride_height;
T value = 0; const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_start = -padding_height + h_out * stride_height; const int h_in_end = h_in_start + filter_height * dilate_height;
const int w_in_start = -padding_width + w_out * stride_width; const int w_in_end = w_in_start + filter_width * dilate_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width; const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width; const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height; const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_end = w_in_end < input_width ? w_in_end : input_width; const int w_start = w_in_start > 0 ? w_in_start : 0;
const int h_start = h_in_start > 0 ? h_in_start : 0; int weight_offset = 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
for (int h_in = h_start; h_in < h_end; h_in++) { for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
for (int w_in = w_start; w_in < w_end; w_in++) { if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
const int offset = in_offset + h_in * input_width + w_in; w_in < w_end) {
value += const int offset = in_offset + h_in * input_width + w_in;
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * value += weight[weight_offset] * input_data[offset];
input_data[offset]; }
weight_offset++;
}
} }
int index =
((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
w_out;
output_data[index] = value;
} }
output_data[index] = value;
} }
} }
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvSp(
const T* const input_data, const T* const filter_data, const int batch_size,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const output_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, filter_multiplier,
filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width,
dilate_height, dilate_width, output_data);
else
KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels,
input_height, input_width, c_filter_multiplier,
filter_height, filter_height, c_stride, c_stride,
padding_height, padding_width, dilate_height,
dilate_width, output_data);
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input. // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConvInputGrad( __device__ __inline__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data, const T* const output_grad_data, const T* const filter_data,
const T* const filter_data, const int batch_size, const int output_channels, const int batch_size, const int output_channels, const int output_height,
const int output_height, const int output_width, const int input_channels, const int output_width, const int input_channels, const int input_height,
const int input_height, const int input_width, const int filter_multiplier, const int input_width, const int filter_multiplier, const int filter_height,
const int filter_height, const int filter_width, const int stride_height, const int filter_width, const int stride_height, const int stride_width,
const int stride_width, const int padding_height, const int padding_width, const int padding_height, const int padding_width, const int dilate_height,
T* const input_grad_data) { const int dilate_width, T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
if (index < nthreads) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
const int batch = index / input_channels / input_height / input_width; const int batch = blockIdx.y;
const int c_in = (index / input_height / input_width) % input_channels; const int c_in = blockIdx.x;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width; const int c_out_start = c_in * filter_multiplier;
const int c_out_start = c_in * filter_multiplier; int h_out_start =
h_in - (filter_height - 1) * dilate_height + padding_height;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height; int h_out_end = h_in + padding_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int w_out_start =
int h_out_end = (h_in + padding_height) / stride_height; w_in - (filter_width - 1) * dilate_width + padding_width;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_end = w_in + padding_width;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width; T value = 0;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
int w_out_end = (w_in + padding_width) / stride_width; c_out++) {
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; int filter_offset = (c_out + 1) * filter_height * filter_width;
for (int h_out = h_out_start; h_out <= h_out_end;
T value = 0; h_out += dilate_height) {
for (int w_out = w_out_start; w_out <= w_out_end;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; w_out += dilate_width) {
c_out++) { filter_offset--;
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { int s_h_out = h_out / stride_height;
const int filter_h = h_in + padding_height - h_out * stride_height; int s_w_out = w_out / stride_width;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
const int filter_w = w_in + padding_width - w_out * stride_width; s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
const int filter_offset = c_out * filter_height * filter_width + s_w_out < output_width) {
filter_h * filter_width + filter_w; const int output_grad_offset =
const int output_grad_offset = ((batch * output_channels + c_out) * output_height +
((batch * output_channels + c_out) * output_height + h_out) * s_h_out) *
output_width + output_width +
w_out; s_w_out;
value += value += output_grad_data[output_grad_offset] *
output_grad_data[output_grad_offset] * filter_data[filter_offset]; filter_data[filter_offset];
}
}
} }
} }
int index =
((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
w_in;
input_grad_data[index] = value;
} }
input_grad_data[index] += value;
} }
} }
template <typename T, int c_filter_multiplier, int c_stride>
__global__ void KernelDepthwiseConvInputGradSp(
const T* const output_grad_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* const input_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, input_grad_data);
else
KernelDepthwiseConvInputGrad<T>(
output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
padding_height, padding_width, dilate_height, dilate_width,
input_grad_data);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T> template <typename T>
__global__ void KernelDepthwiseConvFilterGrad( __device__ __inline__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data, const T* output_grad_data, const T* input_data, const int num,
const T* const input_data, const int num, const int output_channels, const int output_channels, const int output_height, const int output_width,
const int output_height, const int output_width, const int input_channels, const int input_channels, const int input_height, const int input_width,
const int input_height, const int input_width, const int filter_multiplier, const int filter_multiplier, const int filter_height,
const int filter_height, const int filter_width, const int stride_height, const int filter_width, const int stride_height, const int stride_width,
const int stride_width, const int padding_height, const int padding_width, const int padding_height, const int padding_width, const int dilate_height,
T* const filter_grad_data) { const int dilate_width, T* filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; T s = 0;
if (index < nthreads) {
const int w_out = index % output_width; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
const int h_out = (index / output_width) % output_height; int lid = lane_id();
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels); for (int image_w = threadIdx.x; image_w < output_width;
const int c_in = c_out / filter_multiplier; image_w += blockDim.x) {
const int h_in_start = -padding_height + h_out * stride_height; for (int bid = 0; bid < num; bid++) {
const int w_in_start = -padding_width + w_out * stride_width; for (int image_h = threadIdx.y; image_h < output_height;
const int h_in_end = image_h += blockDim.y) {
-padding_height + h_out * stride_height + filter_height; int kernel_id = blockIdx.z;
const int w_in_end = -padding_width + w_out * stride_width + filter_width; int kernel_h = blockIdx.y * dilate_height - padding_height;
const int in_offset = int kernel_w = blockIdx.x * dilate_width - padding_width;
(batch * input_channels + c_in) * input_height * input_width;
int image_hk = image_h * stride_height + kernel_h;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; int image_wk = image_w * stride_width + kernel_w;
const int h_end = h_in_end < input_height ? h_in_end : input_height; if (image_hk < 0 || image_hk >= input_height) continue;
const int w_end = w_in_end < input_width ? w_in_end : input_width; if (image_wk < 0 || image_wk >= input_width) continue;
const int h_start = h_in_start > 0 ? h_in_start : 0; #define gaid(N, C, H, W) \
const int w_start = w_in_start > 0 ? w_in_start : 0; ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
for (int h_in = h_start; h_in < h_end; h_in++) { s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
for (int w_in = w_start; w_in < w_end; w_in++) { input_data[((bid * (gridDim.z / filter_multiplier) +
const int offset = in_offset + h_in * input_width + w_in; kernel_id / filter_multiplier) *
const T diff_temp = output_grad_data[index] * input_data[offset]; input_height +
T* addr = addr_offset + (h_in - h_in_start) * filter_width + image_hk) *
(w_in - w_in_start); input_width +
paddle::platform::CudaAtomicAdd(addr, diff_temp); image_wk];
#undef gaid
} }
} }
} }
#if __CUDA_ARCH__ >= 530
s = warpReduceSum<T>(s);
if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#else
paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
#endif
}
template <typename T, int c_filter_multiplier>
__global__ void KernelDepthwiseConvFilterGradSp(
const T* output_grad_data, const T* input_data, const int num,
const int output_channels, const int output_height, const int output_width,
const int input_channels, const int input_height, const int input_width,
const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, const int dilate_height,
const int dilate_width, T* filter_grad_data) {
if (c_filter_multiplier == 0)
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
else
KernelDepthwiseConvFilterGrad<T>(
output_grad_data, input_data, num, output_channels, output_height,
output_width, input_channels, input_height, input_width,
c_filter_multiplier, filter_height, filter_width, stride_height,
stride_width, padding_height, padding_width, dilate_height,
dilate_width, filter_grad_data);
} }
/* /*
...@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) { const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> { ...@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width; int thread = 512;
int blocks = (nthreads + 1024 - 1) / 1024; int blocks = std::min(std::max(thread / output_width, 1), output_height);
dim3 threads(1024, 1); dim3 threads(std::min(output_width, thread), blocks, 1);
dim3 grid(blocks, 1); dim3 grid(output_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>( #define check_case(c_filter_multiplier, c_stride) \
nthreads, input_data, filter_data, batch_size, output_channels, if (c_filter_multiplier == 0 || \
output_height, output_width, input_channels, input_height, input_width, filter_multiplier == c_filter_multiplier && \
output_channels / input_channels, ksize_height, ksize_width, stride_height == stride_width && stride_height == c_stride) { \
stride_height, stride_width, padding_height, padding_width, KernelDepthwiseConvSp<T, c_filter_multiplier, \
output_data); c_stride><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> { ...@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* filter_data = filter.data<T>(); const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width; int thread = 512;
int blocks = (nthreads + 1024 - 1) / 1024; int blocks = std::min(std::max(thread / input_width, 1), input_height);
dim3 threads(1024, 1); dim3 threads(std::min(input_width, thread), blocks, 1);
dim3 grid(blocks, 1); dim3 grid(input_channels, batch_size, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels, #define check_case(c_filter_multiplier, c_stride) \
output_height, output_width, input_channels, input_height, input_width, if (c_filter_multiplier == 0 || \
output_channels / input_channels, ksize_height, ksize_width, filter_multiplier == c_filter_multiplier && \
stride_height, stride_width, padding_height, padding_width, stride_height == stride_width && stride_height == c_stride) { \
input_grad_data); KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case(1, 1);
check_case(1, 2);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case(0, 0);
#undef check_case
} }
}; };
...@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad) { framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> { ...@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const int stride_width = strides[1]; const int stride_width = strides[1];
const int padding_height = paddings[0]; const int padding_height = paddings[0];
const int padding_width = paddings[1]; const int padding_width = paddings[1];
const int dilate_height = dilations[0];
const int dilate_width = dilations[1];
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width; int block_size = 512;
int crop_output_height =
int blocks = (nthreads + 1024 - 1) / 1024; std::min(std::max(block_size / output_width, 1), output_height);
dim3 threads(1024, 1); dim3 grid(ksize_width, ksize_height, output_channels);
dim3 grid(blocks, 1); dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
int filter_multiplier = output_channels / input_channels;
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels, #define check_case(c_filter_multiplier) \
output_height, output_width, input_channels, input_height, input_width, if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
output_channels / input_channels, ksize_height, ksize_width, KernelDepthwiseConvFilterGradSp< \
stride_height, stride_width, padding_height, padding_width, T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \
filter_grad_data); output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case(1);
check_case(0);
#undef check_case
} }
}; };
......
...@@ -32,7 +32,8 @@ class DepthwiseConvFunctor { ...@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter, const framework::Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output); const std::vector<int>& paddings,
const std::vector<int>& dilations, framework::Tensor* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor { ...@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor { ...@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations,
framework::Tensor* filter_grad); framework::Tensor* filter_grad);
}; };
......
...@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
......
...@@ -13,18 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
// remove typedef in openblas
#undef FLOAT
#undef INT
#undef SIZE
#endif
#include <cmath> #include <cmath>
#include <vector> #include <vector>
......
...@@ -12,17 +12,64 @@ ...@@ -12,17 +12,64 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h" #include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_mean, namespace paddle {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, namespace operators {
float, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, template <typename T>
double, ops::MeanFunctor>, struct DivideFunctor {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
int, ops::MeanFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
int64_t, ops::MeanFunctor>);
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>, float, ops::MeanGradFunctor>,
......
...@@ -12,17 +12,59 @@ ...@@ -12,17 +12,59 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h" #include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_sum, namespace paddle {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, namespace operators {
float, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, template <typename T>
double, ops::SumFunctor>, struct IdentityFunctor {
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE explicit inline IdentityFunctor() {}
int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, HOSTDEVICE inline T operator()(const T& x) const { return x; }
int64_t, ops::SumFunctor>); };
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>, float, ops::SumGradFunctor>,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include <boost/tokenizer.hpp>
namespace paddle {
namespace operators {
static constexpr char kX[] = "X";
static constexpr char kCaseToExecute[] = "case_to_execute";
static constexpr char kOutputs[] = "Out";
static constexpr char kCases[] = "cases";
static constexpr char kCasesBlock[] = "sub_block";
class SelectOp : public framework::OperatorBase {
public:
SelectOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
enum class SelectOpCaseType {
DEFAULT = 0,
SEND = 1,
RECEIVE = 2,
};
struct SelectOpCase {
int caseIndex;
SelectOpCaseType caseType;
std::string channelName;
std::string varName;
SelectOpCase() {}
SelectOpCase(int caseIndex, SelectOpCaseType caseType,
std::string channelName, std::string varName)
: caseIndex(caseIndex),
caseType(caseType),
channelName(channelName),
varName(varName) {}
};
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
std::vector<std::string> casesConfigs =
Attr<std::vector<std::string>>(kCases);
framework::BlockDesc *casesBlock =
Attr<framework::BlockDesc *>(kCasesBlock);
framework::Scope &casesBlockScope = scope.NewScope();
std::string caseToExecuteVarName = Input(kCaseToExecute);
framework::Variable *caseToExecuteVar =
casesBlockScope.FindVar(caseToExecuteVarName);
// Construct cases from "conditional_block_op"(s) in the casesBlock
std::vector<std::shared_ptr<SelectOpCase>> cases =
ParseAndShuffleCases(&casesConfigs);
// Get all unique channels involved in select
std::set<framework::ChannelHolder *> channelsSet;
for (auto c : cases) {
if (!c->channelName.empty()) {
auto channelVar = scope.FindVar(c->channelName);
framework::ChannelHolder *ch =
channelVar->GetMutable<framework::ChannelHolder>();
if (channelsSet.find(ch) == channelsSet.end()) {
channelsSet.insert(ch);
}
}
}
// Order all channels by their pointer address
std::vector<framework::ChannelHolder *> channels(channelsSet.begin(),
channelsSet.end());
std::sort(channels.begin(), channels.end());
// Poll all cases
int32_t caseToExecute = pollCases(&scope, &cases, channels);
// At this point, the case to execute has already been determined,
// so we can proceed with executing the cases block
framework::LoDTensor *caseToExecuteTensor =
caseToExecuteVar->GetMutable<framework::LoDTensor>();
caseToExecuteTensor->data<int32_t>()[0] = caseToExecute;
// Execute the cases block, only one case will be executed since we set the
// case_to_execute value to the index of the case we want to execute
framework::Executor executor(dev_place);
framework::ProgramDesc *program = casesBlock->Program();
executor.Run(*program, &casesBlockScope, casesBlock->ID(),
false /*create_local_scope*/);
}
/**
* Goes through all operators in the casesConfigs and processes
* "conditional_block" operators. These operators are mapped to our
* SelectOpCase objects. We randomize the case orders, and set the
* default case (if any exists) as the last case)
* @param casesBlock
* @return
*/
std::vector<std::shared_ptr<SelectOpCase>> ParseAndShuffleCases(
std::vector<std::string> *casesConfigs) const {
std::vector<std::shared_ptr<SelectOpCase>> cases;
std::shared_ptr<SelectOpCase> defaultCase;
if (casesConfigs != nullptr) {
boost::char_delimiters_separator<char> sep(false, ",", "");
for (std::vector<std::string>::iterator itr = casesConfigs->begin();
itr < casesConfigs->end(); ++itr) {
std::string caseConfig = *itr;
boost::tokenizer<> tokens(caseConfig, sep);
boost::tokenizer<>::iterator tok_iter = tokens.begin();
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index");
std::string caseIndexString = *tok_iter;
int caseIndex = std::stoi(caseIndexString);
++tok_iter;
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type");
std::string caseTypeString = *tok_iter;
SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString);
std::string caseChannel;
std::string caseChannelVar;
++tok_iter;
if (caseType != SelectOpCaseType::DEFAULT) {
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel");
caseChannel = *tok_iter;
++tok_iter;
PADDLE_ENFORCE(tok_iter != tokens.end(),
"Cannot get case channel variable");
caseChannelVar = *tok_iter;
}
auto c = std::make_shared<SelectOpCase>(caseIndex, caseType,
caseChannel, caseChannelVar);
if (caseType == SelectOpCaseType::DEFAULT) {
PADDLE_ENFORCE(defaultCase == nullptr,
"Select can only contain one default case.");
defaultCase = c;
} else {
cases.push_back(c);
}
}
}
// Randomly sort cases, with default case being last
std::random_shuffle(cases.begin(), cases.end());
if (defaultCase != nullptr) {
cases.push_back(defaultCase);
}
return cases;
}
/**
* This method will recursively poll the cases and determines if any case
* condition is true.
* If none of the cases conditions are true (and there is no default case),
* then block
* the thread. The thread may be woken up by a channel operation, at which
* point we
* execute the case.
* @param scope
* @param cases
* @param channels
* @return
*/
int32_t pollCases(const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases,
std::vector<framework::ChannelHolder *> channels) const {
// Lock all involved channels
lockChannels(channels);
std::atomic<int> caseToExecute(-1);
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
switch (c->caseType) {
case SelectOpCaseType::SEND:
PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel");
if (ch->CanSend()) {
// We can send to channel directly, send the data to channel
// and execute case
auto chVar = scope->FindVar(c->varName);
concurrency::ChannelSend(ch, chVar);
caseToExecute = c->caseIndex;
}
break;
case SelectOpCaseType::RECEIVE:
if (ch->CanReceive()) {
// We can receive from channel directly, send the data to channel
// and execute case
auto chVar = scope->FindVar(c->varName);
concurrency::ChannelReceive(ch, chVar);
caseToExecute = c->caseIndex;
}
break;
case SelectOpCaseType::DEFAULT:
caseToExecute = c->caseIndex;
break;
}
if (caseToExecute != -1) {
// We found a case to execute, stop looking at other case statements
break;
}
++it;
}
if (caseToExecute == -1) {
// None of the cases are eligible to execute, enqueue current thread
// into all the sending/receiving queue of each involved channel
std::atomic<bool> completed(false);
std::recursive_mutex mutex;
std::unique_lock<std::recursive_mutex> lock{mutex};
// std::condition_variable_any selectCond;
auto selectCond = std::make_shared<std::condition_variable_any>();
std::recursive_mutex callbackMutex;
pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
&completed, &callbackMutex);
// TODO(thuan): Atomically unlock all channels and sleep current thread
unlockChannels(channels);
selectCond->wait(lock, [&completed]() { return completed.load(); });
// Select has been woken up by case operation
lockChannels(channels);
removeThreadOnChannelQueues(scope, cases);
if (caseToExecute == -1) {
// Recursively poll cases, since we were woken up by a channel close
// TODO(thuan): Need to test if this is a valid case
unlockChannels(channels);
return pollCases(scope, cases, channels);
}
}
// At this point, caseToExecute != -1, and we can proceed with executing
// the case block
unlockChannels(channels);
return caseToExecute;
}
void lockChannels(std::vector<framework::ChannelHolder *> chs) const {
std::vector<framework::ChannelHolder *>::iterator it = chs.begin();
while (it != chs.end()) {
framework::ChannelHolder *ch = *it;
ch->Lock();
++it;
}
}
void unlockChannels(std::vector<framework::ChannelHolder *> chs) const {
std::vector<framework::ChannelHolder *>::reverse_iterator it = chs.rbegin();
while (it != chs.rend()) {
framework::ChannelHolder *ch = *it;
ch->Unlock();
++it;
}
}
void pushThreadOnChannelQueues(
const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases,
std::shared_ptr<std::condition_variable_any> rCond,
std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
std::recursive_mutex *callbackMutex) const {
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
std::function<bool(framework::ChannelAction channelAction)> cb =
[&caseToExecute, &completed, &callbackMutex,
c](framework::ChannelAction channelAction) {
std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
bool canProcess = false;
if (!(*completed)) {
// If the channel wasn't closed, we set the caseToExecute index
// as this current case
if (channelAction != framework::ChannelAction::CLOSE) {
*caseToExecute = c->caseIndex;
}
// This will allow our conditional variable to break out of wait
*completed = true;
canProcess = true;
}
return canProcess;
};
switch (c->caseType) {
case SelectOpCaseType::SEND: {
auto chOutputVar = scope->FindVar(c->varName);
concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb);
break;
}
case SelectOpCaseType::RECEIVE: {
auto chOutputVar = scope->FindVar(c->varName);
concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb);
break;
}
default:
break;
}
++it;
}
}
void removeThreadOnChannelQueues(
const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases) const {
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
switch (c->caseType) {
case SelectOpCaseType::SEND: {
ch->RemoveFromSendQ(this);
break;
}
case SelectOpCaseType::RECEIVE: {
ch->RemoveFromReceiveQ(this);
break;
}
default:
break;
}
++it;
}
}
};
class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(kX,
"A set of variables, which are required by operators inside the "
"cases of Select Op")
.AsDuplicable();
AddInput(kCaseToExecute,
"(Int) The variable the sets the index of the case to execute, "
"after evaluating the channels being sent to and received from")
.AsDuplicable();
AddOutput(kOutputs,
"A set of variables, which will be assigned with values "
"generated by the operators inside the cases of Select Op.")
.AsDuplicable();
AddAttr<std::vector<std::string>>(kCases,
"(String vector) Serialized list of"
"all cases in the select op. Each"
"case is serialized as: "
"'<index>,<type>,<channel>,<value>'"
"where type is 0 for default, 1 for"
"send, and 2 for receive"
"No channel and values are needed for"
"default cases.");
AddAttr<framework::BlockDesc *>(kCasesBlock,
"The cases block inside select_op");
AddComment(R"DOC(
)DOC");
}
};
// TODO(thuan): Implement Gradient Operator for SELECT_OP
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(select, paddle::operators::SelectOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::SelectOpMaker);
...@@ -24,9 +24,9 @@ class SequenceEraseOp : public framework::OperatorWithKernel { ...@@ -24,9 +24,9 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequenceEraseOp should not be null."); "Input(X) of SequenceErase operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequenceEraseOp should not be null."); "Output(Out) of SequenceErase operator should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1, PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
"Input(X) of SequenceEraseOp should be a 2-D LoDTensor " "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
......
...@@ -34,7 +34,7 @@ namespace operators { ...@@ -34,7 +34,7 @@ namespace operators {
using FluidDT = framework::proto::VarType_Type; using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType; using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT namespace {
TRT_DT FluidDataType2TRT(FluidDT type) { TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) { switch (type) {
......
...@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel {
"Output(Indices) of TopkOp should not be null."); "Output(Indices) of TopkOp should not be null.");
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(input_dims.size(), 2,
"Rank of TopK op's input must be 2.");
const int k = static_cast<int>(ctx->Attrs().Get<int>("k")); const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
......
...@@ -55,7 +55,7 @@ extern void *cublas_dso_handle; ...@@ -55,7 +55,7 @@ extern void *cublas_dso_handle;
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
inline cublasStatus_t operator()(Args... args) { \ inline cublasStatus_t operator()(Args... args) { \
return __name(args...); \ return ::__name(args...); \
} \ } \
}; \ }; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include <glog/logging.h>
#include <cudnn.h> #include <cudnn.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
...@@ -47,13 +50,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -47,13 +50,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
#else #else
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ inline cudnnStatus_t operator()(Args... args) { \
return __name(args...); \ return ::__name(args...); \
} \ } \
}; \ }; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#endif #endif
......
...@@ -44,7 +44,7 @@ extern void *curand_dso_handle; ...@@ -44,7 +44,7 @@ extern void *curand_dso_handle;
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
curandStatus_t operator()(Args... args) { \ curandStatus_t operator()(Args... args) { \
return __name(args...); \ return ::__name(args...); \
} \ } \
}; \ }; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
......
...@@ -107,7 +107,11 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, ...@@ -107,7 +107,11 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
const std::string& dso_name, const std::string& dso_name,
bool throw_on_error = true) { bool throw_on_error = true) {
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL; int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
void* dso_handle = nullptr; void* dso_handle = nullptr;
std::string dlPath = dso_name; std::string dlPath = dso_name;
...@@ -117,10 +121,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -117,10 +121,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
// search xxx.so from custom path // search xxx.so from custom path
dlPath = join(search_root, dso_name); dlPath = join(search_root, dso_name);
dso_handle = dlopen(dlPath.c_str(), dynload_flags); dso_handle = dlopen(dlPath.c_str(), dynload_flags);
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
// if not found, search from default path // if not found, search from default path
if (nullptr == dso_handle) { if (nullptr == dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")"; << errorno << ")";
if (dlPath.find("nccl") != std::string::npos) { if (dlPath.find("nccl") != std::string::npos) {
std::cout std::cout
<< "You may need to install 'nccl2' from NVIDIA official website: " << "You may need to install 'nccl2' from NVIDIA official website: "
...@@ -139,10 +148,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -139,10 +148,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
"export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
"using the DYLD_LIBRARY_PATH is impossible unless System " "using the DYLD_LIBRARY_PATH is impossible unless System "
"Integrity Protection (SIP) is disabled."; "Integrity Protection (SIP) is disabled.";
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
if (throw_on_error) { if (throw_on_error) {
PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror()); PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
} else if (nullptr == dso_handle) { } else if (nullptr == dso_handle) {
LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
} }
return dso_handle; return dso_handle;
......
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
if(NOT WIN32) if(NOT WIN32)
list(APPEND PYBIND_DEPS parallel_executor profiler) list(APPEND PYBIND_DEPS parallel_executor profiler)
......
...@@ -48,9 +48,6 @@ void BindConstValue(pybind11::module* m) { ...@@ -48,9 +48,6 @@ void BindConstValue(pybind11::module* m) {
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpNameScopeAttrName", "kOpNameScopeAttrName",
framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
op_proto_and_checker_maker.def(
"kOpCreationCallstackAttrName",
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
} }
} // namespace pybind } // namespace pybind
......
...@@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) { ...@@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) {
.def("set_shapes", &pd::VarDesc::SetShapes) .def("set_shapes", &pd::VarDesc::SetShapes)
.def("set_dtype", &pd::VarDesc::SetDataType) .def("set_dtype", &pd::VarDesc::SetDataType)
.def("set_dtypes", &pd::VarDesc::SetDataTypes) .def("set_dtypes", &pd::VarDesc::SetDataTypes)
.def("set_capacity", &pd::VarDesc::SetCapacity)
.def("shape", &pd::VarDesc::GetShape, .def("shape", &pd::VarDesc::GetShape,
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("shapes", &pd::VarDesc::GetShapes, .def("shapes", &pd::VarDesc::GetShapes,
...@@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) { ...@@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) {
.value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES) .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
.value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE) .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
.value("CHANNEL", pd::proto::VarType::CHANNEL)
.value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
.value("READER", pd::proto::VarType::READER) .value("READER", pd::proto::VarType::READER)
.value("RAW", pd::proto::VarType::RAW); .value("RAW", pd::proto::VarType::RAW);
......
...@@ -21,10 +21,10 @@ limitations under the License. */ ...@@ -21,10 +21,10 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
...@@ -595,6 +595,29 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -595,6 +595,29 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init())
.def("set_str", [](ir::Pass &self, const std::string &name,
const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
});
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder");
pb.def(py::init())
.def("append_pass",
[](ir::PassBuilder &self,
const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
return self.AppendPass(pass_type);
})
.def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
.def("insert_pass",
[](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
return self.InsertPass(idx, pass_type);
})
.def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
// -- python binds for parallel executor. // -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor"); py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy"); py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
...@@ -677,7 +700,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -677,7 +700,11 @@ All parameter, weight, gradient are variables in Paddle.
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
self.fuse_elewise_add_act_ops_ = b; self.fuse_elewise_add_act_ops_ = b;
}); })
.def("_create_passes_from_strategy",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy();
});
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::unordered_set<std::string> &,
......
...@@ -56,13 +56,13 @@ struct Style { ...@@ -56,13 +56,13 @@ struct Style {
}; };
template <typename... Args> template <typename... Args>
static void PrettyLogEndl(const std::string& style, const char* fmt, static void PrettyLogEndl(const std::string &style, const char *fmt,
const Args&... args) { const Args &... args) {
std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
} }
template <typename... Args> template <typename... Args>
static void PrettyLog(const std::string& style, const char* fmt, static void PrettyLog(const std::string &style, const char *fmt,
const Args&... args) { const Args &... args) {
std::cerr << style << Sprintf(fmt, args...) << reset(); std::cerr << style << Sprintf(fmt, args...) << reset();
} }
......
...@@ -16,7 +16,11 @@ endfunction() ...@@ -16,7 +16,11 @@ endfunction()
trainer_test(test_Compare) trainer_test(test_Compare)
trainer_test(test_PyDataProviderWrapper) trainer_test(test_PyDataProviderWrapper)
trainer_test(test_recurrent_machine_generation) trainer_test(test_recurrent_machine_generation)
trainer_test(test_Trainer) if(NOT APPLE)
trainer_test(test_Trainer)
else()
message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer")
endif()
############### test_TrainerOnePass ########################## ############### test_TrainerOnePass ##########################
if(WITH_PYTHON) if(WITH_PYTHON)
......
...@@ -395,10 +395,11 @@ EOF ...@@ -395,10 +395,11 @@ EOF
ctest --output-on-failure -j $1 ctest --output-on-failure -j $1
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
pip install /usr/local/opt/paddle/share/wheels/*.whl pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version paddle version
fi fi
pip uninstall -y paddlepaddle
fi fi
} }
...@@ -654,11 +655,21 @@ function gen_fluid_inference_lib() { ...@@ -654,11 +655,21 @@ function gen_fluid_inference_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Deploying fluid inference library ... Generating fluid inference library ...
======================================== ========================================
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF cmake .. -DWITH_DISTRIBUTE=OFF
make -j `nproc` inference_lib_dist make -j `nproc` inference_lib_dist
fi
}
function tar_fluid_inference_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF
========================================
Taring fluid inference library ...
========================================
EOF
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
cp -r fluid_install_dir fluid cp -r fluid_install_dir fluid
tar -czf fluid.tgz fluid tar -czf fluid.tgz fluid
...@@ -673,7 +684,7 @@ function test_fluid_inference_lib() { ...@@ -673,7 +684,7 @@ function test_fluid_inference_lib() {
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR}
./clean.sh ./clean.sh
fi fi
} }
...@@ -722,6 +733,7 @@ function main() { ...@@ -722,6 +733,7 @@ function main() {
fluid_inference_lib) fluid_inference_lib)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
gen_fluid_inference_lib gen_fluid_inference_lib
tar_fluid_inference_lib
test_fluid_inference_lib test_fluid_inference_lib
;; ;;
check_style) check_style)
...@@ -742,11 +754,15 @@ function main() { ...@@ -742,11 +754,15 @@ function main() {
build_mac build_mac
run_mac_test ${PROC_RUN:-1} run_mac_test ${PROC_RUN:-1}
;; ;;
macbuild)
cmake_gen ${PYTHON_ABI:-""}
build_mac
;;
cicheck_py35) cicheck_py35)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build build
run_test run_test
assert_api_not_changed assert_api_not_changed ${PYTHON_ABI:-""}
;; ;;
*) *)
print_usage print_usage
......
...@@ -87,6 +87,7 @@ if (WITH_TESTING) ...@@ -87,6 +87,7 @@ if (WITH_TESTING)
endif() endif()
endif() endif()
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
"All parameters' 'clip_norm' of a same group should be the same" "All parameters' 'clip_norm' of a same group should be the same"
) )
local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0)) square = grad * grad
local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
context[self.group_name].append(local_norm_var) context[self.group_name].append(local_norm_var)
self.context = context self.context = context
...@@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
if group_scale_name not in self.context: if group_scale_name not in self.context:
group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sums(input=self.context[self.group_name])
group_norm_var = layers.sqrt(x=group_norm_var) group_norm_var = layers.sqrt(x=group_norm_var)
group_norm_var = layers.cast(group_norm_var, 'float32')
clip_var = self.context[self.group_name + "_clip"] clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div( group_scale_var = layers.elementwise_div(
x=clip_var, x=clip_var,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from .layers.control_flow import BlockGuard, equal
from .framework import Operator
from .layer_helper import LayerHelper, unique_name
from .layers import fill_constant
from . import core
__all__ = [
'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
]
class Go(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper("go", name=name)
super(Go, self).__init__(self.helper.main_program)
def __enter__(self):
super(Go, self).__enter__()
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
self._construct_go_op()
return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
def _construct_go_op(self):
main_program = self.helper.main_program
go_block = main_program.current_block()
parent_block = main_program.block(main_program.current_block()
.parent_idx)
inner_outputs = set()
x_name_list = set()
for op in go_block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the Go operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
x_name_list.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
# Iterate over all operators , get all the outputs
# add to the output list of Go operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in parent_block.vars:
out_vars.append(parent_block.var(inner_out_name))
parent_block.append_op(
type='go',
inputs={
'X': [
parent_block._var_recursive(x_name)
for x_name in x_name_list
]
},
outputs={},
attrs={'sub_block': go_block})
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
select,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None,
is_copy=False):
self.select = select
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else self.DEFAULT
X = value
if self.action == self.SEND and is_copy:
# We create of copy of the data we want to send
copied_X = self.select.parent_block.create_var(
name=unique_name.generate(value.name + '_copy'),
type=value.type,
dtype=value.dtype,
shape=value.shape,
lod_level=value.lod_level,
capacity=value.capacity
if hasattr(value, 'capacity') else None, )
self.select.parent_block.append_op(
type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X
self.value = X
self.channel = channel
def __enter__(self):
self.block = self.main_program._create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program._rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.parent_block = self.helper.main_program.current_block()
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value, is_copy=False):
"""Create a new block for this condition.
"""
select_case = SelectCase(self,
len(self.cases), self.case_to_execute,
channel_action_fn, channel, value, is_copy)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(self, len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block._var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={'Out': out_list})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
def make_channel(dtype, capacity=0):
"""
Helps implementation of a concurrent program by creating a "channel" of
a defined data type. Channels allow for the passing of data in
concurrent scenarios - such as when using threads to divide computation.
Channels can be used to "send" and "receive" such data concurrently.
There are two kinds of channels: unbuffered and buffered. Unbuffered
channels have no capacity - and thus, block on send and only unblock only
once what they have sent has been received.
On the other hand, buffered channels are initialized with a capacity -
and do not block on sends.
Use this method in combination with `channel_send`, `channel_recv`,
`channel_close`, and `Go` to design a concurrent Paddle program.
Args:
dtype (ParamAttr|string): Data type of the data sent in the channel.
This data type should be the string name of a numpy data type.
capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
to create an unbuffered channel.
Returns:
Variable: The channel variable that can be used to send an receive data
of the defined dtype.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to execute in a Go block, which receives the channel data.
fluid.channel_send(ch, 100)
fluid.channel_close(ch)
"""
helper = LayerHelper('channel_create', **locals())
main_program = helper.main_program
make_channel_block = main_program.current_block()
# Make a channel variable (using the channel data type) and make sure it
# persists into the global scope.
channel = helper.create_variable(
name=unique_name.generate('channel'),
type=core.VarDesc.VarType.CHANNEL,
persistable=True)
create_channel_op = make_channel_block.append_op(
type="channel_create",
outputs={"Out": channel},
attrs={"data_type": dtype,
"capacity": capacity})
return channel
def channel_send(channel, value, is_copy=False):
"""
Sends a value through a channel variable. Used by an unbuffered or buffered
channel to pass data from within or to a concurrent Go block, where
`channel_recv` to used to get the passed value.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
value (Variable): Value to send to channel
is_copy (bool): Copy data while channel send. If False, then data
is moved. The input cannot be used after move. (default False)
Returns:
Variable: The boolean status on whether or not the channel
successfully sent the passed value.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to execute in a Go block, which receives the channel data.
fluid.channel_send(ch, 100)
"""
helper = LayerHelper('channel_send', **locals())
main_program = helper.main_program
channel_send_block = main_program.current_block()
X = value
if is_copy:
copied_X = helper.create_variable(
name=unique_name.generate(value.name + '_copy'),
type=value.type,
dtype=value.dtype,
shape=value.shape,
lod_level=value.lod_level,
capacity=value.capacity if hasattr(value, 'capacity') else None)
assign_op = channel_send_block.append_op(
type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X
channel_send_block.append_op(
type="channel_send", inputs={
"Channel": channel,
"X": X,
})
def channel_recv(channel, return_value):
"""
Receives a value through a channel variable. Used by an unbuffered or
buffered channel within a concurrent Go block to get data from originally
sent using `channel_send`, or from outside such a block where
`channel_send` is used to send the value.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
return_value (Variable): Variable to set as a result of running channel_recv_op
Returns:
Variable: The received value from the channel.
Variable: The boolean status on whether or not the channel
successfully received the passed value.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
with fluid.Go():
returned_value, return_status = fluid.channel_recv(ch, 'int32')
# Code to send data through the channel.
"""
helper = LayerHelper('channel_recv', **locals())
main_program = helper.main_program
channel_recv_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
channel_recv_op = channel_recv_block.append_op(
type="channel_recv",
inputs={"Channel": channel},
outputs={"Out": return_value,
"Status": status})
return return_value, status
def channel_close(channel):
"""
Closes a channel created using `make_channel`.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to receive and send data through a channel
...
fluid.channel_close(ch)
"""
helper = LayerHelper('channel_close', **locals())
main_program = helper.main_program
channel_close_block = main_program.current_block()
channel_close_op = channel_close_block.append_op(
type="channel_close", inputs={"Channel": channel})
...@@ -20,8 +20,11 @@ from . import memory_usage_calc ...@@ -20,8 +20,11 @@ from . import memory_usage_calc
from .memory_usage_calc import * from .memory_usage_calc import *
from . import op_frequence from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize
from .quantize import *
__all__ = [] __all__ = []
__all__ += decoder.__all__ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -14,28 +14,7 @@ ...@@ -14,28 +14,7 @@
from __future__ import print_function from __future__ import print_function
import unittest from . import quantize_transpiler
import paddle.fluid as fluid from .quantize_transpiler import *
import paddle.fluid.core as core
from paddle.fluid.executor import Executor
__all__ = quantize_transpiler.__all__
class TestRoutineOp(unittest.TestCase):
def test_simple_routine(self):
ch = fluid.make_channel(
dtype=core.VarDesc.VarType.BOOL, name="CreateChannel")
with fluid.Go():
fluid.channel_send(ch, True)
result = fluid.channel_recv(ch)
fluid.channel_close(ch)
cpu = core.CPUPlace()
exe = Executor(cpu)
outs = exe.run(fetch_list=[result])
self.assertEqual(outs[0], True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import numpy as np
from paddle.fluid.framework import default_main_program, default_startup_program, program_guard
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import unique_name
from paddle.fluid import core
from paddle.fluid.initializer import Constant
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.layers.nn import autoincreased_step_counter
from paddle.fluid.framework import Variable
from paddle.fluid.executor import global_scope
from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler
__all__ = ['QuantizeTranspiler']
_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
def _quantized_var_name(var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.quantized" % (var_name)
def _dequantized_var_name(var_name):
"""
Return dequantized variable name for the input `var_name`.
"""
return "%s.dequantized" % (var_name)
def _quantized_scale_name(var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.scale" % (var_name)
def _original_var_name(var_name):
"""
Return the original variable name.
"""
if var_name.endswith('.quantized.dequantized'):
return var_name[:-len('.quantized.dequantized')]
if var_name.endswith('.quantized'):
return var_name[:-len('.quantized')]
if var_name.endswith('.dequantized'):
return var_name[:-len('.dequantized')]
if var_name.endswith('.scale'):
return var_name[:-len('.scale')]
else:
return var_name
def _is_float(v):
return isinstance(v, float) or isinstance(v, np.float32)
def quant(x, scale, num_bits):
y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
return y
class QuantizeTranspiler(object):
def __init__(self,
weight_bits=8,
activation_bits=8,
activation_quantize_type='abs_max',
weight_quantize_type='abs_max',
window_size=10000):
"""
Convert and rewrite the fluid Program according to weight and
activation quantization type.
Args:
weight_bits (int): quantization bit number for weights,
the bias is not quantized.
activation_bits (int): quantization bit number for activation.
activation_quantize_type (str): quantization type for activation,
now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
the quantization scale will be calculated dynamically each step
in both training and testing period. If use 'range_abs_max',
a static quantization scale will be calculated during training
and used in inference.
weight_quantize_type (str): quantization type for weights,
support 'abs_max'. The 'range_abs_max' usually is not used for
weight, since weights are fixed once the model is well trained.
window_size (int): the window size for 'range_abs_max' quantization.
Examples:
.. code-block:: python
# the original program will be rewrite, if you don't want to
# change it, please clone at first.
# quantize_program = program.clone()
t = fluid.QuantizeTranspiler()
t.transpile(quantize_program)
"""
self.weight_bits = weight_bits
self.activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max']
if weight_quantize_type not in quant_type:
raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
if activation_quantize_type not in quant_type:
raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
self.weight_quantize_type = weight_quantize_type
self.activation_quantize_type = activation_quantize_type
self.window_size = window_size
self.helper = LayerHelper(self.__class__.__name__)
self.fake_quant_op_types = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
]
self.fake_dequant_op_types = ['fake_dequantize_max_abs']
self.is_test = None
self.global_step = None
def training_transpile(self, program=None, startup_program=None):
"""Rewrites a training input program in place for simulated
quantization. Insert fake quantization and de-quantization ops into
program to simulate the error introduced by quantization. And change
the graident ops' input by using the faked quantization weights and
activation. Since the program is transformed in place, the graph
connection will change.
Args:
program (Program): the input program to be transpile.
"""
self.is_test = False
program = default_main_program() if program is None else program
startup_program = default_startup_program() if startup_program is \
None else startup_program
# marked the variable which has been quantized and dequantized.
dequanted_vars = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
params = [p.name for p in program.global_block().iter_parameters()]
def _transpile_forward(block, op):
idx = block.ops.index(op)
block_id = block.idx
# insert quant op and dequant op
for name in op.input_arg_names:
if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name]
else:
var = block.var(name)
quant_bits = self.weight_bits if var.name in params \
else self.activation_bits
quant_type = self.weight_quantize_type if var.name \
in params else self.activation_quantize_type
quant_var, scale_var = self._insert_quant_op(
block, idx, var, quant_bits, quant_type)
dequant_var = self._insert_dequant_op(
block, idx + 1, quant_var, scale_var, quant_bits)
dequanted_vars[block_id][name] = dequant_var
# rename the forward op inputs
op._rename_input(name, dequant_var.name)
def _transpile_backward(block, op):
block_id = block.idx
no_dequanted_input_vars = True
for name in op.input_arg_names:
if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name]
op._rename_input(name, dequant_var.name)
no_dequanted_input_vars = False
if no_dequanted_input_vars:
raise ValueError("There is no dequanted inputs for op %s." %
(op.type))
with program_guard(program, startup_program):
self._create_global_step()
for block in program.blocks:
ops = list(block.ops)
block_id = block.idx
for op in ops:
# rewrite the forward ProgramDes
if op.type in _QUANTIZABLE_OP_TYPES:
_transpile_forward(block, op)
# rename the backward op inputs
if op.type in grad_op_types:
_transpile_backward(block, op)
def _create_global_step(self):
if self.weight_quantize_type == 'range_abs_max' or \
self.activation_quantize_type == 'range_abs_max':
self.global_step = autoincreased_step_counter()
def freeze_program(self, program, place, fuse_bn=False, scope=None):
"""Freeze input training program for inference.
Args:
program (Program): the input program to be transpile.
"""
self.is_test = True
scope = global_scope() if scope is None else scope
program = default_main_program() if program is None else program
if fuse_bn:
bn_fuse_transpiler = BNFuseTranspiler()
bn_fuse_transpiler.transpile(program, place)
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, program.list_vars())
]
op_in_rename_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
op_out_rename_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
var_scale_map = [
collections.OrderedDict() for _ in range(len(program.blocks))
]
def _remove_fake_quant_and_dequant_op(block, op):
idx = block.ops.index(op)
block_id = block.idx
k = op.output('Out')[0]
v = op.input('X')[0]
if v not in op_in_rename_map[block_id]:
op_in_rename_map[block_id][k] = v
else:
op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
block._remove_op(idx)
def _insert_post_dequant_op(block, op):
idx = block.ops.index(op)
block_id = block.idx
max_range = None
scale_var = None
for name in op.input_arg_names:
if name in op_in_rename_map[block_id]:
op._rename_input(name, op_in_rename_map[block_id][name])
scale_v = var_scale_map[block_id][_original_var_name(name)]
if _original_var_name(name) in persistable_vars:
param_range = (1 << (self.weight_bits - 1)) - 1
act_range = (1 << (self.activation_bits - 1)) - 1
assert _is_float(scale_v)
max_range = param_range * act_range / scale_v
else:
assert isinstance(scale_v, Variable)
scale_var = var_scale_map[block_id][_original_var_name(
name)]
if len(op.output_arg_names) != 1:
raise ValueError("Only support one output, but op %s has"
" more than one output." % (op.type))
out_var = block.var(op.output_arg_names[0])
dequant_var = block.create_var(
name=_dequantized_var_name(out_var.name),
type=out_var.type,
shape=out_var.shape,
dtype=out_var.dtype)
# insert fake_dequantize_op
dequant_op = block._insert_op(
idx + 1,
type="fake_dequantize_max_abs",
attrs={'max_range': float(max_range)},
inputs={"X": out_var,
'Scale': scale_var},
outputs={"Out": dequant_var})
op_out_rename_map[block_id][out_var.name] = dequant_var.name
return dequant_var
def _load_var(name):
return np.array(scope.find_var(name).get_tensor())
def _restore_var(name, arr):
t = scope.find_var(name).get_tensor()
t.set(arr, place)
for block in program.blocks:
ops = list(block.ops)
block_id = block.idx
for op in ops:
op_type = op.type
# insert dequant_op after fc/conv, need to rename
# input of the followed ops
for name in op.input_arg_names:
if name in op_out_rename_map[block_id]:
op._rename_input(name,
op_out_rename_map[block_id][name])
if op_type in self.fake_quant_op_types:
in_arg_name = op.input('X')[0]
if in_arg_name in persistable_vars:
if self.weight_quantize_type == 'abs_max':
param = _load_var(in_arg_name)
scale_v = np.max(np.abs(param))
else:
scale_v = _load_var(op.output('OutScale')[0])
var_scale_map[block_id][in_arg_name] = scale_v
else:
scale_v = block.var(op.output('OutScale')[0])
var_scale_map[block_id][in_arg_name] = scale_v
if in_arg_name in persistable_vars:
_remove_fake_quant_and_dequant_op(block, op)
# quantize weight and restore
param_t = _load_var(in_arg_name)
param_q_t = quant(param_t, scale_v, self.weight_bits)
_restore_var(in_arg_name, param_q_t)
if op_type in self.fake_dequant_op_types:
_remove_fake_quant_and_dequant_op(block, op)
if op_type in _QUANTIZABLE_OP_TYPES:
dequant_var = _insert_post_dequant_op(block, op)
# remove the unused var in ProgramDesc
self._remove_unused_var(program)
#program = program.clone()
def convert_to_int8(self, program, place, scope=None):
scope = global_scope() if scope is None else scope
program = default_main_program() if program is None else program
def _load_var(name):
return np.array(scope.find_var(name).get_tensor())
global_block = program.global_block()
def convert_to_int8(var):
int8_var_name = var.name + ".int8"
int8_var = global_block.create_parameter(
name=int8_var_name.encode('ascii'),
type=var.type,
dtype=core.VarDesc.VarType.INT8,
shape=var.shape)
tensor = _load_var(var.name)
scope.var(int8_var_name)
int8_tensor = scope.find_var(int8_var_name).get_tensor()
int8_tensor.set(tensor.astype(np.int8), place)
return int8_var
input_map = {}
for block in program.blocks:
for op in list(block.ops):
if op.type in _QUANTIZABLE_OP_TYPES:
for name in op.input_arg_names:
var = block.var(name)
if var.persistable:
if name not in input_map:
int8_var = convert_to_int8(var)
input_map[name] = int8_var.name
op._rename_input(name, input_map[name])
self._remove_unused_var(program)
def _remove_unused_var(self, program):
all_remove_vars = []
for block in program.blocks:
args = []
for op in block.ops:
args += op.input_arg_names
args += op.output_arg_names
args = list(set(args))
var_names = block.vars.keys()
sub_block_remove_vars = []
for var in var_names:
if var not in args:
sub_block_remove_vars.append(var)
all_remove_vars.append(sub_block_remove_vars)
remove_vars = [list(set(v)) for v in all_remove_vars]
for i, block in enumerate(program.blocks):
for v in remove_vars[i]:
block._remove_var(v)
def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
"""Insert fake_quantize_abs_max op.
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
scale = block.create_var(
name=_quantized_scale_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
quant_op = block._insert_op(
idx,
type='fake_quantize_abs_max',
attrs={'bit_length': quant_bits},
inputs={'X': var},
outputs={'Out': quant_var,
'OutScale': scale})
return quant_var, scale
def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
"""Insert fake_quantize_range_abs_max
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
scale = self.helper.create_parameter(
attr=ParamAttr(
name=_quantized_scale_name(var.name),
initializer=Constant(0.001),
trainable=False),
shape=[1],
dtype=var.dtype)
scale.stop_gradient = True
ins = {'X': var, 'InScale': scale}
outs = {'Out': quant_var, 'OutScale': scale}
if not self.is_test:
# A global step counter variable with type int64
scales = self.helper.create_global_variable(
name=unique_name.generate('scales'),
persistable=True,
dtype=var.dtype,
shape=[self.window_size])
self.helper.set_variable_initializer(
scales, initializer=Constant(value=0))
ins['Iter'] = self.global_step
outs['OutScales'] = scales
attrs = {
'window_size': self.window_size,
'bit_length': quant_bits,
'is_test': self.is_test
}
quant_op = block._insert_op(
idx,
type='fake_quantize_range_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
return quant_var, scale
def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
"""
Insert fake_quantize_op
"""
if quant_type == 'abs_max':
return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(block, idx, var,
quant_bits)
def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
"""
Insert fake_quantize_op
"""
dequant_var = block.create_var(
name=_dequantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
# insert fake_dequantize_op
max_range = (1 << (quant_bits - 1)) - 1
dequant_op = block._insert_op(
idx,
type="fake_dequantize_max_abs",
attrs={'max_range': float(max_range)},
inputs={"X": var,
'Scale': scale},
outputs={"Out": dequant_var})
return dequant_var
class BNFuseTranspiler(InferenceTranspiler):
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
def _update_param(op, param_name, new_param):
var = self.block.vars[param_name]
tensor = self.scope.find_var(param_name).get_tensor()
tensor.set(np.array(new_param), self.place)
def _load_param(param_name):
return np.array(self.scope.find_var(param_name).get_tensor())
bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias
scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale
mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean
var_bn = _load_param(bn_op.input("Variance")[0]) #Variance
if current_op.type in ['conv2d', 'depthwise_conv2d']:
current_param = _load_param(
_original_var_name(current_op.input("Filter")[0]))
elif current_op.type == 'mul':
current_param = _load_param(
_original_var_name(current_op.input("Y")[0]))
std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
tmp = np.float32(np.divide(scale_bn, std_bn))
# add bias of batch_norm_op to conv2d
if with_bias:
bias = _load_param(bias_op.input("Y"))
else:
bias = np.zeros(bias_bn.shape)
bias = np.float32(
np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
# re-compute weight of conv2d/fc
tmp = tmp.reshape(tmp.shape[0], -1)
dst_param = current_param.reshape((tmp.shape[0], -1))
dst_param = np.float32(np.multiply(dst_param, tmp))
dst_param = dst_param.reshape(current_param.shape)
# update parameters
if current_op.type in ['conv2d', 'depthwise_conv2d']:
_update_param(current_op,
_original_var_name(current_op.input("Filter")[0]),
dst_param)
elif current_op.type == 'mul':
_update_param(current_op,
_original_var_name(current_op.input("Y")[0]),
dst_param)
_update_param(bias_op, bias_op.input("Y")[0], bias)
# collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import numpy as np
import six
import unittest
import paddle
import paddle.fluid as fluid
from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
def linear_fc(num):
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data
for _ in six.moves.xrange(num):
hidden = fluid.layers.fc(hidden, size=128, act='relu')
loss = fluid.layers.cross_entropy(input=hidden, label=label)
loss = fluid.layers.mean(loss)
return loss
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data
for _ in six.moves.xrange(num):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
fc = fluid.layers.fc(input=hidden, size=10)
loss = fluid.layers.cross_entropy(input=fc, label=label)
loss = fluid.layers.mean(loss)
return loss
def conv_net(img, label):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
class TestQuantizeTranspiler(unittest.TestCase):
def setUp(self):
# since quant_op and dequant_op is not ready, use cos and sin for test
self.weight_quant_op_type = 'fake_quantize_abs_max'
self.dequant_op_type = 'fake_dequantize_max_abs'
self.quantizable_op_and_inputs = {
'conv2d': ['Input', 'Filter'],
'depthwise_conv2d': ['Input', 'Filter'],
'mul': ['X', 'Y']
}
self.quantizable_op_grad_and_inputs = {
'conv2d_grad': ['Input', 'Filter'],
'depthwise_conv2d_grad': ['Input', 'Filter'],
'mul_grad': ['X', 'Y']
}
def check_program(self, program):
quantized_ops = {}
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, program.list_vars())
]
for block in program.blocks:
for idx, op in enumerate(block.ops):
# check forward
if op.type in self.quantizable_op_and_inputs:
for i, arg_name in enumerate(op.input_arg_names):
quant_op_type = self.weight_quant_op_type if \
_original_var_name(arg_name) \
in persistable_vars else self.act_quant_op_type
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
if arg_name not in quantized_ops:
self.assertEqual(block.ops[idx - 2 * i - 1].type,
self.dequant_op_type)
self.assertEqual(block.ops[idx - 2 * i - 2].type,
quant_op_type)
quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
else:
op_idx = block.ops.index(quantized_ops[arg_name])
self.assertLess(op_idx, idx)
# check backward
if op.type in self.quantizable_op_grad_and_inputs:
for pname in self.quantizable_op_grad_and_inputs[op.type]:
arg_name = op.input(pname)[0]
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
self.assertTrue(arg_name in quantized_ops)
def linear_fc_quant(self, quant_type):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = linear_fc(3)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
t = QuantizeTranspiler(activation_quantize_type=quant_type)
t.training_transpile(main)
self.check_program(main)
def test_linear_fc_quant_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max'
self.linear_fc_quant('abs_max')
def test_linear_fc_quant_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max'
self.linear_fc_quant('range_abs_max')
def residual_block_quant(self, quant_type):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = residual_block(2)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
t = QuantizeTranspiler(activation_quantize_type=quant_type)
t.training_transpile(main)
self.check_program(main)
def test_residual_block_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max'
self.residual_block_quant('abs_max')
def test_residual_block_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max'
self.residual_block_quant('range_abs_max')
def freeze_program(self, use_cuda, seed):
def build_program(main, startup, is_test):
main.random_seed = seed
startup.random_seed = seed
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
img = fluid.layers.data(
name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
loss = conv_net(img, label)
if not is_test:
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
return [img, label], loss
main = fluid.Program()
startup = fluid.Program()
test_program = fluid.Program()
import random
random.seed(0)
np.random.seed(0)
feeds, loss = build_program(main, startup, False)
build_program(test_program, startup, True)
test_program = test_program.clone(for_test=True)
quant_transpiler = QuantizeTranspiler()
quant_transpiler.training_transpile(main)
quant_transpiler.training_transpile(test_program)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
iters = 5
batch_size = 8
class_num = 10
exe.run(startup)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
with fluid.program_guard(main):
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(program=main,
feed=feeder.feed(data),
fetch_list=[loss])
with fluid.program_guard(test_program):
test_data = next(test_reader())
w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
test_program)
# Testing during training
test_loss1, w_quant = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[loss, w_var])
# Freeze program for inference, but the weight of fc/conv is still float type.
quant_transpiler.freeze_program(test_program, place)
test_loss2, = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[loss])
self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
.get_tensor())
# fail: -432.0 != -433.0, this is due to the calculation precision
#self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
# Convert parameter to 8-bit.
quant_transpiler.convert_to_int8(test_program, place)
# Save the 8-bit parameter and model file.
fluid.io.save_inference_model('model_8bit', ['image', 'label'],
[loss], exe, test_program)
# Test whether the 8-bit parameter and model file can be loaded successfully.
[infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
exe)
# Check the loaded 8-bit weight.
w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
.get_tensor())
self.assertEqual(w_8bit.dtype, np.int8)
self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
def not_test_freeze_program_cuda(self):
if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard():
self.freeze_program(True, seed=1)
def not_test_freeze_program_cpu(self):
with fluid.unique_name.guard():
self.freeze_program(False, seed=2)
if __name__ == '__main__':
unittest.main()
...@@ -18,7 +18,6 @@ import collections ...@@ -18,7 +18,6 @@ import collections
import contextlib import contextlib
import re import re
import six import six
import traceback
import numpy as np import numpy as np
...@@ -35,8 +34,6 @@ except ImportError as e: ...@@ -35,8 +34,6 @@ except ImportError as e:
except Exception as e: except Exception as e:
raise e raise e
from . import unique_name from . import unique_name
import os
PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None
__all__ = [ __all__ = [
'Program', 'Program',
...@@ -490,8 +487,7 @@ class OpProtoHolder(object): ...@@ -490,8 +487,7 @@ class OpProtoHolder(object):
return { return {
core.op_proto_and_checker_maker.kOpRoleAttrName(), core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName(), core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
core.op_proto_and_checker_maker.kOpNameScopeAttrName(), core.op_proto_and_checker_maker.kOpNameScopeAttrName()
core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
} }
...@@ -541,8 +537,7 @@ class Operator(object): ...@@ -541,8 +537,7 @@ class Operator(object):
'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'channel_create', 'channel_close', 'channel_send', 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
} }
def __init__(self, def __init__(self,
...@@ -574,11 +569,6 @@ class Operator(object): ...@@ -574,11 +569,6 @@ class Operator(object):
if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del op_attrs[role_var_name] del op_attrs[role_var_name]
if not PADDLE_ON_MODEL_CE:
callstack_var_name = op_maker.kOpCreationCallstackAttrName()
op_attrs[callstack_var_name] = list(
reversed(traceback.format_stack()))[1:]
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
if type is None: if type is None:
......
...@@ -21,7 +21,7 @@ from .. import core ...@@ -21,7 +21,7 @@ from .. import core
from ..framework import Program, Variable, Operator from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ..initializer import force_init_on_cpu from ..initializer import force_init_on_cpu
from .ops import logical_and, logical_not, logical_or from .nn import logical_and, logical_not, logical_or
import numpy import numpy
import warnings import warnings
import six import six
......
...@@ -42,19 +42,11 @@ __all__ = [ ...@@ -42,19 +42,11 @@ __all__ = [
'roi_perspective_transform', 'roi_perspective_transform',
'generate_proposal_labels', 'generate_proposal_labels',
'generate_proposals', 'generate_proposals',
]
__auto__ = [
'iou_similarity', 'iou_similarity',
'box_coder', 'box_coder',
'polygon_box_transform', 'polygon_box_transform',
] ]
__all__ += __auto__
for _OP in set(__auto__):
globals()[_OP] = generate_layer_fn(_OP)
def rpn_target_assign(bbox_pred, def rpn_target_assign(bbox_pred,
cls_logits, cls_logits,
...@@ -308,6 +300,101 @@ def detection_output(loc, ...@@ -308,6 +300,101 @@ def detection_output(loc,
return nmsed_outs return nmsed_outs
@templatedoc()
def iou_similarity(x, y, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
y(${y_type}): ${y_comment}
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("iou_similarity", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="iou_similarity",
inputs={"X": x,
"Y": y},
attrs={},
outputs={"Out": out})
return out
@templatedoc()
def box_coder(prior_box,
prior_box_var,
target_box,
code_type="encode_center_size",
box_normalized=True,
name=None):
"""
${comment}
Args:
prior_box(${prior_box_type}): ${prior_box_comment}
prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
target_box(${target_box_type}): ${target_box_comment}
code_type(${code_type_type}): ${code_type_comment}
box_normalized(${box_normalized_type}): ${box_normalized_comment}
Returns:
output_box(${output_box_type}): ${output_box_comment}
"""
helper = LayerHelper("box_coder", **locals())
if name is None:
output_box = helper.create_tmp_variable(dtype=prior_box.dtype)
else:
output_box = helper.create_variable(
name=name, dtype=prior_box.dtype, persistable=False)
helper.append_op(
type="box_coder",
inputs={
"PriorBox": prior_box,
"PriorBoxVar": prior_box_var,
"TargetBox": target_box
},
attrs={"code_type": code_type,
"box_normalized": box_normalized},
outputs={"OutputBox": output_box})
return output_box
@templatedoc()
def polygon_box_transform(input, name=None):
"""
${comment}
Args:
input(${input_type}): ${input_comment}
Returns:
output(${output_type}): ${output_comment}
"""
helper = LayerHelper("polygon_box_transform", **locals())
if name is None:
output = helper.create_tmp_variable(dtype=input.dtype)
else:
output = helper.create_variable(
name=name, dtype=prior_box.input, persistable=False)
helper.append_op(
type="polygon_box_transform",
inputs={"Input": input},
attrs={},
outputs={"Output": output})
return output
@templatedoc() @templatedoc()
def detection_map(detect_res, def detection_map(detect_res,
label, label,
......
...@@ -660,7 +660,6 @@ def py_reader(capacity, ...@@ -660,7 +660,6 @@ def py_reader(capacity,
1. The basic usage of :code:`py_reader` is as follows: 1. The basic usage of :code:`py_reader` is as follows:
>>> import paddle.v2
>>> import paddle.fluid as fluid >>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist >>> import paddle.dataset.mnist as mnist
>>> >>>
...@@ -668,7 +667,7 @@ def py_reader(capacity, ...@@ -668,7 +667,7 @@ def py_reader(capacity,
>>> shapes=[(-1,3,224,224), (-1,1)], >>> shapes=[(-1,3,224,224), (-1,1)],
>>> dtypes=['float32', 'int64']) >>> dtypes=['float32', 'int64'])
>>> reader.decorate_paddle_reader( >>> reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) >>> paddle.reader.shuffle(paddle.batch(mnist.train())
>>> >>>
>>> img, label = fluid.layers.read_file(reader) >>> img, label = fluid.layers.read_file(reader)
>>> loss = network(img, label) # some network definition >>> loss = network(img, label) # some network definition
...@@ -687,7 +686,6 @@ def py_reader(capacity, ...@@ -687,7 +686,6 @@ def py_reader(capacity,
2. When training and testing are both performed, two different 2. When training and testing are both performed, two different
:code:`py_reader` should be created with different names, e.g.: :code:`py_reader` should be created with different names, e.g.:
>>> import paddle.v2
>>> import paddle.fluid as fluid >>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist >>> import paddle.dataset.mnist as mnist
>>> >>>
...@@ -701,7 +699,7 @@ def py_reader(capacity, ...@@ -701,7 +699,7 @@ def py_reader(capacity,
>>> dtypes=['float32', 'int64'], >>> dtypes=['float32', 'int64'],
>>> name='train_reader') >>> name='train_reader')
>>> train_reader.decorate_paddle_reader( >>> train_reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) >>> paddle.reader.shuffle(paddle.batch(mnist.train())
>>> >>>
>>> test_reader = fluid.layers.py_reader(capacity=32, >>> test_reader = fluid.layers.py_reader(capacity=32,
>>> shapes=[(-1,3,224,224), (-1,1)], >>> shapes=[(-1,3,224,224), (-1,1)],
......
...@@ -29,29 +29,127 @@ from .. import unique_name ...@@ -29,29 +29,127 @@ from .. import unique_name
from functools import reduce from functools import reduce
__all__ = [ __all__ = [
'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', 'fc',
'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', 'embedding',
'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d', 'dynamic_lstm',
'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d', 'dynamic_lstmp',
'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose', 'dynamic_gru',
'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit', 'gru_unit',
'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod', 'linear_chain_crf',
'sequence_first_step', 'sequence_last_step', 'dropout', 'split', 'crf_decoding',
'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk', 'cos_sim',
'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce', 'cross_entropy',
'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm', 'square_error_cost',
'softmax_with_cross_entropy', 'smooth_l1', 'one_hot', 'chunk_eval',
'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze', 'sequence_conv',
'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool', 'conv2d',
'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear', 'conv3d',
'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', 'sequence_pool',
'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid', 'sequence_softmax',
'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten', 'softmax',
'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate', 'pool2d',
'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div', 'pool3d',
'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min', 'batch_norm',
'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random', 'beam_search_decode',
'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape' 'conv2d_transpose',
'conv3d_transpose',
'sequence_expand',
'sequence_expand_as',
'sequence_pad',
'lstm_unit',
'reduce_sum',
'reduce_mean',
'reduce_max',
'reduce_min',
'reduce_prod',
'sequence_first_step',
'sequence_last_step',
'dropout',
'split',
'ctc_greedy_decoder',
'edit_distance',
'l2_normalize',
'matmul',
'topk',
'warpctc',
'sequence_reshape',
'transpose',
'im2sequence',
'nce',
'hsigmoid',
'beam_search',
'row_conv',
'multiplex',
'layer_norm',
'softmax_with_cross_entropy',
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
'reshape',
'squeeze',
'unsqueeze',
'lod_reset',
'lrn',
'pad',
'pad_constant_like',
'label_smooth',
'roi_pool',
'dice_loss',
'image_resize',
'image_resize_short',
'resize_bilinear',
'gather',
'scatter',
'sequence_scatter',
'random_crop',
'mean_iou',
'relu',
'log',
'crop',
'rank_loss',
'elu',
'relu6',
'pow',
'stanh',
'hard_sigmoid',
'swish',
'prelu',
'brelu',
'leaky_relu',
'soft_relu',
'flatten',
'sequence_mask',
'stack',
'pad2d',
'unstack',
'sequence_enumerate',
'expand',
'sequence_concat',
'scale',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'uniform_random_batch_size_like',
'gaussian_random',
'sampling_id',
'gaussian_random_batch_size_like',
'sum',
'slice',
'shape',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
'clip',
'clip_by_norm',
'mean',
'mul',
'sigmoid_cross_entropy_with_logits',
'maxout',
] ]
...@@ -60,7 +158,6 @@ def fc(input, ...@@ -60,7 +158,6 @@ def fc(input,
num_flatten_dims=1, num_flatten_dims=1,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_mkldnn=False,
act=None, act=None,
is_test=False, is_test=False,
name=None): name=None):
...@@ -112,8 +209,6 @@ def fc(input, ...@@ -112,8 +209,6 @@ def fc(input,
If it is set to None, the bias is initialized zero. Default: None. If it is set to None, the bias is initialized zero. Default: None.
act (str, default None): Activation to be applied to the output of this layer. act (str, default None): Activation to be applied to the output of this layer.
is_test(bool): A flag indicating whether execution is in test phase. is_test(bool): A flag indicating whether execution is in test phase.
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
library is installed. Default: False
name (str, default None): The name of this layer. name (str, default None): The name of this layer.
Returns: Returns:
...@@ -160,7 +255,7 @@ def fc(input, ...@@ -160,7 +255,7 @@ def fc(input,
type="sum", type="sum",
inputs={"X": mul_results}, inputs={"X": mul_results},
outputs={"Out": pre_bias}, outputs={"Out": pre_bias},
attrs={"use_mkldnn": use_mkldnn}) attrs={"use_mkldnn": False})
# add bias # add bias
pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
# add activation # add activation
...@@ -953,8 +1048,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): ...@@ -953,8 +1048,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
soft_label (bool): a flag indicating whether to soft_label (bool): a flag indicating whether to
interpretate the given labels as soft interpretate the given labels as soft
labels. Default: `False`. labels. Default: `False`.
ignore_index (int): Specifies a target value that is ignored and does ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100 if soft_label is set to False. Default: -100
Returns: Returns:
...@@ -1324,7 +1419,6 @@ def conv2d(input, ...@@ -1324,7 +1419,6 @@ def conv2d(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False,
act=None, act=None,
name=None): name=None):
""" """
...@@ -1402,8 +1496,6 @@ def conv2d(input, ...@@ -1402,8 +1496,6 @@ def conv2d(input,
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
act (str): Activation type. Default: None act (str): Activation type. Default: None
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -1476,7 +1568,7 @@ def conv2d(input, ...@@ -1476,7 +1568,7 @@ def conv2d(input,
'dilations': dilation, 'dilations': dilation,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn, 'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn 'use_mkldnn': False
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
...@@ -1494,7 +1586,6 @@ def conv3d(input, ...@@ -1494,7 +1586,6 @@ def conv3d(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False,
act=None, act=None,
name=None): name=None):
""" """
...@@ -1568,7 +1659,6 @@ def conv3d(input, ...@@ -1568,7 +1659,6 @@ def conv3d(input,
bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not.
act (str): Activation type. Default: None act (str): Activation type. Default: None
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -1638,7 +1728,7 @@ def conv3d(input, ...@@ -1638,7 +1728,7 @@ def conv3d(input,
'dilations': dilation, 'dilations': dilation,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn, 'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn 'use_mkldnn': False
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
...@@ -1820,7 +1910,6 @@ def pool2d(input, ...@@ -1820,7 +1910,6 @@ def pool2d(input,
global_pooling=False, global_pooling=False,
use_cudnn=True, use_cudnn=True,
ceil_mode=False, ceil_mode=False,
use_mkldnn=False,
name=None): name=None):
""" """
${comment} ${comment}
...@@ -1838,7 +1927,6 @@ def pool2d(input, ...@@ -1838,7 +1927,6 @@ def pool2d(input,
global_pooling: ${global_pooling_comment} global_pooling: ${global_pooling_comment}
use_cudnn: ${use_cudnn_comment} use_cudnn: ${use_cudnn_comment}
ceil_mode: ${ceil_mode_comment} ceil_mode: ${ceil_mode_comment}
use_mkldnn: ${use_mkldnn_comment}
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
...@@ -1898,7 +1986,7 @@ def pool2d(input, ...@@ -1898,7 +1986,7 @@ def pool2d(input,
"paddings": pool_padding, "paddings": pool_padding,
"use_cudnn": use_cudnn, "use_cudnn": use_cudnn,
"ceil_mode": ceil_mode, "ceil_mode": ceil_mode,
"use_mkldnn": use_mkldnn "use_mkldnn": False
}) })
return pool_out return pool_out
...@@ -1912,7 +2000,6 @@ def pool3d(input, ...@@ -1912,7 +2000,6 @@ def pool3d(input,
global_pooling=False, global_pooling=False,
use_cudnn=True, use_cudnn=True,
ceil_mode=False, ceil_mode=False,
use_mkldnn=False,
name=None): name=None):
""" """
This function adds the operator for pooling in 3-dimensions, using the This function adds the operator for pooling in 3-dimensions, using the
...@@ -1927,7 +2014,6 @@ def pool3d(input, ...@@ -1927,7 +2014,6 @@ def pool3d(input,
global_pooling (bool): ${global_pooling_comment} global_pooling (bool): ${global_pooling_comment}
use_cudnn (bool): ${use_cudnn_comment} use_cudnn (bool): ${use_cudnn_comment}
ceil_mode (bool): ${ceil_mode_comment} ceil_mode (bool): ${ceil_mode_comment}
use_mkldnn (bool): ${use_mkldnn_comment}
name (str): A name for this layer(optional). If set None, the layer name (str): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -1968,7 +2054,7 @@ def pool3d(input, ...@@ -1968,7 +2054,7 @@ def pool3d(input,
"paddings": pool_padding, "paddings": pool_padding,
"use_cudnn": use_cudnn, "use_cudnn": use_cudnn,
"ceil_mode": ceil_mode, "ceil_mode": ceil_mode,
"use_mkldnn": use_mkldnn "use_mkldnn": False
}) })
return pool_out return pool_out
...@@ -1983,7 +2069,6 @@ def batch_norm(input, ...@@ -1983,7 +2069,6 @@ def batch_norm(input,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
in_place=False, in_place=False,
use_mkldnn=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None, moving_variance_name=None,
...@@ -2025,7 +2110,6 @@ def batch_norm(input, ...@@ -2025,7 +2110,6 @@ def batch_norm(input,
bias_attr(ParamAttr): The parameter attribute for Parameter `bias`. bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
data_layout(string, default NCHW): NCHW|NHWC data_layout(string, default NCHW): NCHW|NHWC
in_place(bool, Default False): Make the input and output of batch norm reuse memory. in_place(bool, Default False): Make the input and output of batch norm reuse memory.
use_mkldnn(bool, Default false): ${use_mkldnn_comment}
name(string, Default None): A name for this layer(optional). If set None, the layer name(string, Default None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
...@@ -2117,7 +2201,7 @@ def batch_norm(input, ...@@ -2117,7 +2201,7 @@ def batch_norm(input,
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": is_test, "is_test": is_test,
"use_mkldnn": use_mkldnn, "use_mkldnn": False,
"fuse_with_relu": fuse_with_relu "fuse_with_relu": fuse_with_relu
}) })
...@@ -2714,20 +2798,20 @@ def sequence_pad(x, pad_value, maxlen=None): ...@@ -2714,20 +2798,20 @@ def sequence_pad(x, pad_value, maxlen=None):
Args: Args:
x(Variable): Input variable which should contain lod information. x(Variable): Input variable which should contain lod information.
pad_value(Variable): The Variable that holds values that will be fill pad_value(Variable): The Variable that holds values that will be fill
into padded steps. It can be a scalar or a tensor whose shape into padded steps. It can be a scalar or a tensor whose shape
equals to time steps in sequences. If it's a scalar, it will be equals to time steps in sequences. If it's a scalar, it will be
automatically broadcasted to the shape of time step. automatically broadcasted to the shape of time step.
maxlen(int, default None): The length of padded sequences. It can be maxlen(int, default None): The length of padded sequences. It can be
None or any positive int. When it is None, all sequences will be None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the certain positive value, it must be greater than the length of the
longest original sequence." longest original sequence."
Returns: Returns:
Variable: The padded sequence batch and the original lengths before Variable: The padded sequence batch and the original lengths before
padding. All sequences has the same length. padding. All sequences has the same length.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -4343,8 +4427,8 @@ def softmax_with_cross_entropy(logits, ...@@ -4343,8 +4427,8 @@ def softmax_with_cross_entropy(logits,
soft_label is set to true, Label is a Tensor<float/double> with soft_label is set to true, Label is a Tensor<float/double> with
soft_label (bool): A flag to indicate whether to interpretate the given soft_label (bool): A flag to indicate whether to interpretate the given
labels as soft labels. By default, `soft_label` is set to False. labels as soft labels. By default, `soft_label` is set to False.
ignore_index (int): Specifies a target value that is ignored and does ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100 if soft_label is set to False. Default: -100
Returns: Returns:
...@@ -4601,14 +4685,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4601,14 +4685,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
def squeeze(input, axes, name=None): def squeeze(input, axes, name=None):
""" """
Remove single-dimensional entries from the shape of a tensor. Takes a Remove single-dimensional entries from the shape of a tensor. Takes a
parameter axes with a list of axes to squeeze. If axes is not provided, all parameter axes with a list of axes to squeeze. If axes is not provided, all
the single dimensions will be removed from the shape. If an axis is the single dimensions will be removed from the shape. If an axis is
selected with shape entry not equal to one, an error is raised. selected with shape entry not equal to one, an error is raised.
Examples: Examples:
Case 1: Case 1:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
axes = [0] axes = [0]
...@@ -4617,11 +4701,11 @@ def squeeze(input, axes, name=None): ...@@ -4617,11 +4701,11 @@ def squeeze(input, axes, name=None):
Case 2: Case 2:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
axes = [] axes = []
we get: we get:
Out.shape = (3, 5) Out.shape = (3, 5)
Args: Args:
input (Variable): The input variable to be squeezed. input (Variable): The input variable to be squeezed.
axes (list): List of integers, indicating the dimensions to be squeezed. axes (list): List of integers, indicating the dimensions to be squeezed.
...@@ -4651,14 +4735,14 @@ def squeeze(input, axes, name=None): ...@@ -4651,14 +4735,14 @@ def squeeze(input, axes, name=None):
def unsqueeze(input, axes, name=None): def unsqueeze(input, axes, name=None):
""" """
Insert single-dimensional entries to the shape of a tensor. Takes one Insert single-dimensional entries to the shape of a tensor. Takes one
required argument axes, a list of dimensions that will be inserted. required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor. Dimension indices in axes are as seen in the output tensor.
For example: For example:
Given a tensor such that tensor with shape [3, 4, 5], Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
Args: Args:
input (Variable): The input variable to be unsqueezed. input (Variable): The input variable to be unsqueezed.
axes (list): List of integers, indicating the dimensions to be inserted. axes (list): List of integers, indicating the dimensions to be inserted.
...@@ -5757,39 +5841,39 @@ def pad2d(input, ...@@ -5757,39 +5841,39 @@ def pad2d(input,
Example: Example:
Given that X is a channel of image from input: Given that X is a channel of image from input:
X = [[1, 2, 3], X = [[1, 2, 3],
[4, 5, 6]] [4, 5, 6]]
Case 0: Case 0:
paddings = [0, 1, 2, 3], paddings = [0, 1, 2, 3],
mode = 'constant' mode = 'constant'
pad_value = 0 pad_value = 0
Out = [[0, 0, 1, 2, 3, 0, 0, 0] Out = [[0, 0, 1, 2, 3, 0, 0, 0]
[0, 0, 4, 5, 6, 0, 0, 0] [0, 0, 4, 5, 6, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]] [0, 0, 0, 0, 0, 0, 0, 0]]
Case 1: Case 1:
paddings = [0, 1, 2, 1], paddings = [0, 1, 2, 1],
mode = 'reflect' mode = 'reflect'
Out = [[3, 2, 1, 2, 3, 2] Out = [[3, 2, 1, 2, 3, 2]
[6, 5, 4, 5, 6, 5] [6, 5, 4, 5, 6, 5]
[3, 2, 1, 2, 3, 2]] [3, 2, 1, 2, 3, 2]]
Case 2: Case 2:
paddings = [0, 1, 2, 1], paddings = [0, 1, 2, 1],
mode = 'edge' mode = 'edge'
Out = [[1, 1, 1, 2, 3, 3] Out = [[1, 1, 1, 2, 3, 3]
[4, 4, 4, 5, 6, 6] [4, 4, 4, 5, 6, 6]
[4, 4, 4, 5, 6, 6]] [4, 4, 4, 5, 6, 6]]
Args: Args:
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format. input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
paddings (tuple|list): The padding size. If padding is a tuple, it must paddings (tuple|list): The padding size. If padding is a tuple, it must
...@@ -5988,7 +6072,7 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -5988,7 +6072,7 @@ def prelu(x, mode, param_attr=None, name=None):
channel:elements in a channel share same weight channel:elements in a channel share same weight
element:each element has a weight element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
Variable: The output tensor with the same shape as input. Variable: The output tensor with the same shape as input.
...@@ -6166,10 +6250,10 @@ def flatten(x, axis=1, name=None): ...@@ -6166,10 +6250,10 @@ def flatten(x, axis=1, name=None):
def sequence_enumerate(input, win_size, pad_value=0, name=None): def sequence_enumerate(input, win_size, pad_value=0, name=None):
""" """
Generate a new sequence for the input index sequence, which enumerates all the Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input. sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples: Examples:
Case 1: Case 1:
Input: Input:
...@@ -6296,20 +6380,20 @@ def unstack(x, axis=0, num=None): ...@@ -6296,20 +6380,20 @@ def unstack(x, axis=0, num=None):
**UnStack Layer** **UnStack Layer**
This layer unstacks input :code:`x` into several tensors along axis. This layer unstacks input :code:`x` into several tensors along axis.
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
raised. raised.
Args: Args:
x (Variable): Input variable. x (Variable): Input variable.
axis (int): The axis along which the input is unstacked. axis (int): The axis along which the input is unstacked.
num (int|None): The number of output variables. num (int|None): The number of output variables.
Returns: Returns:
list(Variable): The unstacked variables. list(Variable): The unstacked variables.
""" """
helper = LayerHelper('unstack', **locals()) helper = LayerHelper('unstack', **locals())
...@@ -6342,21 +6426,21 @@ def expand(x, expand_times, name=None): ...@@ -6342,21 +6426,21 @@ def expand(x, expand_times, name=None):
.. code-block:: text .. code-block:: text
Input(X) is a 3-D tensor with shape [2, 3, 1]: Input(X) is a 3-D tensor with shape [2, 3, 1]:
[ [
[[1], [2], [3]], [[1], [2], [3]],
[[4], [5], [6]] [[4], [5], [6]]
] ]
Attr(expand_times): [1, 2, 2] Attr(expand_times): [1, 2, 2]
Output(Out) is a 3-D tensor with shape [2, 6, 2]: Output(Out) is a 3-D tensor with shape [2, 6, 2]:
[ [
[[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
[[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
] ]
Args: Args:
x (Variable): A tensor with rank in [1, 6]. x (Variable): A tensor with rank in [1, 6].
expand_times (list|tuple): Expand times number for each dimension. expand_times (list|tuple): Expand times number for each dimension.
...@@ -6432,12 +6516,7 @@ def uniform_random_batch_size_like(input, ...@@ -6432,12 +6516,7 @@ def uniform_random_batch_size_like(input,
@templatedoc() @templatedoc()
def gaussian_random(shape, def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
mean=0.0,
std=1.0,
seed=0,
dtype='float32',
use_mkldnn=False):
""" """
${comment} ${comment}
...@@ -6447,7 +6526,6 @@ def gaussian_random(shape, ...@@ -6447,7 +6526,6 @@ def gaussian_random(shape,
std (Float): ${std_comment} std (Float): ${std_comment}
seed (Int): ${seed_comment} seed (Int): ${seed_comment}
dtype(np.dtype|core.VarDesc.VarType|str): Output data type. dtype(np.dtype|core.VarDesc.VarType|str): Output data type.
use_mkldnn (Bool): Only used in mkldnn kernel.
Returns: Returns:
out (Variable): ${out_comment} out (Variable): ${out_comment}
...@@ -6466,7 +6544,7 @@ def gaussian_random(shape, ...@@ -6466,7 +6544,7 @@ def gaussian_random(shape,
'std': std, 'std': std,
'seed': seed, 'seed': seed,
'dtype': c_dtype, 'dtype': c_dtype,
'use_mkldnn': use_mkldnn 'use_mkldnn': False
}) })
return out return out
...@@ -6549,13 +6627,12 @@ def gaussian_random_batch_size_like(input, ...@@ -6549,13 +6627,12 @@ def gaussian_random_batch_size_like(input,
@templatedoc() @templatedoc()
def sum(x, use_mkldnn=False): def sum(x):
""" """
${comment} ${comment}
Args: Args:
x (Variable): ${x_comment} x (Variable): ${x_comment}
use_mkldnn (Bool): ${use_mkldnn_comment}
Returns: Returns:
out (Variable): ${out_comment} out (Variable): ${out_comment}
...@@ -6567,7 +6644,7 @@ def sum(x, use_mkldnn=False): ...@@ -6567,7 +6644,7 @@ def sum(x, use_mkldnn=False):
type='sum', type='sum',
inputs={'X': x}, inputs={'X': x},
outputs={'Out': out}, outputs={'Out': out},
attrs={'use_mkldnn': use_mkldnn}) attrs={'use_mkldnn': False})
return out return out
...@@ -6630,14 +6707,12 @@ def _elementwise_op(helper): ...@@ -6630,14 +6707,12 @@ def _elementwise_op(helper):
assert y is not None, 'y cannot be None in {}'.format(op_type) assert y is not None, 'y cannot be None in {}'.format(op_type)
axis = helper.kwargs.get('axis', -1) axis = helper.kwargs.get('axis', -1)
use_mkldnn = helper.kwargs.get('use_mkldnn', False) use_mkldnn = helper.kwargs.get('use_mkldnn', False)
out = helper.kwargs.get('out', None) name = helper.kwargs.get('name', None)
if out is None: if name is None:
name = helper.kwargs.get('name', None) out = helper.create_tmp_variable(dtype=x.dtype)
if name is None: else:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable(
else: name=name, dtype=x.dtype, persistable=False)
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op( helper.append_op(
type=op_type, type=op_type,
...@@ -6650,13 +6725,7 @@ def _elementwise_op(helper): ...@@ -6650,13 +6725,7 @@ def _elementwise_op(helper):
@templatedoc() @templatedoc()
def scale(x, def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
scale=1.0,
bias=0.0,
bias_after_scale=True,
out=None,
act=None,
name=None):
""" """
${comment} ${comment}
...@@ -6665,21 +6734,19 @@ def scale(x, ...@@ -6665,21 +6734,19 @@ def scale(x,
scale(${scale_type}): ${scale_comment} scale(${scale_type}): ${scale_comment}
bias(${bias_type}): ${bias_comment} bias(${bias_type}): ${bias_comment}
bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment} bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment}
out(Tensor): Output tensor.
act(basestring|None): Activation applied to the output. act(basestring|None): Activation applied to the output.
name(basestring|None): Name of the output. name(basestring|None): Name of the output.
Returns: Returns:
out(${out_type}): ${out_comment} out(${out_type}): ${out_comment}
""" """
helper = LayerHelper('scale', **locals()) helper = LayerHelper('scale', **locals())
if out is None: if name is None:
if name is None: out = helper.create_tmp_variable(dtype=x.dtype)
out = helper.create_tmp_variable(dtype=x.dtype) else:
else: out = helper.create_variable(
out = helper.create_variable( name=name, dtype=x.dtype, persistable=False)
name=name, dtype=x.dtype, persistable=False)
helper.append_op( helper.append_op(
type='scale', type='scale',
...@@ -6693,73 +6760,31 @@ def scale(x, ...@@ -6693,73 +6760,31 @@ def scale(x,
return helper.append_activation(out) return helper.append_activation(out)
def elementwise_add(x, def elementwise_add(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_add', **locals())) return _elementwise_op(LayerHelper('elementwise_add', **locals()))
def elementwise_div(x, def elementwise_div(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_div', **locals())) return _elementwise_op(LayerHelper('elementwise_div', **locals()))
def elementwise_sub(x, def elementwise_sub(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_sub', **locals())) return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
def elementwise_mul(x, def elementwise_mul(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_mul', **locals())) return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
def elementwise_max(x, def elementwise_max(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_max', **locals())) return _elementwise_op(LayerHelper('elementwise_max', **locals()))
def elementwise_min(x, def elementwise_min(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_min', **locals())) return _elementwise_op(LayerHelper('elementwise_min', **locals()))
def elementwise_pow(x, def elementwise_pow(x, y, axis=-1, act=None, name=None):
y,
out=None,
axis=-1,
use_mkldnn=False,
act=None,
name=None):
return _elementwise_op(LayerHelper('elementwise_pow', **locals())) return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
...@@ -6771,7 +6796,291 @@ for func in [ ...@@ -6771,7 +6796,291 @@ for func in [
func.__doc__ = _generate_doc_string_( func.__doc__ = _generate_doc_string_(
op_proto, op_proto,
additional_args_lines=[ additional_args_lines=[
"out (Tensor): The output tensor of elementwise op.",
"act (basestring|None): Activation applied to the output.", "act (basestring|None): Activation applied to the output.",
"name (basestring|None): Name of the output." "name (basestring|None): Name of the output."
]) ])
def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
helper = LayerHelper(op_name, **locals())
if binary_op:
assert x.dtype == y.dtype
if out is None:
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
if binary_op:
helper.append_op(
type=op_name, inputs={"X": x,
"Y": y}, outputs={"Out": out})
else:
helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
return out
@templatedoc()
def logical_and(x, y, out=None, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
y(${y_type}): ${y_comment}
out(Tensor): Output tensor of logical operation.
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
return _logical_op(
op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
@templatedoc()
def logical_or(x, y, out=None, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
y(${y_type}): ${y_comment}
out(Tensor): Output tensor of logical operation.
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
return _logical_op(
op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
@templatedoc()
def logical_xor(x, y, out=None, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
y(${y_type}): ${y_comment}
out(Tensor): Output tensor of logical operation.
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
return _logical_op(
op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
@templatedoc()
def logical_not(x, out=None, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
out(Tensor): Output tensor of logical operation.
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
return _logical_op(
op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False)
@templatedoc()
def clip(x, min, max, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
min(${min_type}): ${min_comment}
max(${max_type}): ${max_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("clip", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="clip",
inputs={"X": x},
attrs={"min": min,
"max": max},
outputs={"Out": out})
return out
@templatedoc()
def clip_by_norm(x, max_norm, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
max_norm(${max_norm_type}): ${max_norm_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("clip_by_norm", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="clip_by_norm",
inputs={"X": x},
attrs={"max_norm": max_norm},
outputs={"Out": out})
return out
@templatedoc()
def mean(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("mean", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out})
return out
@templatedoc()
def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
y(${y_type}): ${y_comment}
x_num_col_dims(${x_num_col_dims_type}): ${x_num_col_dims_comment}
y_num_col_dims(${y_num_col_dims_type}): ${y_num_col_dims_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("mul", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="mul",
inputs={"X": x,
"Y": y},
attrs={
"x_num_col_dims": x_num_col_dims,
"y_num_col_dims": y_num_col_dims
},
outputs={"Out": out})
return out
@templatedoc()
def sigmoid_cross_entropy_with_logits(x, label, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
label(${label_type}): ${label_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="sigmoid_cross_entropy_with_logits",
inputs={"X": x,
"Label": label},
attrs={},
outputs={"Out": out})
return out
@templatedoc()
def maxout(x, groups, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
groups(${groups_type}): ${groups_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("maxout", **locals())
if name is None:
out = helper.create_tmp_variable(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="maxout",
inputs={"X": x},
attrs={"groups": groups},
outputs={"Out": out})
return out
...@@ -35,18 +35,7 @@ __activations_noattr__ = [ ...@@ -35,18 +35,7 @@ __activations_noattr__ = [
'softsign', 'softsign',
] ]
__all__ = [ __all__ = []
'mean',
'mul',
'sigmoid_cross_entropy_with_logits',
'clip',
'clip_by_norm',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
'maxout',
]
for _OP in set(__all__): for _OP in set(__all__):
globals()[_OP] = generate_layer_fn(_OP) globals()[_OP] = generate_layer_fn(_OP)
......
...@@ -40,8 +40,7 @@ def simple_img_conv_pool(input, ...@@ -40,8 +40,7 @@ def simple_img_conv_pool(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
act=None, act=None,
use_cudnn=True, use_cudnn=True):
use_mkldnn=False):
""" """
The simple_img_conv_pool is composed with one Convolution2d and one Pool2d. The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
...@@ -84,8 +83,6 @@ def simple_img_conv_pool(input, ...@@ -84,8 +83,6 @@ def simple_img_conv_pool(input,
act (str): Activation type for Conv2d. Default: None act (str): Activation type for Conv2d. Default: None
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
Return: Return:
Variable: The result of input after Convolution2d and Pool2d. Variable: The result of input after Convolution2d and Pool2d.
...@@ -112,8 +109,7 @@ def simple_img_conv_pool(input, ...@@ -112,8 +109,7 @@ def simple_img_conv_pool(input,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
act=act, act=act,
use_cudnn=use_cudnn, use_cudnn=use_cudnn)
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d( pool_out = layers.pool2d(
input=conv_out, input=conv_out,
...@@ -122,8 +118,7 @@ def simple_img_conv_pool(input, ...@@ -122,8 +118,7 @@ def simple_img_conv_pool(input,
pool_stride=pool_stride, pool_stride=pool_stride,
pool_padding=pool_padding, pool_padding=pool_padding,
global_pooling=global_pooling, global_pooling=global_pooling,
use_cudnn=use_cudnn, use_cudnn=use_cudnn)
use_mkldnn=use_mkldnn)
return pool_out return pool_out
...@@ -138,8 +133,7 @@ def img_conv_group(input, ...@@ -138,8 +133,7 @@ def img_conv_group(input,
conv_batchnorm_drop_rate=0.0, conv_batchnorm_drop_rate=0.0,
pool_stride=1, pool_stride=1,
pool_type="max", pool_type="max",
use_cudnn=True, use_cudnn=True):
use_mkldnn=False):
""" """
The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut, The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
and Pool2d. According to the input arguments, img_conv_group will do serials of and Pool2d. According to the input arguments, img_conv_group will do serials of
...@@ -177,8 +171,6 @@ def img_conv_group(input, ...@@ -177,8 +171,6 @@ def img_conv_group(input,
average-pooling. Default :math:`max`. average-pooling. Default :math:`max`.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
Return: Return:
Variable: The final result after serial computation using Convolution2d, Variable: The final result after serial computation using Convolution2d,
...@@ -226,8 +218,7 @@ def img_conv_group(input, ...@@ -226,8 +218,7 @@ def img_conv_group(input,
padding=conv_padding[i], padding=conv_padding[i],
param_attr=param_attr[i], param_attr=param_attr[i],
act=local_conv_act, act=local_conv_act,
use_cudnn=use_cudnn, use_cudnn=use_cudnn)
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]: if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True) tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
...@@ -240,8 +231,7 @@ def img_conv_group(input, ...@@ -240,8 +231,7 @@ def img_conv_group(input,
pool_size=pool_size, pool_size=pool_size,
pool_type=pool_type, pool_type=pool_type,
pool_stride=pool_stride, pool_stride=pool_stride,
use_cudnn=use_cudnn, use_cudnn=use_cudnn)
use_mkldnn=use_mkldnn)
return pool_out return pool_out
......
...@@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") ...@@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test # default test
foreach(src ${TEST_OPS}) if(NOT APPLE)
py_test(${src} SRCS ${src}.py) foreach(src ${TEST_OPS})
endforeach() py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_recognize_digits_conv")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import framework, unique_name, layer_helper
from paddle.fluid.executor import Executor
from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print
class TestRoutineOp(unittest.TestCase):
def test_simple_routine(self):
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
# Create LOD_TENSOR<INT64> and put it into the scope. This placeholder
# variable will be filled in and returned by fluid.channel_recv
result = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT64)
with fluid.Go():
input_value = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
fluid.channel_send(ch, input_value)
result, status = fluid.channel_recv(ch, result)
fluid.channel_close(ch)
cpu = core.CPUPlace()
exe = Executor(cpu)
outs = exe.run(fetch_list=[result])
self.assertEqual(outs[0], 1234)
def test_daisy_chain(self):
'''
Mimics classic Daisy-chain test: https://talks.golang.org/2012/concurrency.slide#39
'''
n = 100
leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
left = leftmost
# TODO(thuan): Use fluid.While() after scope capture is implemented.
# https://github.com/PaddlePaddle/Paddle/issues/8502
for i in range(n):
right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
one_tensor = self._create_one_dim_tensor(1)
result = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT64)
result, status = fluid.channel_recv(right, result)
one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
fluid.channel_send(left, one_added)
left = right
# Trigger the channel propagation by sending a "1" to rightmost channel
with fluid.Go():
one_tensor = self._create_one_dim_tensor(1)
fluid.channel_send(right, one_tensor)
leftmost_result = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT64)
leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
cpu = core.CPUPlace()
exe = Executor(cpu)
leftmost_data = exe.run(fetch_list=[leftmost_result])
# The leftmost_data should be equal to the number of channels + 1
self.assertEqual(leftmost_data[0][0], n + 1)
def _create_one_dim_tensor(self, value):
one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value)
one_dim_tensor.stop_gradient = True
return one_dim_tensor
def _create_tensor(self, name, type, dtype):
return framework.default_main_program().current_block().create_var(
name=unique_name.generate(name), type=type, dtype=dtype)
def _create_persistable_tensor(self, name, type, dtype):
return framework.default_main_program().current_block().create_var(
name=unique_name.generate(name),
type=type,
dtype=dtype,
persistable=True)
def test_select(self):
with framework.program_guard(framework.Program()):
ch1 = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
result1 = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
input_value = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=10)
with fluid.Select() as select:
with select.case(fluid.channel_send, ch1, input_value):
# Execute something.
pass
with select.default():
pass
# This should not block because we are using a buffered channel.
result1, status = fluid.channel_recv(ch1, result1)
fluid.channel_close(ch1)
cpu = core.CPUPlace()
exe = Executor(cpu)
result = exe.run(fetch_list=[result1])
self.assertEqual(result[0][0], 10)
def test_fibonacci(self):
"""
Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5
"""
with framework.program_guard(framework.Program()):
quit_ch_input_var = self._create_persistable_tensor(
'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT32)
quit_ch_input = fill_constant(
shape=[1],
dtype=core.VarDesc.VarType.INT32,
value=0,
out=quit_ch_input_var)
result = self._create_persistable_tensor(
'result', core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT32)
fill_constant(
shape=[1],
dtype=core.VarDesc.VarType.INT32,
value=0,
out=result)
x = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
y = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
while_cond = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
while_false = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
x_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
def fibonacci(channel, quit_channel):
while_op = While(cond=while_cond)
with while_op.block():
result2 = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
with fluid.Select() as select:
with select.case(
fluid.channel_send, channel, x, is_copy=True):
assign(input=x, output=x_tmp)
assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y)
with select.case(fluid.channel_recv, quit_channel,
result2):
# Quit
helper = layer_helper.LayerHelper('assign')
helper.append_op(
type='assign',
inputs={'X': [while_false]},
outputs={'Out': [while_cond]})
ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
for i in range(10):
fluid.channel_recv(ch1, result)
Print(result)
fluid.channel_send(quit_ch, quit_ch_input)
fibonacci(ch1, quit_ch)
fluid.channel_close(ch1)
fluid.channel_close(quit_ch)
cpu = core.CPUPlace()
exe = Executor(cpu)
exe_result = exe.run(fetch_list=[result])
self.assertEqual(exe_result[0][0], 34)
def test_ping_pong(self):
"""
Mimics Ping Pong example: https://gobyexample.com/channel-directions
"""
with framework.program_guard(framework.Program()):
result = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
ping_result = self._create_tensor('ping_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
def ping(ch, message):
fluid.channel_send(ch, message, is_copy=True)
def pong(ch1, ch2):
fluid.channel_recv(ch1, ping_result)
fluid.channel_send(ch2, ping_result, is_copy=True)
pings = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
pongs = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
msg = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=9)
ping(pings, msg)
pong(pings, pongs)
fluid.channel_recv(pongs, result)
fluid.channel_close(pings)
fluid.channel_close(pongs)
cpu = core.CPUPlace()
exe = Executor(cpu)
exe_result = exe.run(fetch_list=[result])
self.assertEqual(exe_result[0][0], 9)
if __name__ == '__main__':
unittest.main()
...@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl ...@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
if(APPLE) if(APPLE)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
...@@ -77,11 +76,13 @@ if(WITH_DISTRIBUTE) ...@@ -77,11 +76,13 @@ if(WITH_DISTRIBUTE)
if(NOT APPLE) if(NOT APPLE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# TODO: fix this test
#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
#FIXME(gongwb): random fails.
#py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
......
...@@ -247,7 +247,7 @@ class DistSeResneXt2x2(TestDistRunnerBase): ...@@ -247,7 +247,7 @@ class DistSeResneXt2x2(TestDistRunnerBase):
# Reader # Reader
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.flowers.train(), batch_size=batch_size) paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
......
...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): ...@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d" self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
self.dtype = np.float32 self.dtype = np.float32
...@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest): ...@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
def testcudnn(self): def testcuda(self):
return core.is_compiled_with_cuda() and self.use_cudnn return core.is_compiled_with_cuda() and (self.use_cudnn or
self.use_cuda)
def test_check_output(self): def test_check_output(self):
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5) self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Input'], place, ['Input'],
'Output', 'Output',
...@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest): ...@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, ['Filter'], place, ['Filter'],
'Output', 'Output',
...@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): ...@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class TestDepthwiseConv(TestConv2dOp): class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [2, 2] self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3 self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp): class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self): def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [3, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv3(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1] self.pad = [1, 1]
self.stride = [1, 1] self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
...@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
self.op_type = "depthwise_conv2d" self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConvWithDilation2(TestConv2dOp):
def init_test_case(self):
self.use_cuda = True
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
self.dilations = [2, 2]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# Please Don't remove the following code. # Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv. # Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
......
...@@ -50,9 +50,7 @@ class TestDistRunnerBase(object): ...@@ -50,9 +50,7 @@ class TestDistRunnerBase(object):
def run_pserver(self, args): def run_pserver(self, args):
self.get_model(batch_size=2) self.get_model(batch_size=2)
# NOTE: pserver should not call memory optimize
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program())
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), args.endpoints, fluid.default_main_program(), args.endpoints,
args.trainers, args.sync_mode) args.trainers, args.sync_mode)
...@@ -70,7 +68,7 @@ class TestDistRunnerBase(object): ...@@ -70,7 +68,7 @@ class TestDistRunnerBase(object):
self.get_model(batch_size=2) self.get_model(batch_size=2)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist: if args.is_dist:
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), fluid.default_main_program(),
...@@ -166,6 +164,17 @@ class TestDistBase(unittest.TestCase): ...@@ -166,6 +164,17 @@ class TestDistBase(unittest.TestCase):
def _setup_config(self): def _setup_config(self):
raise NotImplementedError("tests should have _setup_config implemented") raise NotImplementedError("tests should have _setup_config implemented")
def _after_setup_config(self):
if self._enforce_place == "CPU":
self.__use_cuda = False
elif self._enforce_place == "GPU":
self.__use_cuda = True
else:
if fluid.core.is_compiled_with_cuda():
self.__use_cuda = True
else:
self.__use_cuda = False
def setUp(self): def setUp(self):
self._trainers = 2 self._trainers = 2
self._pservers = 2 self._pservers = 2
...@@ -173,11 +182,12 @@ class TestDistBase(unittest.TestCase): ...@@ -173,11 +182,12 @@ class TestDistBase(unittest.TestCase):
self._find_free_port(), self._find_free_port()) self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True self._sync_mode = True
self._use_cuda = True self._enforce_place = None
self._mem_opt = False self._mem_opt = False
self._use_reduce = False self._use_reduce = False
self._use_reader_alloc = True self._use_reader_alloc = True
self._setup_config() self._setup_config()
self._after_setup_config()
def _find_free_port(self): def _find_free_port(self):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
...@@ -201,13 +211,10 @@ class TestDistBase(unittest.TestCase): ...@@ -201,13 +211,10 @@ class TestDistBase(unittest.TestCase):
ps0_cmd += " --mem_opt" ps0_cmd += " --mem_opt"
ps1_cmd += " --mem_opt" ps1_cmd += " --mem_opt"
ps0_pipe = subprocess.PIPE print(ps0_cmd)
ps1_pipe = subprocess.PIPE print(ps1_cmd)
if check_error_log: ps0_pipe = open("/tmp/ps0_err.log", "wb")
print(ps0_cmd) ps1_pipe = open("/tmp/ps1_err.log", "wb")
print(ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
ps0_cmd.strip().split(" "), ps0_cmd.strip().split(" "),
...@@ -220,10 +227,7 @@ class TestDistBase(unittest.TestCase): ...@@ -220,10 +227,7 @@ class TestDistBase(unittest.TestCase):
stderr=ps1_pipe, stderr=ps1_pipe,
env=required_envs) env=required_envs)
if not check_error_log: return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
return ps0_proc, ps1_proc, None, None
else:
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid): def _wait_ps_ready(self, pid):
retry_times = 50 retry_times = 50
...@@ -244,7 +248,7 @@ class TestDistBase(unittest.TestCase): ...@@ -244,7 +248,7 @@ class TestDistBase(unittest.TestCase):
cmd = "%s %s --role trainer" % (self._python_interp, model) cmd = "%s %s --role trainer" % (self._python_interp, model)
if self._use_cuda: if self.__use_cuda:
cmd += " --use_cuda" cmd += " --use_cuda"
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
else: else:
...@@ -252,7 +256,7 @@ class TestDistBase(unittest.TestCase): ...@@ -252,7 +256,7 @@ class TestDistBase(unittest.TestCase):
envs.update(env_local) envs.update(env_local)
if not check_error_log: if check_error_log:
err_log = open("/tmp/trainer.err.log", "wb") err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
cmd.split(" "), cmd.split(" "),
...@@ -266,7 +270,6 @@ class TestDistBase(unittest.TestCase): ...@@ -266,7 +270,6 @@ class TestDistBase(unittest.TestCase):
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
env=envs) env=envs)
local_proc.wait()
local_out, local_err = local_proc.communicate() local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(local_out) local_ret = cpt.to_text(local_out)
...@@ -307,7 +310,7 @@ class TestDistBase(unittest.TestCase): ...@@ -307,7 +310,7 @@ class TestDistBase(unittest.TestCase):
if self._use_reader_alloc: if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc" tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc" tr1_cmd += " --use_reader_alloc"
if self._use_cuda: if self.__use_cuda:
tr0_cmd += " --use_cuda" tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda" tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"} env0 = {"CUDA_VISIBLE_DEVICES": "0"}
...@@ -319,15 +322,10 @@ class TestDistBase(unittest.TestCase): ...@@ -319,15 +322,10 @@ class TestDistBase(unittest.TestCase):
env0.update(envs) env0.update(envs)
env1.update(envs) env1.update(envs)
FNULL = open(os.devnull, 'w') print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
tr0_pipe = subprocess.PIPE tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = subprocess.PIPE tr1_pipe = open("/tmp/tr1_err.log", "wb")
if check_error_log:
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen( tr0_proc = subprocess.Popen(
tr0_cmd.strip().split(" "), tr0_cmd.strip().split(" "),
...@@ -340,29 +338,22 @@ class TestDistBase(unittest.TestCase): ...@@ -340,29 +338,22 @@ class TestDistBase(unittest.TestCase):
stderr=tr1_pipe, stderr=tr1_pipe,
env=env1) env=env1)
tr0_proc.wait()
tr1_proc.wait()
tr0_out, tr0_err = tr0_proc.communicate() tr0_out, tr0_err = tr0_proc.communicate()
tr0_loss_text = cpt.to_text(tr0_out) tr0_loss_text = cpt.to_text(tr0_out)
tr1_out, tr1_err = tr1_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate()
tr1_loss_text = cpt.to_text(tr1_out) tr1_loss_text = cpt.to_text(tr1_out)
# close trainer file # close trainer file
if check_error_log: tr0_pipe.close()
tr0_pipe.close() tr1_pipe.close()
tr1_pipe.close()
ps0_pipe.close() ps0_pipe.close()
ps1_pipe.close() ps1_pipe.close()
# FIXME: use terminate() instead of sigkill. # FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL) os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL)
ps0.terminate() ps0.terminate()
ps1.terminate() ps1.terminate()
ps0.wait()
ps1.wait()
FNULL.close()
# print log # print log
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
...@@ -387,6 +378,7 @@ class TestDistBase(unittest.TestCase): ...@@ -387,6 +378,7 @@ class TestDistBase(unittest.TestCase):
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1", "FLAGS_cudnn_deterministic": "1",
"http_proxy": ""
} }
required_envs.update(need_envs) required_envs.update(need_envs)
......
...@@ -21,10 +21,11 @@ from test_dist_base import TestDistBase ...@@ -21,10 +21,11 @@ from test_dist_base import TestDistBase
class TestDistCTR2x2(TestDistBase): class TestDistCTR2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_cuda = False self._enforce_place = "CPU"
def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) def test_dist_ctr(self):
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)
# TODO(typhoonzero): fix this test class TestDistseResnXt2x2WithMemopt(TestDistBase):
# class TestDistseResnXt2x2WithMemopt(TestDistBase): def _setup_config(self):
# def _setup_config(self): self._sync_mode = True
# self._sync_mode = True self._mem_opt = True
# self._mem_opt = True
def test_dist_train(self):
# def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100)
# self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
......
...@@ -22,7 +22,7 @@ from test_dist_base import TestDistBase ...@@ -22,7 +22,7 @@ from test_dist_base import TestDistBase
class TestDistSimnetBowDense2x2(TestDistBase): class TestDistSimnetBowDense2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_cuda = False self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
...@@ -36,7 +36,7 @@ class TestDistSimnetBowDense2x2(TestDistBase): ...@@ -36,7 +36,7 @@ class TestDistSimnetBowDense2x2(TestDistBase):
class TestDistSimnetBow2x2DenseAsync(TestDistBase): class TestDistSimnetBow2x2DenseAsync(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._use_cuda = False self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
...@@ -50,7 +50,7 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -50,7 +50,7 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
class TestDistSimnetBowSparse2x2(TestDistBase): class TestDistSimnetBowSparse2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_cuda = False self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
...@@ -64,7 +64,7 @@ class TestDistSimnetBowSparse2x2(TestDistBase): ...@@ -64,7 +64,7 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
class TestDistSimnetBow2x2SparseAsync(TestDistBase): class TestDistSimnetBow2x2SparseAsync(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._use_cuda = False self._enforce_place = "CPU"
def test_simnet_bow(self): def test_simnet_bow(self):
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
......
...@@ -21,7 +21,7 @@ from test_dist_base import TestDistBase ...@@ -21,7 +21,7 @@ from test_dist_base import TestDistBase
class TestDistTextClassification2x2(TestDistBase): class TestDistTextClassification2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._use_cuda = False self._enforce_place = "CPU"
def test_text_classification(self): def test_text_classification(self):
self.check_with_place("dist_text_classification.py", delta=1e-6) self.check_with_place("dist_text_classification.py", delta=1e-6)
...@@ -30,7 +30,7 @@ class TestDistTextClassification2x2(TestDistBase): ...@@ -30,7 +30,7 @@ class TestDistTextClassification2x2(TestDistBase):
class TestDistTextClassification2x2Async(TestDistBase): class TestDistTextClassification2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._use_cuda = False self._enforce_place = "CPU"
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_text_classification.py", delta=100) self.check_with_place("dist_text_classification.py", delta=100)
......
...@@ -825,6 +825,15 @@ class TestBook(unittest.TestCase): ...@@ -825,6 +825,15 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def iou_similarity(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[16], dtype="float32")
y = layers.data(name="y", shape=[16], dtype="float32")
out = layers.iou_similarity(x, y, name='iou_similarity')
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase): ...@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
set(mul_op.attr_names), set(mul_op.attr_names),
set([ set([
"x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
"op_namescope", "op_callstack" "op_namescope"
])) ]))
self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestPassBuilder(unittest.TestCase):
def check_network_convergence(self, use_cuda, build_strategy=None):
os.environ['CPU_NUM'] = str(4)
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net()
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
loss_name=loss.name,
main_program=main,
build_strategy=build_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=test_program,
share_vars_from=train_exe,
build_strategy=build_strategy)
for i in range(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss))
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
pass_builder = build_strategy._create_passes_from_strategy()
origin_len = len(pass_builder.all_passes())
viz_pass = pass_builder.append_pass("graph_viz_pass")
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
pass_builder.insert_pass(
len(pass_builder.all_passes()), "graph_viz_pass")
self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
self.check_network_convergence(
use_cuda=core.is_compiled_with_cuda(),
build_strategy=build_strategy)
try:
os.stat("/tmp/test_viz_pass")
except os.error:
self.assertFalse(True)
if __name__ == '__main__':
unittest.main()
...@@ -246,6 +246,7 @@ def prepare_encoder(src_word, ...@@ -246,6 +246,7 @@ def prepare_encoder(src_word,
padding_idx=pos_pad_idx, padding_idx=pos_pad_idx,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False)) name=pos_enc_param_name, trainable=False))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
# FIXME(guosheng): Decouple the program desc with batch_size. # FIXME(guosheng): Decouple the program desc with batch_size.
......
...@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory ...@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
from .ps_dispatcher import HashName, RoundRobin from .ps_dispatcher import HashName, RoundRobin
__all__ = [ __all__ = [
"DistributeTranspiler", "memory_optimize", "release_memory", "HashName", "DistributeTranspiler",
"RoundRobin", "DistributeTranspilerConfig" "memory_optimize",
"release_memory",
"HashName",
"RoundRobin",
"DistributeTranspilerConfig",
] ]
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
from __future__ import print_function from __future__ import print_function
from collections import defaultdict, OrderedDict, Callable from collections import defaultdict, MutableSet
from .. import core from .. import core
from ... import compat as cpt from ... import compat as cpt
from ..framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter, Variable, core
from ..backward import _rename_arg_ from ..backward import _rename_arg_
from functools import reduce from functools import reduce
from six.moves import range from six.moves import range
...@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), ...@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
PRINT_LOG = False PRINT_LOG = False
class OrderedSet(MutableSet):
def __init__(self, iterable=None):
self.end = end = []
end += [None, end, end] # sentinel node for doubly linked list
self.map = {} # key --> [key, prev, next]
if iterable is not None:
self |= iterable
def __len__(self):
return len(self.map)
def __contains__(self, key):
return key in self.map
def add(self, key):
if key not in self.map:
end = self.end
curr = end[1]
curr[2] = end[1] = self.map[key] = [key, curr, end]
def update(self, other):
for e in other:
self.add(e)
def discard(self, key):
if key in self.map:
key, prev, next = self.map.pop(key)
prev[2] = next
next[1] = prev
def remove(self, key):
self.discard(key)
def __iter__(self):
end = self.end
curr = end[2]
while curr is not end:
yield curr[0]
curr = curr[2]
def __reversed__(self):
end = self.end
curr = end[1]
while curr is not end:
yield curr[0]
curr = curr[1]
def pop(self, last=True):
if not self:
raise KeyError('set is empty')
key = self.end[1][0] if last else self.end[2][0]
self.discard(key)
return key
def __repr__(self):
if not self:
return '%s()' % (self.__class__.__name__, )
return '%s(%r)' % (self.__class__.__name__, list(self))
def __eq__(self, other):
if isinstance(other, OrderedSet):
return len(self) == len(other) and list(self) == list(other)
return set(self) == set(other)
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, program, ops, forward_num, skip_opt): def __init__(self, program, ops, forward_num, skip_opt):
self._program = program self._program = program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
self._successors = defaultdict(set) self._successors = defaultdict(OrderedSet)
self._presuccessors = defaultdict(set) self._presuccessors = defaultdict(OrderedSet)
self._uses = defaultdict(set) self._uses = defaultdict(OrderedSet)
self._defs = defaultdict(set) self._defs = defaultdict(OrderedSet)
self._live_in = defaultdict(set) self._live_in = defaultdict(OrderedSet)
self._live_out = defaultdict(set) self._live_out = defaultdict(OrderedSet)
self._skip_opt = skip_opt self._skip_opt = skip_opt
self.pool = [] self.pool = []
...@@ -116,7 +181,7 @@ class ControlFlowGraph(object): ...@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
# NOTE: must sort the in_diff set for cases that get different cache var. # NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this. # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize = [ can_optimize = [
x for x in sorted(list(in_diff)) x for x in in_diff
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
if can_optimize: if can_optimize:
...@@ -224,7 +289,7 @@ class ControlFlowGraph(object): ...@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
if self.pool: if self.pool:
# NOTE: must sort the in_diff set for cases that get different cache var. # NOTE: must sort the in_diff set for cases that get different cache var.
defs_can_optimize = [ defs_can_optimize = [
x for x in sorted(list(self._defs[i])) x for x in self._defs[i]
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
out_pair = [ out_pair = [
...@@ -381,7 +446,19 @@ def _get_cfgs(input_program): ...@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
return cfgs return cfgs
def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): def _is_opt_role_op(op):
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
def memory_optimize(input_program,
skip_opt_set=None,
print_log=False,
level=0,
skip_grads=False):
"""Optimize memory by reusing var memory. """Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock. Note: it doesn't not support subblock nested in subblock.
...@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): ...@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
raise ValueError("only support opt_level 0 or 1.") raise ValueError("only support opt_level 0 or 1.")
global PRINT_LOG global PRINT_LOG
PRINT_LOG = print_log PRINT_LOG = print_log
if skip_grads:
grad_set = set()
OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
for op in input_program.global_block().ops:
if _is_opt_role_op(op):
if op.attr(OP_ROLE_VAR):
grad_name = op.attr(OP_ROLE_VAR)[1]
grad_set.add(grad_name)
if not skip_opt_set:
skip_opt_set = grad_set
else:
skip_opt_set.update(grad_set)
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
......
...@@ -106,6 +106,7 @@ packages=['paddle', ...@@ -106,6 +106,7 @@ packages=['paddle',
'paddle.fluid.layers', 'paddle.fluid.layers',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.transpiler', 'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details'] 'paddle.fluid.transpiler.details']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册