diff --git a/CMakeLists.txt b/CMakeLists.txt index d4fe4f9a0e4b90e34b95ddfba52e22ee762273a0..1cbfa6706160e26656f81e65a7f97b5cb928d0e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,7 +59,6 @@ option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) -option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4..9459f1ddfe85f5607880d3fdd968b494d6af592a 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -23,17 +23,20 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) + +include(ProcessorCount) +ProcessorCount(NUM_OF_PROCESSOR) + IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) ELSE() - SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin) + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) ENDIF() ExternalProject_Add( extern_grpc DEPENDS protobuf zlib - GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.10.x" + URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9de44beafbb69b3510b97afcc43d4b489a029c35..b69de2ced03569d5e9ffe313527ab776ee798496 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -36,5 +36,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) -cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory - device_context reduce_op_handle ) +#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory +# device_context reduce_op_handle ) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 602246d75d708db5108e5320e50a27fd9cd580f8..79b1a248a0acfded0d2fcfadc041a6ad2a92ff3d 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,5 +1,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc - DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine - SERIAL) + DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine + SERIAL) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index bc7faef8cd499e63af4d0ab2282897c39f2b7faa..f72997ca24ed837f761b52cbecdc05998424a675 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -201,9 +201,9 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op - listen_and_serv_op sum_op executor SERIAL) + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op + # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc index b8db0ad987cdfaec1fc9236c3f26e88891376dce..73e75c9087fef756840c76db249f8996253ced64 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -108,7 +108,7 @@ void StartServer(const std::string& endpoint) { rpc_service_->RunSyncUpdate(); } -TEST(PREFETCH, CPU) { +TEST(PREFETCH, DISABLED_CPU) { // start up a server instance backend std::thread server_thread(StartServer, "127.0.0.1:8889"); sleep(2); diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index bbae1d54aa3524fd45cb8ab13c86df8d54b8e643..719f039a0f5fcd7445bf1589a683f122e6d62ba0 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -63,7 +63,7 @@ void StartServer(std::atomic* initialized) { server_thread.join(); } -TEST(SendNcclId, Normal) { +TEST(SendNcclId, DISABLED_Normal) { std::atomic initialized{false}; std::thread server_thread(StartServer, &initialized); while (!initialized) { diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index be704a7be7d2c9f3c95ad81ca906eeaf73b35beb..eed1412ba4f2b8f2209c0573359bea1e4b20d8d5 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -17,7 +17,7 @@ endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 +list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 @@ -39,74 +39,12 @@ function(py_test_modules TARGET_NAME) endif() endif() endfunction() - -list(REMOVE_ITEM TEST_OPS test_sequence_expand) - -# test time consuming OPs in a separate process for expliot parallism -list(REMOVE_ITEM TEST_OPS test_parallel_executor) list(REMOVE_ITEM TEST_OPS test_warpctc_op) -list(REMOVE_ITEM TEST_OPS test_dyn_rnn) -list(REMOVE_ITEM TEST_OPS test_mul_op) - -# tests that need to be run in separate process. -list(REMOVE_ITEM TEST_OPS test_multihead_attention) -list(REMOVE_ITEM TEST_OPS test_calc_gradient) -list(REMOVE_ITEM TEST_OPS test_while_op) -list(REMOVE_ITEM TEST_OPS test_lod_array_length_op) -list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor) -list(REMOVE_ITEM TEST_OPS test_profiler) -list(REMOVE_ITEM TEST_OPS test_nvprof) -list(REMOVE_ITEM TEST_OPS test_normalization_wrapper) -list(REMOVE_ITEM TEST_OPS test_executor_and_mul) -list(REMOVE_ITEM TEST_OPS test_assign_value_op) -list(REMOVE_ITEM TEST_OPS test_array_read_write_op) -list(REMOVE_ITEM TEST_OPS test_lod_rank_table) -list(REMOVE_ITEM TEST_OPS test_weight_normalization) -list(REMOVE_ITEM TEST_OPS test_conditional_block) -list(REMOVE_ITEM TEST_OPS test_parameter) -list(REMOVE_ITEM TEST_OPS test_registry) -list(REMOVE_ITEM TEST_OPS test_fetch_var) -list(REMOVE_ITEM TEST_OPS test_parallel_op) -list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input) list(REMOVE_ITEM TEST_OPS test_dist_train) -list(REMOVE_ITEM TEST_OPS test_network_with_dtype) - -# tests that can be bundled together in one python process for speed. -if(WITH_FAST_BUNDLE_TEST) - py_test_modules("test_all_ops" MODULES ${TEST_OPS}) -else() - foreach(TEST_OP ${TEST_OPS}) - py_test_modules(${TEST_OP} MODULES ${TEST_OP}) - endforeach(TEST_OP) -endif(WITH_FAST_BUNDLE_TEST) - -# -py_test_modules(test_sequence_expand MODULES test_sequence_expand) -# tests with high overhead -py_test_modules(test_parallel_executor MODULES test_parallel_executor) +list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) +list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) -py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn) -py_test_modules(test_mul_op MODULES test_mul_op) -py_test_modules(test_network_with_dtype MODULES test_network_with_dtype) - -# tests that need to be run in separate process. -py_test_modules(test_multihead_attention MODULES test_multihead_attention) -py_test_modules(test_calc_gradient MODULES test_calc_gradient) -py_test_modules(test_while_op MODULES test_while_op) -py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op) -py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor) -py_test_modules(test_profiler MODULES test_profiler) -py_test_modules(test_nvprof MODULES test_nvprof) -py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper) -py_test_modules(test_executor_and_mul MODULES test_executor_and_mul) -py_test_modules(test_assign_value_op MODULES test_assign_value_op) -py_test_modules(test_array_read_write_op MODULES test_array_read_write_op) -py_test_modules(test_lod_rank_table MODULES test_lod_rank_table) -py_test_modules(test_weight_normalization MODULES test_weight_normalization) -py_test_modules(test_conditional_block MODULES test_conditional_block) -py_test_modules(test_parameter MODULES test_parameter) -py_test_modules(test_registry MODULES test_registry) -py_test_modules(test_fetch_var MODULES test_fetch_var) -py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input) -py_test_modules(test_parallel_op MODULES test_parallel_op) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..c9c3c648717814c28c39a401487925824e885946 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -0,0 +1,96 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import time +import numpy as np + +__all__ = ['TestParallelExecutorBase'] + + +class TestParallelExecutorBase(unittest.TestCase): + def check_network_convergence(self, + method, + memory_opt=True, + iter=50, + batch_size=None, + allow_op_delay=False, + feed_dict=None, + seed=None, + use_parallel_executor=True, + balance_parameter_opt_between_cards=False): + def run_executor(exe, feed, fetch_list, program=None): + if isinstance(exe, fluid.ParallelExecutor): + res = exe.run(fetch_list=fetch_list, feed=feed) + elif isinstance(exe, fluid.Executor): + if program is None: + program = fluid.default_main_program() + res = exe.run(program=program, feed=feed, fetch_list=fetch_list) + else: + raise ValueError('Unkown type exe') + return res + + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = 1 # Fix random seed + with fluid.program_guard(main, startup): + if seed is not None: + startup.random_seed = seed + loss = method(use_feed=feed_dict is not None) + adam = fluid.optimizer.Adam() + adam.minimize(loss) + if memory_opt: + fluid.memory_optimize(main) + place = fluid.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + + if use_parallel_executor: + exe = fluid.ParallelExecutor( + True, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + else: + exe = fluid.Executor(place=place) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count() + begin = time.time() + first_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + first_loss = np.array(first_loss) + + for i in xrange(iter): + run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print "%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin)) + + last_loss = np.array(last_loss) + + print first_loss, last_loss + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 08479202f637a2102d163f306e3e17bcfa873482..2314bb2ed8a4eeb34752fd5d040f8a8476798aa6 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -12,19 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import time import unittest +from multiprocessing import Process + +import numpy import paddle.fluid as fluid -import paddle.fluid.core as core import paddle.fluid.layers as layers -import numpy -from multiprocessing import Process -from threading import Thread -import os, sys -import time class TestSendOp(unittest.TestCase): + @unittest.skip( + "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest." + ) def test_send(self): # Run init_serv in a thread place = fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py deleted file mode 100644 index 056f9e1781997aa1586d972874b652d5b725fe3f..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ /dev/null @@ -1,902 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import unittest - -import paddle.fluid as fluid -import paddle -import paddle.dataset.mnist as mnist -import paddle.dataset.wmt16 as wmt16 - - -def simple_fc_net(use_feed): - if use_feed: - img = fluid.layers.data(name='image', shape=[784], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - else: - reader = fluid.layers.open_files( - filenames=['./mnist.recordio'], - shapes=[[-1, 784], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) - reader = fluid.layers.io.double_buffer(reader) - img, label = fluid.layers.read_file(reader) - hidden = img - for _ in xrange(4): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -def fc_with_batchnorm(use_feed): - if use_feed: - img = fluid.layers.data(name='image', shape=[784], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - else: - reader = fluid.layers.open_files( - filenames=['mnist.recordio'], - shapes=[[-1, 784], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) - reader = fluid.layers.io.double_buffer(reader) - img, label = fluid.layers.read_file(reader) - - hidden = img - for _ in xrange(1): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - - hidden = fluid.layers.batch_norm(input=hidden) - - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -def squeeze_excitation(input, num_channels, reduction_ratio): - # pool = fluid.layers.pool2d( - # input=input, pool_size=0, pool_type='avg', global_pooling=True) - conv = input - shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) - pool = fluid.layers.reduce_mean(input=reshape, dim=2) - - squeeze = fluid.layers.fc(input=pool, - size=num_channels / reduction_ratio, - act='relu') - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') - scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) - return scale - - -def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, - act=None): - conv = fluid.layers.conv2d( - input=input, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) / 2, - groups=groups, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) - - -def shortcut(input, ch_out, stride): - ch_in = input.shape[1] - if ch_in != ch_out: - if stride == 1: - filter_size = 1 - else: - filter_size = 3 - return conv_bn_layer(input, ch_out, filter_size, stride) - else: - return input - - -def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): - # The number of first 1x1 convolutional channels for each bottleneck build block - # was halved to reduce the compution cost. - conv0 = conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu') - conv1 = conv_bn_layer( - input=conv0, - num_filters=num_filters * 2, - filter_size=3, - stride=stride, - groups=cardinality, - act='relu') - conv2 = conv_bn_layer( - input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) - scale = squeeze_excitation( - input=conv2, - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) - - short = shortcut(input, num_filters * 2, stride) - - return fluid.layers.elementwise_add(x=short, y=scale, act='relu') - - -def SE_ResNeXt50Small(batch_size=2, use_feed=False): - assert not use_feed, "SE_ResNeXt doesn't support feed yet" - - img = fluid.layers.fill_constant( - shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) - label = fluid.layers.fill_constant( - shape=[batch_size, 1], dtype='int64', value=0.0) - - conv = conv_bn_layer( - input=img, num_filters=16, filter_size=3, stride=2, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=16, filter_size=3, stride=1, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=16, filter_size=3, stride=1, act='relu') - conv = fluid.layers.pool2d( - input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') - - cardinality = 32 - reduction_ratio = 16 - depth = [3, 4, 6, 3] - num_filters = [128, 256, 512, 1024] - - for block in range(len(depth)): - for i in range(depth[block]): - conv = bottleneck_block( - input=conv, - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1, - cardinality=cardinality, - reduction_ratio=reduction_ratio) - - shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) - pool = fluid.layers.reduce_mean(input=reshape, dim=2) - dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) - # Classifier layer: - prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) - return loss - - -import time - - -class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, - method, - memory_opt=True, - iter=50, - batch_size=None, - allow_op_delay=False, - feed_dict=None, - seed=None, - use_parallel_executor=True, - balance_parameter_opt_between_cards=False): - def run_executor(exe, feed, fetch_list, program=None): - if isinstance(exe, fluid.ParallelExecutor): - res = exe.run(fetch_list=fetch_list, feed=feed) - elif isinstance(exe, fluid.Executor): - if program is None: - program = fluid.default_main_program() - res = exe.run(program=program, feed=feed, fetch_list=fetch_list) - else: - raise ValueError('Unkown type exe') - return res - - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = 1 # Fix random seed - with fluid.program_guard(main, startup): - if seed is not None: - startup.random_seed = seed - loss = method(use_feed=feed_dict is not None) - adam = fluid.optimizer.Adam() - adam.minimize(loss) - if memory_opt: - fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) - startup_exe = fluid.Executor(place) - startup_exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce - - if use_parallel_executor: - exe = fluid.ParallelExecutor( - True, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) - else: - exe = fluid.Executor(place=place) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count() - begin = time.time() - first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - first_loss = np.array(first_loss) - - for i in xrange(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print "%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin)) - - last_loss = np.array(last_loss) - - print first_loss, last_loss - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss - - -class TestMNIST(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - # Convert mnist to recordio file - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch(mnist.train(), batch_size=4) - feeder = fluid.DataFeeder( - feed_list=[ # order is image and label - fluid.layers.data( - name='image', shape=[784]), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - fluid.recordio_writer.convert_reader_to_recordio_file( - './mnist.recordio', reader, feeder) - - def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): - self.check_network_convergence(simple_fc_net) - self.check_network_convergence(simple_fc_net, allow_op_delay=True) - - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - self.check_network_convergence( - simple_fc_net, - feed_dict={"image": img, - "label": label}, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_simple_fc(self): - self.check_simple_fc_convergence(False) - - def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True) - - def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards): - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - single_first_loss, single_last_loss = self.check_network_convergence( - method=simple_fc_net, - seed=1000, - feed_dict={"image": img, - "label": label}, - use_parallel_executor=False) - parallel_first_loss, parallel_last_loss = self.check_network_convergence( - method=simple_fc_net, - seed=1000, - feed_dict={"image": img, - "label": label}, - use_parallel_executor=True, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) - - def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False) - - def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True) - - def check_batchnorm_fc_convergence(self, - balance_parameter_opt_between_cards): - self.check_network_convergence(fc_with_batchnorm) - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False) - - def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True) - - -class TestResnet(TestParallelExecutorBase): - # @classmethod - # def setUpClass(cls): - # # import os - # # if os.path.exists('./flowers.recordio'): - # # return - # with fluid.program_guard(fluid.Program(), fluid.Program()): - # reader = paddle.batch(flowers.train(), batch_size=4) - # feeder = fluid.DataFeeder( - # feed_list=[ - # fluid.layers.data( - # name='image', shape=[3, 224, 224]), - # fluid.layers.data( - # name='label', shape=[1], dtype='int64'), - # ], - # place=fluid.CPUPlace()) - # fluid.recordio_writer.convert_reader_to_recordio_file( - # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) - - def check_resnet_convergence(self, balance_parameter_opt_between_cards): - import functools - batch_size = 2 - self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), - iter=20, - batch_size=batch_size, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_resnet(self): - self.check_resnet_convergence(False) - - def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True) - - -class ModelHyperParams(object): - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # alreay been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. - - # size of source word dictionary. - src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size - - # size of target word dictionay - trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size - - # position value corresponding to the token. - pos_pad_idx = 0 - - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. - max_length = 50 - - # the dimension for word embeddings, which is also the last dimension of - # the input and output of multi-head attention, position-wise feed-forward - # networks, encoder and decoder. - - d_model = 512 - # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 1024 - # the dimension that keys are projected to for dot-product attention. - d_key = 64 - # the dimension that values are projected to for dot-product attention. - d_value = 64 - # number of head used in multi-head attention. - n_head = 8 - # number of sub-layers to be stacked in the encoder and decoder. - n_layer = 6 - # dropout rate used by all dropout layers. - dropout = 0.1 - - -def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. Then, convert the numpy - data to tensors and return a dict mapping names to tensors. - """ - - def __pad_batch_data(insts, - pad_idx, - is_target=False, - return_pos=True, - return_attn_bias=True, - return_max_len=True): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array([[ - pos_i + 1 if w_i != pad_idx else 0 - for pos_i, w_i in enumerate(inst) - ] for inst in inst_data]) - - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, - max_len)) - slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( - [-1, 1, max_len, max_len]) - slf_attn_bias_data = np.tile(slf_attn_bias_data, - [1, n_head, 1, 1]) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * - (max_len - len(inst)) - for inst in insts]) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1]) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - return return_list if len(return_list) > 1 else return_list[0] - - def data_to_tensor(data_list, name_list, input_dict, place): - assert len(data_list) == len(name_list) - for i in range(len(name_list)): - tensor = fluid.LoDTensor() - tensor.set(data_list[i], place) - input_dict[name_list[i]] = tensor - - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, is_target=False) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, is_target=True) - trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], - [1, 1, trg_max_len, 1]).astype("float32") - lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, - False, False, False) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) - - return [ - src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight - ] - - -import transformer_model - - -def transformer(use_feed): - assert not use_feed, "transfomer doesn't support feed yet" - return transformer_model.transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, ModelHyperParams.n_head, - ModelHyperParams.d_key, ModelHyperParams.d_value, - ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) - - -class TestTransformer(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - reader = paddle.batch( - wmt16.train(ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size), - batch_size=transformer_model.batch_size) - - with fluid.recordio_writer.create_recordio_writer( - "./wmt16.recordio") as writer: - for batch in reader(): - for tensor in prepare_batch_input( - batch, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): - t = fluid.LoDTensor() - t.set(tensor, fluid.CPUPlace()) - writer.append_tensor(t) - writer.complete_append_tensor() - - @unittest.skip("transformer is buggy in multi gpu") - def test_main(self): - self.check_network_convergence(transformer) - - -class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self, build_strategy=None): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - loss = simple_fc_net(True) - test_program = main.clone(for_test=True) - - opt = fluid.optimizer.SGD(learning_rate=0.001) - opt.minimize(loss) - - batch_size = 32 - image = np.random.normal(size=(batch_size, 784)).astype('float32') - label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - feed_dict = {'image': image, 'label': label} - - train_exe = fluid.ParallelExecutor( - use_cuda=True, - loss_name=loss.name, - main_program=main, - build_strategy=build_strategy) - - test_exe = fluid.ParallelExecutor( - use_cuda=True, - main_program=test_program, - share_vars_from=train_exe, - build_strategy=build_strategy) - - for i in xrange(5): - test_loss, = test_exe.run([loss.name], feed=feed_dict) - test_loss = np.array(test_loss) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - train_loss = np.array(train_loss) - self.assertTrue( - np.allclose( - train_loss, test_loss, atol=1e-8), - "Train loss: " + str(train_loss) + "\n Test loss:" + - str(test_loss)) - - def test_parallel_testing(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence(build_strategy) - - def test_parallel_testing_with_new_strategy(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence(build_strategy) - - -import paddle.dataset.conll05 as conll05 -import paddle.fluid as fluid - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_dict_len = len(verb_dict) -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 -mix_hidden_lr = 1e-3 -embedding_name = 'emb' - - -def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - is_sparse, **ignored): - # 8 features - predicate_embedding = fluid.layers.embedding( - input=predicate, - is_sparse=is_sparse, - size=[pred_dict_len, word_dim], - dtype='float32', - param_attr='vemb') - - mark_embedding = fluid.layers.embedding( - input=mark, - is_sparse=is_sparse, - size=[mark_dict_len, mark_dim], - dtype='float32') - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - fluid.layers.embedding( - size=[word_dict_len, word_dim], - is_sparse=is_sparse, - input=x, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') - for emb in emb_layers - ] - - hidden_0 = fluid.layers.sums(input=hidden_0_layers) - - lstm_0 = fluid.layers.dynamic_lstm( - input=hidden_0, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid') - - # stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') - ]) - - lstm = fluid.layers.dynamic_lstm( - input=mix_hidden, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid', - is_reverse=((i % 2) == 1)) - - input_tmp = [mix_hidden, lstm] - - feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') - ]) - - return feature_out - - -class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse, build_strategy=None): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - feature_out = db_lstm(**locals()) - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=16) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - - pe = fluid.ParallelExecutor( - use_cuda=True, - loss_name=avg_cost.name, - build_strategy=build_strategy) - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, - mark, target - ], - place=fluid.CPUPlace()) - - data = train_data() - for i in xrange(10): - cur_batch = next(data) - print map(np.array, - pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name]))[0] - - def test_update_sparse_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) - - def test_update_dense_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) - - def test_update_sparse_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) - - def test_update_dense_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) - - -# test fetch all the variables of global_block - -import paddle.dataset.flowers as flowers -import math - - -def Lenet(data, class_dim): - conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None) - bn1 = fluid.layers.batch_norm(conv1, act='relu') - pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2) - conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None) - bn2 = fluid.layers.batch_norm(conv2, act='relu') - pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2) - - fc1 = fluid.layers.fc(pool2, size=500, act='relu') - fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax') - - return fc2 - - -class TestFetchOp(unittest.TestCase): - def parallel_exe(self, train_inputs, seed): - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = seed - with fluid.program_guard(main, startup): - data = fluid.layers.data( - name='image', shape=[3, 224, 224], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - out = Lenet(data, class_dim=102) - loss = fluid.layers.cross_entropy(input=out, label=label) - loss = fluid.layers.mean(loss) - - opt = fluid.optimizer.Momentum( - learning_rate=0.1, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - opt.minimize(loss) - - # TODO(zcd): I found that onece the memory optimizer is open, - # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD, - # conv2d_1.b_0@GRAD. Those variables should not be pruned. - # fluid.memory_optimize(main) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(startup) - - feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) - pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) - - fetch_list = [] - all_vars = main.global_block().vars - for k, v in all_vars.iteritems(): - if 'tmp' not in k and k[0] is not '_' or v.persistable: - fetch_list.append(k) - - for data in train_inputs: - ret = pe.run(fetch_list, feed=feeder.feed(data)) - for i in range(len(fetch_list)): - assert not math.isnan(np.sum(ret[i])) and \ - not math.isinf(np.sum(ret[i])) - - def test_fetch_op(self): - tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) - tst_reader_iter = tst_reader() - - iters = 3 - train_inputs = [] - for i in range(iters): - train_inputs.append(tst_reader_iter.next()) - - self.parallel_exe(train_inputs, seed=1) - - -class TestFeedParallel(unittest.TestCase): - def test_main(self): - main = fluid.Program() - startup = fluid.Program() - startup.random_seed = 1 - with fluid.scope_guard(fluid.core.Scope()): - with fluid.program_guard(main, startup): - data = fluid.layers.data( - name='image', shape=[3, 224, 224], dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - out = Lenet(data, class_dim=102) - loss = fluid.layers.cross_entropy(input=out, label=label) - loss = fluid.layers.mean(loss) - opt = fluid.optimizer.Momentum( - learning_rate=0.1, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - opt.minimize(loss) - place = fluid.CUDAPlace(0) - feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) - reader = feeder.decorate_reader( - paddle.batch( - flowers.train(), batch_size=16), multi_devices=True) - exe = fluid.Executor(place) - exe.run(startup) - pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) - - for batch_id, data in enumerate(reader()): - loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) - print batch_id, loss_np - if batch_id == 2: - break - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py new file mode 100644 index 0000000000000000000000000000000000000000..66e138b03f3b170aca4fb2207438eb9af1783c33 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -0,0 +1,197 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.dataset.conll05 as conll05 +import paddle.fluid as fluid +import unittest +import paddle +import numpy as np + +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_dict_len = len(verb_dict) +mark_dict_len = 2 +word_dim = 32 +mark_dim = 5 +hidden_dim = 512 +depth = 8 +mix_hidden_lr = 1e-3 +embedding_name = 'emb' + + +def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, + is_sparse, **ignored): + # 8 features + predicate_embedding = fluid.layers.embedding( + input=predicate, + is_sparse=is_sparse, + size=[pred_dict_len, word_dim], + dtype='float32', + param_attr='vemb') + + mark_embedding = fluid.layers.embedding( + input=mark, + is_sparse=is_sparse, + size=[mark_dict_len, mark_dim], + dtype='float32') + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + fluid.layers.embedding( + size=[word_dict_len, word_dim], + is_sparse=is_sparse, + input=x, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0_layers = [ + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers + ] + + hidden_0 = fluid.layers.sums(input=hidden_0_layers) + + lstm_0 = fluid.layers.dynamic_lstm( + input=hidden_0, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid') + + # stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') + ]) + + lstm = fluid.layers.dynamic_lstm( + input=mix_hidden, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', + is_reverse=((i % 2) == 1)) + + input_tmp = [mix_hidden, lstm] + + feature_out = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') + ]) + + return feature_out + + +class TestCRFModel(unittest.TestCase): + def check_network_convergence(self, is_sparse, build_strategy=None): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=1e-1)) + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=16) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + + pe = fluid.ParallelExecutor( + use_cuda=True, + loss_name=avg_cost.name, + build_strategy=build_strategy) + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, + mark, target + ], + place=fluid.CPUPlace()) + + data = train_data() + for i in xrange(10): + cur_batch = next(data) + print map(np.array, + pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name]))[0] + + def test_update_sparse_parameter_all_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy) + + def test_update_dense_parameter_all_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy) + + def test_update_sparse_parameter_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy) + + def test_update_dense_parameter_reduce(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py new file mode 100644 index 0000000000000000000000000000000000000000..24f8d28c0304a77a99213374b25d0db728eca265 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -0,0 +1,132 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.dataset.flowers as flowers +import math +import paddle.fluid as fluid +import unittest +import numpy as np +import paddle + + +def Lenet(data, class_dim): + conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None) + bn1 = fluid.layers.batch_norm(conv1, act='relu') + pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2) + conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None) + bn2 = fluid.layers.batch_norm(conv2, act='relu') + pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2) + + fc1 = fluid.layers.fc(pool2, size=500, act='relu') + fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax') + + return fc2 + + +class TestFetchOp(unittest.TestCase): + def parallel_exe(self, train_inputs, seed): + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = seed + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='image', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = Lenet(data, class_dim=102) + loss = fluid.layers.cross_entropy(input=out, label=label) + loss = fluid.layers.mean(loss) + + opt = fluid.optimizer.Momentum( + learning_rate=0.1, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opt.minimize(loss) + + # TODO(zcd): I found that onece the memory optimizer is open, + # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD, + # conv2d_1.b_0@GRAD. Those variables should not be pruned. + # fluid.memory_optimize(main) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + pe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + fetch_list = [] + all_vars = main.global_block().vars + for k, v in all_vars.iteritems(): + if 'tmp' not in k and k[0] is not '_' or v.persistable: + fetch_list.append(k) + + for data in train_inputs: + ret = pe.run(fetch_list, feed=feeder.feed(data)) + for i in range(len(fetch_list)): + assert not math.isnan(np.sum(ret[i])) and \ + not math.isinf(np.sum(ret[i])) + + def test_fetch_op(self): + tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) + tst_reader_iter = tst_reader() + + iters = 3 + train_inputs = [] + for i in range(iters): + train_inputs.append(tst_reader_iter.next()) + + self.parallel_exe(train_inputs, seed=1) + + +class TestFeedParallel(unittest.TestCase): + def test_main(self): + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = 1 + with fluid.scope_guard(fluid.core.Scope()): + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='image', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + out = Lenet(data, class_dim=102) + loss = fluid.layers.cross_entropy(input=out, label=label) + loss = fluid.layers.mean(loss) + opt = fluid.optimizer.Momentum( + learning_rate=0.1, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opt.minimize(loss) + place = fluid.CUDAPlace(0) + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + reader = feeder.decorate_reader( + paddle.batch( + flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) + exe.run(startup) + pe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + for batch_id, data in enumerate(reader()): + loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) + print batch_id, loss_np + if batch_id == 2: + break + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..015703c3e25f4e11e64ab6a7de99da12bee608f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -0,0 +1,171 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def simple_fc_net(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=1, + for_parallel=True) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=1, + for_parallel=True) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + + hidden = img + for _ in xrange(1): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): + self.check_network_convergence(simple_fc_net) + self.check_network_convergence(simple_fc_net, allow_op_delay=True) + + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_simple_fc(self): + self.check_simple_fc_convergence(False) + + def test_simple_fc_with_new_strategy(self): + self.check_simple_fc_convergence(True) + + def check_simple_fc_parallel_accuracy(self, + balance_parameter_opt_between_cards): + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + single_first_loss, single_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1000, + feed_dict={"image": img, + "label": label}, + use_parallel_executor=False) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1000, + feed_dict={"image": img, + "label": label}, + use_parallel_executor=True, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + for p_f in parallel_first_loss: + self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) + for p_l in parallel_last_loss: + self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy(False) + + def test_simple_fc_parallel_accuracy_with_new_strategy(self): + self.check_simple_fc_parallel_accuracy(True) + + def check_batchnorm_fc_convergence(self, + balance_parameter_opt_between_cards): + self.check_network_convergence(fc_with_batchnorm) + img = np.zeros(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_batchnorm_fc(self): + self.check_batchnorm_fc_convergence(False) + + def test_batchnorm_fc_with_new_strategy(self): + self.check_batchnorm_fc_convergence(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py new file mode 100644 index 0000000000000000000000000000000000000000..a3fa140cbb7994a36d2cbee26d598165f1f771d2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -0,0 +1,152 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase +import unittest + + +def squeeze_excitation(input, num_channels, reduction_ratio): + # pool = fluid.layers.pool2d( + # input=input, pool_size=0, pool_type='avg', global_pooling=True) + conv = input + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + if stride == 1: + filter_size = 1 + else: + filter_size = 3 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + # The number of first 1x1 convolutional channels for each bottleneck build block + # was halved to reduce the compution cost. + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters * 2, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def SE_ResNeXt50Small(batch_size=2, use_feed=False): + assert not use_feed, "SE_ResNeXt doesn't support feed yet" + + img = fluid.layers.fill_constant( + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) + label = fluid.layers.fill_constant( + shape=[batch_size, 1], dtype='int64', value=0.0) + + conv = conv_bn_layer( + input=img, num_filters=16, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=16, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + # Classifier layer: + prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestResnet(TestParallelExecutorBase): + def check_resnet_convergence(self, balance_parameter_opt_between_cards): + import functools + batch_size = 2 + self.check_network_convergence( + functools.partial( + SE_ResNeXt50Small, batch_size=batch_size), + iter=20, + batch_size=batch_size, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) + + def test_resnet(self): + self.check_resnet_convergence(False) + + def test_resnet_with_new_strategy(self): + self.check_resnet_convergence(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py new file mode 100644 index 0000000000000000000000000000000000000000..93a5f767867d68110cf7b8f441cc740ecd843cf9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -0,0 +1,93 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import numpy as np +import unittest + + +def simple_fc_net(): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class ParallelExecutorTestingDuringTraining(unittest.TestCase): + def check_network_convergence(self, build_strategy=None): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net() + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=True, + loss_name=loss.name, + main_program=main, + build_strategy=build_strategy) + + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy) + + for i in xrange(5): + test_loss, = test_exe.run([loss.name], feed=feed_dict) + test_loss = np.array(test_loss) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + train_loss = np.array(train_loss) + self.assertTrue( + np.allclose( + train_loss, test_loss, atol=1e-8), + "Train loss: " + str(train_loss) + "\n Test loss:" + + str(test_loss)) + + def test_parallel_testing(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + self.check_network_convergence(build_strategy) + + def test_parallel_testing_with_new_strategy(self): + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + self.check_network_convergence(build_strategy) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c81df66d987f3d3856af0e19fc935df7de2edacc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -0,0 +1,174 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import transformer_model +import numpy as np +from parallel_executor_test_base import TestParallelExecutorBase +import unittest +import paddle +import paddle.dataset.wmt16 as wmt16 + +WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" + + +class ModelHyperParams(object): + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # alreay been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionay + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data(insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array([[ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] for inst in inst_data]) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, + max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, + False, False, False) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + +def transformer(use_feed): + assert not use_feed, "transfomer doesn't support feed yet" + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + + +class TestTransformer(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=transformer_model.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + WMT16_RECORDIO_FILE) as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + @unittest.skip("transformer is buggy in multi gpu") + def test_main(self): + self.check_network_convergence(transformer) + + +if __name__ == '__main__': + unittest.main()