提交 4d2a2e75 编写于 作者: B baiyfbupt

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") ...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0") set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz") set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
......
...@@ -21,11 +21,12 @@ else() ...@@ -21,11 +21,12 @@ else()
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
# eigen on cuda9.1 missing header of math_funtions.hpp # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "eigen"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
......
...@@ -53,11 +53,9 @@ ExternalProject_Add( ...@@ -53,11 +53,9 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.14" GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
# Patch MKLDNN to compile with gcc 4.8, the related issue is in intel/mkl-dnn#237.
PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/mkldnn.hpp ${MKLDNN_SOURCES_DIR}/src/extern_mkldnn/include/mkldnn.hpp
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
......
...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject) ...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.3.20180406") SET(MKLML_VER "mklml_lnx_2018.0.3.20180406")
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz") SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
......
...@@ -98,6 +98,14 @@ elseif (WITH_MKLML) ...@@ -98,6 +98,14 @@ elseif (WITH_MKLML)
) )
endif() endif()
if(WITH_MKLDNN)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
)
endif()
if(NOT MOBILE_INFERENCE AND NOT RPI) if(NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy") set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
copy(snappy_lib copy(snappy_lib
......
...@@ -186,11 +186,7 @@ endif() ...@@ -186,11 +186,7 @@ endif()
add_subdirectory(detail) add_subdirectory(detail)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GPU)
op_library(gen_nccl_id_op DEPS nccl_common)
else()
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif()
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS}) op_library(send_op DEPS ${DISTRIBUTE_DEPS})
...@@ -208,6 +204,12 @@ if(WITH_DISTRIBUTE) ...@@ -208,6 +204,12 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
if(WITH_GPU)
op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif()
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op gen_nccl_id_op) set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op gen_nccl_id_op)
endif() endif()
......
...@@ -20,19 +20,15 @@ ...@@ -20,19 +20,15 @@
#================================================= #=================================================
function print_usage() { function print_usage() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
echo -e "\n${RED}Usage${NONE}: echo -e "\n${RED}Usage${NONE}:
${BOLD}$0${NONE} [OPTION]" ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
echo -e "\n${RED}Options${NONE}: echo -e "\n${RED}Options${NONE}:
${BLUE}build${NONE}: run build for x86 platform ${BLUE}build${NONE}: run build for x86 platform
${BLUE}build_android${NONE}: run build for android platform ${BLUE}build_android${NONE}: run build for android platform
${BLUE}build_ios${NONE}: run build for ios platform ${BLUE}build_ios${NONE}: run build for ios platform
${BLUE}test${NONE}: run all unit tests ${BLUE}test${NONE}: run all unit tests
${BLUE}single_test${NONE}: run a single unit test
${BLUE}bind_test${NONE}: parallel tests bind to different GPU ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
${BLUE}doc${NONE}: generate paddle documents ${BLUE}doc${NONE}: generate paddle documents
${BLUE}html${NONE}: convert C++ source code into HTML ${BLUE}html${NONE}: convert C++ source code into HTML
...@@ -45,7 +41,15 @@ function print_usage() { ...@@ -45,7 +41,15 @@ function print_usage() {
} }
function init() { function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
if [ -z "${SCRIPT_NAME}" ]; then
SCRIPT_NAME=$0
fi
} }
function cmake_gen() { function cmake_gen() {
...@@ -91,7 +95,6 @@ function cmake_gen() { ...@@ -91,7 +95,6 @@ function cmake_gen() {
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF} -DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
...@@ -309,6 +312,25 @@ EOF ...@@ -309,6 +312,25 @@ EOF
fi fi
} }
function single_test() {
TEST_NAME=$1
if [ -z "${TEST_NAME}" ]; then
echo -e "${RED}Usage:${NONE}"
echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
exit 1
fi
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
if [ ${WITH_TESTING:-ON} == "ON" ] ; then
cat <<EOF
========================================
Running ${TEST_NAME} ...
========================================
EOF
ctest --output-on-failure -R ${TEST_NAME}
fi
}
function bind_test() { function bind_test() {
# the number of process to run tests # the number of process to run tests
NUM_PROC=6 NUM_PROC=6
...@@ -491,6 +513,9 @@ function main() { ...@@ -491,6 +513,9 @@ function main() {
test) test)
run_test run_test
;; ;;
single_test)
single_test $2
;;
bind_test) bind_test)
bind_test bind_test
;; ;;
......
...@@ -63,6 +63,7 @@ EOL ...@@ -63,6 +63,7 @@ EOL
${DOCKER_CMD} run -it \ ${DOCKER_CMD} run -it \
--name $CONTAINER_ID \ --name $CONTAINER_ID \
${DOCKER_ENV} \ ${DOCKER_ENV} \
-e SCRIPT_NAME=$0 \
-v $PADDLE_ROOT:/paddle \ -v $PADDLE_ROOT:/paddle \
-v ${HOME}/.ccache:/root/.ccache \ -v ${HOME}/.ccache:/root/.ccache \
-w /paddle \ -w /paddle \
......
此差异已折叠。
...@@ -62,31 +62,31 @@ def train(use_cuda, train_program, save_dirname): ...@@ -62,31 +62,31 @@ def train(use_cuda, train_program, save_dirname):
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer=optimizer) train_func=train_program,
place=place,
optimizer=optimizer,
parallel=True)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndEpochEvent): if isinstance(event, fluid.EndEpochEvent):
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
test_metrics = trainer.test( avg_cost, acc = trainer.test(
reader=test_reader, feed_order=['img', 'label']) reader=test_reader, feed_order=['img', 'label'])
avg_cost_set = test_metrics[0]
acc_set = test_metrics[1]
# get test acc and loss
acc = numpy.array(acc_set).mean()
avg_cost = numpy.array(avg_cost_set).mean()
print("avg_cost: %s" % avg_cost) print("avg_cost: %s" % avg_cost)
print("acc : %s" % acc) print("acc : %s" % acc)
if float(acc) > 0.2: # Smaller value to increase CI speed if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname) trainer.save_params(save_dirname)
else: else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, float(avg_cost), float(acc))) event.epoch + 1, avg_cost, acc))
if math.isnan(float(avg_cost)): if math.isnan(avg_cost):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
elif isinstance(event, fluid.EndStepEvent):
print("Step {0}, Epoch {1} Metrics {2}".format(
event.step, event.epoch, map(numpy.array, event.metrics)))
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -131,4 +131,4 @@ def main(use_cuda): ...@@ -131,4 +131,4 @@ def main(use_cuda):
if __name__ == '__main__': if __name__ == '__main__':
# for use_cuda in (False, True): # for use_cuda in (False, True):
main(use_cuda=False) main(use_cuda=True)
...@@ -55,24 +55,18 @@ def train(use_cuda, train_program, save_dirname): ...@@ -55,24 +55,18 @@ def train(use_cuda, train_program, save_dirname):
if isinstance(event, fluid.EndEpochEvent): if isinstance(event, fluid.EndEpochEvent):
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
test_metrics = trainer.test( avg_cost, acc = trainer.test(
reader=test_reader, feed_order=['img', 'label']) reader=test_reader, feed_order=['img', 'label'])
avg_cost_set = test_metrics[0]
acc_set = test_metrics[1]
# get test acc and loss
acc = numpy.array(acc_set).mean()
avg_cost = numpy.array(avg_cost_set).mean()
print("avg_cost: %s" % avg_cost) print("avg_cost: %s" % avg_cost)
print("acc : %s" % acc) print("acc : %s" % acc)
if float(acc) > 0.2: # Smaller value to increase CI speed if acc > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname) trainer.save_params(save_dirname)
else: else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.epoch + 1, float(avg_cost), float(acc))) event.epoch + 1, avg_cost, acc))
if math.isnan(float(avg_cost)): if math.isnan(avg_cost):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
train_reader = paddle.batch( train_reader = paddle.batch(
......
...@@ -20,6 +20,7 @@ import data_feeder ...@@ -20,6 +20,7 @@ import data_feeder
import contextlib import contextlib
import io import io
import unique_name import unique_name
import parallel_executor
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
import optimizer as opt_module import optimizer as opt_module
...@@ -48,12 +49,14 @@ class BeginStepEvent(object): ...@@ -48,12 +49,14 @@ class BeginStepEvent(object):
def __init__(self, epoch_id, step_id): def __init__(self, epoch_id, step_id):
self.epoch = epoch_id self.epoch = epoch_id
self.step = step_id self.step = step_id
self.fetch_metrics = True
class EndStepEvent(object): class EndStepEvent(object):
def __init__(self, epoch_id, step_id): def __init__(self, epoch_id, step_id, metrics):
self.epoch = epoch_id self.epoch = epoch_id
self.step = step_id self.step = step_id
self.metrics = metrics
def check_and_get_place(place): def check_and_get_place(place):
...@@ -87,12 +90,17 @@ class Trainer(object): ...@@ -87,12 +90,17 @@ class Trainer(object):
Args: Args:
train_func(callable): A function which will return loss. The loss must be a scalar. train_func(callable): A function which will return loss. The loss must be a scalar.
infer_func(callable): A function which will return predict, used to save inference model
optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
place: The device place of this trainer. place: The device place of this trainer.
""" """
def __init__(self, train_func, optimizer, param_path=None, place=None): def __init__(self,
train_func,
optimizer,
param_path=None,
place=None,
parallel=False):
self.parallel = parallel
# 1. we need to generate a framework.Program by calling # 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in # program_func. Reference: fluid.program_guard in
# test_word2vec.py # test_word2vec.py
...@@ -106,14 +114,14 @@ class Trainer(object): ...@@ -106,14 +114,14 @@ class Trainer(object):
with framework.program_guard(self.train_program, self.startup_program): with framework.program_guard(self.train_program, self.startup_program):
program_func_outs = train_func() program_func_outs = train_func()
self.test_outputs = program_func_outs if isinstance( self.train_func_outputs = program_func_outs if isinstance(
program_func_outs, list) else [program_func_outs] program_func_outs, list) else [program_func_outs]
self.test_program = self.train_program.clone() self.test_program = self.train_program.clone()
if not isinstance(optimizer, opt_module.Optimizer): if not isinstance(optimizer, opt_module.Optimizer):
raise TypeError( raise TypeError(
"The optimizer should be an instance of Optimizer") "The optimizer should be an instance of Optimizer")
# The fisrt element of program_func_outs is loss. # The fisrt element of program_func_outs is loss.
loss = self.test_outputs[0] loss = self.train_func_outputs[0]
optimize_ops, params_grads = optimizer.minimize(loss) optimize_ops, params_grads = optimizer.minimize(loss)
self.place = check_and_get_place(place) self.place = check_and_get_place(place)
...@@ -202,12 +210,7 @@ class Trainer(object): ...@@ -202,12 +210,7 @@ class Trainer(object):
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
) )
def train(self, def train(self, num_epochs, event_handler, reader=None, feed_order=None):
num_epochs,
event_handler,
reader,
feed_order,
parallel=False):
""" """
Train the model. Train the model.
...@@ -215,25 +218,24 @@ class Trainer(object): ...@@ -215,25 +218,24 @@ class Trainer(object):
num_epochs: The number of epoch. An epoch will process all data in reader num_epochs: The number of epoch. An epoch will process all data in reader
event_handler: The event handler. A function with type (ev:Event)->void event_handler: The event handler. A function with type (ev:Event)->void
reader: reader:
parallel: True if use multi-CPUs or multi-GPUs
feed_order: Feeding order of reader. None will following the defining feed_order: Feeding order of reader. None will following the defining
order in program order in program
Returns: Returns:
""" """
if parallel:
raise NotImplementedError(
"Parallel Executor version of trainer is not implemented")
training_role = os.getenv("PADDLE_TRAINING_ROLE", "") training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
if training_role == "PSERVER": if training_role == "PSERVER":
with self._prog_and_scope_guard(): with self._prog_and_scope_guard():
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
exe.run() exe.run()
return return
if self.parallel:
self._train_by_executor(num_epochs, event_handler, reader, feed_order) self._train_by_parallel_executor(num_epochs, event_handler, reader,
feed_order)
else:
self._train_by_executor(num_epochs, event_handler, reader,
feed_order)
def test(self, reader, feed_order): def test(self, reader, feed_order):
""" """
...@@ -245,7 +247,8 @@ class Trainer(object): ...@@ -245,7 +247,8 @@ class Trainer(object):
order in program order in program
""" """
return self._test_by_executor(reader, feed_order, self.test_outputs) return self._test_by_executor(reader, feed_order,
self.train_func_outputs)
def save_params(self, param_path): def save_params(self, param_path):
# reference: save_persistables in io.py # reference: save_persistables in io.py
...@@ -279,12 +282,24 @@ class Trainer(object): ...@@ -279,12 +282,24 @@ class Trainer(object):
feeder = data_feeder.DataFeeder( feeder = data_feeder.DataFeeder(
feed_list=feed_var_list, place=self.place) feed_list=feed_var_list, place=self.place)
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
reader = feeder.decorate_reader(reader, multi_devices=False)
self._train_by_any_executor(event_handler, exe, num_epochs, reader)
def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
for epoch_id in range(num_epochs): for epoch_id in range(num_epochs):
event_handler(BeginEpochEvent(epoch_id)) event_handler(BeginEpochEvent(epoch_id))
for step_id, data in enumerate(reader()): for step_id, data in enumerate(reader()):
event_handler(BeginStepEvent(epoch_id, step_id)) begin_event = BeginStepEvent(epoch_id, step_id)
exe.run(feed=feeder.feed(data), fetch_list=[]) event_handler(begin_event)
event_handler(EndStepEvent(epoch_id, step_id)) if begin_event.fetch_metrics:
metrics = exe.run(feed=data,
fetch_list=[
var.name
for var in self.train_func_outputs
])
else:
metrics = exe.run(feed=data, fetch_list=[])
event_handler(EndStepEvent(epoch_id, step_id, metrics))
event_handler(EndEpochEvent(epoch_id)) event_handler(EndEpochEvent(epoch_id))
def _test_by_executor(self, reader, feed_order, fetch_list): def _test_by_executor(self, reader, feed_order, fetch_list):
...@@ -304,6 +319,28 @@ class Trainer(object): ...@@ -304,6 +319,28 @@ class Trainer(object):
return [x / count for x in accumulated] return [x / count for x in accumulated]
def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
feed_order):
with self._prog_and_scope_guard():
pe = self._get_or_create_parallel_executor()
feed_var_list = build_feed_var_list(self.train_program, feed_order)
feeder = data_feeder.DataFeeder(
feed_list=feed_var_list, place=self.place)
reader = feeder.decorate_reader(reader, multi_devices=True)
for epoch_id in range(num_epochs):
self._train_by_any_executor(event_handler, pe, num_epochs,
reader)
def _get_parallel_executor(self):
return getattr(self, 'parallel_executor', None)
def _get_or_create_parallel_executor(self):
if self._get_parallel_executor() is None:
self.parallel_executor = parallel_executor.ParallelExecutor(
use_cuda=isinstance(self.place, core.CUDAPlace),
loss_name=self.train_func_outputs[0].name)
return self._get_parallel_executor()
def build_feed_var_list(program, feed_order): def build_feed_var_list(program, feed_order):
if not isinstance(program, framework.Program): if not isinstance(program, framework.Program):
......
...@@ -171,7 +171,7 @@ if args.timeline_path: ...@@ -171,7 +171,7 @@ if args.timeline_path:
profile_paths = profile_path.split(',') profile_paths = profile_path.split(',')
profile_dict = dict() profile_dict = dict()
if len(profile_path) == 1: if len(profile_paths) == 1:
with open(profile_path, 'r') as f: with open(profile_path, 'r') as f:
profile_s = f.read() profile_s = f.read()
profile_pb = profiler_pb2.Profile() profile_pb = profiler_pb2.Profile()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册