提交 e8954a36 编写于 作者: N nhzlx

merge develop

...@@ -27,15 +27,6 @@ script: ...@@ -27,15 +27,6 @@ script:
# 43min timeout # 43min timeout
paddle/scripts/paddle_docker_build.sh ${JOB} paddle/scripts/paddle_docker_build.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
# For document only
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications: notifications:
email: email:
on_success: change on_success: change
......
...@@ -200,6 +200,14 @@ include(external/snappy) # download snappy ...@@ -200,6 +200,14 @@ include(external/snappy) # download snappy
include(external/snappystream) include(external/snappystream)
include(external/threadpool) include(external/threadpool)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(cupti) include(cupti)
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
...@@ -228,14 +236,6 @@ set(EXTERNAL_LIBS ...@@ -228,14 +236,6 @@ set(EXTERNAL_LIBS
${PYTHON_LIBRARIES} ${PYTHON_LIBRARIES}
) )
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
find_package(HIP) find_package(HIP)
include(hip) include(hip)
......
...@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
......
...@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME) ...@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL}) if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif() endif()
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME) ...@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL) if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif() endif()
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -577,7 +583,9 @@ function(py_test TARGET_NAME) ...@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
......
# 如何使用timeline工具做性能分析 # 如何使用timeline工具做性能分析
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 1. 在训练的主循环外加上`profiler.start_profiler(...)``profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
**提示:** **提示:**
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
```python ```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof: for pass_id in range(pass_num):
for pass_id in range(pass_num): for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_reader()): if pass_id == 0 and batch_id == 5:
exe.run(fluid.default_main_program(), profiler.start_profiler("All")
feed=feeder.feed(data), elif pass_id == 0 and batch_id == 10:
fetch_list=[]) profiler.stop_profiler("total", "/tmp/profile")
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
... ...
``` ```
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。 1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
......
# how to use timeline tool to do profile # how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. 1. Add `profiler.start_profiler(...)``profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python ```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof: for pass_id in range(pass_num):
for pass_id in range(pass_num): for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_reader()): if pass_id == 0 and batch_id == 5:
exe.run(fluid.default_main_program(), profiler.start_profiler("All")
feed=feeder.feed(data), elif pass_id == 0 and batch_id == 10:
fetch_list=[], profiler.stop_profiler("total", "/tmp/profile")
use_program_cache=True) exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
... ...
``` ```
...@@ -17,6 +19,10 @@ ...@@ -17,6 +19,10 @@
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file. 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
![chrome tracing](./tracing.jpeg) ![chrome tracing](./tracing.jpeg)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
......
...@@ -21,6 +21,26 @@ namespace framework { ...@@ -21,6 +21,26 @@ namespace framework {
namespace details { namespace details {
struct BuildStrategy { struct BuildStrategy {
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
// kReduce, for CPU and GPU. If you use kAllReduce, different threads
// optimize their parameters separately. If you use kReduce, the optimizations
// of parameters are distributed to different threads.
// For example, a model has 100 parameters and is running with four threads,
// if you choose kAllReduce, every thread is to optimize 100 parameters
// separately, if you choose kReduce, every thread is to optimize 25
// parameters.
// Of particular note is, if you use kReduce when using CPU training,
// all the parameters are shared between different threads. This feature will
// save memory.
// FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
// equal for GPU. Because, the result of the different order of summing maybe
// different, for example, the result of `a+b+c+d` may be different with the
// result of `c+a+b+d`.
// For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
// so the result of kAllReduce and kReduce maybe not equal.
// For CPU, if you want to fix the order of summing to make the result
// of kAllReduce and kReduce no diff, you can add
// `FLAGS_cpu_deterministic=true` to env.
enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 }; enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
enum class GradientScaleStrategy { enum class GradientScaleStrategy {
......
...@@ -16,12 +16,18 @@ ...@@ -16,12 +16,18 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(
cpu_deterministic, false,
"Whether to make the result of computation deterministic in CPU side.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
...@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() { ...@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
} else { } else {
std::vector<const LoDTensor *> lod_tensors = std::vector<const LoDTensor *> lod_tensors =
GetInputValues<LoDTensor>(in_var_handles, var_scopes); GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
ReduceLoDTensor func(lod_tensors, // FIXME(zcd): The order of summing is important,
out_var->GetMutable<framework::LoDTensor>()); // especially when the type of data is float or double.
VisitDataType(ToDataType(lod_tensors[0]->type()), func); // For example, the result of `a+b+c+d` may be different
// with the result of `c+a+b+d`, so the summing order should be fixed.
if (!FLAGS_cpu_deterministic) {
ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
} else {
// We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
// here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
auto &reduce_sum_trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(out_var_handle->name_)
->GetMutable<framework::LoDTensor>();
ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
auto trg = out_var->GetMutable<framework::LoDTensor>();
if (reduce_sum_trg.data<void>() != trg->data<void>()) {
TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
}
}
}); });
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception(); eptr = std::current_exception();
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; drop_scope_counter_ += 1;
if (!fetch_tensors.empty() || if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars; BlockingQueue<VarHandleBase *> ready_vars;
...@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context // Clean run context
run_op_futures_.clear(); run_op_futures_.clear();
exception_holder_.Clear(); exception_holder_.Clear();
event.reset(nullptr);
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
...@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
} }
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1});
}
return tensor.dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) { if (get_actual_dim) {
return var->Get<SelectedRows>().value().dims(); return var->Get<SelectedRows>().value().dims();
...@@ -74,8 +79,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { ...@@ -74,8 +79,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
if (var == nullptr) { if (var == nullptr) {
return ""; return "";
} }
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return DataTypeToString(ToDataType(var->Get<LoDTensor>().type())); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return "";
}
return DataTypeToString(ToDataType(tensor.type()));
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
return DataTypeToString( return DataTypeToString(
ToDataType(var->Get<SelectedRows>().value().type())); ToDataType(var->Get<SelectedRows>().value().type()));
...@@ -106,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -106,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().lod(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod;
}
return tensor.lod();
} else { } else {
return default_lod; return default_lod;
} }
...@@ -122,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -122,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} }
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope); VLOG(10) << "+ " << DebugStringEx(&scope);
} }
...@@ -625,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -625,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
// For profiling, don't move out of this function because that will result
// in the failure of multi-GPU profiling.
platform::RecordEvent record_event(Type(), dev_ctx);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_); auto kernels_iter = all_op_kernels.find(type_);
......
...@@ -82,7 +82,7 @@ class Tensor { ...@@ -82,7 +82,7 @@ class Tensor {
template <typename T> template <typename T>
const T* data() const; const T* data() const;
bool IsInitialized() const; inline bool IsInitialized() const;
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
......
...@@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI ...@@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
target_link_libraries(inference_anakin_api anakin anakin_saber_common) target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
if (WITH_TESTING) if (WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc # this test is unstable, disable it first.
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
DEPS inference_anakin_api_shared) #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) #DEPS inference_anakin_api_shared)
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING) endif(WITH_TESTING)
endif() endif()
...@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase { ...@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
// get device context from pool // get device context from pool
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx);
auto feed_var_name = Input("X"); auto feed_var_name = Input("X");
auto *feed_var = scope.FindVar(feed_var_name); auto *feed_var = scope.FindVar(feed_var_name);
......
...@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override { const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints"); std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
......
...@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase { ...@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
auto fetch_var_name = Input("X"); auto fetch_var_name = Input("X");
auto *fetch_var = scope.FindVar(fetch_var_name); auto *fetch_var = scope.FindVar(fetch_var_name);
PADDLE_ENFORCE(fetch_var != nullptr, PADDLE_ENFORCE(fetch_var != nullptr,
......
...@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase { ...@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx);
// FIXME(yuyang18): We save variable to local file now, but we should change // FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream. // it to save an output stream.
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
......
...@@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W"); auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputDim("Ids");
int ids_rank = ids_dims.size();
PADDLE_ENFORCE_EQ(ids_dims.size(), 2); PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1); PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
"The last dimension of the 'Ids' tensor must be 1.");
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); auto output_dims =
framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
output_dims.push_back(table_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
if (ctx->GetOutputsVarType("Out")[0] == if (ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
...@@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Ids", AddInput("Ids",
"An input with type int32 or int64 " "An input with type int32 or int64 "
"contains the ids to be looked up in W. " "contains the ids to be looked up in W. "
"Ids must be a column vector with rank = 2. " "The last dimension size must be 1.");
"The 2nd dimension size must be 1.");
AddOutput("Out", "The lookup results, which have the same type as W."); AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<bool>("is_sparse", AddAttr<bool>("is_sparse",
"(boolean, default false) " "(boolean, default false) "
......
...@@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims(); int64_t ids_num = ids->numel();
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
// copy GPU memory to CPU pinned memory // copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]); new_rows.resize(ids_num);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here. // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(), memory::Copy(platform::CPUPlace(),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place, new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
ids_data, ids_dim[0] * sizeof(int64_t), stream); ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_dim[0], table->dims()[1]}); d_table_value->Resize({ids_num, table->dims()[1]});
d_table_value->mutable_data<T>(context.GetPlace()); d_table_value->mutable_data<T>(context.GetPlace());
auto *d_table_data = d_table_value->data<T>(); auto *d_table_data = d_table_value->data<T>();
auto *d_output_data = d_output->data<T>(); auto *d_output_data = d_output->data<T>();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
d_output->numel() * sizeof(T), stream); d_output->numel() * sizeof(T), stream);
......
...@@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims(); int64_t ids_num = ids->numel();
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.reserve(ids_dim[0]); new_rows.reserve(ids_num);
for (int64_t i = 0; i < ids_dim[0]; i++) { for (int64_t i = 0; i < ids_num; i++) {
new_rows.push_back(ids_data[i]); new_rows.push_back(ids_data[i]);
} }
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_dim[0], table_dim[1]}); d_table_value->Resize({ids_num, table_dim[1]});
d_table_value->mutable_data<T>(context.GetPlace()); d_table_value->mutable_data<T>(context.GetPlace());
d_table->set_height(table_dim[0]); d_table->set_height(table_dim[0]);
...@@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_output_data = d_output->data<T>(); auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table_value->data<T>(); auto *d_table_data = d_table_value->data<T>();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
} else { } else {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
...@@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W")); auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
auto ids_dim = ids->dims();
int N = table_dim[0]; int N = table_dim[0];
int D = d_output->dims()[1]; int D = table_dim[1];
auto *d_output_data = d_output->data<T>(); auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table->mutable_data<T>(context.GetPlace()); auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
......
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
workers.emplace_back( workers.emplace_back(
framework::Async([program, cur_scope, place, block, place_idx] { framework::Async([program, cur_scope, place, block, place_idx] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
framework::Executor executor(place); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(), executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
...@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
// execute // execute
workers.emplace_back( workers.emplace_back(
framework::Async([program, cur_scope, place, block, i] { framework::Async([program, cur_scope, place, block, i] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(i) + 1);
framework::Executor executor(place); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(), executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase { ...@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
.GetMutable<framework::ReaderHolder>(); .GetMutable<framework::ReaderHolder>();
std::vector<std::string> out_arg_names = Outputs("Out"); std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
// For profiling
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(dev_place);
platform::RecordEvent record_event(Type(), &ctx);
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
if (Attr<bool>("throw_eof_exp")) { if (Attr<bool>("throw_eof_exp")) {
......
...@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase { ...@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
......
...@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints"); std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode"); bool sync_mode = Attr<bool>("sync_mode");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
......
...@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase { ...@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
......
...@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> { ...@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
// allocate memory on device. // allocate memory on device.
Out->mutable_data<T>(context.GetPlace()); Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
math::SoftmaxCUDNNFunctor<T>()( math::SoftmaxCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), X, Out); context.template device_context<platform::CUDADeviceContext>(),
&flattened_x, &flattened_out);
} }
}; };
...@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
// allocate memory on device. // allocate memory on device.
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
math::SoftmaxGradCUDNNFunctor<T>()( math::SoftmaxGradCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), Out, context.template device_context<platform::CUDADeviceContext>(),
dOut, dX); &flattened_out, &flattened_d_out, &flattened_d_x);
} }
}; };
......
...@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc; ...@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive; using mkldnn::primitive;
using mkldnn::softmax_forward;
using mkldnn::softmax_backward;
using mkldnn::prop_kind; using mkldnn::prop_kind;
using mkldnn::softmax_backward;
using mkldnn::softmax_forward;
using mkldnn::stream; using mkldnn::stream;
using platform::to_void_cast; using platform::to_void_cast;
...@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
auto mkldnn_engine = dev_ctx.GetEngine(); auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X"); const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out"); Tensor* output = ctx.Output<Tensor>("Out");
PADDLE_ENFORCE(input->dims().size() == 2UL, PADDLE_ENFORCE_EQ(
"The input of softmax op must be a 2D matrix."); input->dims(), output->dims(),
const T* input_data = input->data<T>(); "The shape of softmax's input and output must be identical.");
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace()); // make sure 'output' holds memory, which will be shared by
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); // 'flattened_output' later.
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); output->mutable_data<T>(ctx.GetPlace());
// MKL-DNN does support softmax over selected axis. Having 2D Tensor,
// we will make normalization after final eg. axis: 1 // flatten input and output to 2-D matrixs
PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), auto dims = input->dims(); // input and output share the same shape
"Softmax input and output dimensions should match"); auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::Tensor flattened_input;
framework::Tensor flattened_output;
flattened_input.ShareDataWith(*input).Resize(flattened_dims);
flattened_output.ShareDataWith(*output).Resize(flattened_dims);
const T* input_data = flattened_input.data<T>();
T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
std::vector<int> dst_tz = src_tz;
// Same memory descriptor to be used for input and output // Same memory descriptor to be used for input and output
memory::dims softmax_tz = {src_tz[0], src_tz[1]}; memory::dims softmax_tz = {src_tz[0], src_tz[1]};
// Generate keys for storing/retriving primitives for this operator // Generate keys for storing/retriving primitives for this operator
...@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> { ...@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine(); auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* output = ctx.Input<Tensor>("Out"); const Tensor* output = ctx.Input<Tensor>("Out");
const T* dst_data = output->data<T>();
auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto* diff_dst_ptr = dout->template data<T>();
auto* dx = auto* dx =
ctx.template Output<framework::Tensor>(framework::GradVarName("X")); ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); PADDLE_ENFORCE_EQ(
dout->dims(), dx->dims(),
"The shape of softmax_grad's input and output must be identical.");
// make sure 'dx' holds memory, which will be shared by 'flattened_dx'
// later.
dx->template mutable_data<T>(ctx.GetPlace());
auto dims = dout->dims(); // input and output share the same shape
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::Tensor flattened_output;
framework::Tensor flattened_dout;
framework::Tensor flattened_dx;
flattened_output.ShareDataWith(*output).Resize(flattened_dims);
flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
const T* dst_data = flattened_output.data<T>();
const T* diff_dst_ptr = flattened_dout.template data<T>();
T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
std::vector<int> src_tz(dst_tz); std::vector<int> src_tz(dst_tz);
PADDLE_ENFORCE(output->dims().size() == 2UL,
"The input of softmax op must be a 2D matrix.");
// MKL-DNN does support softmax over selected axis. Having 2D Tensor,
// we will make normalization after final eg. axis: 1
PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
"Softmax input and output dimensions should match");
// Same memory descriptor to be used for input and output // Same memory descriptor to be used for input and output
memory::dims softmax_tz = {src_tz[0], src_tz[1]}; memory::dims softmax_tz = {src_tz[0], src_tz[1]};
// Currently only supports NC data format // Currently only supports NC data format
......
...@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SoftmaxOp should not be null."); "Output(Out) of SoftmaxOp should not be null.");
auto x_dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
PADDLE_ENFORCE(x_dims.size() == 2UL,
"The input of softmax op must be a matrix.");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
...@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"The input tensor of softmax. " "The input tensor of softmax, "
"2-D with shape [batch_size, input_feature_dimensions]."); "whose last dimension is the input_feature_dimensions.");
AddOutput("Out", "The normalized values with the same shape as X.") AddOutput("Out", "The normalized values with the same shape as X.")
.Reuse("X"); .Reuse("X");
AddAttr<bool>( AddAttr<bool>(
...@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Softmax Operator. Softmax Operator.
The input of the softmax operator is a 2-D tensor with shape N x K (N is the The input of the softmax operator is a tensor of any rank. The output tensor
batch_size, K is the dimension of input feature). The output tensor has the has the same shape as the input.
same shape as the input tensor.
For each row of the input tensor, the softmax operator squashes the The input tensor will first be logically flattened to a 2-D matrix. The matrix's
K-dimensional vector of arbitrary real values to a K-dimensional vector of real second dimension(row length) is as same as the last dimension of the input
values in the range [0, 1] that add up to 1. tensor, and the first dimension(column length) is the product of all other
dimensions of the input tensor. For each row of the matrix, the softmax operator
squashes the K-dimensional(K is the width of the matrix, which is also the size
of the input tensor's last dimension) vector of arbitrary real values to a
K-dimensional vector of real values in the range [0, 1] that add up to 1.
It computes the exponential of the given dimension and the sum of exponential It computes the exponential of the given dimension and the sum of exponential
values of all the other dimensions in the K-dimensional vector input. values of all the other dimensions in the K-dimensional vector input.
Then the ratio of the exponential of the given dimension and the sum of Then the ratio of the exponential of the given dimension and the sum of
exponential values of all the other dimensions is the output of the softmax exponential values of all the other dimensions is the output of the softmax
operator. operator.
For each row $i$ and each column $j$ in Input(X), we have: For each row $i$ and each column $j$ in the matrix, we have:
$$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$ $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
)DOC"); )DOC");
......
...@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> { ...@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
// allocate memory on device. // allocate memory on device.
Out->mutable_data<T>(context.GetPlace()); Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
math::SoftmaxFunctor<DeviceContext, T>()( math::SoftmaxFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), X, Out); context.template device_context<DeviceContext>(), &flattened_x,
&flattened_out);
} }
}; };
...@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> { ...@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
// allocate memory on device. // allocate memory on device.
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
math::SoftmaxGradFunctor<DeviceContext, T>()( math::SoftmaxGradFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), Out, dOut, dX); context.template device_context<DeviceContext>(), &flattened_out,
&flattened_d_out, &flattened_d_x);
} }
}; };
......
...@@ -30,9 +30,6 @@ limitations under the License. */ ...@@ -30,9 +30,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace { namespace {
// Current thread's id. Note, we don't distinguish nested threads
// for now.
thread_local int cur_thread_id = 0;
// Tracking the nested block stacks of each thread. // Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack; thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks. // Tracking the nested event stacks.
...@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } ...@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); } void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); } int BlockDepth() { return block_id_stack.size(); }
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
void ClearCurThread() { cur_thread_id = 0; }
int CurThread() { return cur_thread_id; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -99,9 +99,5 @@ std::string CurAnnotation(); ...@@ -99,9 +99,5 @@ std::string CurAnnotation();
void SetCurBlock(int block_id); void SetCurBlock(int block_id);
void ClearCurBlock(); void ClearCurBlock();
int BlockDepth(); int BlockDepth();
void SetCurThread(int thread_id);
void ClearCurThread();
int CurThread();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id, ...@@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) { if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaSetDevice(
boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_)); PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream(); auto stream = cuda_dev_ctx->stream();
...@@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { ...@@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
is_enabled_ = true; is_enabled_ = true;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
...@@ -186,11 +189,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) ...@@ -186,11 +189,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
BlockDepth(), CurThread()); BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
...@@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() { ...@@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() {
RecordBlock::RecordBlock(int block_id) RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
is_enabled_ = true; is_enabled_ = true;
SetCurBlock(block_id); SetCurBlock(block_id);
...@@ -205,27 +210,18 @@ RecordBlock::RecordBlock(int block_id) ...@@ -205,27 +210,18 @@ RecordBlock::RecordBlock(int block_id)
} }
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
CurThread()); g_thread_id);
} }
ClearCurBlock(); ClearCurBlock();
} }
RecordThread::RecordThread(int thread_id) {
if (g_state == ProfilerState::kDisabled) return;
SetCurThread(thread_id);
}
RecordThread::~RecordThread() {
if (g_state == ProfilerState::kDisabled) return;
ClearCurThread();
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
......
...@@ -95,11 +95,6 @@ struct RecordBlock { ...@@ -95,11 +95,6 @@ struct RecordBlock {
uint64_t start_ns_; uint64_t start_ns_;
}; };
struct RecordThread {
explicit RecordThread(int thread_id);
~RecordThread();
};
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
......
...@@ -419,6 +419,25 @@ EOF ...@@ -419,6 +419,25 @@ EOF
linkchecker doc/v2/en/html/index.html linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html linkchecker doc/v2/api/en/html/index.html
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
# Deploy to the the content server if its a "develop" or "release/version" branch
# The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
PPO_SCRIPT_BRANCH=develop
elif [[ "$TRAVIS_BRANCH" == "develop" || "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
PPO_SCRIPT_BRANCH=master
else
# Early exit, this branch doesn't require documentation build
return 0;
fi
# Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
cd -
} }
function gen_html() { function gen_html() {
......
...@@ -52,6 +52,9 @@ EOL ...@@ -52,6 +52,9 @@ EOL
${DOCKER_CMD} run -it \ ${DOCKER_CMD} run -it \
${DOCKER_ENV} \ ${DOCKER_ENV} \
-e SCRIPT_NAME=$0 \ -e SCRIPT_NAME=$0 \
-e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
-e TRAVIS_BRANCH=$TRAVIS_BRANCH \
-e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
-v $PADDLE_ROOT:/paddle \ -v $PADDLE_ROOT:/paddle \
-v ${HOME}/.ccache:/root/.ccache \ -v ${HOME}/.ccache:/root/.ccache \
-w /paddle \ -w /paddle \
......
...@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False): ...@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
if drop_last == False and len(b) != 0: if drop_last == False and len(b) != 0:
yield b yield b
# Batch size check
batch_size = int(batch_size)
if batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(batch_size))
return batch_reader return batch_reader
...@@ -123,7 +123,8 @@ def __bootstrap__(): ...@@ -123,7 +123,8 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads' 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
'cpu_deterministic'
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
......
...@@ -572,8 +572,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -572,8 +572,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
program.current_block_idx = current_block_idx program.current_block_idx = current_block_idx
program._sync_with_cpp() program._sync_with_cpp()
# FIXME(zcd): prevent loss.grad optimized by mem_opt.
loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
if parameter_list is not None: if parameter_list is not None:
parameters = parameter_list parameters = parameter_list
......
...@@ -1038,7 +1038,26 @@ class Block(object): ...@@ -1038,7 +1038,26 @@ class Block(object):
global_block = self.program.global_block() global_block = self.program.global_block()
param = Parameter(global_block, *args, **kwargs) param = Parameter(global_block, *args, **kwargs)
if 'initializer' in kwargs: if 'initializer' in kwargs:
kwargs['initializer'](param, self)
def _is_inited_by(block, var):
init_ops = []
for op in block.ops:
if var.name in op.output_arg_names:
init_ops.append(op)
return init_ops
initializer = kwargs['initializer']
init_ops = _is_inited_by(global_block, param)
init_ops_len = len(init_ops)
if init_ops_len > 1:
raise RuntimeError("param " + param.name +
" is inited by multiple init ops " + str(
init_ops))
elif init_ops_len == 1:
#TODO already inited, do nothing, should log a warning
pass
else:
initializer(param, self)
return param return param
def append_op(self, *args, **kwargs): def append_op(self, *args, **kwargs):
......
...@@ -949,6 +949,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): ...@@ -949,6 +949,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
helper = LayerHelper('dropout', **locals()) helper = LayerHelper('dropout', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
seed = helper.main_program.random_seed
helper.append_op( helper.append_op(
type='dropout', type='dropout',
inputs={'X': [x]}, inputs={'X': [x]},
...@@ -1313,13 +1317,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): ...@@ -1313,13 +1317,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
""" """
The input of the softmax layer is a 2-D tensor with shape N x K (N is the The input of the softmax operator is a tensor of any rank. The output tensor
batch_size, K is the dimension of input feature). The output tensor has the has the same shape as the input.
same shape as the input tensor.
For each row of the input tensor, the softmax operator squashes the The input tensor will first be logically flattened to a 2-D matrix. The matrix's
K-dimensional vector of arbitrary real values to a K-dimensional vector of real second dimension(row length) is as same as the last dimension of the input
values in the range [0, 1] that add up to 1. tensor, and the first dimension(column length) is the product of all other
dimensions of the input tensor. For each row of the matrix, the softmax operator
squashes the K-dimensional(K is the width of the matrix, which is also the size
of the input tensor's last dimension) vector of arbitrary real values to a
K-dimensional vector of real values in the range [0, 1] that add up to 1.
It computes the exponential of the given dimension and the sum of exponential It computes the exponential of the given dimension and the sum of exponential
values of all the other dimensions in the K-dimensional vector input. values of all the other dimensions in the K-dimensional vector input.
...@@ -1327,7 +1334,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): ...@@ -1327,7 +1334,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
exponential values of all the other dimensions is the output of the softmax exponential values of all the other dimensions is the output of the softmax
operator. operator.
For each row :math:`i` and each column :math:`j` in Input(X), we have: For each row :math:`i` and each column :math:`j` in the matrix, we have:
.. math:: .. math::
......
...@@ -50,6 +50,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) ...@@ -50,6 +50,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_dist_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -64,3 +66,5 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -64,3 +66,5 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
...@@ -174,6 +174,9 @@ class SE_ResNeXt(): ...@@ -174,6 +174,9 @@ class SE_ResNeXt():
padding=(filter_size - 1) / 2, padding=(filter_size - 1) / 2,
groups=groups, groups=groups,
act=None, act=None,
# avoid pserver CPU init differs from GPU
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant()),
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act) return fluid.layers.batch_norm(input=conv, act=act)
...@@ -194,10 +197,8 @@ class SE_ResNeXt(): ...@@ -194,10 +197,8 @@ class SE_ResNeXt():
def get_model(batch_size): def get_model(batch_size):
# Input data # Input data
image = fluid.layers.fill_constant( image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
# Train program # Train program
model = SE_ResNeXt(layers=50) model = SE_ResNeXt(layers=50)
...@@ -222,8 +223,10 @@ def get_model(batch_size): ...@@ -222,8 +223,10 @@ def get_model(batch_size):
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay( # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
boundaries=bd, values=lr), #learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
learning_rate=base_lr,
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4)) regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -232,7 +235,7 @@ def get_model(batch_size): ...@@ -232,7 +235,7 @@ def get_model(batch_size):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.flowers.train(), batch_size=batch_size) paddle.dataset.flowers.train(), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.flowers.test(), batch_size=batch_size) paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
return test_program, avg_cost, train_reader, test_reader, acc_top1, out return test_program, avg_cost, train_reader, test_reader, acc_top1, out
...@@ -256,7 +259,6 @@ class DistSeResneXt2x2: ...@@ -256,7 +259,6 @@ class DistSeResneXt2x2:
trainers) trainers)
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog) startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
...@@ -302,12 +304,19 @@ class DistSeResneXt2x2: ...@@ -302,12 +304,19 @@ class DistSeResneXt2x2:
] ]
feeder = fluid.DataFeeder(feed_var_list, place) feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = train_reader() reader_generator = test_reader()
first_loss, = exe.run(fetch_list=[avg_cost.name])
data = next(reader_generator)
first_loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(data))
print(first_loss) print(first_loss)
for i in xrange(5): for i in xrange(5):
loss, = exe.run(fetch_list=[avg_cost.name]) data = next(reader_generator)
last_loss, = exe.run(fetch_list=[avg_cost.name]) loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
data = next(reader_generator)
last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
print(last_loss) print(last_loss)
......
...@@ -63,7 +63,8 @@ class TestDistBase(unittest.TestCase): ...@@ -63,7 +63,8 @@ class TestDistBase(unittest.TestCase):
"PATH": os.getenv("PATH"), "PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"), "PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15" "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1"
} }
# Run local to get a base line # Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
......
...@@ -17,8 +17,7 @@ from test_dist_base import TestDistBase ...@@ -17,8 +17,7 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def test_se_resnext(self): def test_se_resnext(self):
# TODO(paddle-dev): Is the delta too large? self.check_with_place("dist_se_resnext.py")
self.check_with_place("dist_se_resnext.py", delta=0.2)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -73,9 +73,18 @@ class TranspilerTest(unittest.TestCase): ...@@ -73,9 +73,18 @@ class TranspilerTest(unittest.TestCase):
return self.transpiler return self.transpiler
def transpiler_test_impl(self):
pass
class TestBasicModel(TranspilerTest):
def test_transpiler(self): def test_transpiler(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
self.transpiler_test_impl()
class TestBasicModel(TranspilerTest):
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
pserver2, startup2 = self.get_pserver(self.pserver2_ep) pserver2, startup2 = self.get_pserver(self.pserver2_ep)
...@@ -123,7 +132,7 @@ class TestBasicModel(TranspilerTest): ...@@ -123,7 +132,7 @@ class TestBasicModel(TranspilerTest):
class TestBasicModelWithLargeBlockSize(TranspilerTest): class TestBasicModelWithLargeBlockSize(TranspilerTest):
def test_transpiler(self): def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.min_block_size = 1048576 config.min_block_size = 1048576
...@@ -148,7 +157,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest): ...@@ -148,7 +157,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
["sum", "scale", "sgd"]) ["sum", "scale", "sgd"])
# confirm startup program # confirm startup program
self.assertEqual([op.type for op in startup.global_block().ops], self.assertEqual([op.type for op in startup.global_block().ops],
["fill_constant", "fill_constant", "fill_constant"]) ["fill_constant", "fill_constant"])
# the variable #fc_w will be split into two blocks # the variable #fc_w will be split into two blocks
fc_w_var = startup2.global_block().var("fc_w") fc_w_var = startup2.global_block().var("fc_w")
self.assertEqual(fc_w_var.shape, (1000L, 1000L)) self.assertEqual(fc_w_var.shape, (1000L, 1000L))
...@@ -177,7 +186,7 @@ class TestNoSliceVar(TranspilerTest): ...@@ -177,7 +186,7 @@ class TestNoSliceVar(TranspilerTest):
def setUp(self): def setUp(self):
super(TestNoSliceVar, self).setUp() super(TestNoSliceVar, self).setUp()
def test_transpiler(self): def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False config.slice_var_up = False
...@@ -212,7 +221,7 @@ class TestLRDecay(TranspilerTest): ...@@ -212,7 +221,7 @@ class TestLRDecay(TranspilerTest):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return return
def test_transpiler(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer() trainer = self.get_trainer()
...@@ -242,7 +251,7 @@ class TestLRDecayConditional(TranspilerTest): ...@@ -242,7 +251,7 @@ class TestLRDecayConditional(TranspilerTest):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return return
def test_transpiler(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer() trainer = self.get_trainer()
...@@ -291,7 +300,7 @@ class TestL2Decay(TranspilerTest): ...@@ -291,7 +300,7 @@ class TestL2Decay(TranspilerTest):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return return
def test_transpiler(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer() trainer = self.get_trainer()
...@@ -326,7 +335,7 @@ class TestL2DecayWithPiecewise(TranspilerTest): ...@@ -326,7 +335,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return return
def test_transpiler(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer() trainer = self.get_trainer()
...@@ -350,5 +359,110 @@ class TestL2DecayWithPiecewise(TranspilerTest): ...@@ -350,5 +359,110 @@ class TestL2DecayWithPiecewise(TranspilerTest):
["sum", "scale", "scale", "elementwise_add", "momentum"]) ["sum", "scale", "scale", "elementwise_add", "momentum"])
class TestDistLookupTableBase(TranspilerTest):
def network_with_table(self, is_sparse, is_distributed):
def emb_pool(ids):
table_size = 1000
emb_size = 64
emb = fluid.layers.embedding(
input=ids,
size=[table_size, emb_size],
dtype='float32',
param_attr='shared_w', # share parameter
is_sparse=is_sparse,
is_distributed=is_distributed)
pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
return pool
title_ids = fluid.layers.data(
name='title_ids', shape=[1], dtype='int64', lod_level=1)
brand_ids = fluid.layers.data(
name='brand_ids', shape=[1], dtype='int64', lod_level=1)
title_emb = emb_pool(title_ids)
brand_emb = emb_pool(brand_ids)
fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
predict = fluid.layers.fc(input=fc0,
size=2,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.003)
optimizer.minimize(avg_cost)
class TestLocalLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 3)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "adam", "scale", "scale"])
trainer = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
'fill_constant', 'mean_grad', 'cross_entropy_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestDistLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=True)
def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 6)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "sgd"])
# 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["lookup_sparse_table"])
# 4 prefetch -> lookup_sparse_table for data1
self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"])
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
'fetch_barrier'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase): ...@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.ConstantInitializer()) name="param",
initializer=initializer.ConstantInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'fill_constant') self.assertEqual(init_op.type, 'fill_constant')
...@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase): ...@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.ConstantInitializer(2.3)) name="param",
initializer=initializer.ConstantInitializer(2.3))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'fill_constant') self.assertEqual(init_op.type, 'fill_constant')
...@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase): ...@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.UniformInitializer()) name="param",
initializer=initializer.UniformInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase): ...@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase):
program = framework.Program() program = framework.Program()
program.random_seed = 123 program.random_seed = 123
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.UniformInitializer()) name="param1",
block.create_parameter( initializer=initializer.UniformInitializer())
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.UniformInitializer(seed=456)) name="param2",
initializer=initializer.UniformInitializer(seed=456))
init_op = block.ops[1] init_op = block.ops[1]
self.assertEqual(init_op.attr("seed"), 123) self.assertEqual(init_op.attr("seed"), 123)
init_op1 = block.ops[0] init_op1 = block.ops[0]
...@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase): ...@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.UniformInitializer(-4.2, 3.1, 123)) name="param",
initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase): ...@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase):
self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA) self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
self.assertEqual(init_op.attr('seed'), 123) self.assertEqual(init_op.attr('seed'), 123)
def test_uniform_initializer_two_op(self):
"""Test uniform initializer with supplied attributes
"""
program = framework.Program()
block = program.global_block()
for i in range(2):
block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="param",
initializer=initializer.UniformInitializer(-4.2, float(i), 123))
self.assertEqual(len(block.ops), 1)
init_op0 = block.ops[0]
self.assertEqual(init_op0.type, 'uniform_random')
self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
self.assertEqual(init_op0.attr('seed'), 123)
class TestNormalInitializer(unittest.TestCase): class TestNormalInitializer(unittest.TestCase):
def test_normal_initializer_default_value(self): def test_normal_initializer_default_value(self):
...@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase): ...@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.NormalInitializer()) name="param",
initializer=initializer.NormalInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase): ...@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.NormalInitializer(2.3, 1.9, 123)) name="param",
initializer=initializer.NormalInitializer(2.3, 1.9, 123))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase): ...@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.XavierInitializer()) name="param",
initializer=initializer.XavierInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase): ...@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10, 15, 20], dtype="float32",
lod_level=0, shape=[5, 10, 15, 20],
name="param", lod_level=0,
initializer=initializer.XavierInitializer()) name="param",
initializer=initializer.XavierInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase): ...@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.XavierInitializer(uniform=False)) name="param",
initializer=initializer.XavierInitializer(uniform=False))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase): ...@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10, 15, 20], dtype="float32",
lod_level=0, shape=[5, 10, 15, 20],
name="param", lod_level=0,
initializer=initializer.XavierInitializer(uniform=False)) name="param",
initializer=initializer.XavierInitializer(uniform=False))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase): ...@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.XavierInitializer( name="param",
fan_in=12, fan_out=23, seed=134)) initializer=initializer.XavierInitializer(
fan_in=12, fan_out=23, seed=134))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.MSRAInitializer()) name="param",
initializer=initializer.MSRAInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10, 15, 20], dtype="float32",
lod_level=0, shape=[5, 10, 15, 20],
name="param", lod_level=0,
initializer=initializer.MSRAInitializer()) name="param",
initializer=initializer.MSRAInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.MSRAInitializer(uniform=False)) name="param",
initializer=initializer.MSRAInitializer(uniform=False))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
param = block.create_parameter( for _ in range(2):
dtype="float32", param = block.create_parameter(
shape=[5, 10, 15, 20], dtype="float32",
lod_level=0, shape=[5, 10, 15, 20],
name="param", lod_level=0,
initializer=initializer.MSRAInitializer(uniform=False)) name="param",
initializer=initializer.MSRAInitializer(uniform=False))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'gaussian_random') self.assertEqual(init_op.type, 'gaussian_random')
...@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[5, 10], dtype="float32",
lod_level=0, shape=[5, 10],
name="param", lod_level=0,
initializer=initializer.MSRAInitializer( name="param",
fan_in=12, seed=134)) initializer=initializer.MSRAInitializer(
fan_in=12, seed=134))
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'uniform_random') self.assertEqual(init_op.type, 'uniform_random')
...@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase): ...@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase):
""" """
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
block.create_parameter( for _ in range(2):
dtype="float32", block.create_parameter(
shape=[8, 1, 3, 3], dtype="float32",
lod_level=0, shape=[8, 1, 3, 3],
name="param", lod_level=0,
initializer=initializer.BilinearInitializer()) name="param",
initializer=initializer.BilinearInitializer())
self.assertEqual(len(block.ops), 1) self.assertEqual(len(block.ops), 1)
init_op = block.ops[0] init_op = block.ops[0]
self.assertEqual(init_op.type, 'assign_value') self.assertEqual(init_op.type, 'assign_value')
......
...@@ -35,6 +35,22 @@ class TestLookupTableOp(OpTest): ...@@ -35,6 +35,22 @@ class TestLookupTableOp(OpTest):
self.check_grad(['W'], 'Out', no_grad_set=set('Ids')) self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
class TestLookupTableOpWithTensorIds(OpTest):
def setUp(self):
self.op_type = "lookup_table"
table = np.random.random((17, 31)).astype("float32")
ids = np.random.randint(
low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
self.inputs = {'W': table, 'Ids': ids}
self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
class TestLookupTableOpWithPadding(TestLookupTableOp): class TestLookupTableOpWithPadding(TestLookupTableOp):
def test_check_output(self): def test_check_output(self):
ids = np.squeeze(self.inputs['Ids']) ids = np.squeeze(self.inputs['Ids'])
...@@ -44,21 +60,34 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): ...@@ -44,21 +60,34 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
# Since paddings are not trainable and fixed in forward, the gradient of # Since paddings are not trainable and fixed in forward, the gradient of
# paddings makes no sense and we don't test the gradient here. # paddings makes no sense and we don't test the gradient here.
pass pass
class TestLookupTableWIsSelectedRows(OpTest): class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
def check_with_place(self, place): def test_check_output(self):
scope = core.Scope() ids = self.inputs['Ids']
flatten_idx = ids.flatten()
padding_idx = np.random.choice(flatten_idx, 1)[0]
self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
self.attrs = {'padding_idx': long(padding_idx)}
self.check_output()
def test_check_grad(self):
# Since paddings are not trainable and fixed in forward, the gradient of
# paddings makes no sense and we don't test the gradient here.
pass
# create and initialize Id Variable
class TestLookupTableWIsSelectedRows(OpTest):
def prepare_ids(self, scope, place):
ids_tensor = scope.var('Ids').get_tensor() ids_tensor = scope.var('Ids').get_tensor()
ids_array = np.array([[0], [4], [3], [5]]).astype("int64") ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
ids_tensor.set(ids_array, place) ids_tensor.set(ids_array, place)
return ids_array
# create and initialize W Variable def prepare_w(self, scope, place):
rows = [0, 1, 2, 3, 4, 5, 6] rows = [0, 1, 2, 3, 4, 5, 6]
row_numel = 12 row_numel = 12
...@@ -71,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest): ...@@ -71,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
w_tensor = w_selected_rows.get_tensor() w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place) w_tensor.set(w_array, place)
# create Out Variable def create_out_tensor(self, scope, place):
out_tensor = scope.var('Out').get_tensor() return scope.var('Out').get_tensor()
def check_result(self, ids_array, result_array):
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array):
assert (row[0] == result_array[idx]).all()
def check_with_place(self, place):
scope = core.Scope()
ids_array = self.prepare_ids(scope, place)
self.prepare_w(scope, place)
out_tensor = self.create_out_tensor(scope, place)
# create and run lookup_table operator # create and run lookup_table operator
lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out') lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
...@@ -80,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest): ...@@ -80,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
# get result from Out # get result from Out
result_array = np.array(out_tensor) result_array = np.array(out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array): self.check_result(ids_array, result_array)
assert (row[0] == result_array[idx]).all()
def test_w_is_selected_rows(self): def test_w_is_selected_rows(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
...@@ -91,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest): ...@@ -91,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
self.check_with_place(place) self.check_with_place(place)
class TestLookupTableWithTensorIdsWIsSelectedRows(
TestLookupTableWIsSelectedRows):
def prepare_ids(self, scope, place):
ids_tensor = scope.var('Ids').get_tensor()
ids_array = np.random.randint(
low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
ids_tensor.set(ids_array, place)
return ids_array
def check_result(self, ids_array, result_array):
for idx, row in np.ndenumerate(ids_array):
assert (row == result_array[idx]).all()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase):
fluid.recordio_writer.convert_reader_to_recordio_file( fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder) MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self, random=True): def _init_data(self):
np.random.seed(5) np.random.seed(5)
if random: img = np.random.random(size=[32, 784]).astype(np.float32)
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
return img, label return img, label
def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True): def _compare_reduce_and_allreduce(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence( self.check_network_convergence(
...@@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase):
self.check_network_convergence( self.check_network_convergence(
model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True) model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
img, label = self._init_data(random_data) img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model, model,
...@@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
img, label = self._init_data(random=False) img, label = self._init_data()
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
seed=1000, seed=1,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_parallel_executor=False) use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence( parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net, method=simple_fc_net,
seed=1000, seed=1,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_parallel_executor=True) use_parallel_executor=True)
for p_f in parallel_first_loss: self.assertAlmostEquals(
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
for p_l in parallel_last_loss: self.assertAlmostEquals(
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy(True)
......
...@@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase ...@@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase
import unittest import unittest
import math import math
import os import os
import numpy as np
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
# and Executor is different. Because, for ParallelExecutor, the dropout_op of
# the neural net will be copied N copies(N is the number of device). This will
# lead to the random numbers generated by ParallelExecutor and Executor are different.
# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
# dropout_op.
remove_dropout = False
# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
# and Executor is different.
remove_bn = False
def squeeze_excitation(input, num_channels, reduction_ratio): def squeeze_excitation(input, num_channels, reduction_ratio):
...@@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, ...@@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
groups=groups, groups=groups,
act=None, act=None,
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) return conv if remove_bn else fluid.layers.batch_norm(
input=conv, act=act, momentum=0.1)
def shortcut(input, ch_out, stride): def shortcut(input, ch_out, stride):
...@@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): ...@@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt50Small(batch_size=2, use_feed=False): batch_size = 12
assert not use_feed, "SE_ResNeXt doesn't support feed yet" img_shape = [3, 224, 224]
img = fluid.layers.fill_constant( def SE_ResNeXt50Small(use_feed):
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant( img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
shape=[batch_size, 1], dtype='int64', value=0.0) label = fluid.layers.data(name='label', shape=[1], dtype='int64')
conv = conv_bn_layer( conv = conv_bn_layer(
input=img, num_filters=16, filter_size=3, stride=2, act='relu') input=img, num_filters=16, filter_size=3, stride=2, act='relu')
...@@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): ...@@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
reshape = fluid.layers.reshape( reshape = fluid.layers.reshape(
x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
pool = fluid.layers.reduce_mean(input=reshape, dim=2) pool = fluid.layers.reduce_mean(input=reshape, dim=2)
dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) dropout = pool if remove_dropout else fluid.layers.dropout(
x=pool, dropout_prob=0.2, seed=1)
# Classifier layer: # Classifier layer:
prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.cross_entropy(input=prediction, label=label)
...@@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): ...@@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
return loss return loss
class TestResnet(TestParallelExecutorBase): def cosine_decay(learning_rate, step_each_epoch, epochs=120):
def check_resnet_convergence_with_learning_rate_decay(self, """
use_cuda=True, Applies cosine decay to the learning rate.
use_reduce=False, lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
iter=20): """
global_step = _decay_step_counter()
if use_cuda and not core.is_compiled_with_cuda(): with init_on_cpu():
return epoch = ops.floor(global_step / step_each_epoch)
decayed_lr = learning_rate * \
(ops.cos(epoch * (math.pi / epochs)) + 1)/2
return decayed_lr
os.environ['CPU_NUM'] = str(4)
def _cosine_decay(learning_rate, step_each_epoch, epochs=120): def optimizer(learning_rate=0.01):
""" optimizer = fluid.optimizer.Momentum(
Applies cosine decay to the learning rate. learning_rate=cosine_decay(
lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) learning_rate=learning_rate, step_each_epoch=2, epochs=1),
""" momentum=0.9,
global_step = _decay_step_counter() regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
with init_on_cpu():
epoch = ops.floor(global_step / step_each_epoch)
decayed_lr = learning_rate * \
(ops.cos(epoch * (math.pi / epochs)) + 1)/2
return decayed_lr
def _optimizer(learning_rate=0.01): class TestResnet(TestParallelExecutorBase):
optimizer = fluid.optimizer.Momentum( @classmethod
learning_rate=_cosine_decay( def setUpClass(cls):
learning_rate=learning_rate, step_each_epoch=2, epochs=1), os.environ['CPU_NUM'] = str(4)
momentum=0.9, global remove_dropout
regularization=fluid.regularizer.L2Decay(1e-4)) global remove_bn
return optimizer remove_dropout = False
remove_bn = False
def _init_data(self, batch_size=2, random=True):
np.random.seed(5)
if random:
img = np.random.random(
size=[batch_size] + img_shape).astype(np.float32)
else:
img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
label = [np.random.randint(0, 999) for _ in range(batch_size)]
label = np.array(label).astype(np.int64).reshape(-1, 1)
return img, label
def _compare_reduce_and_allreduce(self,
model,
use_cuda,
iter=20,
delta2=1e-6):
if use_cuda and not core.is_compiled_with_cuda():
return
import functools global remove_bn
remove_bn = True
batch_size = 2 img, label = self._init_data(batch_size=batch_size)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
use_reduce=False,
optimizer=optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
use_reduce=True,
optimizer=optimizer)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
def _check_resnet_convergence(self,
model,
use_cuda=True,
use_reduce=False,
iter=20,
delta2=1e-6):
if use_cuda and not core.is_compiled_with_cuda():
return
global remove_dropout
global remove_bn
remove_dropout = True
remove_bn = True
img, label = self._init_data(batch_size=batch_size)
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss = self.check_network_convergence(
functools.partial( model,
SE_ResNeXt50Small, batch_size=batch_size), feed_dict={"image": img,
"label": label},
iter=iter, iter=iter,
batch_size=batch_size, batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=use_reduce, use_reduce=use_reduce,
optimizer=_optimizer, optimizer=optimizer,
use_parallel_executor=False) use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence( parallel_first_loss, parallel_last_loss = self.check_network_convergence(
functools.partial( model,
SE_ResNeXt50Small, batch_size=batch_size), feed_dict={"image": img,
"label": label},
iter=iter, iter=iter,
batch_size=batch_size, batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
use_reduce=use_reduce, use_reduce=use_reduce,
optimizer=_optimizer) optimizer=optimizer)
for p_f in parallel_first_loss: self.assertAlmostEquals(
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
for p_l in parallel_last_loss: self.assertAlmostEquals(
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
def test_seresnext_with_learning_rate_decay(self): def test_seresnext_with_learning_rate_decay(self):
self.check_resnet_convergence_with_learning_rate_decay(True, False) self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
self.check_resnet_convergence_with_learning_rate_decay( self._check_resnet_convergence(
False, False, iter=5) model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
def test_seresnext_with_new_strategy_with_learning_rate_decay(self): def test_seresnext_with_new_strategy(self):
self.check_resnet_convergence_with_learning_rate_decay(True, True) self._compare_reduce_and_allreduce(
self.check_resnet_convergence_with_learning_rate_decay( model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
False, True, iter=5) self._compare_reduce_and_allreduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=5)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -26,15 +26,22 @@ def stable_softmax(x): ...@@ -26,15 +26,22 @@ def stable_softmax(x):
class TestSoftmaxOp(OpTest): class TestSoftmaxOp(OpTest):
def get_x_shape(self):
return [10, 10]
def setUp(self): def setUp(self):
self.op_type = "softmax" self.op_type = "softmax"
self.use_cudnn = False self.use_cudnn = False
self.use_mkldnn = False self.use_mkldnn = False
self.dtype = np.float32 self.dtype = np.float32
self.init_kernel_type() self.init_kernel_type()
self.shape = self.get_x_shape()
x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
out = np.apply_along_axis(stable_softmax, 1,
x.reshape([-1, self.shape[-1]]))
out = out.reshape(self.shape)
x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
out = np.apply_along_axis(stable_softmax, 1, x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out} self.outputs = {'Out': out}
self.attrs = { self.attrs = {
...@@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest): ...@@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest):
self.check_grad(["X"], "Out", max_relative_error=0.01) self.check_grad(["X"], "Out", max_relative_error=0.01)
class TestSoftmaxOp2(TestSoftmaxOp):
def get_x_shape(self):
return [2, 3, 4, 5]
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxCUDNNOp(TestSoftmaxOp): class TestSoftmaxCUDNNOp(TestSoftmaxOp):
...@@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp): ...@@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
self.use_cudnn = True self.use_cudnn = True
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxFP16Op(TestSoftmaxOp): class TestSoftmaxFP16Op(TestSoftmaxOp):
...@@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp): ...@@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
self.check_output_with_place(place, atol=1e-3) self.check_output_with_place(place, atol=1e-3)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
def get_x_shape(self):
return [2, 3, 4, 5]
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
...@@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): ...@@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
self.check_output_with_place(place, atol=1e-3) self.check_output_with_place(place, atol=1e-3)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
class TestSoftmaxMKLDNNOp(TestSoftmaxOp): class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
def init_kernel_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -896,8 +896,6 @@ class DistributeTranspiler(object): ...@@ -896,8 +896,6 @@ class DistributeTranspiler(object):
self.table_name self.table_name
][0] ][0]
table_opt_block = pserver_program.create_block(pre_block_idx) table_opt_block = pserver_program.create_block(pre_block_idx)
# only support sgd now
assert table_opt_op.type == "sgd"
if self.sync_mode: if self.sync_mode:
# create grad vars in pserver program # create grad vars in pserver program
...@@ -937,11 +935,12 @@ class DistributeTranspiler(object): ...@@ -937,11 +935,12 @@ class DistributeTranspiler(object):
"LearningRate": [lr_var] "LearningRate": [lr_var]
} }
outputs = {"ParamOut": [param_var]} outputs = {"ParamOut": [param_var]}
table_opt_block.append_op( # only support sgd now
type=table_opt_op.type, import logging
inputs=inputs, logging.warn(
outputs=outputs, "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
attrs=table_opt_op.attrs) + table_opt_op.type)
table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
# add table parameter gradient and it's block id to grad_to_block_id # add table parameter gradient and it's block id to grad_to_block_id
grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx)) grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册