提交 0a885ac1 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_graph_mode

test=develop
...@@ -139,10 +139,12 @@ endfunction() ...@@ -139,10 +139,12 @@ endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0) if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x ...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
......
...@@ -89,6 +89,7 @@ if(CUDNN_FOUND) ...@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
if(NOT CUDNN_MAJOR_VERSION) if(NOT CUDNN_MAJOR_VERSION)
set(CUDNN_VERSION "???") set(CUDNN_VERSION "???")
else() else()
add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
math(EXPR CUDNN_VERSION math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 + "${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
......
...@@ -32,4 +32,4 @@ endif() ...@@ -32,4 +32,4 @@ endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub) LIST(APPEND external_project_dependencies cub)
...@@ -28,4 +28,4 @@ endif() ...@@ -28,4 +28,4 @@ endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND externl_project_dependencies dlpack) LIST(APPEND external_project_dependencies dlpack)
...@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs) ...@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_VERSION "0.9") SET(NGRAPH_GIT_TAG "v0.10.1")
SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -27,9 +27,10 @@ add_subdirectory(details) ...@@ -27,9 +27,10 @@ add_subdirectory(details)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto) proto_library(async_executor_proto SRCS data_feed.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
...@@ -77,7 +78,7 @@ if (WITH_GPU) ...@@ -77,7 +78,7 @@ if (WITH_GPU)
endif() endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits) cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
...@@ -129,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) ...@@ -129,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler ngraph) shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
...@@ -175,11 +174,7 @@ if(WITH_DISTRIBUTE) ...@@ -175,11 +174,7 @@ if(WITH_DISTRIBUTE)
else() else()
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(NOT WIN32)
else(WITH_NGRAPH) else(WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
...@@ -194,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS ...@@ -194,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
if(WITH_PSLIB) if(WITH_PSLIB)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
else() else()
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
endif(WITH_PSLIB) endif(WITH_PSLIB)
......
...@@ -15,34 +15,123 @@ ...@@ -15,34 +15,123 @@
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/framework/unroll_array_ops.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T, size_t N> template <typename T, size_t N>
class Array { class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public: public:
HOSTDEVICE Array() {} static constexpr size_t kSize = N;
HOSTDEVICE inline Array() {}
HOSTDEVICE explicit Array(const T &val) { template <typename... Args>
for (size_t i = 0; i < N; ++i) data_[i] = val; HOSTDEVICE inline explicit Array(const T &val, Args... args) {
static_assert(N == sizeof...(Args) + 1, "Invalid argument");
UnrollVarArgsAssign<T>::Run(data_, val, args...);
} }
HOSTDEVICE const T *Get() const { return data_; } HOSTDEVICE inline void Fill(const T &val) {
UnrollFillConstant<N>::Run(data_, val);
}
HOSTDEVICE T *GetMutable() { return data_; } HOSTDEVICE inline const T *Get() const { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; } HOSTDEVICE inline T *GetMutable() { return data_; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
// Writing "return data_[i]" would cause compilation warning/error:
// "array subscript is above array bound" in Python 35 CI.
// It seems that it is a false warning of GCC if we do not check the bounds
// of array index. But for better performance, we do not check in operator[]
// like what is in STL. If users want to check the bounds, use at() instead
HOSTDEVICE inline const T &operator[](size_t i) const {
return *advance(data_, i);
}
HOSTDEVICE inline T &at(size_t i) {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE inline const T &at(size_t i) const {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE constexpr size_t size() const { return N; } HOSTDEVICE constexpr size_t size() const { return N; }
HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
return UnrollCompare<N>::Run(data_, other.data_);
}
HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
return !(*this == other);
}
private: private:
template <typename U>
HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
return ptr + i;
}
T data_[N]; T data_[N];
}; };
template <typename T>
class Array<T, 0> {
public:
static constexpr size_t kSize = 0;
HOSTDEVICE inline Array() {}
HOSTDEVICE inline void Fill(const T &val) {}
HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
// Add constexpr to GetMutable() cause warning in MAC
HOSTDEVICE inline T *GetMutable() { return nullptr; }
HOSTDEVICE inline T &operator[](size_t) {
#ifdef __CUDA_ARCH__
static T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline const T &operator[](size_t) const {
#ifdef __CUDA_ARCH__
static const T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
HOSTDEVICE constexpr size_t size() const { return 0; }
HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
return true;
}
HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
return false;
}
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -304,9 +304,14 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, ...@@ -304,9 +304,14 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// start executing ops in multiple threads // start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) { for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
if (debug) {
threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
workers[thidx].get()));
} else {
threads.push_back( threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
} }
}
for (auto& th : threads) { for (auto& th : threads) {
th.join(); th.join();
......
...@@ -18,312 +18,159 @@ limitations under the License. */ ...@@ -18,312 +18,159 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
/// @cond HIDDEN
template <int i>
Dim<i> make_dim(const int64_t* d) {
return Dim<i>(*d, make_dim<i - 1>(d + 1));
}
template <>
Dim<0> make_dim<0>(const int64_t* d) {
return Dim<0>(*d);
}
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
case 0:
ddim = make_dim<0>(dims);
break;
case 1:
ddim = make_dim<1>(dims);
break;
case 2:
ddim = make_dim<2>(dims);
break;
case 3:
ddim = make_dim<3>(dims);
break;
case 4:
ddim = make_dim<4>(dims);
break;
case 5:
ddim = make_dim<5>(dims);
break;
case 6:
ddim = make_dim<6>(dims);
break;
case 7:
ddim = make_dim<7>(dims);
break;
case 8:
ddim = make_dim<8>(dims);
break;
case 9:
ddim = make_dim<9>(dims);
break;
default:
PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
}
}
/// @endcond
DDim make_ddim(std::initializer_list<int64_t> dims) { DDim make_ddim(std::initializer_list<int64_t> dims) {
DDim result(make_dim(0)); return DDim(dims.begin(), dims.size());
make_ddim(result, dims.begin(), dims.size());
return result;
} }
DDim make_ddim(const std::vector<int64_t>& dims) { DDim make_ddim(const std::vector<int64_t>& dims) {
DDim result(make_dim(0)); return DDim(dims.data(), dims.size());
make_ddim(result, &dims[0], dims.size());
return result;
} }
DDim make_ddim(const std::vector<int>& dims) { DDim make_ddim(const std::vector<int>& dims) {
std::vector<int64_t> res(dims.size()); return DDim(dims.data(), dims.size());
std::transform(dims.begin(), dims.end(), res.begin(),
[](int d) { return static_cast<int64_t>(d); });
return make_ddim(res);
} }
/// @cond HIDDEN struct DDimEqualityVisitor {
// XXX For some reason, putting this in an anonymous namespace causes errors explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
public:
explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
template <int D> template <int D>
int64_t& operator()(Dim<D>& dim) const { inline bool operator()(const Dim<D>& self) const {
return dim[idx_]; return UnrollCompare<D>::Run(self.Get(), d_);
} }
private: const int64_t* d_;
int idx_;
}; };
class DynamicConstIndexer : public boost::static_visitor<int64_t> { bool DDim::operator==(const DDim& d) const {
public: return size() == d.size() &&
explicit DynamicConstIndexer(int idx) : idx_(idx) {} this->apply_visitor(DDimEqualityVisitor(d.Get()));
template <int D>
int64_t operator()(const Dim<D>& dim) const {
return dim[idx_];
}
private:
int idx_;
};
/// @endcond
int64_t& DDim::operator[](int idx) {
return boost::apply_visitor(DynamicMutableIndexer(idx), var);
}
int64_t DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var);
} }
int DDim::size() const { return arity(*this); } bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
bool DDim::operator==(DDim d) const { struct DDimPlusVisitor {
if (var.which() != d.getVar().which()) { explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
return false; : d1_(d1), d2_(d2) {}
} else {
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
for (unsigned int i = 0; i < v1.size(); i++) { template <int D>
if (v1[i] != v2[i]) { inline void operator()(Dim<D>& self) const {
return false; UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
}
} }
return true; const int64_t* d1_;
} const int64_t* d2_;
} };
bool DDim::operator!=(DDim d) const { return !(*this == d); }
DDim DDim::operator+(DDim d) const {
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int64_t> v3;
assert(v1.size() == v2.size());
for (unsigned int i = 0; i < v1.size(); i++) {
v3.push_back(v1[i] + v2[i]);
}
return make_ddim(v3); DDim DDim::operator+(const DDim& d) const {
PADDLE_ENFORCE(size() == d.size());
DDim ret;
ret.rank_ = rank_;
ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
return ret;
} }
DDim DDim::operator*(DDim d) const { struct DDimMulVisitor {
std::vector<int64_t> v1 = vectorize(*this); explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
std::vector<int64_t> v2 = vectorize(d); : d1_(d1), d2_(d2) {}
std::vector<int64_t> v3; template <int D>
inline void operator()(Dim<D>& self) const {
assert(v1.size() == v2.size()); UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
for (unsigned int i = 0; i < v1.size(); i++) {
v3.push_back(v1[i] * v2[i]);
} }
return make_ddim(v3); const int64_t* d1_;
const int64_t* d2_;
};
DDim DDim::operator*(const DDim& d) const {
PADDLE_ENFORCE(size() == d.size());
DDim ret;
ret.rank_ = rank_;
ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
return ret;
} }
int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT
/// @cond HIDDEN
struct VectorizeVisitor : public boost::static_visitor<> {
std::vector<int64_t>& vector;
explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
template <typename T>
void operator()(const T& t) {
vector.push_back(t.head);
this->operator()(t.tail);
}
void operator()(const Dim<0>& t) {}
};
/// @endcond
std::vector<int64_t> vectorize(const DDim& ddim) { std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result; std::vector<int64_t> result(DDim::kMaxRank);
VectorizeVisitor visitor(result); dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
boost::apply_visitor(visitor, ddim); result.resize(ddim.size());
return result; return result;
} }
// NOTE: framework::vectorize converts to type int64_t // NOTE: framework::vectorize converts to type int64_t
// which does not fit cudnn inputs. // which does not fit cudnn inputs.
std::vector<int> vectorize2int(const DDim& ddim) { std::vector<int> vectorize2int(const DDim& ddim) {
std::vector<int64_t> temp = vectorize(ddim); std::vector<int> result(DDim::kMaxRank);
std::vector<int> result(temp.begin(), temp.end()); dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result; return result;
} }
struct ProductVisitor : public boost::static_visitor<int64_t> { struct ProductVisitor {
template <int D> template <int D>
int64_t operator()(const Dim<D>& dim) { inline int64_t operator()(const Dim<D>& dim) {
return product(dim); return product(dim);
} }
}; };
int64_t product(const DDim& ddim) { int64_t product(const DDim& ddim) {
ProductVisitor visitor; return ddim.apply_visitor(ProductVisitor());
return boost::apply_visitor(visitor, ddim);
} }
struct SliceVectorizeVisitor : public boost::static_visitor<> {
std::vector<int64_t>& vector;
int begin;
int end;
SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
: vector(v), begin(b), end(e) {
PADDLE_ENFORCE(begin < end,
"Begin index must be less than end index in ddim slice.");
PADDLE_ENFORCE(begin >= 0,
"Begin index can't be less than zero in ddim slice.");
}
template <int S>
void operator()(const Dim<S>& dim) {
if (begin == 0) {
vector.push_back(dim.head);
} else {
--begin;
}
--end;
if (end > 0) {
this->operator()(dim.tail);
}
}
void operator()(const Dim<0>& dim) {
PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
}
};
DDim slice_ddim(const DDim& dim, int begin, int end) { DDim slice_ddim(const DDim& dim, int begin, int end) {
std::vector<int64_t> vec; PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
vec.reserve(end - begin); "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
SliceVectorizeVisitor visitor(vec, begin, end); begin, end, dim.size());
boost::apply_visitor(visitor, dim); // Constructor of DDim would check whether end - begin is valid
return make_ddim(vec); return DDim(dim.Get() + begin, end - begin);
} }
/// \cond HIDDEN int arity(const DDim& d) { return d.size(); }
struct ArityVisitor : boost::static_visitor<int> {
template <int D>
int operator()(Dim<D>) const {
return D;
}
};
/// \endcond struct DDimPrinter {
int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
/// \cond HIDDEN
struct DDimPrinter : boost::static_visitor<void> {
std::ostream& os; std::ostream& os;
explicit DDimPrinter(std::ostream& os_) : os(os_) {} explicit DDimPrinter(std::ostream& os_) : os(os_) {}
template <typename T> template <int D>
void operator()(const T& t) { void operator()(const Dim<D>& t) {
os << t; os << t;
} }
}; };
/// \endcond
std::ostream& operator<<(std::ostream& os, const DDim& ddim) { std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
DDimPrinter printer(os); ddim.apply_visitor(DDimPrinter(os));
boost::apply_visitor(printer, ddim);
return os; return os;
} }
DDim::DDim(std::initializer_list<int64_t> init_list) {
*this = make_ddim(init_list);
}
DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_2d(const DDim& src, int num_col_dims) {
int rank = src.size(); return DDim({product(slice_ddim(src, 0, num_col_dims)),
return make_ddim({product(slice_ddim(src, 0, num_col_dims)), product(slice_ddim(src, num_col_dims, src.size()))});
product(slice_ddim(src, num_col_dims, rank))});
} }
DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
DDim stride(const DDim& ddim) { DDim stride(const DDim& ddim) {
std::vector<int64_t> strides(ddim.size()); DDim strides;
strides.rank_ = ddim.size();
strides[ddim.size() - 1] = 1; strides[ddim.size() - 1] = 1;
for (int i = ddim.size() - 2; i >= 0; --i) { for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i + 1]; strides[i] = strides[i + 1] * ddim[i + 1];
} }
return framework::make_ddim(strides); return strides;
} }
DDim stride_numel(const framework::DDim& ddim) { DDim stride_numel(const DDim& ddim) {
std::vector<int64_t> strides(ddim.size()); DDim strides;
strides.rank_ = ddim.size();
strides[ddim.size() - 1] = ddim[ddim.size() - 1]; strides[ddim.size() - 1] = ddim[ddim.size() - 1];
for (int i = ddim.size() - 2; i >= 0; --i) { for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i]; strides[i] = strides[i + 1] * ddim[i];
} }
return framework::make_ddim(strides); return strides;
} }
} // namespace framework } // namespace framework
......
...@@ -18,62 +18,145 @@ limitations under the License. */ ...@@ -18,62 +18,145 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include "paddle/fluid/framework/dim.h" #include "paddle/fluid/framework/dim.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
case (rank): { \
constexpr auto kRank = (rank); \
return (callback); \
}
#define PADDLE_VISIT_DDIM(rank, callback) \
switch (rank) { \
PADDLE_VISIT_DDIM_BASE(0, callback); \
PADDLE_VISIT_DDIM_BASE(1, callback); \
PADDLE_VISIT_DDIM_BASE(2, callback); \
PADDLE_VISIT_DDIM_BASE(3, callback); \
PADDLE_VISIT_DDIM_BASE(4, callback); \
PADDLE_VISIT_DDIM_BASE(5, callback); \
PADDLE_VISIT_DDIM_BASE(6, callback); \
PADDLE_VISIT_DDIM_BASE(7, callback); \
PADDLE_VISIT_DDIM_BASE(8, callback); \
PADDLE_VISIT_DDIM_BASE(9, callback); \
default: \
PADDLE_THROW("Invalid rank %d", rank); \
}
template <typename T1, typename T2>
inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
}
/** /**
* \brief A dynamically sized dimension. * \brief A dynamically sized dimension.
* *
* The number of dimensions must be between [1, 9]. * The number of dimensions must be between [1, 9].
*/ */
struct DDim { class DDim {
typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, public:
Dim<7>, Dim<8>, Dim<9>> constexpr static int kMaxRank = 9;
DDimVar;
DDimVar var; DDim() : rank_(1) { dim_[0] = 0; }
DDim() : var(Dim<1>()) {} DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
DDim(const int* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
DDim(const int64_t* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
template <int D> template <int D>
explicit DDim(const Dim<D>& in) : var(in) {} /*implicit*/ DDim(const Dim<D>& in) : rank_(D) { // NOLINT
UnsafeCast<D>() = in;
}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list)
: DDim(init_list.begin(), init_list.size()) {}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list); inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
template <int D> template <int D>
DDim& operator=(const Dim<D>& in) { inline DDim& operator=(const Dim<D>& dim) {
var = in; rank_ = D;
UnsafeCast<D>() = dim;
return *this; return *this;
} }
int64_t& operator[](int idx); inline int64_t& operator[](int idx) { return dim_[idx]; }
int64_t operator[](int idx) const;
inline int64_t operator[](int idx) const { return dim_[idx]; }
inline int64_t& at(int idx) {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
inline int64_t at(int idx) const {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) { typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) const { typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) const {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
DDimVar getVar() { return var; } bool operator==(const DDim& d) const;
bool operator!=(const DDim& d) const;
DDim operator+(const DDim& d) const;
bool operator==(DDim d) const; DDim operator*(const DDim& d) const;
bool operator!=(DDim d) const; inline const int64_t* Get() const { return dim_.Get(); }
DDim operator+(DDim d) const; inline int64_t* GetMutable() { return dim_.GetMutable(); }
DDim operator*(DDim d) const; inline int size() const { return rank_; }
private:
template <int D>
inline Dim<D>& UnsafeCast() {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<void*>(&dim_);
return *reinterpret_cast<Dim<D>*>(p);
}
template <int D>
inline const Dim<D>& UnsafeCast() const {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<const void*>(&dim_);
return *reinterpret_cast<const Dim<D>*>(p);
}
int size() const; inline DDim& CopyFrom(const DDim& ddim) {
PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
}
friend DDim stride(const DDim& ddim);
friend DDim stride_numel(const DDim& ddim);
private:
Dim<kMaxRank> dim_;
int rank_;
}; };
#undef PADDLE_VISIT_DDIM_BASE
#undef PADDLE_VISIT_DDIM
/** /**
* \brief Make a DDim from std::vector<int64_t> * \brief Make a DDim from std::vector<int64_t>
* *
...@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims); ...@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
DDim make_ddim(std::initializer_list<int64_t> dims); DDim make_ddim(std::initializer_list<int64_t> dims);
int64_t get(const DDim& dim, int idx); int64_t get(const DDim& dim, int idx);
void set(DDim& dim, int idx, int val); void set(DDim& dim, int idx, int val); // NOLINT
std::vector<int64_t> vectorize(const DDim& ddim); std::vector<int64_t> vectorize(const DDim& ddim);
std::vector<int> vectorize2int(const DDim& ddim); std::vector<int> vectorize2int(const DDim& ddim);
...@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim); ...@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
DDim stride_numel(const DDim& ddim); DDim stride_numel(const DDim& ddim);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
namespace boost {
template <typename T>
T get(const paddle::framework::DDim& in) {
return boost::get<T>(in.var);
}
} // namespace boost
...@@ -55,9 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -55,9 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
// Wait input done
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
......
...@@ -25,7 +25,7 @@ struct ExecutionStrategy { ...@@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
}; };
......
...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; ++drop_scope_counter_;
if (!fetch_tensors.empty() || bool stream_end = false;
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { if (!fetch_tensors.empty()) {
drop_scope_counter_ = 0; WaitComputationalStreams();
// Wait All computational streams stream_end = true;
for (auto p : places_) { }
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
if (!stream_end) {
WaitComputationalStreams();
} }
for (auto &scope : local_scopes_) { for (auto &scope : local_scopes_) {
auto &local_scope = auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>(); *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
drop_scope_counter_ = 0;
} }
if (eptr) { if (eptr) {
std::rethrow_exception(eptr); std::rethrow_exception(eptr);
......
...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override; FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
private:
inline void WaitComputationalStreams() {
// Wait All computational streams
for (auto p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
private: private:
size_t drop_scope_counter_{0}; size_t drop_scope_counter_{0};
......
...@@ -16,332 +16,184 @@ ...@@ -16,332 +16,184 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/framework/array.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Statically sized, statically indexed dimension // Statically sized, statically indexed dimension
template <int i> template <int D>
struct Dim { class Dim : public Array<int64_t, D> {
static constexpr int dimensions = i; public:
static_assert(D >= 0, "D must be not less than 0");
template <typename... Args> static constexpr int kRank = D;
HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { using BaseClass = Array<int64_t, D>;
static_assert(sizeof...(_tail) == i - 1,
"Dim initialized with the wrong number of parameters");
}
HOSTDEVICE inline Dim(int64_t head, const Dim<D - 1>& tail) {
Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {} (*this)[0] = head;
new (this->GetMutable() + 1) Dim<D - 1>(tail);
}
HOSTDEVICE template <typename... Args>
Dim() : head(0), tail() {} HOSTDEVICE explicit Dim(int64_t head, Args... args)
: BaseClass(head, args...) {}
/** Construct a Dim from a linear index and size. Uses Fortran order /** Construct a Dim from a linear index and size. Uses Fortran order
* indexing. */ * indexing. */
HOSTDEVICE HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);
Dim(int64_t idx, const Dim<i>& size)
: head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */ /** Construct a Dim with each dimension set to the given index */
HOSTDEVICE HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
Dim(int64_t idx) : head(idx), tail(idx) {}
HOSTDEVICE HOSTDEVICE Dim() = default;
bool operator==(const Dim<i>& o) const {
return (head == o.head) && (tail == o.tail);
}
HOSTDEVICE
bool operator!=(const Dim<i>& o) const { return !(*this == o); }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
HOST std::string to_string() const; HOST std::string to_string() const;
int64_t head;
Dim<i - 1> tail;
};
// Base case specialization
template <>
struct Dim<0> {
static constexpr int dimensions = 0;
HOSTDEVICE
Dim(int64_t _head) {}
HOSTDEVICE
Dim() {}
HOSTDEVICE
Dim(int idx, const Dim<0>& size) {
#ifndef __CUDA_ARCH__
if (idx > 0) {
throw std::invalid_argument("Index out of range.");
}
#else
PADDLE_ASSERT(idx == 0);
#endif
}
HOSTDEVICE
bool operator==(const Dim<0>& o) const { return true; }
HOSTDEVICE
bool operator!=(const Dim<0>& o) const { return false; }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
}; };
namespace { namespace detail {
template <int kStart, int kEnd, bool kStop>
// Helper for accessing Dim classes struct FortranOrderIndexingConstructorFunctor {
template <int i> HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
struct DimGetter { int64_t* out) {
// Return a copy if Dim is const out[kStart] = (*idx) % in[kStart];
template <typename D> (*idx) /= in[kStart];
HOSTDEVICE static int64_t impl(const D& d) { FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
return DimGetter<i - 1>::impl(d.tail); kStart + 1 == kEnd>::Run(in, idx,
} out);
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int64_t& impl(D& d) {
return DimGetter<i - 1>::impl(d.tail);
} }
}; };
// Eureka! We found the element! template <int kStart, int kEnd>
template <> struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
struct DimGetter<0> { HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
// Return a copy if Dim is const int64_t* out) {}
template <typename D>
HOSTDEVICE static int64_t impl(const D& d) {
return d.head;
}
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int64_t& impl(D& d) {
return d.head;
}
}; };
} // namespace detail
template <int D> template <int D>
HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) { HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
#ifndef __CUDA_ARCH__ detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
if (idx < 0) { size.Get(), &idx, this->GetMutable());
throw std::invalid_argument("Tried to access a negative dimension");
}
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) {
return dim.head;
}
return indexer(dim.tail, idx - 1);
}
template <>
HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
}
template <int D>
HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension");
}
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) {
return dim.head;
}
return indexer(dim.tail, idx - 1);
}
template <>
HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
}
} // namespace
// Static access to constant Dim
template <int i, int l>
HOSTDEVICE int64_t get(const Dim<l>& d) {
return DimGetter<i>::impl(d);
} }
// Static access to mutable Dim template <int idx, int D>
template <int i, int l> HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
HOSTDEVICE int64_t& get(Dim<l>& d) { return dim[idx];
return DimGetter<i>::impl(d);
} }
// Dynamic access to constant Dim template <int idx, int D>
template <int l> HOSTDEVICE inline int64_t& get(Dim<D>& dim) { // NOLINT
HOSTDEVICE int64_t Dim<l>::operator[](int i) const { return dim[idx];
return indexer(*this, i);
} }
// Dynamic access to mutable Dim template <int D>
template <int l> HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
HOSTDEVICE int64_t& Dim<l>::operator[](int i) { return dim[idx];
return indexer(*this, i);
}
// Dynamic access to constant Dim
inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
return indexer(*this, i);
}
// Dynamic access to constant Dim
// without std::enable_if will try to instantiate this on get<0>(d)
template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
int i) {
return d[i];
} }
// Dynamic access to mutable Dim template <int D>
template <int l> HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) { // NOLINT
HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d, return dim[idx];
int i) {
return d[i];
} }
// Dot product of two dims // Dot product of two dims
template <int i> template <int D>
HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
return a.head * b.head + linearize(a.tail, b.tail); return UnrollProduct<D>::Run(a.Get(), b.Get());
}
// Base case dot product of two Dims
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
return 0;
} }
// Product of a Dim // Product of a Dim
template <int i> template <int D>
HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) { HOSTDEVICE inline int64_t product(const Dim<D>& a) {
return prod * a.head * product(a.tail); return UnrollProduct<D>::Run(a.Get());
}
// Base case product of a Dim
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
return prod;
} }
// Is 0 <= idx_i < size_i for all i? // Is 0 <= idx_i < size_i for all i?
template <int i> namespace detail {
HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) { template <int kStart, int kEnd, bool kStop>
return ((0 <= idx.head) && (idx.head < size.head) && struct ContainedFunctor {
contained(idx.tail, size.tail)); HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
} return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
size);
}
};
// Base case of is 0 <= idx_i < size_i ? template <int kStart, int kEnd>
// Notice it is inline because it is no longer a template struct ContainedFunctor<kStart, kEnd, true> {
template <> HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { const int64_t* size) {
return true; return true;
}
};
} // namespace detail
template <int D>
HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
} }
/** /**
* \brief Compute exclusive prefix-multiply of a Dim. * \brief Compute exclusive prefix-multiply of a Dim.
*/ */
template <int i> namespace detail {
HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) { template <int kStart, int kEnd, bool kStop>
return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head)); struct ExPrefixMulFunctor {
} HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
kStart == 0 ? out[kStart] = 1 : out[kStart] =
out[kStart - 1] * in[kStart - 1];
detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
out);
}
};
///\cond HIDDEN template <int kStart, int kEnd>
// Base case of ex_prefix_mul struct ExPrefixMulFunctor<kStart, kEnd, true> {
// Notice it is inline because it is no longer a template HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
template <> };
HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { } // namespace detail
return Dim<0>();
template <int D>
HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
Dim<D> ret;
detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
return ret;
} }
///\endcond
/** /**
* Add two dimensions together * Add two dimensions together
*/ */
template <int i> template <int D>
HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail)); Dim<D> ret;
} UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
return ret;
// Base case
template <>
HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
} }
template <int i> template <int D>
HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) { HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
return dim_plus(lhs, rhs); return dim_plus(lhs, rhs);
} }
/** /**
* Multiply two dimensions together * Multiply two dimensions together
*/ */
template <int i> template <int D>
HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail)); Dim<D> ret;
} UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
return ret;
// Base case
template <>
HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
} }
template <int i> template <int D>
HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) { HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
return dim_mult(lhs, rhs); return dim_mult(lhs, rhs);
} }
...@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) { ...@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
* \return Dim object the same size as \p size with normalized strides * \return Dim object the same size as \p size with normalized strides
* *
*/ */
namespace detail {
template <int kStart, int kEnd, bool kStop>
struct NormalizeStridesFunctor {
HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
int64_t* ret) {
ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
size, stride, ret);
}
};
template <int i> template <int kStart, int kEnd>
HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) { struct NormalizeStridesFunctor<kStart, kEnd, true> {
int norm_stride = size.head == 1 ? 0 : stride.head; HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail)); int64_t* ret) {}
} };
} // namespace detail
///\cond HIDDEN
template <> template <int D>
HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
const Dim<0>& stride) { Dim<D> ret;
return Dim<0>(); detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
ret.GetMutable());
return ret;
} }
///\endcond
/** /**
* Helper function to create a Dim * Helper function to create a Dim
* *
...@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, ...@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
*/ */
template <typename... Args> template <typename... Args>
HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) { HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
return Dim<sizeof...(Args)>(idxes...); return Dim<sizeof...(Args)>(idxes...);
} }
// Allows us to output a Dim // Allows us to output a Dim
// XXX For some reason, overloading fails to resolve this correctly template <int D>
template <int i> inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
typename std::enable_if<(i > 1), std::ostream&>::type operator<<( os << d[0];
std::ostream& os, const Dim<i>& d) { for (int i = 1; i < D; ++i) {
os << d.head << ", " << d.tail; os << ", " << d[i];
return os; }
}
// Base case that allows us to output a Dim
// XXX I wish this could be an overload instead of a template
template <int i>
typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
std::ostream& os, const Dim<i>& d) {
os << d.head;
return os; return os;
} }
...@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { ...@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
return os; return os;
} }
template <int i> template <int D>
HOST std::string Dim<i>::to_string() const { HOST std::string Dim<D>::to_string() const {
std::stringstream stream; std::stringstream stream;
stream << *this; stream << *this;
return stream.str(); return stream.str();
} }
template <int D> template <int D>
HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) { HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
Dim<D> result; Dim<D> result;
for (int i = 0; i < D - 1; ++i) { for (int i = 0; i < D - 1; ++i) {
...@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) { ...@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
return result; return result;
} }
template <int D, typename T1, typename T2>
inline void static_dim_assign(const T1* in, T2* out) {
UnrollAssign<D>::Run(in, out);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { ...@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
struct DLContextVisitor : public boost::static_visitor<::DLContext> { struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CPUPlace &place) const { inline ::DLContext operator()(const platform::CPUPlace &place) const {
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLCPU; ctx.device_type = kDLCPU;
ctx.device_id = 0; ctx.device_id = 0;
return ctx; return ctx;
...@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLGPU; ctx.device_type = kDLGPU;
ctx.device_id = place.device; ctx.device_id = place.device;
return ctx; return ctx;
...@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLCPUPinned; ctx.device_type = kDLCPUPinned;
ctx.device_id = 0; ctx.device_id = 0;
return ctx; return ctx;
......
...@@ -38,7 +38,7 @@ class DLPackTensor { ...@@ -38,7 +38,7 @@ class DLPackTensor {
// The shape in DLTensor is defined as int64_t* // The shape in DLTensor is defined as int64_t*
// Add this member to make TVMTensor init without heap allocation // Add this member to make TVMTensor init without heap allocation
ShapeType shape_[9]; ShapeType shape_[DDim::kMaxRank];
}; };
} // namespace framework } // namespace framework
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() { ...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
return; return;
#else #else
static unsigned concurrency_cap = std::thread::hardware_concurrency(); static unsigned concurrency_cap = std::thread::hardware_concurrency();
LOG(WARNING) << "concurrency capacity " << concurrency_cap;
int thread_id = this->thread_id_; int thread_id = this->thread_id_;
if (static_cast<unsigned>(thread_id) < concurrency_cap) { if (static_cast<unsigned>(thread_id) < concurrency_cap) {
...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { ...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
} }
void ExecutorThreadWorker::TrainFilesWithTimer() {
platform::SetNumThreads(1);
SetDevice();
thread_reader_->Start();
std::vector<double> op_total_time;
std::vector<std::string> op_name;
for (auto& op : ops_) {
op_name.push_back(op->Type());
}
op_total_time.resize(ops_.size());
for (size_t i = 0; i < op_total_time.size(); ++i) {
op_total_time[i] = 0.0;
}
platform::Timer timeline;
double total_time = 0.0;
double read_time = 0.0;
int cur_batch;
int batch_cnt = 0;
timeline.Start();
while ((cur_batch = thread_reader_->Next()) > 0) {
timeline.Pause();
read_time += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
for (size_t i = 0; i < ops_.size(); ++i) {
timeline.Start();
ops_[i]->Run(*thread_scope_, place_);
timeline.Pause();
op_total_time[i] += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
}
++batch_cnt;
thread_scope_->DropKids();
if (thread_id_ == 0) {
if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
for (size_t i = 0; i < ops_.size(); ++i) {
fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
op_name[i].c_str(), op_total_time[i] / batch_cnt);
}
fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
int fetch_var_num = fetch_var_names_.size();
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
}
}
}
timeline.Start();
}
}
void ExecutorThreadWorker::TrainFiles() { void ExecutorThreadWorker::TrainFiles() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr( ...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) { std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
_pslib_ptr = pslib_ptr; _pslib_ptr = pslib_ptr;
} }
void AsyncExecutorThreadWorker::SetPullDenseThread( void AsyncExecutorThreadWorker::SetPullDenseThread(
std::shared_ptr<DensePullThread> dpt) { std::shared_ptr<DensePullThread> dpt) {
_pull_dense_thread = dpt; _pull_dense_thread = dpt;
} }
void AsyncExecutorThreadWorker::TrainOneNetwork() { void AsyncExecutorThreadWorker::TrainOneNetwork() {
PrepareParams(); PrepareParams();
......
...@@ -155,6 +155,8 @@ class ExecutorThreadWorker { ...@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed); void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function // A multi-thread training function
virtual void TrainFiles(); virtual void TrainFiles();
// with timer log
virtual void TrainFilesWithTimer();
// set fetch variable names from python interface assigned by users // set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names); void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
......
...@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<Node*> optimize_ops; std::vector<Node*> optimize_ops;
std::vector<Node*> lr_ops; // ops other than forward/backward/optimize std::vector<Node*> lr_ops; // ops other than forward/backward/optimize
std::unordered_set<std::string> grad_names; std::unordered_set<std::string> grad_names;
std::unordered_map<std::string, std::string> gradname2paramname;
std::vector<ir::Node*> nodes = TopologySortOperations(*graph); std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes(); auto origin_nodes = graph->ReleaseNodes();
...@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var); auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
for (size_t i = 0; i < op_role_vars.size(); i += 2) { for (size_t i = 0; i < op_role_vars.size(); i += 2) {
grad_names.insert(op_role_vars[i + 1]); grad_names.insert(op_role_vars[i + 1]);
gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
} }
} else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) { } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
lr_ops.push_back(node); lr_ops.push_back(node);
...@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
// 2. copy forward backward // 2. copy forward backward
ir::Node* prev_repeat_last_op_node = nullptr; ir::Node* prev_repeat_last_op_node = nullptr;
// record origin_grad -> repeated grad list map. // record origin_grad -> repeated_grad_list map.
std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map; std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
std::map<std::string, std::vector<ir::Node*>> created; std::map<std::string, std::vector<ir::Node*>> created;
std::unordered_set<std::string> bn_vars_need_rename; std::unordered_set<std::string> bn_vars_need_rename;
...@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
if (grad_names.find(outname) != grad_names.end()) { if (grad_names.find(outname) != grad_names.end()) {
std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
repeated_op.RenameOutput(outname, new_gname); repeated_op.RenameOutput(outname, new_gname);
// remove op_role_var for backward ops that outputs grad for a
// parameter.
repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
std::vector<std::string>());
} }
} }
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update // not need this update, because only moving mean and variance should be
// differ, trainable parameter scale and bias is the same as other
// parameters.
if (node->Name() == "batch_norm") { if (node->Name() == "batch_norm") {
// NOTE: assume bn op created by layers use save var as output mean and // NOTE: assume bn op created by layers use save var as output mean and
// variance // variance
...@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
var->inputs.push_back(repeated_node); var->inputs.push_back(repeated_node);
} }
} }
} } // end copy forward backward
// 5. create GRAD merge op node // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
// scale(1/num_repeats)
for (auto kv : grad_repeated_map) { for (auto kv : grad_repeated_map) {
OpDesc sum_op; OpDesc sum_op;
sum_op.SetType("sum"); sum_op.SetType("sum");
std::vector<std::string> repeated_grad_names; std::vector<std::string> repeated_grad_names;
std::vector<std::string> param_grad_op_role_var;
for (auto r : kv.second) { for (auto r : kv.second) {
repeated_grad_names.push_back(r->Var()->Name()); repeated_grad_names.push_back(r->Var()->Name());
} }
// NOTE: use op_role_var to control allreduce op appending in
// multi_devices_graph_pass, we want to append op_role_var
// only once for the merged gradient, so break after first call.
param_grad_op_role_var.push_back(
gradname2paramname.at(kv.first->Var()->Name())); // param
param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad
sum_op.SetInput("X", repeated_grad_names); sum_op.SetInput("X", repeated_grad_names);
sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetOutput("Out", {kv.first->Var()->Name()});
sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
...@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats)); scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward)); static_cast<int>(OpRole::kBackward));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
param_grad_op_role_var);
auto scale_op_node = result.CreateOpNode(&scale_op); auto scale_op_node = result.CreateOpNode(&scale_op);
scale_op_node->inputs.push_back(sum_out_var_node); scale_op_node->inputs.push_back(sum_out_var_node);
sum_out_var_node->outputs.push_back(scale_op_node); sum_out_var_node->outputs.push_back(scale_op_node);
......
...@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() { ...@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() {
BuildNgNodes(); BuildNgNodes();
ngraph_function_ = nullptr; ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs; ngraph::NodeVector func_outputs;
ngraph::op::ParameterVector func_inputs; ngraph::ParameterVector func_inputs;
for (auto& vo : var_out_) { for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo)); func_outputs.push_back(var_node_map_->at(vo));
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -1041,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1041,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
auto& scope = ctx.scope();
int data_type = -1; int data_type = -1;
std::string last_input_name;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) { const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
auto* var = scope.FindVar(ipt_name); for (size_t i = 0; i < vars.size(); ++i) {
const Variable* var = vars[i];
if (var != nullptr) { if (var != nullptr) {
const Tensor* t = nullptr; const Tensor* t = nullptr;
if (var->IsType<Tensor>()) { if (var->IsType<Tensor>()) {
...@@ -1057,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -1057,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &(var->Get<SelectedRows>().value()); t = &(var->Get<SelectedRows>().value());
} }
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized", PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
ipt_name); input.first, i);
int tmp = static_cast<int>(t->type()); int tmp = static_cast<int>(t->type());
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), last_input_name, data_type, ipt_name, tmp); Type(), data_type, tmp);
data_type = tmp; data_type = tmp;
last_input_name = ipt_name;
} }
} }
} }
......
...@@ -81,6 +81,10 @@ class RuntimeContext { ...@@ -81,6 +81,10 @@ class RuntimeContext {
RuntimeContext(const VariableNameMap& innames, RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames, const Scope& scope); const VariableNameMap& outnames, const Scope& scope);
RuntimeContext(const VariableValueMap& invars,
const VariableValueMap& outvars)
: inputs(invars), outputs(outvars) {}
VariableValueMap inputs; VariableValueMap inputs;
VariableValueMap outputs; VariableValueMap outputs;
}; };
...@@ -447,8 +451,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -447,8 +451,9 @@ class OperatorWithKernel : public OperatorBase {
void RuntimeInferShape(const Scope& scope, const platform::Place& place, void RuntimeInferShape(const Scope& scope, const platform::Place& place,
const RuntimeContext& ctx) const override; const RuntimeContext& ctx) const override;
protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
protected:
virtual OpKernelType GetKernelTypeForVar( virtual OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const; const OpKernelType& expected_kernel_type) const;
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#if !defined(_WIN32) #if !defined(_WIN32)
#include <pthread.h> #include <pthread.h>
#else
#include <mutex> // NOLINT
#endif // !_WIN32 #endif // !_WIN32
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,17 +31,17 @@ struct RWLock { ...@@ -29,17 +31,17 @@ struct RWLock {
~RWLock() { pthread_rwlock_destroy(&lock_); } ~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() { inline void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed"); "acquire read lock failed");
} }
void WRLock() { inline void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed"); "acquire write lock failed");
} }
void UNLock() { inline void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
} }
...@@ -51,81 +53,46 @@ struct RWLock { ...@@ -51,81 +53,46 @@ struct RWLock {
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing. // In windows, rw_lock seems like a hack. Use empty object and do nothing.
struct RWLock { struct RWLock {
void RDLock() {} // FIXME(minqiyang): use mutex here to do fake lock
void WRLock() {} inline void RDLock() { mutex_.lock(); }
void UNLock() {}
inline void WRLock() { mutex_.lock(); }
inline void UNLock() { mutex_.unlock(); }
private:
std::mutex mutex_;
}; };
#endif #endif
class RWLockGuard { class AutoWRLock {
public: public:
enum Status { kUnLock, kWRLock, kRDLock }; explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() { ~AutoWRLock() { UnLock(); }
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() { private:
switch (status_) { inline void Lock() { lock_->WRLock(); }
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() { inline void UnLock() { lock_->UNLock(); }
if (status_ != Status::kUnLock) {
lock_->UNLock(); private:
status_ = Status::kUnLock; RWLock* lock_;
} };
}
class AutoRDLock {
public:
explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
~AutoRDLock() { UnLock(); }
private:
inline void Lock() { lock_->RDLock(); }
~RWLockGuard() { UnLock(); } inline void UnLock() { lock_->UNLock(); }
private: private:
RWLock* lock_; RWLock* lock_;
Status status_;
}; };
} // namespace framework } // namespace framework
......
...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false, ...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
// the mutex will cause serious performance issue. // the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`. // So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE #ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD #define SCOPE_KIDS_READER_LOCK
#define SCOPE_KIDS_WRITER_LOCK
#define SCOPE_VARS_READER_LOCK
#define SCOPE_VARS_WRITER_LOCK
#else #else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_); #define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
#endif #endif
namespace paddle { namespace paddle {
...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } ...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
SCOPE_LOCK_GUARD Scope* child = new Scope(this);
kids_.push_back(new Scope(this)); {
return *kids_.back(); SCOPE_KIDS_WRITER_LOCK
kids_.push_back(child);
}
return *child;
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
SCOPE_LOCK_GUARD
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const { Variable* Scope::FindLocalVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarLocally(name); return FindVarLocally(name);
} }
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_READER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
SCOPE_LOCK_GUARD
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
{
SCOPE_VARS_READER_LOCK
known_vars.reserve(this->vars_.size()); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) { for (auto& p : vars_) {
known_vars.emplace_back(p.first); known_vars.emplace_back(p.first);
} }
}
return known_vars; return known_vars;
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
this, scope); this, scope);
...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
SCOPE_LOCK_GUARD
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
SCOPE_VARS_WRITER_LOCK
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
it = vars_.erase(it); it = vars_.erase(it);
...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
......
...@@ -14,12 +14,18 @@ limitations under the License. */ ...@@ -14,12 +14,18 @@ limitations under the License. */
#pragma once #pragma once
extern "C" {
#include <xxhash.h>
}
#include <list> #include <list>
#include <mutex> // NOLINT #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -95,7 +101,14 @@ class Scope { ...@@ -95,7 +101,14 @@ class Scope {
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
protected: protected:
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_; struct KeyHasher {
std::size_t operator()(const std::string& key) const {
return XXH32(key.c_str(), key.size(), 1);
}
};
mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
vars_;
private: private:
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
...@@ -124,7 +137,8 @@ class Scope { ...@@ -124,7 +137,8 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private: private:
mutable std::mutex mutex_; mutable RWLock kids_lock_;
mutable RWLock vars_lock_;
}; };
// Generate some debug string about the inherience structure of scope, quite // Generate some debug string about the inherience structure of scope, quite
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <type_traits>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace framework {
namespace detail {
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollFillConstant {
template <typename T>
HOSTDEVICE inline static void Run(T *data, T val) {
data[kStart] = val;
UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollFillConstant<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(T *data, T val) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollAssign {
template <typename Tin, typename Tout>
HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
d2[kStart] = static_cast<Tout>(d1[kStart]);
UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollAssign<kStart, kEnd, true> {
template <typename Tin, typename Tout>
HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
};
template <typename T, size_t kStart, size_t kEnd, bool kStop>
struct UnrollVarArgsAssignImpl {
template <typename... Args>
HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
d[kStart] = val;
UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
d, args...);
}
};
template <typename T, size_t kStart, size_t kEnd>
struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
HOSTDEVICE inline static void Run(T *d) {}
};
template <typename T>
struct UnrollVarArgsAssign {
template <typename... Args>
HOSTDEVICE inline static void Run(T *d, Args... args) {
UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
d, args...);
}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollCompare {
template <typename T>
HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
return d1[kStart] == d2[kStart] &&
UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollCompare<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
return true;
}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollAdd {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
d3[kStart] = d1[kStart] + d2[kStart];
UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollAdd<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollMul {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
d3[kStart] = d1[kStart] * d2[kStart];
UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollMul<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollProduct {
template <typename T>
HOSTDEVICE inline static T Run(const T *d) {
return d[kStart] *
UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
}
template <typename T>
HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
return d1[kStart] * d2[kStart] +
UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollProduct<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline constexpr static T Run(const T *d) {
return 1;
}
template <typename T>
HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
return 0;
}
};
} // namespace detail
template <size_t N>
using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
template <size_t N>
using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
template <typename T>
using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
template <size_t N>
using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
template <size_t N>
using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
template <size_t N>
using UnrollMul = detail::UnrollMul<0, N, N == 0>;
template <size_t N>
using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/unroll_array_ops.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <array>
#include <cstdint>
namespace paddle {
namespace framework {
template <typename T>
bool CheckEquality(const T* p, size_t n, T val) {
return std::all_of(p, p + n, [val](const T& v) { return v == val; });
}
template <int D1, int D2>
bool FillConstantTestMain() {
static_assert(D1 >= D2, "");
std::array<int, D1> arr;
arr.fill(0);
UnrollFillConstant<D2>::Run(arr.data(), 1);
return CheckEquality(arr.data(), D2, 1) &&
CheckEquality(arr.data() + D2, arr.size() - D2, 0);
}
TEST(unroll_ops, fill_constant) {
EXPECT_TRUE((FillConstantTestMain<9, 0>()));
EXPECT_TRUE((FillConstantTestMain<9, 1>()));
EXPECT_TRUE((FillConstantTestMain<9, 4>()));
EXPECT_TRUE((FillConstantTestMain<9, 9>()));
}
TEST(unroll_ops, assign) {
const int a[] = {1, 2, 3, 4, 5};
int b[] = {0, 0, 0, 0, 0};
UnrollAssign<3>::Run(a, b);
EXPECT_EQ(b[0], 1);
EXPECT_EQ(b[1], 2);
EXPECT_EQ(b[2], 3);
EXPECT_EQ(b[3], 0);
EXPECT_EQ(b[4], 0);
}
TEST(unroll_ops, var_args_assign) {
int a[] = {0, 0, 0};
UnrollVarArgsAssign<int>::Run(a, 1, 2);
EXPECT_EQ(a[0], 1);
EXPECT_EQ(a[1], 2);
EXPECT_EQ(a[2], 0);
}
TEST(unroll_ops, compare) {
int a[] = {1, 2, 3};
int b[] = {1, 2, 4};
EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
b[0] = -1;
EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
}
TEST(unroll_ops, add) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
int c[] = {0, 0, 0};
UnrollAdd<2>::Run(a, b, c);
EXPECT_EQ(a[0] + b[0], c[0]);
EXPECT_EQ(a[1] + b[1], c[1]);
EXPECT_EQ(c[2], 0);
}
TEST(unroll_ops, mul) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
int c[] = {0, 0, 0};
UnrollMul<2>::Run(a, b, c);
EXPECT_EQ(a[0] * b[0], c[0]);
EXPECT_EQ(a[1] * b[1], c[1]);
EXPECT_EQ(c[2], 0);
}
TEST(unroll_ops, product) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
EXPECT_EQ(UnrollProduct<3>::Run(a, b),
a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
}
} // namespace framework
} // namespace paddle
...@@ -42,13 +42,9 @@ void AddTo(Variable* src, Variable* dst) { ...@@ -42,13 +42,9 @@ void AddTo(Variable* src, Variable* dst) {
class Autograd { class Autograd {
public: public:
explicit Autograd(framework::Scope* scope) : scope_(scope) {} Autograd() {}
void RunBackward(VarBase* var) { void RunBackward(VarBase* var) {
PADDLE_ENFORCE(var->pre_op_->op_desc_);
// TODO(panyx0718): Only create for vars that "require_grad"
(*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
std::deque<OpBase*> ready; std::deque<OpBase*> ready;
ready.push_back(var->pre_op_); ready.push_back(var->pre_op_);
...@@ -57,11 +53,14 @@ class Autograd { ...@@ -57,11 +53,14 @@ class Autograd {
while (!ready.empty()) { while (!ready.empty()) {
OpBase* ready_op = ready.front(); OpBase* ready_op = ready.front();
ready.pop_front(); ready.pop_front();
std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_); std::map<std::string, std::vector<VarBase*>> input_grads =
ready_op->ApplyGrad();
for (size_t i = 0; i < input_grads.size(); ++i) {
if (!input_grads[i]) continue; for (auto it : input_grads) {
OpBase* pre_op = ready_op->pre_ops_->at(i); const std::vector<VarBase*>& ingrads = it.second;
for (size_t i = 0; i < ingrads.size(); ++i) {
if (!ingrads[i]) continue;
OpBase* pre_op = ready_op->pre_ops_[it.first][i];
if (!pre_op) continue; if (!pre_op) continue;
dep_counts[pre_op] -= 1; dep_counts[pre_op] -= 1;
...@@ -73,6 +72,7 @@ class Autograd { ...@@ -73,6 +72,7 @@ class Autograd {
} }
} }
} }
}
private: private:
std::map<OpBase*, int> ComputeDepCounts(OpBase* op) { std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
...@@ -85,7 +85,8 @@ class Autograd { ...@@ -85,7 +85,8 @@ class Autograd {
while (!queue.empty()) { while (!queue.empty()) {
OpBase* candidate = queue.front(); OpBase* candidate = queue.front();
queue.pop_front(); queue.pop_front();
for (OpBase* pre_op : *(candidate->pre_ops_)) { for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue; if (!pre_op) continue;
if (visited.find(pre_op) == visited.end()) { if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op); visited.insert(pre_op);
...@@ -94,129 +95,74 @@ class Autograd { ...@@ -94,129 +95,74 @@ class Autograd {
ret[pre_op] += 1; ret[pre_op] += 1;
} }
} }
}
return ret; return ret;
} }
framework::Scope* scope_;
}; };
framework::Variable* CreateVariable(const std::string& name,
const framework::DDim& dim, float val,
framework::Scope* scope,
bool random_name = true) {
std::string varname = name;
if (random_name) {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int>::max());
int id = dist6(rng);
varname = string::Sprintf("%s@%d", varname, id);
}
VLOG(3) << "creating var " << varname;
framework::Variable* var = scope->Var(varname);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
std::fill(data, data + tensor->numel(), val);
return var;
}
framework::LoDTensor& VarBase::Grad() { framework::LoDTensor& VarBase::Grad() {
VLOG(3) << "get var grad " << var_desc_->Name(); VLOG(3) << "get var grad " << var_desc_->Name();
return *grads_->GetMutable<framework::LoDTensor>(); return *grads_->GetMutable<framework::LoDTensor>();
} }
void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
VLOG(3) << "apply var grad " << var_desc_->Name() << " " if (!grad_op_desc_) {
<< grad->Get<framework::LoDTensor>().data<float>()[0]; VLOG(3) << "op with no grad: " << op_desc_->Type();
if (!grads_) { return {};
grads_ = }
CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
}
AddTo(grad, grads_);
VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
<< grads_->Get<framework::LoDTensor>().data<float>()[0];
}
std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
VLOG(3) << "op grad " << grad_op_desc_->Type(); VLOG(3) << "op grad " << grad_op_desc_->Type();
for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
// grad op inputs can be forward inputs, so not in grad_to_var. for (auto it : grad_output_vars_) {
continue; auto& outputs = grad_outputs[it.first];
} for (size_t i = 0; i < it.second.size(); ++i) {
VLOG(3) << "op grad in var " << grad_invar; tmp_vars.emplace_back(new framework::Variable());
block_->FindRecursiveOrCreateVar(grad_invar); outputs.push_back(tmp_vars.back().get());
framework::Variable* var = scope->Var(grad_invar); outputs.back()->GetMutable<framework::LoDTensor>();
const std::string& invar = grad_to_var_->at(grad_invar);
for (VarBase* varbase : *output_vars_) {
// Use the accumulated grads_ by sharing the input with grads_.
if (varbase->var_desc_->Name() == invar) {
var->GetMutable<framework::LoDTensor>()->ShareDataWith(
varbase->grads_->Get<framework::LoDTensor>());
break;
}
} }
} }
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
VLOG(3) << "grad outvar " << outvar;
block_->FindRecursiveOrCreateVar(outvar); // No need to do static infer shape here.
framework::Variable* var = scope->Var(outvar); // grad_op_desc_->InferShape(*block_);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block_->FindVar(outvar);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
}
grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_); grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_); framework::OpRegistry::CreateOp(*grad_op_desc_);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
opbase->Run(*scope, platform::CPUPlace()); framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
// `ret` matches exactly with `input_vars_` of forward op. for (auto it : grad_output_vars_) {
std::vector<Variable*> ret; auto& outputs = grad_outputs[it.first];
for (size_t i = 0; i < input_vars_->size(); ++i) { auto& origin_outputs = it.second;
bool found = false; for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* origin_var = (*input_vars_)[i]; framework::Variable* orig_grad = origin_outputs[i];
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { AddTo(outputs[i], orig_grad);
Variable* var = scope->FindVar(outvar);
std::string orig_var = grad_to_var_->at(outvar);
if (origin_var->var_desc_->Name() != orig_var) {
continue;
} }
VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
origin_var->ApplyGrad(scope, var);
found = true;
ret.push_back(var);
// TODO(panyx0718): There might be another outvar with the same name.
// In that case, it doesn't matter the first one or the second one is
// used.
break;
} }
if (!found) { return input_vars_;
ret.push_back(nullptr);
}
}
return ret;
} }
void VarBase::RunBackward(framework::Scope* scope) { void VarBase::RunBackward() {
grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
false);
if (!pre_op_) return; if (!pre_op_) return;
Autograd(scope).RunBackward(this);
auto grads_t = grads_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace());
std::fill(data, data + grads_t->numel(), 1.0);
PADDLE_ENFORCE(
grads_ ==
pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
Autograd().RunBackward(this);
} }
} // namespace imperative } // namespace imperative
......
...@@ -14,17 +14,69 @@ ...@@ -14,17 +14,69 @@
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx)
: op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = op.GetExpectedKernelType(
framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
}
const framework::OperatorBase& op;
const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx;
};
class OpBase; class OpBase;
class VarBase { class VarBase {
...@@ -33,18 +85,26 @@ class VarBase { ...@@ -33,18 +85,26 @@ class VarBase {
: pre_op_(nullptr), : pre_op_(nullptr),
pre_op_out_idx_(-1), pre_op_out_idx_(-1),
var_desc_(nullptr), var_desc_(nullptr),
var_(nullptr), var_(new framework::Variable()),
grads_(nullptr) {} grads_(new framework::Variable()) {}
virtual ~VarBase() {}
void ApplyGrad(framework::Scope* scope, framework::Variable* grad); virtual ~VarBase() {
if (var_) {
delete var_;
var_ = nullptr;
}
if (grads_) {
delete grads_;
grads_ = nullptr;
}
}
void RunBackward(framework::Scope* scope); void RunBackward();
framework::LoDTensor& Grad(); framework::LoDTensor& Grad();
OpBase* pre_op_; OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_; int pre_op_out_idx_;
framework::VarDesc* var_desc_; framework::VarDesc* var_desc_;
...@@ -54,35 +114,24 @@ class VarBase { ...@@ -54,35 +114,24 @@ class VarBase {
class OpBase { class OpBase {
public: public:
OpBase() OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
: input_vars_(new std::vector<VarBase*>()),
output_vars_(new std::vector<VarBase*>()),
pre_ops_(new std::vector<OpBase*>()),
pre_ops_out_idx_(new std::vector<int>()),
op_desc_(nullptr),
grad_op_desc_(nullptr) {}
virtual ~OpBase() { virtual ~OpBase() {
delete input_vars_;
delete output_vars_;
delete pre_ops_;
delete pre_ops_out_idx_;
if (grad_op_desc_) delete grad_op_desc_; if (grad_op_desc_) delete grad_op_desc_;
if (grad_to_var_) delete grad_to_var_;
} }
std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
std::vector<VarBase*>* input_vars_;
std::vector<VarBase*>* output_vars_;
std::vector<OpBase*>* pre_ops_;
std::vector<int>* pre_ops_out_idx_;
framework::OpDesc* op_desc_; framework::OpDesc* op_desc_;
framework::OpDesc* grad_op_desc_; framework::OpDesc* grad_op_desc_;
std::unordered_map<std::string, std::string>* grad_to_var_;
std::map<std::string, std::vector<VarBase*>> input_vars_;
std::map<std::string, std::vector<VarBase*>> output_vars_;
std::map<std::string, std::vector<OpBase*>> pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_;
std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
framework::BlockDesc* block_; framework::BlockDesc* block_;
}; };
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
...@@ -41,22 +40,28 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -41,22 +40,28 @@ void CreateGradOp(const framework::OpDesc& op_desc,
*grad_op_desc = grad_op_descs[0].release(); *grad_op_desc = grad_op_descs[0].release();
} }
void InitVar(framework::Variable* var, framework::Variable* grad_var) {
auto& var_t = var->Get<framework::LoDTensor>();
float* data =
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
var_t.dims(), platform::CPUPlace());
std::fill(data, data + var_t.numel(), 0.0);
}
class Tracer { class Tracer {
public: public:
explicit Tracer(framework::BlockDesc* root_block, explicit Tracer(framework::BlockDesc* root_block,
framework::BlockDesc* startup_block) framework::BlockDesc* startup_block)
: root_block_(root_block), startup_block_(startup_block) { : root_block_(root_block), startup_block_(startup_block) {}
root_scope_ = new framework::Scope();
scopes_[root_block_] = root_scope_;
scopes_[startup_block_] = root_scope_;
}
virtual ~Tracer() { delete root_scope_; } virtual ~Tracer() {}
void Trace(OpBase* op, const std::vector<VarBase*>& inputs, void Trace(OpBase* op,
const std::vector<VarBase*>& outputs, const std::map<std::string, std::vector<VarBase*>>& inputs,
const std::map<std::string, std::vector<VarBase*>>& outputs,
framework::BlockDesc* block) { framework::BlockDesc* block) {
framework::Scope* scope = GetScope(block); std::map<std::string, VarBase*> vars;
framework::OpDesc* op_desc = op->op_desc_; framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type(); VLOG(3) << "tracer tracing " << op_desc->Type();
op_desc->InferShape(*block); op_desc->InferShape(*block);
...@@ -64,77 +69,113 @@ class Tracer { ...@@ -64,77 +69,113 @@ class Tracer {
std::unique_ptr<framework::OperatorBase> op_base = std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc); framework::OpRegistry::CreateOp(*op_desc);
*op->input_vars_ = inputs; framework::VariableValueMap invars_map;
for (VarBase* input : inputs) { framework::VariableValueMap outvars_map;
const std::string vname = input->var_desc_->Name();
framework::Variable* var = scope->Var(vname); op->input_vars_ = inputs;
input->var_ = var; for (auto it : op->input_vars_) {
if (!var->IsInitialized()) { auto& invars = invars_map[it.first];
framework::VarDesc* var_desc = block->FindVar(vname); for (VarBase* inp : it.second) {
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
var->GetMutable<framework::LoDTensor>(); op->op_desc_->Type(), inp->var_desc_->Name());
invars.push_back(inp->var_);
vars[inp->var_desc_->Name()] = inp;
if (inp->pre_op_) {
op->pre_ops_[it.first].push_back(inp->pre_op_);
op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
} else { } else {
LOG(ERROR) << "tracer doesn't support yet"; op->pre_ops_[it.first].push_back(nullptr);
}
} }
if (input->pre_op_) { VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
op->pre_ops_->push_back(input->pre_op_); << inp->var_->IsInitialized();
op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
} else {
op->pre_ops_->push_back(nullptr);
} }
VLOG(3) << "input vname " << vname << " "
<< var->Get<framework::LoDTensor>().dims().size();
} }
*op->output_vars_ = outputs; op->output_vars_ = outputs;
for (auto it : op->output_vars_) {
auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second;
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
const std::string vname = outputs[i]->var_desc_->Name(); VarBase* out = outputs[i];
framework::Variable* var = scope->Var(vname); outvars.push_back(out->var_);
if (!var->IsInitialized()) { vars[out->var_desc_->Name()] = out;
framework::VarDesc* var_desc = block->FindVar(vname);
framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>(); out->var_->GetMutable<framework::LoDTensor>();
} else { } else {
LOG(ERROR) << "tracer doesn't support yet"; LOG(ERROR) << "tracer doesn't support yet";
} }
out->pre_op_ = op;
out->pre_op_out_name_ = it.first;
out->pre_op_out_idx_ = i;
VLOG(3) << "output vname " << out->var_desc_->Name() << " "
<< out->var_->IsInitialized();
} }
outputs[i]->var_ = var;
outputs[i]->pre_op_ = op;
outputs[i]->pre_op_out_idx_ = i;
} }
VLOG(3) << "tracer running " << op_desc->Type(); VLOG(3) << "tracer running " << op_desc->Type();
op_base->Run(*scope, platform::CPUPlace()); framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p.
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
if (block == startup_block_) { if (block == startup_block_) {
op->grad_op_desc_ = nullptr; op->grad_op_desc_ = nullptr;
op->grad_to_var_ = nullptr;
} else { } else {
framework::OpDesc* grad_op_desc; framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>(); auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
op->grad_op_desc_ = grad_op_desc; op->grad_op_desc_ = grad_op_desc;
op->grad_to_var_ = grad_to_var;
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[it.first];
for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
grad_in_vars.push_back(fwd_var_it->second->var_);
} else {
VarBase* var = vars[var_it->second];
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
} }
op->block_ = block; grad_in_vars.push_back(var->grads_);
} }
framework::Scope* GetScope(framework::BlockDesc* block) {
if (scopes_.find(block) != scopes_.end()) {
return scopes_.at(block);
} }
framework::BlockDesc* parent_block = block->ParentBlock(); }
PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); for (auto it : grad_op_desc->Outputs()) {
framework::Scope* scope = &scopes_[parent_block]->NewScope(); auto& grad_out_vars = op->grad_output_vars_[it.first];
scopes_[block] = scope; for (const std::string& grad_outvar : it.second) {
return scope; block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end());
VarBase* var = vars[var_it->second];
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
}
grad_out_vars.push_back(var->grads_);
}
}
}
op->block_ = block;
} }
private: private:
std::map<framework::BlockDesc*, framework::Scope*> scopes_;
framework::BlockDesc* root_block_; framework::BlockDesc* root_block_;
framework::BlockDesc* startup_block_; framework::BlockDesc* startup_block_;
framework::Scope* root_scope_;
}; };
} // namespace imperative } // namespace imperative
......
...@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase { ...@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
OpComment comment; OpComment comment;
PADDLE_ENFORCE(context->HasInput("X"), PADDLE_ENFORCE(context->HasInput("X"),
"Input(X) of %s operator must not be null", comment.type); "Input(X) of %s operator must not be null", comment.type);
auto dim_x = context->GetInputDim("X");
context->SetOutputDim("Out", context->GetInputDim("X")); context->SetOutputDim("Out", context->GetInputDim("X"));
context->ShareLoD("X", "Out"); context->ShareLoD("X", "Out");
} }
......
...@@ -19,6 +19,10 @@ limitations under the License. */ ...@@ -19,6 +19,10 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
DECLARE_int64(cudnn_exhaustive_search_times);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; ...@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
template <typename TAlgorithm> template <typename TAlgorithm>
class AlgorithmsCache { class AlgorithmsCache {
public: public:
AlgorithmsCache() : search_times_(0) { hash_.clear(); }
// Caches the best algorithm for a given // Caches the best algorithm for a given
// combination of tensor dimensions & compute data type. // combination of tensor dimensions & compute data type.
TAlgorithm GetAlgorithm( TAlgorithm GetAlgorithm(
...@@ -54,9 +59,14 @@ class AlgorithmsCache { ...@@ -54,9 +59,14 @@ class AlgorithmsCache {
int algorithmFlags, // can set for different data type int algorithmFlags, // can set for different data type
std::function<TAlgorithm()> gen_func); std::function<TAlgorithm()> gen_func);
TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
std::function<TAlgorithm()> gen_func);
private: private:
std::unordered_map<int64_t, TAlgorithm> hash_; std::unordered_map<int64_t, TAlgorithm> hash_;
std::mutex mutex_; std::mutex mutex_;
int search_times_;
}; };
template <typename TAlgorithm> template <typename TAlgorithm>
...@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm( ...@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
return hash_[seed]; return hash_[seed];
} }
template <typename TAlgorithm>
TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
int64_t area, int search_times, int algorithmFlags,
std::function<TAlgorithm()> gen_func) {
if (hash_.find(area) != hash_.end()) {
return hash_[area];
}
if (search_times_ < search_times) {
auto algo = gen_func();
hash_[area] = algo;
++search_times_;
return algo;
}
TAlgorithm algo;
int64_t min = static_cast<uint64_t>(INT_MAX);
for (const auto& m : hash_) {
if (m.first < min) {
min = m.first;
algo = m.second;
}
}
return algo;
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,6 +28,8 @@ namespace operators { ...@@ -28,6 +28,8 @@ namespace operators {
// x is Input, // x is Input,
// z is ResidualData, // z is ResidualData,
// bias is Bias // bias is Bias
// When `split_channels` is set, y will be splitted into multiple outputs,
// each output has split_channels[i] number of channels.
class Conv2DFusionOpMaker : public Conv2DOpMaker { class Conv2DFusionOpMaker : public Conv2DOpMaker {
protected: protected:
void Apply() override { void Apply() override {
...@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { ...@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'") "'relux' , 'tanh', 'band_pass'")
.SetDefault("relu"); .SetDefault("relu");
AddAttr<std::vector<int>>(
"split_channels",
"When `split_channels` are set, there will be multiple outputs, the "
"output size is equal to the number of `split_channels`.")
.SetDefault({});
AddOutput("Outputs",
"This Outputs is used when setting `split_channels`."
"Usually used to fuse conv with same input and same filter size, "
"padding, stride, dilation size.")
.AsDuplicable()
.AsDispensable();
AddInput("AlgoCache",
"The cache of convolution algorithm, a RAW type variable.")
.AsDispensable();
AddAttr<int>(
"search_times",
"The number of exhaustive search times for convolution algorithm.")
.SetDefault(-1);
} }
}; };
class Conv2DFusionOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of ConvOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations =
ctx->Attrs().Get<std::vector<int>>("dilations");
std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i], strides[i]));
}
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of ConvOp should not be null.");
ctx->SetOutputDim("Output", framework::make_ddim(oshape));
std::vector<int> channels =
ctx->Attrs().Get<std::vector<int>>("split_channels");
if (channels.size()) {
PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
"Output(Outputs) of ConvOp should not be null.");
std::vector<framework::DDim> oshapes;
oshapes.reserve(channels.size());
for (size_t i = 0; i < channels.size(); ++i) {
oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
}
ctx->SetOutputsDim("Outputs", oshapes);
}
}
};
// TODO(qingqing): add gradient operator for conv2d_fusion // TODO(qingqing): add gradient operator for conv2d_fusion
} // namespace operators } // namespace operators
...@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { ...@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker, REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker); ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
paddle::framework::EmptyGradOpMaker);
...@@ -16,8 +16,9 @@ limitations under the License. */ ...@@ -16,8 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit); DEFINE_int64(cudnn_exhaustive_search_times, -1,
DECLARE_bool(cudnn_exhaustive_search); "Exhaustive search times for cuDNN convolution, "
"defalut is 1, only search once.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -117,20 +118,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -117,20 +118,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr; auto search_func = [&]() {
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count; int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
...@@ -138,20 +126,52 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -138,20 +126,52 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
workspace_size_limit));
}; };
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) { for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i]; const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
<< " " << stat.memory; << stat.memory;
} }
return fwd_perf_stat[0].algo; return fwd_perf_stat[0].algo;
}); };
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
int search_times = ctx.Attr<int>("search_times");
search_times = std::max(
static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
if (search_times > 0) {
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto var_name = ctx.Inputs("AlgoCache")[0];
algo_cache =
ctx.scope()
.FindVar(var_name)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
search_func);
} else {
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
// TODO(qingqing) remove const_cast
algo_cache =
const_cast<framework::Scope*>(ctx.scope().parent())
->Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
dilations, 0, search_func);
}
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
} }
...@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
if (channels.size()) {
auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
if (x_dims[0] == 1) {
// share data with Output
framework::Tensor t;
t.ShareDataWith(*output);
auto y_dims = output->dims();
t.Resize({y_dims[1], y_dims[2], y_dims[3]});
int s = 0;
for (size_t i = 0; i < channels.size(); ++i) {
int e = s + channels[i];
outs[i]->ShareDataWith(t.Slice(s, e));
outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
s = e;
}
} else {
// TODO(qingiqng): do copy when batch size large than 1
PADDLE_THROW("Batch size greater than 1 is Unsupported");
}
}
} }
}; };
#endif #endif
......
...@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) { ...@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
} }
out->mutable_data<T>(out_dims, context.GetPlace()); out->mutable_data<T>(out_dims, context.GetPlace());
auto x_stride = framework::stride(x->dims()); auto x_stride = framework::stride(x->dims());
auto out_stride = framework::stride(out->dims());
auto offsets = GetOffsets(context); auto offsets = GetOffsets(context);
int64_t offset = 0; int64_t offset = 0;
for (size_t i = 0; i < offsets.size(); ++i) { for (size_t i = 0; i < offsets.size(); ++i) {
......
...@@ -147,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -147,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
->GetMutable<CudnnRNNCache>(); ->GetMutable<CudnnRNNCache>();
auto input_dims = input->dims(); auto input_dims = input->dims();
auto weight_dims = weight->dims();
auto init_h_dims = init_h->dims(); auto init_h_dims = init_h->dims();
auto init_c_dims = init_c->dims(); auto init_c_dims = init_c->dims();
in_grad->mutable_data<T>(ctx.GetPlace()); in_grad->mutable_data<T>(ctx.GetPlace());
......
...@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor; ...@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
template <typename T> template <typename T>
struct StridedMemcpyFunctor<T, 0> { struct StridedMemcpyFunctor<T, 0> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src, void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<0> src_stride, framework::Dim<0> dst_dim, const int64_t* src_stride, const int64_t* dst_dim,
framework::Dim<0> dst_stride, T* dst) const { const int64_t* dst_stride, T* dst) const {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
auto& cpu_place = boost::get<platform::CPUPlace>(place); auto& cpu_place = boost::get<platform::CPUPlace>(place);
...@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> { ...@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
template <typename T> template <typename T>
struct StridedMemcpyFunctor<T, 1> { struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src, void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<1> src_stride, framework::Dim<1> dst_dim, const int64_t* src_stride, const int64_t* dst_dim,
framework::Dim<1> dst_stride, T* dst) const { const int64_t* dst_stride, T* dst) const {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
auto& cpu_place = boost::get<platform::CPUPlace>(place); auto& cpu_place = boost::get<platform::CPUPlace>(place);
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::CUDAPlace>(place); auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
cuda_ctx.stream()); cuda_ctx.stream());
#else #else
PADDLE_THROW("Paddle is not compiled with GPU"); PADDLE_THROW("Paddle is not compiled with GPU");
...@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
template <typename T, int Rank> template <typename T, int Rank>
struct StridedMemcpyFunctor { struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx, const T* src, void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim, const int64_t* src_stride, const int64_t* dst_dim,
framework::Dim<Rank> dst_stride, T* dst) const { const int64_t* dst_stride, T* dst) const {
for (int64_t i = 0; i < dst_dim.head; ++i) { for (int64_t i = 0; i < dst_dim[0]; ++i) {
StridedMemcpyFunctor<T, Rank - 1> func; StridedMemcpyFunctor<T, Rank - 1> func;
func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
src += src_stride.head; src += src_stride[0];
dst += dst_stride.head; dst += dst_stride[0];
} }
} }
}; };
template <typename T> template <typename T>
struct StridedCopyDimVisitor : public boost::static_visitor<void> { struct StridedCopyDimVisitor {
StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_stride, T* dst) const framework::DDim& dst_stride, T* dst)
...@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> { ...@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
dst_stride_(dst_stride), dst_stride_(dst_stride),
dst_(dst) {} dst_(dst) {}
template <typename Dim> template <int D>
void operator()(Dim dst_dim) const { void operator()(const framework::Dim<D>& dst_dim) const {
Dim src_stride = boost::get<Dim>(src_stride_); StridedMemcpyFunctor<T, D> functor;
Dim dst_stride = boost::get<Dim>(dst_stride_); functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
constexpr int dim = Dim::dimensions; dst_);
StridedMemcpyFunctor<T, dim> functor;
functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
} }
const platform::DeviceContext& dev_ctx_; const platform::DeviceContext& dev_ctx_;
......
...@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { ...@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
"Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
auto gt_classes_dims = ctx->GetInputDim("GtClasses");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto im_info_dims = ctx->GetInputDim("ImInfo"); auto im_info_dims = ctx->GetInputDim("ImInfo");
......
...@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { ...@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("Variances"), PADDLE_ENFORCE(ctx->HasInput("Variances"),
"Input(Variances) shouldn't be null."); "Input(Variances) shouldn't be null.");
auto scores_dims = ctx->GetInputDim("Scores");
auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
auto im_info_dims = ctx->GetInputDim("ImInfo");
auto anchors_dims = ctx->GetInputDim("Anchors");
auto variances_dims = ctx->GetInputDim("Variances");
ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRois", {-1, 4});
ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
} }
......
...@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
auto anchor_dims = ctx->GetInputDim("Anchor"); auto anchor_dims = ctx->GetInputDim("Anchor");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto im_info_dims = ctx->GetInputDim("ImInfo"); auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
"The rank of Input(Anchor) must be 2."); "The rank of Input(Anchor) must be 2.");
......
...@@ -7,56 +7,52 @@ if(WITH_GRPC) ...@@ -7,56 +7,52 @@ if(WITH_GRPC)
else() else()
set(cc_generic_services "true") set(cc_generic_services "true")
endif() endif()
configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY)
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if(WITH_GRPC) if(WITH_GRPC)
grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
PROTO send_recv.proto request_handler_impl.cc rpc_client.cc rpc_server.cc
variable_response.cc
collective_client.cc collective_server.cc
${GRPC_SRCS}
PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
DEPS lod_tensor selected_rows_functor memory) DEPS lod_tensor selected_rows_functor memory)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
cc_test(grpc_serde_test SRCS grpc_serde_test.cc cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
if(WITH_GPU)
cc_test(collective_server_test SRCS collective_server_test.cc
DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
selected_rows_functor scope math_function SERIAL)
endif()
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
else() else()
set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
PROTO send_recv.proto request_handler_impl.cc rpc_client.cc rpc_server.cc
variable_response.cc
collective_client.cc collective_server.cc
${BRPC_SRCS}
PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
DEPS lod_tensor selected_rows memory) DEPS lod_tensor selected_rows memory)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
proto_desc lookup_sparse_table_op snappystream snappy zlib) endif()
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS ${brpc_test_depends} SERIAL)
cc_test(brpc_serde_test SRCS brpc_serde_test.cc
DEPS ${brpc_test_depends} SERIAL)
if(WITH_GPU) cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
if(WITH_GPU)
cc_test(collective_server_test SRCS collective_server_test.cc cc_test(collective_server_test SRCS collective_server_test.cc
DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
endif() selected_rows_functor scope math_function SERIAL)
endif() endif()
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -31,10 +31,10 @@ limitations under the License. */ ...@@ -31,10 +31,10 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#ifdef PADDLE_WITH_BRPC_RDMA #ifdef PADDLE_WITH_BRPC_RDMA
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
#include "brpc/channel.h" #include "brpc/channel.h"
#include "brpc/rdma/rdma_helper.h" #include "brpc/rdma/rdma_helper.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -20,10 +20,10 @@ limitations under the License. */ ...@@ -20,10 +20,10 @@ limitations under the License. */
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace paddle { namespace paddle {
......
...@@ -22,8 +22,8 @@ limitations under the License. */ ...@@ -22,8 +22,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/distributed/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace sendrecv { namespace sendrecv {
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include <string> #include <string>
#include "brpc/server.h" #include "brpc/server.h"
#include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
// //
#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_deadline); DECLARE_int32(rpc_deadline);
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
......
...@@ -21,9 +21,9 @@ limitations under the License. */ ...@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/collective_client.h" #include "paddle/fluid/operators/distributed/collective_client.h"
#include "paddle/fluid/operators/distributed/collective_server.h" #include "paddle/fluid/operators/distributed/collective_server.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) { ...@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
framework::Scope* scope = new framework::Scope(); framework::Scope* scope = new framework::Scope();
framework::Variable* var = scope->Var("var1"); framework::Variable* var = scope->Var("var1");
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
slr->set_height(1000); slr->set_height(20000);
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows(); auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({3, 5})); tensor->Resize(framework::make_ddim({20000, 1024}));
tensor->mutable_data<float>(place); tensor->mutable_data<float>(place);
paddle::operators::math::set_constant(ctx, tensor, 32.7); paddle::operators::math::set_constant(ctx, tensor, 32.7);
...@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars, ...@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
} }
TEST(PREFETCH, GPU) { TEST(PREFETCH, GPU) {
setenv("FLAGS_max_body_size", "2147483647", 1);
platform::CUDAPlace place; platform::CUDAPlace place;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
......
...@@ -18,15 +18,15 @@ ...@@ -18,15 +18,15 @@
#ifdef PADDLE_WITH_GRPC #ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
#define RPCCLIENT_T paddle::operators::distributed::GRPCClient #define RPCCLIENT_T paddle::operators::distributed::GRPCClient
#else // PADDLE_WITH_GRPC #else // PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
#include "paddle/fluid/operators/distributed/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
#define RPCCLIENT_T paddle::operators::distributed::BRPCClient #define RPCCLIENT_T paddle::operators::distributed::BRPCClient
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_DISTRIBUTE
#ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#else // PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#endif // PADDLE_WITH_GRPC
#endif // PADDLE_WITH_DISTRIBUTE
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC // file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data. // requests without too much copying of the tensor data.
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "glog/logging.h" // For VLOG #include "glog/logging.h" // For VLOG
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -39,10 +39,9 @@ limitations under the License. */ ...@@ -39,10 +39,9 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
......
...@@ -21,9 +21,9 @@ limitations under the License. */ ...@@ -21,9 +21,9 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
......
...@@ -27,8 +27,7 @@ limitations under the License. */ ...@@ -27,8 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -21,9 +21,9 @@ limitations under the License. */ ...@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
using ::grpc::ServerAsyncResponseWriter; using ::grpc::ServerAsyncResponseWriter;
......
...@@ -29,11 +29,10 @@ limitations under the License. */ ...@@ -29,11 +29,10 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/grpc_service.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <grpc++/impl/codegen/stub_options.h> #include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h> #include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h> #include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
// NOTE: This method was originally created by tensorflow // NOTE: This method was originally created by tensorflow
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <nccl.h> #include <nccl.h>
#endif #endif
#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -22,13 +22,11 @@ ...@@ -22,13 +22,11 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
namespace paddle { namespace paddle {
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
the Apache License, Version 2.0 (the "License"); you may not use this file the Apache License, Version 2.0 (the "License"); you may not use this file
except in compliance with the License. except in compliance with the License.
...@@ -18,13 +17,8 @@ package sendrecv; ...@@ -18,13 +17,8 @@ package sendrecv;
option cc_generic_services = @cc_generic_services@; option cc_generic_services = @cc_generic_services@;
service SendRecvService { service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor
// TODO(typhoonzero): add streaming API
rpc SendVariable(VariableMessage) returns (VoidMessage) {} rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {} rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// pre-fetch variable by given variable name and Ids
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
...@@ -33,19 +27,12 @@ service SendRecvService { ...@@ -33,19 +27,12 @@ service SendRecvService {
rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
} }
// VariableMessage is serialized paddle variable message.
// It can be:
// LoDTensor
// SelectedRows
enum VarType { enum VarType {
LOD_TENSOR = 0; LOD_TENSOR = 0;
SELECTED_ROWS = 1; SELECTED_ROWS = 1;
NCCL_ID = 2; NCCL_ID = 2;
} }
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage { message VariableMessage {
enum Type { enum Type {
// Pod Types // Pod Types
...@@ -62,21 +49,14 @@ message VariableMessage { ...@@ -62,21 +49,14 @@ message VariableMessage {
string varname = 1; string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2; VarType type = 2;
// bool persistable is not needed for sending.
// tensor info:
Type data_type = 3; Type data_type = 3;
repeated int64 dims = 4; repeated int64 dims = 4;
// lod details:
int64 lod_level = 5; int64 lod_level = 5;
repeated LodData lod = 6; repeated LodData lod = 6;
// selected_rows height, aka. original dim0
int64 slr_height = 7; int64 slr_height = 7;
// tensor data
bytes serialized = 8; bytes serialized = 8;
// selected_rows data
bytes rows = 9; bytes rows = 9;
// Look up table block execution output variable name.
string out_varname = 10; string out_varname = 10;
// If 1, the ps server will start profiling, the ps // If 1, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_* // server stops profiling and generates a profile to /tmp/profile_ps_*
......
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
......
...@@ -24,7 +24,7 @@ limitations under the License. */ ...@@ -24,7 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/distributed_pb.h"
DECLARE_string(rpc_server_profile_path); DECLARE_string(rpc_server_profile_path);
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
namespace paddle { namespace paddle {
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input."); "Rank of first input must >= rank of second input.");
......
...@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> { ...@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
auto& expand_times = context.Attr<std::vector<int>>("expand_times"); auto& expand_times = context.Attr<std::vector<int>>("expand_times");
auto* out0 = context.Output<Tensor>("Out"); auto* out0 = context.Output<Tensor>("Out");
Eigen::DSizes<int, Rank> bcast_dims; Eigen::DSizes<int, Rank> bcast_dims;
auto x_dims = in0->dims();
for (size_t i = 0; i < expand_times.size(); ++i) { for (size_t i = 0; i < expand_times.size(); ++i) {
bcast_dims[i] = expand_times[i]; bcast_dims[i] = expand_times[i];
} }
......
...@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> {
auto w = ctx.Input<Tensor>("W"); auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias"); auto bias = ctx.Input<Tensor>("Bias");
auto output = ctx.Output<Tensor>("Out"); auto output = ctx.Output<Tensor>("Out");
auto in_dims = input->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
......
...@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class FillConstantInferShape : public framework::InferShapeBase { class FillConstantOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null."); "Output(Out) of FillConstantOp should not be null.");
auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape"); auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape)); ctx->SetOutputDim("Out", framework::make_ddim(shape));
} }
};
class FillConstantOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
auto value = Attr<float>("value");
auto force_cpu = Attr<bool>("force_cpu");
framework::Tensor *tensor = nullptr;
auto &out_var = *scope.FindVar(Output("Out"));
if (out_var.IsType<framework::LoDTensor>()) {
tensor = out_var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else if (out_var.IsType<framework::SelectedRows>()) {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
if (force_cpu) {
auto cpu = platform::CPUPlace();
tensor->mutable_data(cpu, data_type);
} else {
tensor->mutable_data(dev_place, data_type);
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); protected:
auto &dev_ctx = *pool.Get(dev_place); framework::OpKernelType GetExpectedKernelType(
math::set_constant(dev_ctx, tensor, value); const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.GetPlace());
} }
}; };
class FillConstantOpVarTypeInference : public framework::VarTypeInference { class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc *block) const override {} framework::BlockDesc* block) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
boost::get<int>(op_desc.GetAttr("dtype")));
auto& out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetDataType(data_type);
}
}; };
class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -107,7 +79,13 @@ Fill up a variable with specified constant value. ...@@ -107,7 +79,13 @@ Fill up a variable with specified constant value.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker, REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker, ops::FillConstantOpVarTypeInference,
ops::FillConstantOpVarTypeInference); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
ops::FillConstantKernel<double>,
ops::FillConstantKernel<int64_t>,
ops::FillConstantKernel<int>,
ops::FillConstantKernel<paddle::platform::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
ops::FillConstantKernel<double>,
ops::FillConstantKernel<int64_t>,
ops::FillConstantKernel<int>,
ops::FillConstantKernel<paddle::platform::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
class FillConstantKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto value = ctx.Attr<float>("value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
framework::Tensor *tensor = nullptr;
framework::Variable *out_var = ctx.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->Resize(
framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
} else if (out_var->IsType<framework::SelectedRows>()) {
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(
framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
if (force_cpu) {
tensor->mutable_data(platform::CPUPlace(), data_type);
} else {
tensor->mutable_data(ctx.GetPlace(), data_type);
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(ctx.GetPlace());
math::set_constant(dev_ctx, tensor, value);
}
};
} // namespace operators
} // namespace paddle
include(operators) include(operators)
register_operators(EXCLUDES fusion_transpose_flatten_concat_op) register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
if (WITH_GPU) if (WITH_GPU)
op_library(fusion_transpose_flatten_concat_op) op_library(fusion_transpose_flatten_concat_op)
op_library(fusion_conv_inception_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
endif() endif()
...@@ -243,7 +243,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -243,7 +243,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
#define INIT_BASE_SIZES \ #define INIT_BASE_SIZES \
auto ids_dims = ids->dims(); /* T x M*/ \ auto ids_dims = ids->dims(); /* T x M*/ \
auto ids_numel = ids->numel(); /* T x 1*/ \ auto ids_numel = framework::product(ids_dims); /* T x 1*/ \
auto wh_dims = wh->dims(); /* D x 4D*/ \ auto wh_dims = wh->dims(); /* D x 4D*/ \
const int D = wh_dims[0]; \ const int D = wh_dims[0]; \
const int D2 = D * 2; \ const int D2 = D * 2; \
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
class ConvInceptionFusionOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
// 1 x
auto in_dims = ctx->GetInputDim("Input");
// 4 filters
auto w_dims = ctx->GetInputsDim("Filter");
PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
int n = in_dims[0];
// compute output channel
// 1st channel
int c = w_dims[0][0];
// add 2nd channel
c += (w_dims[1][0] - w_dims[2][1] * 2);
// add 3rd channel
c += (w_dims[2][0] - w_dims[3][1]);
// add 4-th channel
c += w_dims[3][0];
int h = in_dims[2];
int w = in_dims[3];
ctx->SetOutputDim("Output", {n, c, h, w});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
}
};
class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
protected:
void Make() override {
AddInput("Input", "(Tensor) NCHW layout.");
AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
.AsDuplicable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddOutput("TempOutput", "").AsDuplicable();
AddAttr<std::string>(
"pooling_type",
"(string), pooling type, can be \"max\" for max-pooling "
"and \"avg\" for average-pooling.")
.InEnum({"max", "avg"});
AddAttr<bool>(
"exclusive",
"(bool, default True) When true, will exclude the zero-padding in the "
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True.")
.SetDefault(true);
AddAttr<std::string>(
"activation",
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'")
.SetDefault("relu");
AddAttr<int>("workspace_size_MB",
"Only used in cudnn kernel. Need set use_cudnn to true."
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully.")
.SetDefault(4096);
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
ops::ConvInceptionFusionOpMaker,
paddle::framework::EmptyGradOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit);
namespace paddle {
namespace operators {
#if CUDNN_VERSION >= 7001
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
using DataLayout = platform::DataLayout;
using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
using PoolingMode = platform::PoolingMode;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* input = ctx.Input<Tensor>("Input");
auto filters = ctx.MultiInput<framework::Tensor>("Filter");
auto bias = ctx.MultiInput<framework::Tensor>("Bias");
auto* output = ctx.Output<Tensor>("Output");
auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
const std::string pool_type = ctx.Attr<std::string>("pooling_type");
const std::string activation = ctx.Attr<std::string>("activation");
const bool exclusive = ctx.Attr<bool>("exclusive");
int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
DataLayout layout = DataLayout::kNCHW;
std::vector<int> in_dim = framework::vectorize2int(input->dims());
// ------------------- cudnn descriptors ---------------------
PoolingMode pooling_mode;
if (pool_type == "max") {
pooling_mode = PoolingMode::kMaximum;
} else {
pooling_mode = exclusive ? PoolingMode::kAverageExclusive
: (PoolingMode::kAverageInclusive);
}
std::vector<int> k0x0 = {0, 0};
std::vector<int> k1x1 = {1, 1};
std::vector<int> k1x1_2 = {1, 1};
std::vector<int> k3x3 = {3, 3};
ScopedPoolingDescriptor pool_desc;
ScopedActivationDescriptor act_desc;
ScopedTensorDescriptor out_pool_desc;
ScopedTensorDescriptor input_desc;
cudnnPoolingDescriptor_t cudnn_pool_desc =
pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
cudnnConvolutionDescriptor_t* conv_desc =
new cudnnConvolutionDescriptor_t[4];
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
}
std::vector<std::vector<int>> filter_dims;
std::vector<std::vector<int>> bias_dims;
std::vector<std::vector<int>> in_dims;
std::vector<std::vector<int>> out_dims;
std::vector<std::vector<int>> in_strides;
std::vector<std::vector<int>> out_strides;
std::vector<std::vector<int>> bias_strides;
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
int n = in_dim[0];
int h = in_dim[2];
int w = in_dim[3];
int oc = output->dims()[1];
cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
? CUDNN_DATA_DOUBLE
: CUDNN_DATA_FLOAT;
for (int i = 0; i < 4; ++i) {
filter_dims.push_back(framework::vectorize2int(filters[i]->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
bias_dims.push_back({1, filter_dims[i][0], 1, 1});
bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
bias_strides[i].data()));
in_dims.push_back({n, filter_dims[i][1], h, w});
out_dims.push_back({n, filter_dims[i][0], h, w});
in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
out_strides.push_back({oc * h * w, h * w, w, 1});
if (i < 2) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
}
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
conv_desc[i], CUDNN_DEFAULT_MATH));
}
in_dims[2][1] *= 2;
in_strides[2][0] = oc * h * w;
out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous.
in_strides[3][0] = filter_dims[2][0] * h * w;
CUDNN_ENFORCE(
platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
cudnnConvolutionFwdAlgo_t algo[4];
auto handle = dev_ctx.cudnn_handle();
size_t workspace_size_in_bytes = 0; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
}
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
out_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
&algo[i]));
size_t tmp_size = 0;
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
algo[i], &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
cudnnActivationDescriptor_t cudnn_act_desc =
act_desc.descriptor<T>(activation);
int oc0 = filter_dims[0][0];
int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
int oc3 = filter_dims[3][0];
int oc2 = oc - oc0 - oc1 - oc3;
// branch1: pool + 1x1 conv
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
pool_out_desc, temp_data));
std::vector<const void*> in_datas;
in_datas.push_back(static_cast<const void*>(temp_data));
in_datas.push_back(static_cast<const void*>(input_data));
in_datas.push_back(
static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
T* temp2_data = temp_outs[1]->mutable_data<T>(
framework::make_ddim(out_dims[2]), ctx.GetPlace());
in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
std::vector<void*> out_datas;
out_datas.push_back(static_cast<void*>(output_data));
out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
out_datas.push_back(static_cast<void*>(temp2_data));
out_datas.push_back(
static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
for (int i = 0; i < 4; ++i) {
auto func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
out_desc[i], out_datas[i], bias_desc[i],
static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
out_desc[i], out_datas[i]));
};
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle.RunFunc(func, workspace_size_in_bytes);
}
cudnnTensorDescriptor_t x_desc;
cudnnTensorDescriptor_t y_desc;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), x_desc,
static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
}
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
}
};
#endif
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7001
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
ops::CUDNNConvInceptionFusionOpKernel<float>,
ops::CUDNNConvInceptionFusionOpKernel<double>);
#endif
...@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel { ...@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
"Input(Logits@GRAD) should not be null."); "Input(Logits@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Logits"); auto pred_dims = ctx->GetInputDim("Logits");
auto lab_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
......
...@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel { ...@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
"Output(Predicted@GRAD) should not be null."); "Output(Predicted@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Predicted"); auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
......
...@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()( ...@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
for (int i = 0; i < Rank; i++) { for (int i = 0; i < Rank; i++) {
permute[i] = axis[i]; permute[i] = axis[i];
} }
auto in_dim = in.dims();
auto out_dim = out->dims();
auto eigen_in = framework::EigenTensor<T, Rank>::From(in); auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
auto eigen_out = framework::EigenTensor<T, Rank>::From(*out); auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
auto* dev = context.eigen_device(); auto* dev = context.eigen_device();
......
...@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> { ...@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto in_dims = X->dims(); auto in_dims = X->dims();
auto out_dims = Y->dims();
const float* in_data = X->data<float>(); const float* in_data = X->data<float>();
float* out_data = Y->data<float>(); float* out_data = Y->data<float>();
const int kBatchDim = 0; const int kBatchDim = 0;
......
...@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { ...@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
"Input(Out@Grad) must not be null."); "Input(Out@Grad) must not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
......
...@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel { ...@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel {
"Input(Out@GRAD) should not be null"); "Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
auto x_mat_dims = framework::flatten_to_2d(
x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
auto y_mat_dims = framework::flatten_to_2d(
y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y"); auto y_grad_name = framework::GradVarName("Y");
......
...@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel { ...@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("Input"); auto x_dims = ctx->GetInputDim("Input");
auto label_dims = ctx->GetInputDim("Label"); auto label_dims = ctx->GetInputDim("Label");
auto w_dims = ctx->GetInputDim("Weight");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
if (ctx->HasInput("Bias")) { if (ctx->HasInput("Bias")) {
......
...@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> { ...@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> {
out_norm->mutable_data<T>(ctx.GetPlace()); out_norm->mutable_data<T>(ctx.GetPlace());
auto xdim = in_x->dims(); auto xdim = in_x->dims();
auto ndim = out_norm->dims();
T eps = static_cast<T>(ctx.Attr<float>("epsilon")); T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
if (axis < 0) axis = xdim.size() + axis; if (axis < 0) axis = xdim.size() + axis;
......
...@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
int rois_num = rois->dims()[0]; int rois_num = rois->dims()[0];
auto in_stride = framework::stride(in_dims); auto in_stride = framework::stride(in_dims);
auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out->dims()); auto out_stride = framework::stride(out->dims());
const T* input_data = in->data<T>(); const T* input_data = in->data<T>();
......
...@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> { ...@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
set_zero(ctx.template device_context<DeviceContext>(), x_grad, set_zero(ctx.template device_context<DeviceContext>(), x_grad,
static_cast<T>(0)); static_cast<T>(0));
auto out_grad_stride = framework::stride(out_grad->dims());
for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
Tensor out_grad_t = Tensor out_grad_t =
out_grad->Slice(static_cast<int>(out_lod[0][i]), out_grad->Slice(static_cast<int>(out_lod[0][i]),
......
...@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, ...@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& dst_stride, T* dst) { const framework::DDim& dst_stride, T* dst) {
paddle::operators::detail::StridedCopyDimVisitor<T> func( paddle::operators::detail::StridedCopyDimVisitor<T> func(
dev_ctx, src, src_stride, dst_stride, dst); dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); dst_dim.apply_visitor(func);
} }
// Strided numel memory copy from src to dst by the specified axis // Strided numel memory copy from src to dst by the specified axis
......
...@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) ...@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
......
...@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); ...@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R6
CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif #endif
......
...@@ -53,6 +53,12 @@ namespace platform { ...@@ -53,6 +53,12 @@ namespace platform {
namespace dynload { namespace dynload {
static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
#endif
static inline std::string join(const std::string& part1, static inline std::string join(const std::string& part1,
const std::string& part2) { const std::string& part2) {
// directory separator // directory separator
...@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
void* GetCublasDsoHandle() { void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif #endif
...@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { ...@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
void* GetCUDNNDsoHandle() { void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
#endif #endif
...@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { ...@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
void* GetCurandDsoHandle() { void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace platform {
void Timer::Reset() {
_start.tv_sec = 0;
_start.tv_usec = 0;
_count = 0;
_elapsed = 0;
_paused = true;
}
void Timer::Start() {
Reset();
Resume();
}
void Timer::Pause() {
if (_paused) {
return;
}
_elapsed += Tickus();
++_count;
_paused = true;
}
void Timer::Resume() {
gettimeofday(&_start, NULL);
_paused = false;
}
int Timer::Count() { return _count; }
double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
double Timer::ElapsedMS() { return _elapsed / 1000.0; }
double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
int64_t Timer::Tickus() {
gettimeofday(&_now, NULL);
return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
(_now.tv_usec - _start.tv_usec);
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
// A Standard Timer implementation for debugging
class Timer {
public:
// a timer class for profiling
// Reset() will be called during initialization
// all timing variables will be set 0 in Reset()
Timer() { Reset(); }
void Reset();
void Start();
void Pause();
// Resume will get current system time
void Resume();
int Count();
// return elapsed time in us
double ElapsedUS();
// return elapsed time in ms
double ElapsedMS();
// return elapsed time in sec
double ElapsedSec();
private:
struct timeval _start;
struct timeval _now;
int _count;
int _elapsed;
bool _paused;
// get us difference between start and now
int64_t Tickus();
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/timer.h"
#include "gtest/gtest.h"
TEST(Timer, Reset) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
timeline.Reset();
}
TEST(Timer, Start) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
}
TEST(Timer, Pause) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
}
TEST(Timer, Resume) {
paddle::platform::Timer timeline;
timeline.Start();
sleep(3);
timeline.Pause();
timeline.Resume();
}
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
namespace paddle { namespace paddle {
...@@ -28,9 +27,7 @@ void BindTracer(pybind11::module *m) { ...@@ -28,9 +27,7 @@ void BindTracer(pybind11::module *m) {
framework::BlockDesc *startup_block) { framework::BlockDesc *startup_block) {
new (&self) imperative::Tracer(root_block, startup_block); new (&self) imperative::Tracer(root_block, startup_block);
}) })
.def("trace", &imperative::Tracer::Trace) .def("trace", &imperative::Tracer::Trace);
.def("get_scope", &imperative::Tracer::GetScope,
pybind11::return_value_policy::reference);
} }
} // namespace pybind } // namespace pybind
......
...@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() { ...@@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() {
} }
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) #ifndef PADDLE_WITH_DISTRIBUTE
return true; return false;
#else #endif
#ifdef PADDLE_WITH_GRPC
return false; return false;
#endif #endif
return true;
} }
bool IsCompiledWithDIST() { bool IsCompiledWithDIST() {
...@@ -124,9 +128,7 @@ PYBIND11_MODULE(core, m) { ...@@ -124,9 +128,7 @@ PYBIND11_MODULE(core, m) {
py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
.def(py::init<>()) .def(py::init<>())
.def("_run_backward", .def("_run_backward",
[](imperative::VarBase &self, framework::Scope *scope) { [](imperative::VarBase &self) { self.RunBackward(); })
self.RunBackward(scope);
})
.def("_grad", &imperative::VarBase::Grad) .def("_grad", &imperative::VarBase::Grad)
.def_property( .def_property(
"desc", "desc",
...@@ -134,6 +136,12 @@ PYBIND11_MODULE(core, m) { ...@@ -134,6 +136,12 @@ PYBIND11_MODULE(core, m) {
[](imperative::VarBase &self, framework::VarDesc *var_desc) { [](imperative::VarBase &self, framework::VarDesc *var_desc) {
self.var_desc_ = var_desc; self.var_desc_ = var_desc;
}, },
py::return_value_policy::reference)
.def_property("var",
[](const imperative::VarBase &self) { return self.var_; },
[](imperative::VarBase &self, framework::Variable *var) {
self.var_ = var;
},
py::return_value_policy::reference); py::return_value_policy::reference);
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC") py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
......
...@@ -28,20 +28,53 @@ int main(int argc, char** argv) { ...@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
for (int i = 0; i < argc; ++i) { for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]); new_argv.push_back(argv[i]);
} }
std::vector<std::string> envs;
std::vector<std::string> undefok;
#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
envs.push_back("max_body_size");
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv.push_back( envs.push_back("fraction_of_gpu_memory_to_use");
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); envs.push_back("allocator_strategy");
#elif __clang__ #elif __clang__
new_argv.push_back( envs.push_back("use_mkldnn");
strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" envs.push_back("initial_cpu_memory_in_mb");
"mb,allocator_strategy")); envs.push_back("allocator_strategy");
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
undefok.push_back("use_mkldnn");
undefok.push_back("initial_cpu_memory_in_mb");
#else #else
new_argv.push_back( envs.push_back("use_pinned_memory");
strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" envs.push_back("use_mkldnn");
"mb,allocator_strategy")); envs.push_back("initial_cpu_memory_in_mb");
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); envs.push_back("allocator_strategy");
undefok.push_back("use_mkldnn");
undefok.push_back("initial_cpu_memory_in_mb");
#endif #endif
if (envs.size() > 0) {
std::string env_string = "--tryfromenv=";
for (auto t : envs) {
env_string += t + ",";
}
env_string = env_string.substr(0, env_string.length() - 1);
new_argv.push_back(strdup(env_string.c_str()));
VLOG(1) << "gtest env_string:" << env_string;
}
if (undefok.size() > 0) {
std::string undefok_string = "--undefok=";
for (auto t : undefok) {
undefok_string += t + ",";
}
undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
new_argv.push_back(strdup(undefok_string.c_str()));
VLOG(1) << "gtest undefok_string:" << undefok_string;
}
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
......
...@@ -151,13 +151,17 @@ def __bootstrap__(): ...@@ -151,13 +151,17 @@ def __bootstrap__():
read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_get_thread_num')
read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_prefetch_thread_num')
read_env_flags.append('rpc_disable_reuse_port') read_env_flags.append('rpc_disable_reuse_port')
if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size')
#set brpc max body size
os.environ['FLAGS_max_body_size'] = "2147483647"
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'sync_nccl_allreduce' 'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
......
...@@ -272,8 +272,7 @@ class DataFeeder(object): ...@@ -272,8 +272,7 @@ class DataFeeder(object):
dict: the result of conversion. dict: the result of conversion.
Raises: Raises:
ValueError: If drop_last is False and the data batch which cannot ValueError: If drop_last is False and the data batch which cannot fit for devices.
fit for devices.
""" """
def __reader_creator__(): def __reader_creator__():
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import collections import collections
from collections import defaultdict
import contextlib import contextlib
import os import os
import re import re
...@@ -369,13 +370,11 @@ class Variable(object): ...@@ -369,13 +370,11 @@ class Variable(object):
self._ivar.desc = self.desc self._ivar.desc = self.desc
def _numpy(self): def _numpy(self):
scope = _imperative_tracer().get_scope(self.block.desc) tensor = self._ivar.var.get_tensor()
tensor = core.get_variable_tensor(scope, self.desc.name())
return np.array(tensor) return np.array(tensor)
def _backward(self): def _backward(self):
scope = _imperative_tracer().get_scope(self.block.desc) self._ivar._run_backward()
self._ivar._run_backward(scope)
def _gradient(self): def _gradient(self):
return np.array(self._ivar._grad()) return np.array(self._ivar._grad())
...@@ -648,20 +647,16 @@ class Operator(object): ...@@ -648,20 +647,16 @@ class Operator(object):
self.desc.set_input(in_proto.name, []) self.desc.set_input(in_proto.name, [])
if outputs is not None: if outputs is not None:
given = set()
need = set()
for n in outputs:
given.add(n)
for m in proto.outputs: for m in proto.outputs:
need.add(m.name) if (m.name not in outputs) and m.dispensable:
if not given == need: continue
raise ValueError(("Incorrect setting for output(s) of " if not ((m.name in outputs) or m.dispensable):
"operator \"%s\". Need: [%s] Given: [%s]") % raise ValueError(
(type, ("Incorrect setting for output(s) of "
", ".join(six.binary_type(e) for e in need), "operator \"%s\", should set: [%s].") % (type, m.name))
", ".join(six.binary_type(e) for e in given)))
for out_proto in proto.outputs: for out_proto in proto.outputs:
if out_proto.name not in outputs:
continue
out_args = outputs[out_proto.name] out_args = outputs[out_proto.name]
if not isinstance(out_args, list): if not isinstance(out_args, list):
out_args = [out_args] out_args = [out_args]
...@@ -692,20 +687,20 @@ class Operator(object): ...@@ -692,20 +687,20 @@ class Operator(object):
if _in_imperative_mode(): if _in_imperative_mode():
self.iop = core.OpBase() self.iop = core.OpBase()
self.iop.desc = self.desc self.iop.desc = self.desc
self.inputs = [] self.inputs = defaultdict(list)
if inputs is not None: if inputs is not None:
for inp in inputs.values(): for k, v in six.iteritems(inputs):
if isinstance(inp, Variable): if isinstance(v, Variable):
self.inputs.append(inp) self.inputs[k].append(v._ivar)
elif isinstance(inp, list) or isinstance(inp, tuple): elif isinstance(v, list) or isinstance(v, tuple):
self.inputs.extend(inp[:]) self.inputs[k].extend([var._ivar for var in v])
self.outputs = [] self.outputs = defaultdict(list)
if outputs is not None: if outputs is not None:
for out in outputs.values(): for k, v in six.iteritems(outputs):
if isinstance(out, Variable): if isinstance(v, Variable):
self.outputs.append(out) self.outputs[k].append(v._ivar)
elif isinstance(out, list) or isinstance(out, tuple): elif isinstance(v, list) or isinstance(v, tuple):
self.outputs.extend(out[:]) self.outputs[k].extend([var._ivar for var in v])
def _has_kernel(self, op_type): def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET return op_type not in self.OP_WITHOUT_KERNEL_SET
...@@ -1273,8 +1268,7 @@ class Block(object): ...@@ -1273,8 +1268,7 @@ class Block(object):
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
op = Operator(block=self, desc=op_desc, *args, **kwargs) op = Operator(block=self, desc=op_desc, *args, **kwargs)
if _in_imperative_mode(): if _in_imperative_mode():
_imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
[v._ivar for v in op.outputs], self.desc)
self.ops.append(op) self.ops.append(op)
return op return op
...@@ -1325,8 +1319,7 @@ class Block(object): ...@@ -1325,8 +1319,7 @@ class Block(object):
op_desc = self.desc._prepend_op() op_desc = self.desc._prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
if _in_imperative_mode(): if _in_imperative_mode():
_imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
[v._ivar for v in op.outputs], self.desc)
self.ops.insert(0, op) self.ops.insert(0, op)
return op return op
...@@ -1641,8 +1634,8 @@ class Program(object): ...@@ -1641,8 +1634,8 @@ class Program(object):
parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
to print. to print.
Returns Returns:
(str): The debug string. str : The debug string.
Raises: Raises:
ValueError: If any of required fields is not set and throw_on_error is ValueError: If any of required fields is not set and throw_on_error is
......
...@@ -46,8 +46,7 @@ def to_variable(value, block=None): ...@@ -46,8 +46,7 @@ def to_variable(value, block=None):
name=None, name=None,
shape=value.shape, shape=value.shape,
dtype=value.dtype) dtype=value.dtype)
scope = framework._imperative_tracer().get_scope(block.desc) var = py_var._ivar.var
var = scope.var(py_var.name)
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set(value, core.CPUPlace()) tensor.set(value, core.CPUPlace())
return py_var return py_var
......
...@@ -20,7 +20,7 @@ import six ...@@ -20,7 +20,7 @@ import six
import sys import sys
import numpy as np import numpy as np
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
from . import unique_name from . import unique_name
from paddle.fluid.initializer import Constant, Xavier from paddle.fluid.initializer import Constant, Xavier
from paddle.fluid.imperative import base from paddle.fluid.imperative import base
...@@ -313,9 +313,20 @@ class LayerHelper(object): ...@@ -313,9 +313,20 @@ class LayerHelper(object):
param = self._create_weight_normalize(attr, shape, dtype) param = self._create_weight_normalize(attr, shape, dtype)
WeightNormParamAttr.params_with_weight_norm.append(param) WeightNormParamAttr.params_with_weight_norm.append(param)
return param return param
if _in_imperative_mode():
self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs())
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
**attr._to_kwargs(with_initializer=True))
else:
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) dtype=dtype,
shape=shape,
**attr._to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs()) dtype=dtype, shape=shape, **attr._to_kwargs())
......
...@@ -1452,6 +1452,7 @@ class DynamicRNN(object): ...@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
def step_input(self, x): def step_input(self, x):
""" """
Mark a sequence as a dynamic RNN input. Mark a sequence as a dynamic RNN input.
Args: Args:
x(Variable): The input sequence. x(Variable): The input sequence.
...@@ -1505,6 +1506,7 @@ class DynamicRNN(object): ...@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
""" """
Mark a variable as a RNN input. The input will not be scattered into Mark a variable as a RNN input. The input will not be scattered into
time steps. time steps.
Args: Args:
x(Variable): The input variable. x(Variable): The input variable.
...@@ -1629,13 +1631,11 @@ class DynamicRNN(object): ...@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
Args: Args:
init(Variable|None): The initialized variable. init(Variable|None): The initialized variable.
shape(list|tuple): The memory shape. NOTE the shape does not contain shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
batch_size.
value(float): the initalized value. value(float): the initalized value.
need_reorder(bool): True if the initialized memory depends on the need_reorder(bool): True if the initialized memory depends on the input sample.
input sample.
dtype(str|numpy.dtype): The data type of the initialized memory. dtype(str|numpy.dtype): The data type of the initialized memory.
...@@ -1714,6 +1714,7 @@ class DynamicRNN(object): ...@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
""" """
Update the memory from ex_mem to new_mem. NOTE that the shape and data Update the memory from ex_mem to new_mem. NOTE that the shape and data
type of :code:`ex_mem` and :code:`new_mem` must be same. type of :code:`ex_mem` and :code:`new_mem` must be same.
Args: Args:
ex_mem(Variable): the memory variable. ex_mem(Variable): the memory variable.
new_mem(Variable): the plain variable generated in RNN block. new_mem(Variable): the plain variable generated in RNN block.
......
...@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred, ...@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
rpn_negative_overlap=0.3, rpn_negative_overlap=0.3,
use_random=True): use_random=True):
""" """
** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
This layer can be, for given the Intersection-over-Union (IoU) overlap This layer can be, for given the Intersection-over-Union (IoU) overlap
between anchors and ground truth boxes, to assign classification and between anchors and ground truth boxes, to assign classification and
...@@ -148,6 +148,7 @@ def rpn_target_assign(bbox_pred, ...@@ -148,6 +148,7 @@ def rpn_target_assign(bbox_pred,
cls_logits=cls_logits, cls_logits=cls_logits,
anchor_box=anchor_box, anchor_box=anchor_box,
gt_boxes=gt_boxes) gt_boxes=gt_boxes)
""" """
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
...@@ -1525,20 +1526,23 @@ def anchor_generator(input, ...@@ -1525,20 +1526,23 @@ def anchor_generator(input,
anchors, e.g. [0.5, 1.0, 2.0]. anchors, e.g. [0.5, 1.0, 2.0].
variance(list|tuple): The variances to be used in box regression deltas. variance(list|tuple): The variances to be used in box regression deltas.
Default:[0.1, 0.1, 0.2, 0.2]. Default:[0.1, 0.1, 0.2, 0.2].
stride(list|turple): The anchors stride across width and height, stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
e.g. [16.0, 16.0]
offset(float): Prior boxes center offset. Default: 0.5 offset(float): Prior boxes center offset. Default: 0.5
name(str): Name of the prior box op. Default: None. name(str): Name of the prior box op. Default: None.
Returns: Returns:
Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. Anchors(Variable),Variances(Variable):
H is the height of input, W is the width of input,
num_anchors is the box count of each position. two variables:
- Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
H is the height of input, W is the width of input, \
num_anchors is the box count of each position. \
Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
Variances(Variable): The expanded variances of anchors - Variances(Variable): The expanded variances of anchors \
with a layout of [H, W, num_priors, 4]. with a layout of [H, W, num_priors, 4]. \
H is the height of input, W is the width of input H is the height of input, W is the width of input \
num_anchors is the box count of each position. num_anchors is the box count of each position. \
Each variance is in (xcenter, ycenter, w, h) format. Each variance is in (xcenter, ycenter, w, h) format.
...@@ -1748,7 +1752,7 @@ def generate_proposals(scores, ...@@ -1748,7 +1752,7 @@ def generate_proposals(scores,
eta=1.0, eta=1.0,
name=None): name=None):
""" """
** Generate proposal Faster-RCNN ** **Generate proposal Faster-RCNN**
This operation proposes RoIs according to each box with their probability to be a foreground object and This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
...@@ -1762,7 +1766,6 @@ def generate_proposals(scores, ...@@ -1762,7 +1766,6 @@ def generate_proposals(scores,
4. Remove predicted boxes with small area. 4. Remove predicted boxes with small area.
5. Apply NMS to get final proposals as output. 5. Apply NMS to get final proposals as output.
Args: Args:
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
N is batch size, A is number of anchors, H and W are height and width of the feature map. N is batch size, A is number of anchors, H and W are height and width of the feature map.
...@@ -1777,6 +1780,7 @@ def generate_proposals(scores, ...@@ -1777,6 +1780,7 @@ def generate_proposals(scores,
nms_thresh(float): Threshold in NMS, 0.5 by default. nms_thresh(float): Threshold in NMS, 0.5 by default.
min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
""" """
helper = LayerHelper('generate_proposals', **locals()) helper = LayerHelper('generate_proposals', **locals())
......
...@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size): ...@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
is determined by argument buf_size. is determined by argument buf_size.
Args: Args:
param reader: the original reader whose output will be shuffled. reader(callable): the original reader whose output will be shuffled.
type reader: callable buf_size(int): shuffle buffer size.
param buf_size: shuffle buffer size.
type buf_size: int Returns:
return: the new reader whose output is shuffled. callable: the new reader whose output is shuffled.
rtype: callable
""" """
return __create_unshared_decorated_reader__( return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
......
...@@ -233,7 +233,7 @@ def fc(input, ...@@ -233,7 +233,7 @@ def fc(input,
dimensions will be flatten to form the first dimension of the final matrix (height of dimensions will be flatten to form the first dimension of the final matrix (height of
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
form the second dimension of the final matrix (width of the matrix). For example, suppose form the second dimension of the final matrix (width of the matrix). For example, suppose
`X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer. parameters/weights of this layer.
...@@ -505,31 +505,33 @@ def lstm(input, ...@@ -505,31 +505,33 @@ def lstm(input,
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ .. math::
i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
$$ h_t = o_t \\odot tanh(c_t) $$ h_t &= o_t \odot tanh(c_t)
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input) of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function. - sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate, - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$. the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors. - The :math:`\odot` is the element-wise product of the vectors.
- `tanh` is the activation functions. - :math:`tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state, - :math:`\\tilde{c_t}` is also called candidate hidden state,
which is computed based on the current input and the previous hidden state. which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
X represensts a matrix multiplication X represensts a matrix multiplication
...@@ -556,13 +558,17 @@ def lstm(input, ...@@ -556,13 +558,17 @@ def lstm(input,
Returns: Returns:
rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
Three tensors, rnn_out, last_h, last_c:
- rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
last_h(Tensor): the hidden state of the last step of LSTM - last_h is the hidden state of the last step of LSTM \
shape is ( num_layers x batch_size x hidden_size ) shape is ( num_layers x batch_size x hidden_size ) \
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
last_c(Tensor): the cell state of the last step of LSTM - last_c(Tensor): the cell state of the last step of LSTM \
shape is ( num_layers x batch_size x hidden_size ) shape is ( num_layers x batch_size x hidden_size ) \
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
...@@ -1220,6 +1226,8 @@ def dropout(x, ...@@ -1220,6 +1226,8 @@ def dropout(x,
probability) the outputs of some units to zero, while others are remain probability) the outputs of some units to zero, while others are remain
unchanged. unchanged.
dropout op can be removed from the program to make the program more efficient.
Args: Args:
x (Variable): The input tensor variable. x (Variable): The input tensor variable.
dropout_prob (float): Probability of setting units to zero. dropout_prob (float): Probability of setting units to zero.
...@@ -1230,20 +1238,22 @@ def dropout(x, ...@@ -1230,20 +1238,22 @@ def dropout(x,
units will be dropped. DO NOT use a fixed seed in training. units will be dropped. DO NOT use a fixed seed in training.
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train'] dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
1. downgrade_in_infer(default), downgrade the outcome at inference 1. downgrade_in_infer(default), downgrade the outcome at inference
train: out = input * mask
inference: out = input * dropout_prob - train: out = input * mask
(make is a tensor same shape with input, value is 0 or 1 - inference: out = input * dropout_prob
(mask is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob) ratio of 0 is dropout_prob)
2. upscale_in_train, upscale the outcome at training time 2. upscale_in_train, upscale the outcome at training time
train: out = input * mask / ( 1.0 - dropout_prob )
inference: out = input
(make is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob)
dropout op can be removed from the program.
the program will be efficient
- train: out = input * mask / ( 1.0 - dropout_prob )
- inference: out = input
(mask is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob)
Returns: Returns:
...@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ...@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
A 2-D tensor with shape [N x 1], the cross entropy loss. A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises: Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal. ValueError:
2) when `soft_label == True`, and the 2nd dimension of
`input` and `label` are not equal. 1. the 1st dimension of ``input`` and ``label`` are not equal.
3) when `soft_label == False`, and the 2nd dimension of
`label` is not 1. 2. when ``soft_label == True``, and the 2nd dimension of
``input`` and ``label`` are not equal.
3. when ``soft_label == False``, and the 2nd dimension of
``label`` is not 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1458,7 +1472,7 @@ def chunk_eval(input, ...@@ -1458,7 +1472,7 @@ def chunk_eval(input,
F1-score of chunk detection. F1-score of chunk detection.
For some basics of chunking, please refer to For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'. `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
...@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None): ...@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None):
out.lod = [[2, 1]], out.lod = [[2, 1]],
out.dims = (3, 2). out.dims = (3, 2).
NOTE: The first dimension size of **input**, **offset** and **length** Note:
The first dimension size of **input**, **offset** and **length**
should be equal. The **offset** should start from 0. should be equal. The **offset** should start from 0.
Args: Args:
...@@ -2570,12 +2585,7 @@ def adaptive_pool2d(input, ...@@ -2570,12 +2585,7 @@ def adaptive_pool2d(input,
raise ValueError( raise ValueError(
"invalid setting 'require_index' true when 'pool_type' is 'avg'.") "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
def _is_list_or_tuple_(data): pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
return (isinstance(data, list) or isinstance(data, tuple))
if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
raise ValueError(
"'pool_size' should be a list or tuple with length as 2.")
if pool_type == "max": if pool_type == "max":
l_type = 'max_pool2d_with_index' l_type = 'max_pool2d_with_index'
...@@ -2671,12 +2681,7 @@ def adaptive_pool3d(input, ...@@ -2671,12 +2681,7 @@ def adaptive_pool3d(input,
raise ValueError( raise ValueError(
"invalid setting 'require_index' true when 'pool_type' is 'avg'.") "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
def _is_list_or_tuple_(data): pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
return (isinstance(data, list) or isinstance(data, tuple))
if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
raise ValueError(
"'pool_size' should be a list or tuple with length as 3.")
if pool_type == "max": if pool_type == "max":
l_type = 'max_pool3d_with_index' l_type = 'max_pool3d_with_index'
...@@ -3013,7 +3018,7 @@ def group_norm(input, ...@@ -3013,7 +3018,7 @@ def group_norm(input,
""" """
**Group Normalization Layer** **Group Normalization Layer**
Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>` Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
Args: Args:
input(Variable): The input tensor variable. input(Variable): The input tensor variable.
...@@ -3140,8 +3145,8 @@ def conv2d_transpose(input, ...@@ -3140,8 +3145,8 @@ def conv2d_transpose(input,
H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
Args: Args:
input(Variable): The input image with [N, C, H, W] format. input(Variable): The input image with [N, C, H, W] format.
...@@ -4704,9 +4709,9 @@ def ctc_greedy_decoder(input, blank, name=None): ...@@ -4704,9 +4709,9 @@ def ctc_greedy_decoder(input, blank, name=None):
name (str): The name of this layer. It is optional. name (str): The name of this layer. It is optional.
Returns: Returns:
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
'Lp' is the sum if all output sequences' length. If all the sequences 'Lp' is the sum if all output sequences' length. If all the sequences \
in result were empty, the result LoDTensor will be [-1] with in result were empty, the result LoDTensor will be [-1] with \
LoD [[]] and dims [1, 1]. LoD [[]] and dims [1, 1].
Examples: Examples:
...@@ -5072,6 +5077,7 @@ def hsigmoid(input, ...@@ -5072,6 +5077,7 @@ def hsigmoid(input,
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_ <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first: And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
1. using your word dict to build a binary tree, each leaf node should be an word of your word dict 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
2. build a dict to store word_id -> word's leaf to root path, we call it path_table. 2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
...@@ -5079,7 +5085,6 @@ def hsigmoid(input, ...@@ -5079,7 +5085,6 @@ def hsigmoid(input,
4. now, each word should has its path and code along the path, you can pass a batch of path and code 4. now, each word should has its path and code along the path, you can pass a batch of path and code
related to the same batch of inputs. related to the same batch of inputs.
Args: Args:
input (Variable): The input tensor variable with shape input (Variable): The input tensor variable with shape
:math:`[N \\times D]`, where :math:`N` is the size of mini-batch, :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
...@@ -5485,11 +5490,11 @@ def softmax_with_cross_entropy(logits, ...@@ -5485,11 +5490,11 @@ def softmax_with_cross_entropy(logits,
.. math:: .. math::
max_j = \\max_{i=0}^{K}{\\text{logit}_i} max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j) softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
and then cross entropy loss is calculated by softmax and label. and then cross entropy loss is calculated by softmax and label.
...@@ -5515,10 +5520,10 @@ def softmax_with_cross_entropy(logits, ...@@ -5515,10 +5520,10 @@ def softmax_with_cross_entropy(logits,
along with the cross entropy loss. Default: False along with the cross entropy loss. Default: False
Returns: Returns:
Variable or Tuple of two Variables: Return the cross entropy loss if Variable or Tuple of two Variables: Return the cross entropy loss if \
`return_softmax` is False, otherwise the tuple `return_softmax` is False, otherwise the tuple \
(loss, softmax), where the cross entropy loss is (loss, softmax), where the cross entropy loss is \
a 2-D tensor with shape [N x 1], and softmax is a a 2-D tensor with shape [N x 1], and softmax is a \
2-D tensor with shape [N x K]. 2-D tensor with shape [N x K].
Examples: Examples:
...@@ -5792,15 +5797,21 @@ def squeeze(input, axes, name=None): ...@@ -5792,15 +5797,21 @@ def squeeze(input, axes, name=None):
the single dimensions will be removed from the shape. If an axis is the single dimensions will be removed from the shape. If an axis is
selected with shape entry not equal to one, an error is raised. selected with shape entry not equal to one, an error is raised.
Examples: For example:
.. code-block:: text
Case 1: Case 1:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
axes = [0] axes = [0]
we get: we get:
Out.shape = (3, 1, 5) Out.shape = (3, 1, 5)
Case 2: Case 2:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
...@@ -5842,6 +5853,9 @@ def unsqueeze(input, axes, name=None): ...@@ -5842,6 +5853,9 @@ def unsqueeze(input, axes, name=None):
Dimension indices in axes are as seen in the output tensor. Dimension indices in axes are as seen in the output tensor.
For example: For example:
.. code-block:: text
Given a tensor such that tensor with shape [3, 4, 5], Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
...@@ -6729,8 +6743,11 @@ def sequence_scatter(input, index, updates, name=None): ...@@ -6729,8 +6743,11 @@ def sequence_scatter(input, index, updates, name=None):
the columns to update in each row of X. the columns to update in each row of X.
Here is an example: Here is an example:
Given the following input: Given the following input:
.. code-block:: text .. code-block:: text
input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
...@@ -6743,7 +6760,9 @@ def sequence_scatter(input, index, updates, name=None): ...@@ -6743,7 +6760,9 @@ def sequence_scatter(input, index, updates, name=None):
updates.lod = [[ 0, 3, 8, 12]] updates.lod = [[ 0, 3, 8, 12]]
Then we have the output: Then we have the output:
.. code-block:: text .. code-block:: text
out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0], out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
[1.0, 1.0, 1.4, 1.3, 1.2, 1.1], [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
[1.0, 1.0, 1.3, 1.2, 1.4, 1.1]] [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
...@@ -6759,7 +6778,7 @@ def sequence_scatter(input, index, updates, name=None): ...@@ -6759,7 +6778,7 @@ def sequence_scatter(input, index, updates, name=None):
name (str|None): The output variable name. Default None. name (str|None): The output variable name. Default None.
Returns: Returns:
output (Variable): The output is a tensor with the same shape as input. Variable: The output is a tensor with the same shape as input.
Examples: Examples:
...@@ -6933,7 +6952,7 @@ def mean_iou(input, label, num_classes): ...@@ -6933,7 +6952,7 @@ def mean_iou(input, label, num_classes):
.. math:: .. math::
IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}. IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
The predictions are accumulated in a confusion matrix and mean-IOU The predictions are accumulated in a confusion matrix and mean-IOU
is then calculated from it. is then calculated from it.
...@@ -6946,9 +6965,13 @@ def mean_iou(input, label, num_classes): ...@@ -6946,9 +6965,13 @@ def mean_iou(input, label, num_classes):
num_classes (int): The possible number of labels. num_classes (int): The possible number of labels.
Returns: Returns:
mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. Three variables:
- mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
- out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
- out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
Examples: Examples:
...@@ -7144,7 +7167,7 @@ def affine_grid(theta, out_shape, name=None): ...@@ -7144,7 +7167,7 @@ def affine_grid(theta, out_shape, name=None):
Args: Args:
theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
out_shape can be a Variable or a list or tuple. ``out_shape`` can be a Variable or a list or tuple.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -7157,6 +7180,7 @@ def affine_grid(theta, out_shape, name=None): ...@@ -7157,6 +7180,7 @@ def affine_grid(theta, out_shape, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
data = fluid.layers.affine_grid(theta, out_shape) data = fluid.layers.affine_grid(theta, out_shape)
...@@ -7192,9 +7216,10 @@ def affine_grid(theta, out_shape, name=None): ...@@ -7192,9 +7216,10 @@ def affine_grid(theta, out_shape, name=None):
def rank_loss(label, left, right, name=None): def rank_loss(label, left, right, name=None):
""" """
**Rank loss layer for RankNet** **Rank loss layer for RankNet**
RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf) `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
is a pairwise ranking model with a training sample consisting of a pair is a pairwise ranking model with a training sample consisting of a pair
of documents, A and B. Label P indicates whether A is ranked higher than B of documents, A and B. Label P indicates whether A is ranked higher than B
or not: or not:
...@@ -7202,16 +7227,19 @@ def rank_loss(label, left, right, name=None): ...@@ -7202,16 +7227,19 @@ def rank_loss(label, left, right, name=None):
P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
about the rank of the input pair. about the rank of the input pair.
Rank loss layer takes three inputs: left (o_i), right (o_j) and Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
label (P_{i,j}). The inputs respectively represent RankNet's output scores label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
for documents A and B and the value of label P. The following equation for documents A and B and the value of label P. The following equation
computes rank loss C_{i,j} from the inputs: computes rank loss C_{i,j} from the inputs:
$$ .. math::
C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
o_{i,j} = o_i - o_j \\ C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
\tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
$$ o_{i,j} &= o_i - o_j \\\\
\\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
Rank loss layer takes batch inputs with size batch_size (batch_size >= 1). Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
...@@ -7237,7 +7265,6 @@ def rank_loss(label, left, right, name=None): ...@@ -7237,7 +7265,6 @@ def rank_loss(label, left, right, name=None):
right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
out = fluid.layers.rank_loss(label, left, right) out = fluid.layers.rank_loss(label, left, right)
""" """
helper = LayerHelper('rank_loss', **locals()) helper = LayerHelper('rank_loss', **locals())
...@@ -7269,7 +7296,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): ...@@ -7269,7 +7296,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
.. math:: .. math::
rank\_loss &= max(0, -label * (left - right) + margin) rank\_loss = max(0, -label * (left - right) + margin)
Args: Args:
label (Variable): Indicates whether the left is ranked higher than the right or not. label (Variable): Indicates whether the left is ranked higher than the right or not.
...@@ -7278,12 +7305,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): ...@@ -7278,12 +7305,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
margin (float): Indicates the given margin. margin (float): Indicates the given margin.
name (str|None): A name for this layer (optional). If set None, the layer name (str|None): A name for this layer (optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
Variable: The ranking loss. Variable: The ranking loss.
Raises: Raises:
ValueError: Any of label, left, and right is not a Variable. ValueError: Any of label, left, and right is not a Variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32") label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32") left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
...@@ -7587,7 +7619,8 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -7587,7 +7619,8 @@ def prelu(x, mode, param_attr=None, name=None):
""" """
Equation: Equation:
y = \max(0, x) + alpha * \min(0, x) .. math::
y = \max(0, x) + \\alpha * \min(0, x)
Args: Args:
x (Variable): The input tensor. x (Variable): The input tensor.
...@@ -7730,20 +7763,29 @@ def flatten(x, axis=1, name=None): ...@@ -7730,20 +7763,29 @@ def flatten(x, axis=1, name=None):
**Flatten layer** **Flatten layer**
Flattens the input tensor into a 2D matrix. Flattens the input tensor into a 2D matrix.
Examples: For Example:
.. code-block:: text
Case 1: Case 1:
Given Given
X.shape = (3, 100, 100, 4) X.shape = (3, 100, 100, 4)
and and
axis = 2 axis = 2
We get: We get:
Out.shape = (3 * 100, 4 * 100) Out.shape = (3 * 100, 4 * 100)
Case 2: Case 2:
Given Given
X.shape = (3, 100, 100, 4) X.shape = (3, 100, 100, 4)
and and
axis = 0 axis = 0
We get: We get:
Out.shape = (1, 3 * 100 * 100 * 4) Out.shape = (1, 3 * 100 * 100 * 4)
...@@ -7759,9 +7801,9 @@ def flatten(x, axis=1, name=None): ...@@ -7759,9 +7801,9 @@ def flatten(x, axis=1, name=None):
will be named automatically. will be named automatically.
Returns: Returns:
Variable: A 2D tensor with the contents of the input tensor, with input Variable: A 2D tensor with the contents of the input tensor, with input \
dimensions up to axis flattened to the outer dimension of dimensions up to axis flattened to the outer dimension of \
the output and remaining input dimensions flattened into the the output and remaining input dimensions flattened into the \
inner dimension of the output. inner dimension of the output.
Raises: Raises:
...@@ -7801,15 +7843,19 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): ...@@ -7801,15 +7843,19 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
The enumerated sequence has the same 1st dimension with variable `input`, and The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples: .. code-block:: text
Case 1: Case 1:
Input: Input:
X.lod = [[0, 3, 5]] X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]] X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1] X.dims = [5, 1]
Attrs: Attrs:
win_size = 2 win_size = 2
pad_value = 0 pad_value = 0
Output: Output:
Out.lod = [[0, 3, 5]] Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
...@@ -8896,6 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None): ...@@ -8896,6 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
SimilarityFocus Operator SimilarityFocus Operator
Generate a similarity focus mask with the same shape of input using the following method: Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a], to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
...@@ -8969,14 +9016,16 @@ def similarity_focus(input, axis, indexes, name=None): ...@@ -8969,14 +9016,16 @@ def similarity_focus(input, axis, indexes, name=None):
indexes(list): Indicating the indexes of the selected dimension. indexes(list): Indicating the indexes of the selected dimension.
Returns: Returns:
Variable: A tensor variable with the same shape and same type Variable: A tensor variable with the same shape and same type \
as the input. as the input.
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data( data = fluid.layers.data(
name='data', shape=[2, 3, 2, 2], dtype='float32') name='data', shape=[2, 3, 2, 2], dtype='float32')
x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0]) x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
""" """
helper = LayerHelper('similarity_focus', **locals()) helper = LayerHelper('similarity_focus', **locals())
# check attrs # check attrs
...@@ -9055,6 +9104,7 @@ def hash(input, hash_size, num_hash=1, name=None): ...@@ -9055,6 +9104,7 @@ def hash(input, hash_size, num_hash=1, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
word_dict = paddle.dataset.imdb.word_dict() word_dict = paddle.dataset.imdb.word_dict()
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1) x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000) out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
...@@ -9075,13 +9125,15 @@ def hash(input, hash_size, num_hash=1, name=None): ...@@ -9075,13 +9125,15 @@ def hash(input, hash_size, num_hash=1, name=None):
def grid_sampler(x, grid, name=None): def grid_sampler(x, grid, name=None):
""" """
This operation samples input X by using bilinear interpolation based on This operation samples input X by using bilinear interpolation based on
flow field grid, which is usually gennerated by affine_grid. The grid of flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
with shape [N, H, W] each, where grid_x is indexing the 4th dimension with shape [N, H, W] each, where grid_x is indexing the 4th dimension
(in width dimension) of input data x and grid_y is indexng the 3rd (in width dimension) of input data x and grid_y is indexng the 3rd
dimention (in height dimension), finally results is the bilinear dimention (in height dimension), finally results is the bilinear
interpolation value of 4 nearest corner points. interpolation value of 4 nearest corner points.
.. code-block:: text
Step 1: Step 1:
Get (x, y) grid coordinates and scale to [0, H-1/W-1]. Get (x, y) grid coordinates and scale to [0, H-1/W-1].
...@@ -9126,16 +9178,18 @@ def grid_sampler(x, grid, name=None): ...@@ -9126,16 +9178,18 @@ def grid_sampler(x, grid, name=None):
name (str, default None): The name of this layer. name (str, default None): The name of this layer.
Returns: Returns:
out(Variable): Output of shape [N, C, H, W] data samples input X Variable: Output of shape [N, C, H, W] data samples input X
using bilnear interpolation based on input grid. using bilnear interpolation based on input grid.
Exmples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32') x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32') theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]}) grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
out = fluid.layers.grid_sampler(x=x, grid=grid) out = fluid.layers.grid_sampler(x=x, grid=grid)
""" """
helper = LayerHelper("grid_sampler", **locals()) helper = LayerHelper("grid_sampler", **locals())
...@@ -9203,19 +9257,19 @@ def add_position_encoding(input, alpha, beta, name=None): ...@@ -9203,19 +9257,19 @@ def add_position_encoding(input, alpha, beta, name=None):
""" """
**Add Position Encoding Layer** **Add Position Encoding Layer**
This layer accepts an input 3D-Tensor of shape [N x M x P], and return an This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
output Tensor of shape [N x M x P] with positional encoding value. output Tensor of shape [N x M x P] with positional encoding value.
Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ . Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
.. math:: .. math::
PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})} \\\\ PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\
PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})} \\\\ PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})} \\\\
Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i) Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
Where: Where:
* PE(pos, 2i): the increment for the number at even position - :math:`PE(pos, 2i)` : the increment for the number at even position
* PE(pos, 2i + 1): the increment for the number at odd position - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
Args: Args:
input (Variable): 3-D input tensor with shape [N x M x P] input (Variable): 3-D input tensor with shape [N x M x P]
...@@ -9230,6 +9284,7 @@ def add_position_encoding(input, alpha, beta, name=None): ...@@ -9230,6 +9284,7 @@ def add_position_encoding(input, alpha, beta, name=None):
.. code-block:: python .. code-block:: python
position_tensor = fluid.layers.add_position_encoding(input=tensor) position_tensor = fluid.layers.add_position_encoding(input=tensor)
""" """
helper = LayerHelper('add_position_encoding', **locals()) helper = LayerHelper('add_position_encoding', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -9262,13 +9317,13 @@ def bilinear_tensor_product(x, ...@@ -9262,13 +9317,13 @@ def bilinear_tensor_product(x,
For example: For example:
.. math:: .. math::
out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
In this formula: In this formula:
- :math:`x`: the first input contains M elements, shape is [batch_size, M]. - :math:`x`: the first input contains M elements, shape is [batch_size, M].
- :math:`y`: the second input contains N elements, shape is [batch_size, N]. - :math:`y`: the second input contains N elements, shape is [batch_size, N].
- :math:`W_{i}`: the i-th learned weight, shape is [M, N] - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
- :math:`out{i}`: the i-th element of out, shape is [batch_size, size]. - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
- :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`. - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
Args: Args:
......
...@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input, ...@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
It also sets *stop_gradient* to True. It also sets *stop_gradient* to True.
>>> data = fluid.layers.fill_constant_batch_size_like(
>>> input=like, shape=[1], value=0, dtype='int64')
Args: Args:
input(${input_type}): ${input_comment}. input(${input_type}): ${input_comment}.
...@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input, ...@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
Returns: Returns:
${out_comment}. ${out_comment}.
Examples:
.. code-block:: python
data = fluid.layers.fill_constant_batch_size_like(
input=like, shape=[1], value=0, dtype='int64')
""" """
helper = LayerHelper("fill_constant_batch_size_like", **locals()) helper = LayerHelper("fill_constant_batch_size_like", **locals())
out = helper.create_variable_for_type_inference(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
......
...@@ -362,7 +362,7 @@ class ChunkEvaluator(MetricBase): ...@@ -362,7 +362,7 @@ class ChunkEvaluator(MetricBase):
compute the precision recall and F1-score using the accumulated counter compute the precision recall and F1-score using the accumulated counter
numbers. numbers.
For some basics of chunking, please refer to For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'. `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
...@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase): ...@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
""" """
Update the states based on the layers.chunk_eval() ouputs. Update the states based on the layers.chunk_eval() ouputs.
Args: Args:
num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
...@@ -450,9 +451,9 @@ class EditDistance(MetricBase): ...@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
distance, instance_error = distance_evaluator.eval() distance, instance_error = distance_evaluator.eval()
In the above example: In the above example:
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass. - 'distance' is the average of the edit distance in a pass.
- 'instance_error' is the instance error rate in a pass.
""" """
...@@ -567,12 +568,15 @@ class DetectionMAP(object): ...@@ -567,12 +568,15 @@ class DetectionMAP(object):
Calculate the detection mean average precision (mAP). Calculate the detection mean average precision (mAP).
The general steps are as follows: The general steps are as follows:
1. calculate the true positive and false positive according to the input 1. calculate the true positive and false positive according to the input
of detection and labels. of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'. 2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles: Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/ https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325 https://arxiv.org/abs/1512.02325
Args: Args:
...@@ -615,8 +619,10 @@ class DetectionMAP(object): ...@@ -615,8 +619,10 @@ class DetectionMAP(object):
In the above example: In the above example:
'cur_map_v' is the mAP of current mini-batch. - 'cur_map_v' is the mAP of current mini-batch.
'accum_map_v' is the accumulative mAP of one pass. - 'accum_map_v' is the accumulative mAP of one pass.
""" """
def __init__(self, def __init__(self,
......
...@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest): ...@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
self.activation = 'relu' self.activation = 'relu'
self.add_bias = True self.add_bias = True
self.add_residual_data = True self.add_residual_data = True
self.channels = None
self.outputs = None
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
...@@ -49,7 +51,7 @@ class TestConv2dFusionOp(OpTest): ...@@ -49,7 +51,7 @@ class TestConv2dFusionOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups, self.output = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype(self.dtype) conv2d_param).astype(self.dtype)
self.inputs = { self.inputs = {
...@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest): ...@@ -58,19 +60,20 @@ class TestConv2dFusionOp(OpTest):
} }
if self.add_residual_data: if self.add_residual_data:
residual_data = np.random.random(output.shape).astype(self.dtype) residual_data = np.random.random(self.output.shape).astype(
self.dtype)
self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
residual_data) residual_data)
output += residual_data self.output += residual_data
if self.add_bias: if self.add_bias:
bias = np.random.random(self.filter_size[0]).astype(self.dtype) bias = np.random.random(self.filter_size[0]).astype(self.dtype)
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias) self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
output = output + bias.reshape((1, bias.size, 1, 1)) self.output = self.output + bias.reshape((1, bias.size, 1, 1))
assert self.activation in ['relu', 'identity'] assert self.activation in ['relu', 'identity']
if self.activation == 'relu': if self.activation == 'relu':
output = np.maximum(output, 0) self.output = np.maximum(self.output, 0)
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
...@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest): ...@@ -79,9 +82,12 @@ class TestConv2dFusionOp(OpTest):
'dilations': self.dilations, 'dilations': self.dilations,
'data_format': self.data_format, 'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search, 'exhaustive_search': self.exhaustive_search,
'activation': self.activation 'activation': self.activation,
'split_channels': self.channels
} }
self.outputs = {'Output': output} self.outputs = {'Output': self.output}
self.set_outputs()
def testcuda(self): def testcuda(self):
return core.is_compiled_with_cuda() return core.is_compiled_with_cuda()
...@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest): ...@@ -117,6 +123,9 @@ class TestConv2dFusionOp(OpTest):
def set_search_method(self): def set_search_method(self):
self.exhaustive_search = False self.exhaustive_search = False
def set_outputs(self):
pass
class TestWithoutResidual(TestConv2dFusionOp): class TestWithoutResidual(TestConv2dFusionOp):
def init_bias_residual(self): def init_bias_residual(self):
...@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp): ...@@ -160,5 +169,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
self.exhaustive_search = True self.exhaustive_search = True
class TestMultipleOutputs(TestConv2dFusionOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [1, 32, 17, 17] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [126, f_c, 3, 3]
self.channels = [84, 42]
def set_outputs(self):
out1 = self.output[:, 0:84, :, :]
out2 = self.output[:, 84:126, :, :]
self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -194,4 +194,6 @@ class TestDataBalance(unittest.TestCase): ...@@ -194,4 +194,6 @@ class TestDataBalance(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() # Disable data balance unittest, because data balance would be removed
# unittest.main()
pass
...@@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer): ...@@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer):
def forward(self, inputs): def forward(self, inputs):
x = fluid.layers.relu(inputs[0]) x = fluid.layers.relu(inputs[0])
self._x_for_debug = x self._x_for_debug = x
return [fluid.layers.elementwise_mul(x, x)] x = fluid.layers.elementwise_mul(x, x)
x = fluid.layers.reduce_sum(x)
return [x]
class MLP(fluid.imperative.PyLayer): class MLP(fluid.imperative.PyLayer):
......
...@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase): ...@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
self.assertIsNotNone(pool) self.assertIsNotNone(pool)
self.assertIsNotNone(mask) self.assertIsNotNone(mask)
self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
self.assertIsNotNone(pool)
self.assertIsNotNone(mask)
def test_adaptive_pool3d(self): def test_adaptive_pool3d(self):
program = Program() program = Program()
...@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase): ...@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
x, [3, 3, 3], require_index=True) x, [3, 3, 3], require_index=True)
self.assertIsNotNone(pool) self.assertIsNotNone(pool)
self.assertIsNotNone(mask) self.assertIsNotNone(mask)
self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
self.assertIsNotNone(pool)
self.assertIsNotNone(mask)
def test_lstm_unit(self): def test_lstm_unit(self):
program = Program() program = Program()
......
...@@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase): ...@@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
else: else:
thread = threading.Thread( thread = threading.Thread(
target=feed_data, args=(feed_queue, reader)) target=feed_data, args=(feed_queue, reader))
thread.daemon = True
thread.start() thread.start()
self.outputs = [] self.outputs = []
...@@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): ...@@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
feed_queue.close() feed_queue.close()
self.validate() self.validate()
if not use_decorate_paddle_reader:
thread.join()
def validate(self): def validate(self):
self.assertEqual(len(self.inputs), len(self.outputs)) self.assertEqual(len(self.inputs), len(self.outputs))
......
...@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): ...@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
var_dict = {} var_dict = {}
for var_proto in proto_list: for var_proto in proto_list:
var_name = str(var_proto.name) var_name = str(var_proto.name)
if is_input:
if (var_name not in np_list) and var_proto.dispensable: if (var_name not in np_list) and var_proto.dispensable:
continue continue
if is_input:
assert (var_name in np_list) or (var_proto.dispensable), \ assert (var_name in np_list) or (var_proto.dispensable), \
"Missing {} as input".format(var_name) "Missing {} as input".format(var_name)
if var_proto.duplicable: if var_proto.duplicable:
......
...@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size): ...@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
class DistributeTranspilerConfig(object): class DistributeTranspilerConfig(object):
""" """
Args: .. py:attribute:: slice_var_up (bool)
slice_var_up (bool): Do Tensor slice for pservers, default is True.
split_method (PSDispatcher): RoundRobin or HashName can be used Do Tensor slice for pservers, default is True.
try to choose the best method to balance loads for pservers.
min_block_size (int): Minimum splitted element number in block. .. py:attribute:: split_method (PSDispatcher)
According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
RoundRobin or HashName can be used.
Try to choose the best method to balance loads for pservers.
.. py:attribute:: min_block_size (int)
Minimum number of splitted elements in block.
According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
We can use bandwidth effiently when data size is larger than 2MB.If you We can use bandwidth effiently when data size is larger than 2MB.If you
want to change it, please be sure you see the slice_variable function. want to change it, please be sure you have read the slice_variable function.
""" """
slice_var_up = True slice_var_up = True
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册