提交 ceec1356 编写于 作者: D Dun Liang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my_checkpoint

test=develop
...@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " ...@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=") add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif(WIN32) endif(WIN32)
find_package(CUDA QUIET) find_package(CUDA QUIET)
......
...@@ -152,7 +152,12 @@ endif() ...@@ -152,7 +152,12 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if(WIN32)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
endif(WIN32)
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
......
...@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") ...@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32) if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32) else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G") list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL # match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else() else()
message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)
......
...@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire ...@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
IF(WIN32) IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
ELSE(WIN32) ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
...@@ -39,7 +41,7 @@ ExternalProject_Add( ...@@ -39,7 +41,7 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
...@@ -49,6 +49,8 @@ IF(NOT WIN32) ...@@ -49,6 +49,8 @@ IF(NOT WIN32)
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ELSE()
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
ENDIF(NOT WIN32) ENDIF(NOT WIN32)
ExternalProject_Add( ExternalProject_Add(
...@@ -61,7 +63,6 @@ ExternalProject_Add( ...@@ -61,7 +63,6 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......
...@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) ...@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add( ExternalProject_Add(
extern_snappy extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy" GIT_REPOSITORY "https://github.com/google/snappy"
...@@ -31,7 +37,7 @@ ExternalProject_Add( ...@@ -31,7 +37,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
......
...@@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS ...@@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
endif(NOT WIN32) endif(NOT WIN32)
if (APPLE) if (APPLE)
...@@ -193,8 +187,7 @@ safe_set_static_flag() ...@@ -193,8 +187,7 @@ safe_set_static_flag()
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3") string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") set(flag_var "${flag_var} /w")
endif(${flag_var} MATCHES "/W3")
endforeach(flag_var) endforeach(flag_var)
endif(WIN32) endif(WIN32)
...@@ -30,10 +30,25 @@ while ("${PADDLE_VERSION}" STREQUAL "") ...@@ -30,10 +30,25 @@ while ("${PADDLE_VERSION}" STREQUAL "")
else() # otherwise, get the previous git tag name. else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1") set(tmp_version "${GIT_TAG_NAME}~1")
endif() endif()
else()
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
RESULT_VARIABLE GIT_EXACT_TAG_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_EXACT_TAG_NAME})
# Check if current branch is tag branch
if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
else()
set(PADDLE_VERSION "0.0.0")
endif()
else() else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0") set(PADDLE_VERSION "0.0.0")
endif() endif()
endif()
else() else()
set(PADDLE_VERSION "0.0.0") set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag") message(WARNING "Cannot add paddle version from git tag")
......
...@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) { ...@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
// pserver can not find each other name. // pserver can not find each other name.
for (auto& node : g->Nodes()) { auto update_skip_set = [&](ir::Node* node) {
if (!node->IsOp()) continue;
if (node->Name() == "send") {
for (auto& in : node->inputs) { for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name()); if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
} }
if (node->Name() == "recv") {
for (auto& out : node->outputs) { for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name()); dup_nodes_.emplace(out->Name());
} }
} };
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
} }
} }
......
...@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { ...@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes); CollectSkipVarsSet(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis(); cfg_->LiveVariableAnalysis();
...@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
} }
} }
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars( void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const { const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars; auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) { for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue; if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op(); auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) { // NOTE(dzhwinter):
auto inputs = op_desc->InputArgumentNames(); // current block can not reuse next level block vars.
auto outputs = op_desc->OutputArgumentNames(); if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
vars.insert(inputs.begin(), inputs.end()); // NOTE(dzhwinter):
vars.insert(outputs.begin(), outputs.end()); // distributed ops input/output name need to
} // keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
} }
return vars;
} }
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
......
...@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const; ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const; void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables. // 1. scan op with subblock and collect the output/input vars.
std::unordered_set<std::string> GetSubBlockVars( // while, while_grad, conditional_block
const std::unordered_set<ir::Node*>&) const; // 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.
......
...@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul); EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
...@@ -141,7 +141,8 @@ class Graph { ...@@ -141,7 +141,8 @@ class Graph {
ir::Node *CreateControlDepVar() { ir::Node *CreateControlDepVar() {
// TODO(panyx0718): control var name should be really unique. // TODO(panyx0718): control var name should be really unique.
const std::string name = string::Sprintf( const std::string name = string::Sprintf(
"%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
node_set_.size());
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
x->SetId(num_node_created_++); x->SetId(num_node_created_++);
return x; return x;
......
...@@ -22,7 +22,11 @@ limitations under the License. */ ...@@ -22,7 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DECLARE_bool(benchmark); DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DEFINE_bool( DEFINE_bool(
eager_delete_scope, true, eager_delete_scope, true,
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
endif() endif()
...@@ -58,12 +58,13 @@ if(WIN32) ...@@ -58,12 +58,13 @@ if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config paddle_pass_builder)
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if(WITH_TESTING)
add_dependencies(subgraph_detector gtest)
endif()
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <limits> #include <limits>
#include <map> #include <map>
#include <string> #include <string>
#include <type_traits>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
...@@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse( ...@@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse(
if (!cluster->count(candidate)) continue; if (!cluster->count(candidate)) continue;
size_t space = space_table.at(candidate); size_t space = space_table.at(candidate);
size_t space_diff = std::abs<size_t>(space - space_required); PADDLE_ENFORCE(
space <= std::numeric_limits<std::make_signed<size_t>::type>::max(),
"space overload");
size_t space_diff =
std::abs((std::make_signed<size_t>::type)space - space_required);
if (space_diff < best_fit.second) { if (space_diff < best_fit.second) {
best_fit.first = candidate; best_fit.first = candidate;
best_fit.second = space_diff; best_fit.second = space_diff;
......
...@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false, ...@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_bool(benchmark);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
...@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
size_t avail, total; size_t avail, total;
platform::GpuMemoryUsage(&avail, &total); platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available " << " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail); << string::HumanReadableSize(avail) << "total " << total
LOG(WARNING) << "total " << total; << "GpuMinChunkSize "
LOG(WARNING) << "GpuMinChunkSize " << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
<< string::HumanReadableSize( << "GpuMaxChunkSize "
buddy_allocator->GetMinChunkSize()); << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
LOG(WARNING) << "GpuMaxChunkSize " << "GPU memory used: "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place)); << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} else { } else {
if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Add(place.device, size);
}
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size); cudaMemset(ptr, 0xEF, size);
} }
...@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, ...@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); if (VLOG_IS_ON(3)) {
allocation::GPUMemMonitor.Minus(place.device, size);
}
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
......
...@@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The shape of PriorBox is [N, 4]"); "The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) { if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE( PADDLE_ENFORCE(prior_box_var_dims.size() == 2,
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, "Input(PriorBoxVar) of BoxCoderOp should be 2.");
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
if (prior_box_var_dims.size() == 1) {
PADDLE_ENFORCE_EQ(
prior_box_var_dims[0], 4,
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims, prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to" "The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)"); "the dimension of Input(PriorBox) when the rank is 2.");
}
} }
} }
......
...@@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( ...@@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel(
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = col_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = col_idx * len;
}
output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len] /= prior_box_var_data[prior_var_offset];
output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
...@@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( ...@@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel(
T box_var_x = T(1), box_var_y = T(1); T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1); T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var_data) { if (prior_box_var_data) {
int prior_var_offset = 0; int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
if (prior_box_var_size == 2) {
prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
}
box_var_x = prior_box_var_data[prior_var_offset]; box_var_x = prior_box_var_data[prior_var_offset];
box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2]; box_var_w = prior_box_var_data[prior_var_offset + 2];
......
...@@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) { if (prior_box_var) {
int prior_var_offset = 0; int prior_var_offset = j * len;
if (prior_box_var->dims().size() == 2) {
prior_var_offset = j * len;
}
output[offset] /= prior_box_var_data[prior_var_offset]; output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
...@@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
} }
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box, const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor* prior_box_var,
const bool normalized, const int axis, const bool normalized, std::vector<float> variance,
const std::vector<float> variance, T* output) const { T* output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
...@@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel<T> {
auto* target_box_data = target_box->data<T>(); auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>(); auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr; const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0; int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
if (axis == 0) { prior_box_offset = axis == 0 ? j * len : i * len;
prior_box_offset = j * len;
} else if (axis == 1) {
prior_box_offset = i * len;
}
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
...@@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_center_x = 0, target_box_center_y = 0; T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
T box_var_x = T(1), box_var_y = T(1); int prior_var_offset = axis == 0 ? j * len : i * len;
T box_var_w = T(1), box_var_h = T(1); if (var_size == 2) {
if (prior_box_var) { std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
int prior_var_offset = 0; 4 * sizeof(T));
if (prior_box_var->dims().size() == 2) { } else if (var_size == 1) {
if (axis == 0) var_ptr = reinterpret_cast<T*>(variance.data());
prior_var_offset = j * len; }
else if (axis == 1) T box_var_x = *var_ptr;
prior_var_offset = i * len; T box_var_y = *(var_ptr + 1);
} T box_var_w = *(var_ptr + 2);
box_var_x = prior_box_var_data[prior_var_offset]; T box_var_h = *(var_ptr + 3);
box_var_y = prior_box_var_data[prior_var_offset + 1];
box_var_w = prior_box_var_data[prior_var_offset + 2];
box_var_h = prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
box_var_x = static_cast<T>(variance[0]);
box_var_y = static_cast<T>(variance[1]);
box_var_w = static_cast<T>(variance[2]);
box_var_h = static_cast<T>(variance[3]);
}
target_box_center_x = target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width + box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x; prior_box_center_x;
...@@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel<T> {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, if (prior_box_var) {
variance, output); if (axis == 0) {
DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else if (!(variance.empty())) {
if (axis == 0) {
DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
} else {
if (axis == 0) {
DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
} else {
DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var,
normalized, variance, output);
}
}
} }
} }
}; };
......
...@@ -37,7 +37,7 @@ math_library(concat_and_split) ...@@ -37,7 +37,7 @@ math_library(concat_and_split)
math_library(context_project DEPS im2col math_function) math_library(context_project DEPS im2col math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv) math_library(depthwise_conv DEPS cub)
math_library(im2col) math_library(im2col)
math_library(sampler) math_library(sampler)
......
...@@ -31,6 +31,7 @@ std::map<std::string, ...@@ -31,6 +31,7 @@ std::map<std::string,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode}, {"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
......
...@@ -21,7 +21,8 @@ limitations under the License. */ ...@@ -21,7 +21,8 @@ limitations under the License. */
#pragma once #pragma once
#include "ops/binary_unnary_op.h" #include "ops/accuracy_op.h"
#include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h" #include "ops/conv2d_op.h"
#include "ops/elementwise_add_op.h" #include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h" #include "ops/fill_constant_op.h"
......
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildAccuracyNode(
const std::shared_ptr<framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto indices = platform::GetInputNode(op, "Indices", ngb_node_map);
auto label = platform::GetInputNode(op, "Label", ngb_node_map);
auto inference = platform::GetInputNode(op, "Out", ngb_node_map);
auto inference_shape = inference->get_shape();
size_t num_samples = inference_shape.at(0);
size_t k = inference_shape.at(1);
std::shared_ptr<ngraph::Node> label_k = label;
if (k > 1) {
auto label_1d = std::make_shared<ngraph::op::Reshape>(
label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples});
label_k = std::make_shared<ngraph::op::Broadcast>(label_1d, inference_shape,
ngraph::AxisSet{1});
}
auto node_equal = std::make_shared<ngraph::op::Equal>(indices, label_k);
auto node_eq_int =
std::make_shared<ngraph::op::Convert>(node_equal, ngraph::element::i64);
auto num_correct_0d =
std::make_shared<ngraph::op::Sum>(node_eq_int, ngraph::AxisSet{0, 1});
std::shared_ptr<ngraph::Node> num_correct =
platform::NgReshaper(num_correct_0d, ngraph::Shape{1});
std::shared_ptr<ngraph::Node> n_samples = ngraph::op::Constant::create(
ngraph::element::i64, ngraph::Shape{1}, {num_samples});
std::shared_ptr<ngraph::Node> accuracy = std::make_shared<ngraph::op::Divide>(
std::make_shared<ngraph::op::Convert>(num_correct, ngraph::element::f32),
std::make_shared<ngraph::op::Convert>(n_samples, ngraph::element::f32));
platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map);
platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map);
platform::SetOutputNode(op, "Total", n_samples, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -36,11 +36,6 @@ void BuildTopKNode( ...@@ -36,11 +36,6 @@ void BuildTopKNode(
std::make_shared<ngraph::op::GetOutputElement>(top_k, 0); std::make_shared<ngraph::op::GetOutputElement>(top_k, 0);
std::shared_ptr<ngraph::Node> out = std::shared_ptr<ngraph::Node> out =
std::make_shared<ngraph::op::GetOutputElement>(top_k, 1); std::make_shared<ngraph::op::GetOutputElement>(top_k, 1);
auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) {
out = std::make_shared<ngraph::op::Convert>(out,
dummy_out->get_element_type());
}
paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map); paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
} }
......
...@@ -259,7 +259,7 @@ Example: ...@@ -259,7 +259,7 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
hstart = i * strides[0] - paddings[0] hstart = i * strides[0] - paddings[0]
hend = hstart + ksize[0] hend = hstart + ksize[0]
...@@ -267,7 +267,7 @@ Example: ...@@ -267,7 +267,7 @@ Example:
wend = wstart + ksize[1] wend = wstart + ksize[1]
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$ $$
For exclusive = false: For exclusive = true:
$$ $$
hstart = max(0, i * strides[0] - paddings[0]) hstart = max(0, i * strides[0] - paddings[0])
hend = min(H, hstart + ksize[0]) hend = min(H, hstart + ksize[0])
...@@ -403,7 +403,7 @@ Example: ...@@ -403,7 +403,7 @@ Example:
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
dstart = i * strides[0] - paddings[0] dstart = i * strides[0] - paddings[0]
dend = dstart + ksize[0] dend = dstart + ksize[0]
...@@ -413,7 +413,7 @@ Example: ...@@ -413,7 +413,7 @@ Example:
wend = wstart + ksize[2] wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$ $$
For exclusive = false: For exclusive = true:
$$ $$
dstart = max(0, i * strides[0] - paddings[0]) dstart = max(0, i * strides[0] - paddings[0])
dend = min(D, dstart + ksize[0]) dend = min(D, dstart + ksize[0])
......
...@@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, ...@@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
framework::LoD lod{lod_data}; framework::LoD lod{lod_data};
lod_tensor.set_lod(lod); lod_tensor.set_lod(lod);
int64_t* tensor_data = lod_tensor.mutable_data<int64_t>( int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}), framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
platform::CPUPlace()); platform::CPUPlace());
memcpy(tensor_data, batch_feasign.data(), memcpy(tensor_data, batch_feasign.data(),
batch_feasign.size() * sizeof(int64_t)); batch_feasign.size() * sizeof(int64_t));
...@@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader, ...@@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
// insert label tensor // insert label tensor
framework::LoDTensor label_tensor; framework::LoDTensor label_tensor;
auto* label_tensor_data = label_tensor.mutable_data<int64_t>( auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}), framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
platform::CPUPlace()); platform::CPUPlace());
memcpy(label_tensor_data, batch_label.data(), memcpy(label_tensor_data, batch_label.data(),
batch_label.size() * sizeof(int64_t)); batch_label.size() * sizeof(int64_t));
......
...@@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { ...@@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) {
std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3, std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3,
b4}; b4};
std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; std::vector<DDim> label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}};
LoDTensorBlockingQueueHolder queue_holder; LoDTensorBlockingQueueHolder queue_holder;
int capacity = 64; int capacity = 64;
......
include(operators) include(operators)
register_operators() if(WITH_GPU)
register_operators(DEPS cub)
else()
register_operators()
endif()
if(WITH_GPU) if(WITH_GPU)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) ...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce boost) cc_library(place SRCS place.cc DEPS enforce boost lib_any)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -43,13 +43,14 @@ std::shared_ptr<ngraph::Node> NgReshaper(std::shared_ptr<ngraph::Node> input, ...@@ -43,13 +43,14 @@ std::shared_ptr<ngraph::Node> NgReshaper(std::shared_ptr<ngraph::Node> input,
std::shared_ptr<ngraph::Node> GetNode( std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op, const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, const paddle::framework::VariableNameMap& var_map, const std::string name, const paddle::framework::VariableNameMap& var_map,
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) { ngb_node_map) {
auto& var_names = var_map.at(prm); auto& var_names = var_map.at(name);
PADDLE_ENFORCE_EQ(var_names.size(), 1, PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm); "op %s name %s expects one associated var", op->Type(),
name);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]]; return (*ngb_node_map)[var_names[0]];
} else { } else {
...@@ -59,43 +60,53 @@ std::shared_ptr<ngraph::Node> GetNode( ...@@ -59,43 +60,53 @@ std::shared_ptr<ngraph::Node> GetNode(
std::shared_ptr<ngraph::Node> GetInputNode( std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op, const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, const std::string name,
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) { ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map); return GetNode(op, name, op->Inputs(), ngb_node_map);
} }
std::shared_ptr<ngraph::Node> GetOutputNode( std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op, const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, const std::string name,
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) { ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map); return GetNode(op, name, op->Outputs(), ngb_node_map);
} }
void SetOutputNode( void SetOutputNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op, const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm, std::shared_ptr<ngraph::Node> node, const std::string name, std::shared_ptr<ngraph::Node> node,
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) { ngb_node_map) {
auto& var_names = op->Outputs().at(prm); auto& var_names = op->Outputs().at(name);
if (var_names.size() == 1) { if (var_names.size() == 1) {
/* */
auto dummy_out = GetOutputNode(op, name, ngb_node_map);
if (dummy_out && dummy_out->get_shape() != node->get_shape()) {
node = NgReshaper(node, dummy_out->get_shape());
}
if (dummy_out &&
dummy_out->get_element_type() != node->get_element_type()) {
node = std::make_shared<ngraph::op::Convert>(
node, dummy_out->get_element_type());
}
(*ngb_node_map)[var_names[0]] = node; (*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) { } else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node; (*ngb_node_map)[""] = node;
} else { } else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm); PADDLE_THROW("name %s has more than 1 var_names.", name);
} }
} }
bool HasOutput(const std::shared_ptr<paddle::framework::OperatorBase>& op, bool HasOutput(const std::shared_ptr<paddle::framework::OperatorBase>& op,
const std::string prm) { const std::string name) {
auto& outputs = op->Outputs(); auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false; if (outputs.find(name) == outputs.end()) return false;
return outputs.at(prm).size() > 0; return outputs.at(name).size() > 0;
} }
inline void GetMidDims(const ngraph::Shape& x_shape, inline void GetMidDims(const ngraph::Shape& x_shape,
......
...@@ -14,12 +14,6 @@ limitations under the License. */ ...@@ -14,12 +14,6 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -26,5 +26,5 @@ if(WITH_PYTHON) ...@@ -26,5 +26,5 @@ if(WITH_PYTHON)
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules}) target_link_libraries(paddle_pybind ${os_dependency_modules})
cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind)
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -54,7 +54,7 @@ ELSE(WIN32) ...@@ -54,7 +54,7 @@ ELSE(WIN32)
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
ENDIF() ENDIF()
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
......
...@@ -397,10 +397,10 @@ def box_coder(prior_box, ...@@ -397,10 +397,10 @@ def box_coder(prior_box,
input is image feature map, they are close to input is image feature map, they are close to
the origin of the coordinate system. [xmax, ymax] the origin of the coordinate system. [xmax, ymax]
is the right bottom coordinate of the anchor box. is the right bottom coordinate of the anchor box.
prior_box_var(Variable|list): prior_box_var supports two types of input. prior_box_var(Variable|list|None): prior_box_var supports two types
One is variable with shape [M, 4] holds M group. of input. One is variable with shape [M, 4]
The other one is list consist of 4 elements holds M group. The other one is list consist of
shared by all boxes. 4 elements shared by all boxes.
target_box(Variable): This input can be a 2-D LoDTensor with shape target_box(Variable): This input can be a 2-D LoDTensor with shape
[N, 4] when code_type is 'encode_center_size'. [N, 4] when code_type is 'encode_center_size'.
This input also can be a 3-D Tensor with shape This input also can be a 3-D Tensor with shape
......
...@@ -484,7 +484,7 @@ def _py_reader(capacity, ...@@ -484,7 +484,7 @@ def _py_reader(capacity,
name=None, name=None,
use_double_buffer=True, use_double_buffer=True,
feed_list=None): feed_list=None):
use_cuda_pinned_place = use_double_buffer and core.is_compiled_with_cuda()
if feed_list is not None: if feed_list is not None:
if not isinstance(feed_list, list): if not isinstance(feed_list, list):
raise TypeError("feed_list should be a list of Variable" raise TypeError("feed_list should be a list of Variable"
...@@ -565,9 +565,6 @@ def _py_reader(capacity, ...@@ -565,9 +565,6 @@ def _py_reader(capacity,
for item in tensors: for item in tensors:
if not isinstance(item, core.LoDTensor): if not isinstance(item, core.LoDTensor):
tmp = core.LoDTensor() tmp = core.LoDTensor()
if use_cuda_pinned_place:
tmp.set(item, core.CUDAPinnedPlace())
else:
tmp.set(item, core.CPUPlace()) tmp.set(item, core.CPUPlace())
item = tmp item = tmp
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp
class TestNGRAPHAccuracyOp(TestAccuracyOp):
def setUp(self):
super(TestNGRAPHAccuracyOp, self).setUp()
if __name__ == '__main__':
unittest.main()
...@@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): ...@@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
pb_y = pb_y.reshape(shape) pb_y = pb_y.reshape(shape)
if pb_v.ndim == 2: if pb_v.ndim == 2:
pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
pb_v.shape[0], 1, pb_v.shape[1])
pb_v = pb_v.reshape(var_shape)
if pb_v.ndim == 1: if pb_v.ndim == 1:
tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
...@@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): ...@@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest):
self.outputs = {'OutputBox': output_box} self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithOneRankVar(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((81, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32')
target_box = np.random.random((20, 81, 4)).astype('float32')
code_type = "DecodeCenterSize"
box_normalized = False
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type, box_normalized)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {
'code_type': 'decode_center_size',
'box_normalized': False
}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithoutBoxVar(OpTest): class TestBoxCoderOpWithoutBoxVar(OpTest):
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): ...@@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest):
self.op_type = "box_coder" self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]] lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((30, 4)).astype('float32') prior_box = np.random.random((30, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32') prior_box_var = np.random.random((30, 4)).astype('float32')
target_box = np.random.random((30, 81, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32')
code_type = "DecodeCenterSize" code_type = "DecodeCenterSize"
box_normalized = False box_normalized = False
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册