diff --git a/CMakeLists.txt b/CMakeLists.txt index e4442d254901e2524385452ebe5ac6f6df3056f9..61f5e63098c40f140774ba6bfd9a2de8d2d67bfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") endif(WIN32) find_package(CUDA QUIET) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 076e839120d98d801de4374f2f8338ebd918b88f..b0f54bf49aafb65f1a92fa95877de2cc61fc67d3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -152,7 +152,12 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - set(OPENMP_FLAGS "-fopenmp") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ea46f6418edf1db70b2a308dd49cf2131cc89d3b..ef4192ecc98ea6de0c81c1f33320528d547b818a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") if (NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() else(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") -else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 7a6a4523886824a67c82f9ce978de025ddb9c2cd..d3a4d69d3a05515fdf72074083470e19b4ec255c 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire IF(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") ELSE(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -39,7 +41,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 6a7be73f09a278ab0fd29c7599a7781df3d29413..92fe76d05c7507c295b784bc37870abfc31a0a29 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -49,6 +49,8 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) ExternalProject_Add( @@ -61,7 +63,6 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 27d075336d556528ffaf1929c34753494692f0a0..1e01057aa606af78cd722d3619a710cb35817174 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -31,7 +37,7 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9e6c47f016fe6dfd809c5b2bc88ff59d0a6b2b84..81e7868a6ad3fee16911a49ff9d1394a103706c5 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. -set(GPU_COMMON_FLAGS - "/w") #disable all warnings endif(NOT WIN32) if (APPLE) @@ -193,8 +187,7 @@ safe_set_static_flag() CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/W3") - string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") endforeach(flag_var) endif(WIN32) diff --git a/cmake/version.cmake b/cmake/version.cmake index ac10bdf067be549fe90112aef73fd6e1fbe0ac48..dd57d4ab9969ce530f93ca1694350b1a26b5b543 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "") set(tmp_version "${GIT_TAG_NAME}~1") endif() else() - # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest - set(PADDLE_VERSION "0.0.0") + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_EXACT_TAG_NAME + RESULT_VARIABLE GIT_EXACT_TAG_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_EXACT_TAG_NAME}) + # Check if current branch is tag branch + if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) + else() + set(PADDLE_VERSION "0.0.0") + endif() + else() + # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + endif() endif() else() set(PADDLE_VERSION "0.0.0") diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 64368a5e8737b2484bda9b7dd52451b4d4f760ff..78c5d5b50e606daa963e728355dc1bce83cd5484 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) { // 2. track the nodes which used by parameter server. // these node can not be inplaced, otherwise trainer // pserver can not find each other name. - for (auto& node : g->Nodes()) { - if (!node->IsOp()) continue; - if (node->Name() == "send") { - for (auto& in : node->inputs) { - dup_nodes_.emplace(in->Name()); - } + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); } - if (node->Name() == "recv") { - for (auto& out : node->outputs) { + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) dup_nodes_.emplace(out->Name()); - } } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); } } diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 33ca45668e86bdbe615b91366db7e286258dd7d6..85de14a60a8fe6958794f0ac25768b9da1943f9d 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + CollectSkipVarsSet(nodes); cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_->LiveVariableAnalysis(); @@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { } } -std::unordered_set MemoryOptimizePass::GetSubBlockVars( +void MemoryOptimizePass::CollectSkipVarsSet( const std::unordered_set& nodes) const { - std::unordered_set vars; + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; for (auto& op : nodes) { if (!op->IsOp() || op->Op() == nullptr) continue; auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); } - return vars; } void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index b3e026e0bc1e222e82a22b343c86ddc87a967e8f..3d6b1897f3b5106054b8f647f9cf613ebd1d65ff 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass { ir::Graph* graph) const; void SubGraphOptimize(OpDesc* op_desc) const; - // scan subblock and collect the output/input variables. - std::unordered_set GetSubBlockVars( - const std::unordered_set&) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set&) const; private: // Reuse Node Pool, Owned. diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index 121f648a5f04ae65560ae8d04042e40df61aad50..3e4d715c6f089496d1b1f7906e3f10147a073622 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) { auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); std::unordered_map expects = { {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 8bb3c27bdd32d07d58913db043569f6a3bf69aeb..b7f7c3d82e0da4d3ca8795487fa52fba0394e365 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -141,7 +141,8 @@ class Graph { ir::Node *CreateControlDepVar() { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( - "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); + "%s@%llu", static_cast(ir::Node::kControlDepVarName), + node_set_.size()); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); x->SetId(num_node_created_++); return x; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 87f0f307d30bc90a43a698c3766b16c975f0635e..953618560913229cd1e47659ad61e621efc10ed1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,7 +22,11 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DECLARE_bool(benchmark); +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5db422119966948f75970874e13d416ea699158a..ec8dedd605235a2d197e6a313bd589d5b9520cdf 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 11484a647303b32a6006bef3cfe4be6b3f0d533d..157862016e3556902f6507e02417624363ed1029 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -58,12 +58,13 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) - target_link_libraries(paddle_fluid_shared shlwapi) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE AND NOT WIN32) diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index eb6e1768a2c01f1388962eefe8e70368cae8cf8b..410a90132aa7657a23b858570763547fe53730a0 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 3d1be9196fdeacd8ff852dbb595473a687352ccf..4b0a9d9b1c48fcb0d5e44ec1b977c817f3c70b2e 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/framework/ir/graph_helper.h" @@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse( if (!cluster->count(candidate)) continue; size_t space = space_table.at(candidate); - size_t space_diff = std::abs(space - space_required); + PADDLE_ENFORCE( + space <= std::numeric_limits::type>::max(), + "space overload"); + size_t space_diff = + std::abs((std::make_signed::type)space - space_required); if (space_diff < best_fit.second) { best_fit.first = candidate; best_fit.second = space_diff; diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index ef62f758e37f28ab826faac84fd1276b14de7980..327adcc4aac1c50b51942c557d66dae6770e24f2 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false, "To find this error in time, we use init_allocated_mem to indicate " "that initializing the allocated memory with a small value " "during unit testing."); -DECLARE_bool(benchmark); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { @@ -188,21 +187,20 @@ void *Alloc(const platform::CUDAPlace &place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) - << " in GPU " << place.device << ", available " - << string::HumanReadableSize(avail); - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMinChunkSize()); - LOG(WARNING) << "GpuMaxChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMaxChunkSize()); - LOG(WARNING) << "GPU memory used: " - << string::HumanReadableSize(Used(place)); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << "total " << total + << "GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << "GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << "GPU memory used: " + << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Add(place.device, size); + } if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); } @@ -218,7 +216,9 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Minus(place.device, size); + } #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index fdcff62e1fe59b3a2f4925bdff98632f71220abb..0a51d50e06176e713922837861f2102c9ee8a899 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { "The shape of PriorBox is [N, 4]"); if (ctx->HasInput("PriorBoxVar")) { auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); - PADDLE_ENFORCE( - prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, - "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2."); - if (prior_box_var_dims.size() == 1) { - PADDLE_ENFORCE_EQ( - prior_box_var_dims[0], 4, - "The 1st dimension of Input(PriorBoxVar) should be 4" - "when the rank is 1."); - } else { - PADDLE_ENFORCE_EQ( - prior_box_dims, prior_box_var_dims, - "The dimension of Input(PriorBoxVar) should be equal to" - "the dimension of Input(PriorBox when the rank is 2.)"); - } + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); } } diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index e078af3eb478a8bebc6a7fc6460d169d803a3c4b..19a5bb90fa828899ad6270c051090dd3662aeed8 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = col_idx * len; - } + int prior_var_offset = col_idx * len; output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( T box_var_x = T(1), box_var_y = T(1); T box_var_w = T(1), box_var_h = T(1); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; - } + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; box_var_x = prior_box_var_data[prior_var_offset]; box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_w = prior_box_var_data[prior_var_offset + 2]; diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index a0b1faf7bdc7001eba2d92b4d03fbaf9feb7bcbb..6d406f8196f9964c85bb94541fa7a7a23857539b 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel { output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - prior_var_offset = j * len; - } + int prior_var_offset = j * len; output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel { } } } + template void DecodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, const int axis, - const std::vector variance, T* output) const { + const bool normalized, std::vector variance, + T* output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; @@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel { auto* target_box_data = target_box->data(); auto* prior_box_data = prior_box->data(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); + if (var_size == 2) prior_box_var_data = prior_box_var->data(); int prior_box_offset = 0; + T var_data[4] = {1., 1., 1., 1.}; + T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - if (axis == 0) { - prior_box_offset = j * len; - } else if (axis == 1) { - prior_box_offset = i * len; - } + prior_box_offset = axis == 0 ? j * len : i * len; T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel { T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - T box_var_x = T(1), box_var_y = T(1); - T box_var_w = T(1), box_var_h = T(1); - if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - if (axis == 0) - prior_var_offset = j * len; - else if (axis == 1) - prior_var_offset = i * len; - } - box_var_x = prior_box_var_data[prior_var_offset]; - box_var_y = prior_box_var_data[prior_var_offset + 1]; - box_var_w = prior_box_var_data[prior_var_offset + 2]; - box_var_h = prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { - box_var_x = static_cast(variance[0]); - box_var_y = static_cast(variance[1]); - box_var_w = static_cast(variance[2]); - box_var_h = static_cast(variance[3]); + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + target_box_center_x = box_var_x * target_box_data[offset] * prior_box_width + prior_box_center_x; @@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, - variance, output); + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } } } }; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index e20524012a5839fd250b7426a5efc42b7e87fe87..4b6eef18d8b967af5f3a5df0dee750620e7e412a 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -37,7 +37,7 @@ math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) -math_library(depthwise_conv) +math_library(depthwise_conv DEPS cub) math_library(im2col) math_library(sampler) diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index 9f92bc01befb496c103bcd367ae9cfc5c8f402b0..38e65524e870834710ff29f722c69eadf67d9dbe 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -31,6 +31,7 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { + {"accuracy", NG_OPS::BuildAccuracyNode}, {"conv2d", NG_OPS::BuildConv2dNode}, {"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index a827f7cb5b7200aaa32d6b3e32f5941860709cf3..fb574f1bc1160c79f5802f11c00716eccad7f48d 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -21,7 +21,8 @@ limitations under the License. */ #pragma once -#include "ops/binary_unnary_op.h" +#include "ops/accuracy_op.h" +#include "ops/binary_unary_op.h" #include "ops/conv2d_op.h" #include "ops/elementwise_add_op.h" #include "ops/fill_constant_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bf37ce48d8c2ce3b97fac154be9d1dfb08421f97 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h @@ -0,0 +1,65 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildAccuracyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto indices = platform::GetInputNode(op, "Indices", ngb_node_map); + auto label = platform::GetInputNode(op, "Label", ngb_node_map); + auto inference = platform::GetInputNode(op, "Out", ngb_node_map); + auto inference_shape = inference->get_shape(); + size_t num_samples = inference_shape.at(0); + size_t k = inference_shape.at(1); + + std::shared_ptr label_k = label; + if (k > 1) { + auto label_1d = std::make_shared( + label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples}); + label_k = std::make_shared(label_1d, inference_shape, + ngraph::AxisSet{1}); + } + + auto node_equal = std::make_shared(indices, label_k); + auto node_eq_int = + std::make_shared(node_equal, ngraph::element::i64); + auto num_correct_0d = + std::make_shared(node_eq_int, ngraph::AxisSet{0, 1}); + std::shared_ptr num_correct = + platform::NgReshaper(num_correct_0d, ngraph::Shape{1}); + std::shared_ptr n_samples = ngraph::op::Constant::create( + ngraph::element::i64, ngraph::Shape{1}, {num_samples}); + std::shared_ptr accuracy = std::make_shared( + std::make_shared(num_correct, ngraph::element::f32), + std::make_shared(n_samples, ngraph::element::f32)); + + platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map); + platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map); + platform::SetOutputNode(op, "Total", n_samples, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h similarity index 100% rename from paddle/fluid/operators/ngraph/ops/binary_unnary_op.h rename to paddle/fluid/operators/ngraph/ops/binary_unary_op.h diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index ea66953a125860ab1ce8309819b6c433ff32eaaa..852ecd7139a3c7046e78265ca021b2ce286c63c0 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -36,11 +36,6 @@ void BuildTopKNode( std::make_shared(top_k, 0); std::shared_ptr out = std::make_shared(top_k, 1); - auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map); - if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) { - out = std::make_shared(out, - dummy_out->get_element_type()); - } paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); } diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e7f38a551d680704d8d825e2fdba88a..fc3636e0b24765f681d3260b07fe854309774a40 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -259,7 +259,7 @@ Example: W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ hstart = i * strides[0] - paddings[0] hend = hstart + ksize[0] @@ -267,7 +267,7 @@ Example: wend = wstart + ksize[1] Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ - For exclusive = false: + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) hend = min(H, hstart + ksize[0]) @@ -403,7 +403,7 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ dstart = i * strides[0] - paddings[0] dend = dstart + ksize[0] @@ -413,7 +413,7 @@ Example: wend = wstart + ksize[2] Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} $$ - For exclusive = false: + For exclusive = true: $$ dstart = max(0, i * strides[0] - paddings[0]) dend = min(D, dstart + ksize[0]) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index f08798794a2f9fc042800583cbc032d6f12bf3dc..43a49de52242b96aade91013e89228fcb3247302 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); int64_t* tensor_data = lod_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_feasign.size())}), + framework::make_ddim({static_cast(batch_feasign.size()), 1}), platform::CPUPlace()); memcpy(tensor_data, batch_feasign.data(), batch_feasign.size() * sizeof(int64_t)); @@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, // insert label tensor framework::LoDTensor label_tensor; auto* label_tensor_data = label_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_label.size())}), + framework::make_ddim({static_cast(batch_label.size()), 1}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 9f3a254c84d4e04fbcd449644a7e138eff520fbc..6410439816d8ae4a9d1df507819071ce76b5308e 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { std::vector>> data_slot_6003{b1, b2, b3, b4}; - std::vector label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; + std::vector label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 5fe4d15ae2c6254a50318813c852b6c314880aba..ebd07d90ebe6b0ba008ac89c01c4f054f96a6da9 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) -register_operators() +if(WITH_GPU) + register_operators(DEPS cub) +else() + register_operators() +endif() if(WITH_GPU) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1f51b5bab3068cc89bffa85de28a9438359659f3..fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,4 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index c5b65d6636945b85603c07aeae0290ef9cadb396..b84315995a9d8a65668f57eef67f6dab8c20f9b3 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -43,13 +43,14 @@ std::shared_ptr NgReshaper(std::shared_ptr input, std::shared_ptr GetNode( const std::shared_ptr& op, - const std::string prm, const paddle::framework::VariableNameMap& var_map, + const std::string name, const paddle::framework::VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -59,43 +60,53 @@ std::shared_ptr GetNode( std::shared_ptr GetInputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } std::shared_ptr GetOutputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } void SetOutputNode( const std::shared_ptr& op, - const std::string prm, std::shared_ptr node, + const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { + /* */ + auto dummy_out = GetOutputNode(op, name, ngb_node_map); + if (dummy_out && dummy_out->get_shape() != node->get_shape()) { + node = NgReshaper(node, dummy_out->get_shape()); + } + if (dummy_out && + dummy_out->get_element_type() != node->get_element_type()) { + node = std::make_shared( + node, dummy_out->get_element_type()); + } (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } inline void GetMidDims(const ngraph::Shape& x_shape, diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 60b2d83f15746eab0a4d29c7965c064690b6d46d..655ce8485d4584aa0955315b045da6bf541f7fe2 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,12 +14,6 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); - namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 803ea6b26087884ad79c6bf80238953a012eaddc..4ac5b83c56b114f4e3e4c78710716adc636ebe1d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -26,5 +26,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind) endif(WITH_PYTHON) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 59e695e6fcb66cbaed1bcc9e861df81b5f73c1ed..90b8fd1a0aab159eb1a829d67485c845182d295b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -54,7 +54,7 @@ ELSE(WIN32) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ENDIF() -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index c983e2a44b25c5943df5e822e2e363b2557a6ac3..3b43ae0b9cb63a9f4708a680cb1021d74c197550 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -397,10 +397,10 @@ def box_coder(prior_box, input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(Variable|list): prior_box_var supports two types of input. - One is variable with shape [M, 4] holds M group. - The other one is list consist of 4 elements - shared by all boxes. + prior_box_var(Variable|list|None): prior_box_var supports two types + of input. One is variable with shape [M, 4] + holds M group. The other one is list consist of + 4 elements shared by all boxes. target_box(Variable): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can be a 3-D Tensor with shape diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..13a33e20478372af370d38ab2b475e4425dc8d6e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py @@ -0,0 +1,30 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp + + +class TestNGRAPHAccuracyOp(TestAccuracyOp): + def setUp(self): + super(TestNGRAPHAccuracyOp, self).setUp() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 6156268bf25ada310a3d22242ecff4b9cdf1759a..220bffebe83925c60af65aa9594ddd8a29c38145 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): pb_y = pb_y.reshape(shape) if pb_v.ndim == 2: - pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) if pb_v.ndim == 1: tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y @@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): self.outputs = {'OutputBox': output_box} -class TestBoxCoderOpWithOneRankVar(OpTest): - def test_check_output(self): - self.check_output() - - def setUp(self): - self.op_type = "box_coder" - lod = [[1, 1, 1, 1, 1]] - prior_box = np.random.random((81, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') - target_box = np.random.random((20, 81, 4)).astype('float32') - code_type = "DecodeCenterSize" - box_normalized = False - output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type, box_normalized) - - self.inputs = { - 'PriorBox': prior_box, - 'PriorBoxVar': prior_box_var, - 'TargetBox': target_box, - } - self.attrs = { - 'code_type': 'decode_center_size', - 'box_normalized': False - } - self.outputs = {'OutputBox': output_box} - - class TestBoxCoderOpWithoutBoxVar(OpTest): def test_check_output(self): self.check_output() @@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): self.op_type = "box_coder" lod = [[1, 1, 1, 1, 1]] prior_box = np.random.random((30, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') + prior_box_var = np.random.random((30, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False