提交 36e7800a 编写于 作者: X xzl

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into mobilenet_gpu

...@@ -21,3 +21,10 @@ ...@@ -21,3 +21,10 @@
sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
hooks: hooks:
- id: clang-formater - id: clang-formater
- repo: https://github.com/dnephin/pre-commit-golang
sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
hooks:
- id: go-fmt
files: (.*\.go)
- id: go-lint
files: (.*\.go)
...@@ -37,6 +37,7 @@ before_install: ...@@ -37,6 +37,7 @@ before_install:
# protobuf version. # protobuf version.
- pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
- pip install rarfile - pip install rarfile
- curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)" - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- | - |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
......
...@@ -38,12 +38,14 @@ ExternalProject_Add( ...@@ -38,12 +38,14 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_BUILD_TYPE:STRING=Release
) )
......
...@@ -17,6 +17,65 @@ INCLUDE(ExternalProject) ...@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
FIND_PACKAGE(Protobuf QUIET) FIND_PACKAGE(Protobuf QUIET)
SET(PROTOBUF_FOUND "OFF") SET(PROTOBUF_FOUND "OFF")
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS)
# shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
return()
endif()
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
# Create an include path for each file specified
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
endif()
if(DEFINED Protobuf_IMPORT_DIRS)
foreach(DIR ${Protobuf_IMPORT_DIRS})
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
endif()
set(${SRCS})
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
get_filename_component(FIL_DIR ${FIL} DIRECTORY)
if(FIL_DIR)
set(FIL_WE "${FIL_DIR}/${FIL_WE}")
endif()
endif()
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
COMMENT "Running Python protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
endfunction()
endif()
# Print and set the protobuf library information, # Print and set the protobuf library information,
# finish this cmake process and exit from this file. # finish this cmake process and exit from this file.
......
...@@ -99,15 +99,33 @@ function(merge_static_libs TARGET_NAME) ...@@ -99,15 +99,33 @@ function(merge_static_libs TARGET_NAME)
set(libs ${ARGN}) set(libs ${ARGN})
list(REMOVE_DUPLICATES libs) list(REMOVE_DUPLICATES libs)
# First get the file names of the libraries to be merged # Get all propagation dependencies from the merged libraries
foreach(lib ${libs}) foreach(lib ${libs})
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>) list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach() endforeach()
if(APPLE) # Use OSX's libtool to merge archives if(APPLE) # Use OSX's libtool to merge archives
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
# also help to track dependencies.
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
# Make the generated dummy source file depended on all static input
# libs. If input lib changes,the source file is touched
# which causes the desired effect (relink).
add_custom_command(OUTPUT ${dummyfile}
COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
DEPENDS ${libs})
# Generate dummy staic lib
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
add_library(${TARGET_NAME} STATIC ${dummyfile}) add_library(${TARGET_NAME} STATIC ${dummyfile})
target_link_libraries(${TARGET_NAME} ${libs_deps})
foreach(lib ${libs})
# Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
...@@ -117,7 +135,8 @@ function(merge_static_libs TARGET_NAME) ...@@ -117,7 +135,8 @@ function(merge_static_libs TARGET_NAME)
set(objdir ${lib}.objdir) set(objdir ${lib}.objdir)
add_custom_command(OUTPUT ${objdir} add_custom_command(OUTPUT ${objdir}
COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}) COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
DEPENDS ${lib})
add_custom_command(OUTPUT ${objlistfile} add_custom_command(OUTPUT ${objlistfile}
COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>" COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
...@@ -134,18 +153,18 @@ function(merge_static_libs TARGET_NAME) ...@@ -134,18 +153,18 @@ function(merge_static_libs TARGET_NAME)
list(APPEND mergebases "${mergebase}") list(APPEND mergebases "${mergebase}")
endforeach() endforeach()
# We need a target for the output merged library
add_library(${TARGET_NAME} STATIC ${mergebases}) add_library(${TARGET_NAME} STATIC ${mergebases})
target_link_libraries(${TARGET_NAME} ${libs_deps})
# Get the file name of the generated library
set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>") set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
foreach(lib ${libs}) foreach(lib ${libs})
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist" COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
COMMAND ${CMAKE_RANLIB} ${outlibfile}
WORKING_DIRECTORY ${lib}.objdir) WORKING_DIRECTORY ${lib}.objdir)
endforeach() endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_RANLIB} ${outlibfile})
endif() endif()
endfunction(merge_static_libs) endfunction(merge_static_libs)
...@@ -194,7 +213,7 @@ function(cc_test TARGET_NAME) ...@@ -194,7 +213,7 @@ function(cc_test TARGET_NAME)
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -281,10 +300,11 @@ function(go_library TARGET_NAME) ...@@ -281,10 +300,11 @@ function(go_library TARGET_NAME)
file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${${TARGET_NAME}_LIB_PATH}" COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
# Golang build source code # Golang build source code
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-o "${${TARGET_NAME}_LIB_PATH}" -o "${${TARGET_NAME}_LIB_PATH}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
# must run under GOPATH # must run under GOPATH
...@@ -299,8 +319,10 @@ function(go_binary TARGET_NAME) ...@@ -299,8 +319,10 @@ function(go_binary TARGET_NAME)
cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(OUTPUT ${TARGET_NAME}_timestamp add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
...@@ -332,3 +354,12 @@ function(proto_library TARGET_NAME) ...@@ -332,3 +354,12 @@ function(proto_library TARGET_NAME)
protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
endfunction() endfunction()
function(py_proto_compile TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS)
cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(py_srcs)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction()
\ No newline at end of file
...@@ -445,6 +445,11 @@ smooth_l1_cost ...@@ -445,6 +445,11 @@ smooth_l1_cost
.. autoclass:: paddle.v2.layer.smooth_l1_cost .. autoclass:: paddle.v2.layer.smooth_l1_cost
:noindex: :noindex:
multibox_loss
--------------
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
Check Layer Check Layer
============ ============
...@@ -468,3 +473,11 @@ prelu ...@@ -468,3 +473,11 @@ prelu
-------- --------
.. autoclass:: paddle.v2.layer.prelu .. autoclass:: paddle.v2.layer.prelu
:noindex: :noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
...@@ -101,7 +101,7 @@ ...@@ -101,7 +101,7 @@
</div> </div>
<div class="site-nav-links"> <div class="site-nav-links">
<div class="site-menu"> <div class="site-menu">
<a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a> <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
<div class="language-switcher dropdown"> <div class="language-switcher dropdown">
<a type="button" data-toggle="dropdown"> <a type="button" data-toggle="dropdown">
<span>English</span> <span>English</span>
......
...@@ -12,4 +12,4 @@ ...@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
go_binary(master SRC master.go) go_binary(master SRC master.go DEPS paddle_go_optimizer)
...@@ -12,4 +12,4 @@ ...@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
go_binary(pserver SRCS pserver.go) go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
go_library(paddle_master SHARED) go_library(paddle_master SHARED DEPS paddle_go_optimizer)
...@@ -104,11 +104,22 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int ...@@ -104,11 +104,22 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
return C.PADDLE_MASTER_OK return C.PADDLE_MASTER_OK
} }
// return value:
// 0:ok
// -1:error
//export paddle_next_record //export paddle_next_record
func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
c := get(client) c := get(client)
r := c.NextRecord() r, err := c.NextRecord()
if err != nil {
// Error
// TODO: return the type of error?
*record = (*C.uchar)(nullPtr)
return -1
}
if len(r) == 0 { if len(r) == 0 {
// Empty record
*record = (*C.uchar)(nullPtr) *record = (*C.uchar)(nullPtr)
return 0 return 0
} }
......
...@@ -11,7 +11,12 @@ import ( ...@@ -11,7 +11,12 @@ import (
// Client is the client of the master server. // Client is the client of the master server.
type Client struct { type Client struct {
conn *connection.Conn conn *connection.Conn
ch chan []byte ch chan record
}
type record struct {
r []byte
err error
} }
// NewClient creates a new Client. // NewClient creates a new Client.
...@@ -21,7 +26,7 @@ type Client struct { ...@@ -21,7 +26,7 @@ type Client struct {
func NewClient(addrCh <-chan string, bufSize int) *Client { func NewClient(addrCh <-chan string, bufSize int) *Client {
c := &Client{} c := &Client{}
c.conn = connection.New() c.conn = connection.New()
c.ch = make(chan []byte, bufSize) c.ch = make(chan record, bufSize)
go c.monitorMaster(addrCh) go c.monitorMaster(addrCh)
go c.getRecords() go c.getRecords()
return c return c
...@@ -46,10 +51,11 @@ func (c *Client) getRecords() { ...@@ -46,10 +51,11 @@ func (c *Client) getRecords() {
s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1) s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
for s.Scan() { for s.Scan() {
c.ch <- s.Record() c.ch <- record{s.Record(), nil}
} }
if s.Err() != nil { if s.Err() != nil {
c.ch <- record{nil, s.Err()}
log.Errorln(err, chunk.Path) log.Errorln(err, chunk.Path)
} }
...@@ -116,6 +122,7 @@ func (c *Client) taskFinished(taskID int) error { ...@@ -116,6 +122,7 @@ func (c *Client) taskFinished(taskID int) error {
// //
// NextRecord will block until the next record is available. It is // NextRecord will block until the next record is available. It is
// thread-safe. // thread-safe.
func (c *Client) NextRecord() []byte { func (c *Client) NextRecord() ([]byte, error) {
return <-c.ch r := <-c.ch
return r.r, r.err
} }
...@@ -68,12 +68,17 @@ func TestNextRecord(t *testing.T) { ...@@ -68,12 +68,17 @@ func TestNextRecord(t *testing.T) {
for pass := 0; pass < 50; pass++ { for pass := 0; pass < 50; pass++ {
received := make(map[byte]bool) received := make(map[byte]bool)
for i := 0; i < total; i++ { for i := 0; i < total; i++ {
r := c.NextRecord() r, err := c.NextRecord()
if err != nil {
t.Fatal(pass, i, "Read error:", err)
}
if len(r) != 1 { if len(r) != 1 {
t.Fatal("Length should be 1.", r) t.Fatal(pass, i, "Length should be 1.", r)
} }
if received[r[0]] { if received[r[0]] {
t.Fatal("Received duplicate.", received, r) t.Fatal(pass, i, "Received duplicate.", received, r)
} }
received[r[0]] = true received[r[0]] = true
} }
......
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf) cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
target_link_libraries(paddle_go_optimizer stdc++ m)
go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer) go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
if(WITH_TESTING) if(WITH_TESTING)
add_subdirectory(test) # FIXME: this test requires pserver which is not managed by the test
# we need some kind of e2e testing machanism.
# add_subdirectory(test)
endif() endif()
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient) cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
add_style_check_target(test_cclient test_cclient.c) add_style_check_target(test_cclient test_cclient.c)
...@@ -2,7 +2,7 @@ package pserver ...@@ -2,7 +2,7 @@ package pserver
// #cgo CFLAGS: -I ../../ // #cgo CFLAGS: -I ../../
// //FIXME: ldflags contain "build" path // //FIXME: ldflags contain "build" path
// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ // #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
// #include "paddle/optimizer/optimizer.h" // #include "paddle/optimizer/optimizer.h"
// #include <stdlib.h> // #include <stdlib.h>
// #include <string.h> // #include <string.h>
...@@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer { ...@@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
func (o *optimizer) GetWeights() []byte { func (o *optimizer) GetWeights() []byte {
var buffer unsafe.Pointer var buffer unsafe.Pointer
buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer) bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float) return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
} }
func (o *optimizer) UpdateParameter(g Gradient) error { func (o *optimizer) UpdateParameter(g Gradient) error {
......
...@@ -10,7 +10,9 @@ import ( ...@@ -10,7 +10,9 @@ import (
type ElementType int type ElementType int
const ( const (
// AlreadyInitialized is true if pserver is initialized
AlreadyInitialized = "pserver already initialized" AlreadyInitialized = "pserver already initialized"
// Uninitialized is true if pserver not fully initialized
Uninitialized = "pserver not fully initialized" Uninitialized = "pserver not fully initialized"
) )
......
...@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle ...@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
paddle_trainer_lib paddle_trainer_lib
paddle_network paddle_network
paddle_parameter paddle_parameter
paddle_optimizer
paddle_math paddle_math
paddle_utils paddle_utils
paddle_proto paddle_proto
......
...@@ -2,9 +2,17 @@ ...@@ -2,9 +2,17 @@
cc_library(ddim SRCS ddim.cc) cc_library(ddim SRCS ddim.cc)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
cc_test(scope_test SRCS scope_test.cc) cc_test(scope_test SRCS scope_test.cc)
cc_test(enforce_test SRCS enforce_test.cc) cc_test(enforce_test SRCS enforce_test.cc)
proto_library(attr_type SRCS attr_type.proto) proto_library(attr_type SRCS attr_type.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type) proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
#pragma once
#include <boost/variant.hpp>
#include <functional>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/framework/enforce.h"
namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
// check whether a value(attribute) fit a certain limit
template <typename T>
class LargerThanChecker {
public:
LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
}
private:
T lower_bound_;
};
// we can provide users more common Checker, like 'LessThanChecker',
// 'BetweenChecker'...
template <typename T>
class DefaultValueSetter {
public:
DefaultValueSetter(T default_value) : default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; }
private:
T default_value_;
};
// check whether a certain attribute fit its limits
// an attribute can have more than one limits
template <typename T>
class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker;
public:
TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
TypedAttrChecker& LargerThan(const T& lower_bound) {
value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
return *this;
}
// we can add more common limits, like LessThan(), Between()...
TypedAttrChecker& SetDefault(const T& default_value) {
PADDLE_ENFORCE(default_value_setter_.empty(),
"%s can't have more than one default value!", attr_name_);
default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
return *this;
}
// allow users provide their own checker
TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
value_checkers_.push_back(checker);
return *this;
}
void operator()(AttributeMap& attr_map) const {
if (!attr_map.count(attr_name_)) {
// user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(),
"Attribute '%s' is required!", attr_name_);
// default_value_setter_ has no more than one element
T val;
(default_value_setter_[0])(val);
attr_map[attr_name_] = val;
}
Attribute& attr = attr_map.at(attr_name_);
T& attr_value = boost::get<T>(attr);
for (const auto& checker : value_checkers_) {
checker(attr_value);
}
}
private:
std::string attr_name_;
std::vector<ValueChecker> value_checkers_;
std::vector<ValueChecker> default_value_setter_;
};
// check whether op's all attributes fit their own limits
class OpAttrChecker {
typedef std::function<void(AttributeMap&)> AttrChecker;
public:
template <typename T>
TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
AttrChecker& checker = attr_checkers_.back();
return *(checker.target<TypedAttrChecker<T>>());
}
void Check(AttributeMap& attr_map) const {
for (const auto& checker : attr_checkers_) {
checker(attr_map);
}
}
private:
std::vector<AttrChecker> attr_checkers_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
package paddle.framework;
import "attr_type.proto";
// AttrDesc is used to describe Attributes of an Operator. It contain's
// name, type, and value of Attribute.
//
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc {
required string name = 1;
required AttrType type = 2;
optional int32 i = 3;
optional float f = 4;
optional string s = 5;
repeated int32 ints = 6;
repeated float floats = 7;
repeated string strings = 8;
};
// Protocol Message to describe an Operator.
//
// In PaddlePaddle, Operator is used to do a certain computation such
// as "add", "sub", "cosine", etc.
// (1) Operator needs to know the input and output variable names.
// (2) Some ops may have special attributes such as "scale" in "CosineOp".
//
// 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc {
// input names of this Operator.
repeated string inputs = 1;
// output names of this Operator.
repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc".
required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4;
};
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/framework/op_desc.pb.h>
TEST(OpDesc, Create) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("add");
op_desc.add_inputs("X");
op_desc.add_inputs("Y");
op_desc.add_outputs("Z");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.14);
// required field name is not set, so IsInitialized should be false.
ASSERT_FALSE(op_desc.IsInitialized());
attr->set_name("add");
// after all required fields are set, IsInitialized should be true now.
ASSERT_TRUE(op_desc.IsInitialized());
}
\ No newline at end of file
#pragma once
#include "paddle/framework/attr_checker.h"
//#include "paddle/framework/op_base.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
namespace paddle {
namespace framework {
//==================For test================//
class OpBase {
public:
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
AttributeMap attr_map_;
virtual std::string Run() const = 0;
virtual ~OpBase() {}
};
//=========================================//
// helper class to set attribute type
struct AttrTypeHelper {
template <typename T>
static void SetAttrType(AttrProto* attr);
static Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
};
template <>
void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INT);
}
template <>
void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOAT);
}
template <>
void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRING);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INTS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOATS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRINGS);
}
// this class not only make proto but also init attribute checkers.
class OpProtoAndCheckerMaker {
public:
OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: proto_(proto), op_checker_(op_checker) {}
protected:
void AddInput(const std::string& name, const std::string& comment) {
auto input = proto_->mutable_inputs()->Add();
*(input->mutable_name()) = name;
*(input->mutable_comment()) = comment;
}
void AddOutput(const std::string& name, const std::string& comment) {
auto output = proto_->mutable_outputs()->Add();
*(output->mutable_name()) = name;
*(output->mutable_comment()) = comment;
}
template <typename T>
TypedAttrChecker<T>& AddAttr(const std::string& name,
const std::string& comment) {
auto attr = proto_->mutable_attrs()->Add();
*(attr->mutable_name()) = name;
*(attr->mutable_comment()) = comment;
AttrTypeHelper::SetAttrType<T>(attr);
return op_checker_->AddAttrChecker<T>(name);
}
void AddType(const std::string& op_type) { proto_->set_type(op_type); }
void AddComment(const std::string& comment) {
*(proto_->mutable_comment()) = comment;
}
OpProto* proto_;
OpAttrChecker* op_checker_;
};
class OpRegistry {
typedef std::function<OpBase*()> OpCreator;
public:
template <typename OpType, typename ProtoMakerType>
static void RegisterOp(const std::string& op_type) {
creators_[op_type] = []() { return new OpType; };
OpProto& op_proto = protos_[op_type];
OpAttrChecker& op_checker = op_checkers_[op_type];
ProtoMakerType(&op_proto, &op_checker);
PADDLE_ENFORCE(op_proto.IsInitialized() == true,
"Fail to initialize %s's OpProto !", op_type);
}
static OpBase* CreateOp(const OpDesc& op_desc) {
std::string op_type = op_desc.type();
OpBase* op = (creators_.at(op_type))();
(op->inputs_).resize(op_desc.inputs_size());
for (int i = 0; i < op_desc.inputs_size(); ++i) {
(op->inputs_)[i] = op_desc.inputs(i);
}
(op->outputs_).resize(op_desc.outputs_size());
for (int i = 0; i < op_desc.outputs_size(); ++i) {
(op->outputs_)[i] = op_desc.outputs(i);
}
for (int i = 0; i < op_desc.attrs_size(); ++i) {
const AttrDesc& ith_attr = op_desc.attrs(i);
std::string name = ith_attr.name();
(op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
}
const OpAttrChecker& op_checker = op_checkers_.at(op_type);
op_checker.Check(op->attr_map_);
return op;
}
private:
static std::unordered_map<std::string, OpCreator> creators_;
static std::unordered_map<std::string, OpProto> protos_;
static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
};
std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
std::unordered_map<std::string, OpProto> OpRegistry::protos_;
std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
template <typename OpType, typename ProtoMakerType>
class OpRegisterHelper {
public:
OpRegisterHelper(std::string op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
}
};
#define REGISTER_OP(__op_class, __op_maker_class, __op_type) \
class __op_class##Register { \
private: \
const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
}; \
const OpRegisterHelper<__op_class, __op_maker_class> \
__op_class##Register::reg(#__op_type);
// Demos
class CosineOp : public OpBase {
public:
virtual std::string Run() const {
std::string msg = "CosineOp runs! scale = " +
std::to_string(boost::get<float>(attr_map_.at("scale")));
return msg;
}
};
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of cosine op");
AddOutput("output", "output of cosine op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddType("cos");
AddComment("This is cos op");
}
};
REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
class MyTestOp : public OpBase {
public:
virtual std::string Run() const {
std::string msg =
"MyTestOp runs! test_attr = " +
std::to_string(boost::get<int>(attr_map_.at("test_attr")));
return msg;
}
};
class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of cosine op");
AddOutput("output", "output of cosine op");
auto my_checker = [](int i) {
PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
};
AddAttr<int>("test_attr", "a simple test attribute")
.AddCustomChecker(my_checker);
AddType("my_test_op");
AddComment("This is my_test op");
}
};
REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
} // namespace framework
} // namespace paddle
#include "paddle/framework/op_registry.h"
#include <gtest/gtest.h>
TEST(OpRegistry, CreateOp) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.3);
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
TEST(OpRegistry, IllegalAttr) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(-2.0);
bool caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "larger_than check fail";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
}
TEST(OpRegistry, DefaultValue) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
float default_value = 1.0;
std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
TEST(OpRegistry, CustomChecker) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("my_test_op");
op_desc.add_inputs("ii");
op_desc.add_outputs("oo");
// attr 'test_attr' is not set
bool caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "Attribute 'test_attr' is required!";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
// set 'test_attr' set to an illegal value
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("test_attr");
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(3);
caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "'test_attr' must be even!";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
// set 'test_attr' set to a legal value
op_desc.mutable_attrs()->Clear();
attr = op_desc.mutable_attrs()->Add();
attr->set_name("test_attr");
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(4);
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
\ No newline at end of file
...@@ -14,33 +14,39 @@ limitations under the License. */ ...@@ -14,33 +14,39 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <type_traits>
#include "paddle/framework/ddim.h"
#include "paddle/framework/enforce.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/place.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Tensor { class Tensor {
using paddle::platform::Place;
using paddle::platform::get_place;
public: public:
template <typename T> template <typename T>
const T* data() const { const T* data() const {
PADDLE_ASSERT(holder_ != nullptr, PADDLE_ENFORCE(holder_ != nullptr,
"Tensor::data must be called after Tensor::mutable_data"); "Tensor::data must be called after Tensor::mutable_data.");
return static_cast<const T*>(holder->Ptr()); return static_cast<const T*>(holder_->Ptr());
} }
template <typename T, // must be POD types template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type> typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, Place place) { T* mutable_data(DDim dims, paddle::platform::Place place) {
if (holder_ == nullptr || holder_->Place() != place || if (holder_ == nullptr ||
holder_->Size() < dims.product() * sizeof(T)) { !(holder_->Place() ==
holder_.reset(new PlaceholderImpl(place, dims.product() * sizeof(T))); place) /* some versions of boost::variant don't have operator!= */
|| holder_->Size() < product(dims) * sizeof(T)) {
holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
} }
return static_cast<T*>(holder_->Ptr()); return static_cast<T*>(holder_->Ptr());
} }
template <typename T, // must be POD types template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type> typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims) { T* mutable_data(DDim dims) {
return mutable_data<T>(dims, paddle::platform::get_place()); return mutable_data<T>(dims, paddle::platform::get_place());
} }
...@@ -51,27 +57,41 @@ class Tensor { ...@@ -51,27 +57,41 @@ class Tensor {
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() {}
virtual void* Ptr() const = 0; virtual void* Ptr() const = 0;
virtual Place Place() const = 0; virtual paddle::platform::Place Place() const = 0;
virtual size_t Size() const = 0; virtual size_t Size() const = 0;
}; };
template <typename T> template <typename T>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place pl, size_t size) private:
: ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)), class Deleter {
place_(pl), public:
Deleter(platform::Place place) : place_(place) {}
void operator()(T* ptr) {
paddle::memory::Free(place_, static_cast<void*>(ptr));
}
private:
paddle::platform::Place place_;
};
public:
PlaceholderImpl(paddle::platform::Place place, size_t size)
: ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
Deleter(place)),
place_(place),
size_(size) {} size_(size) {}
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
virtual size_t Size() const { return size_; } virtual size_t Size() const { return size_; }
virtual Place Place() const { return place_; } virtual paddle::platform::Place Place() const { return place_; }
std::unique_ptr<T, memory::Deleter> ptr_; std::unique_ptr<T, Deleter> ptr_;
Place place_; // record the place of ptr_. paddle::platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block. size_t size_; // size of the memory block.
}; };
std::unique_ptr<Placeholder> holder_; // holds the memory block if allocated. std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
}; };
} // namespace framework } // namespace framework
......
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "paddle/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
TEST(Tensor, ASSERT) {
paddle::framework::Tensor cpu_tensor;
bool caught = false;
try {
const double* p __attribute__((unused)) = cpu_tensor.data<double>();
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
const char* what = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(what[i], msg[i]);
}
}
ASSERT_TRUE(caught);
}
/* mutable_data() is not tested at present
because Memory::Alloc() and Memory::Free() have not been ready.
TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
{
Tensor cpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
EXPECT_NE(p1, nullptr);
// set cpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set cpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set cpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
{
Tensor gpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
EXPECT_NE(p1, nullptr);
// set gpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set gpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set gpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
}
*/
...@@ -25,6 +25,10 @@ namespace paddle { ...@@ -25,6 +25,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq: * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances) * Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]} * output[i] = average_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the average pooling
* operation is then applied to each interval independently.
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
......
...@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data, ...@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_); data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
} }
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
void CrossChannelNormLayer::forward(PassType passType) { void CrossChannelNormLayer::forward(PassType passType) {
Layer::forward(passType); Layer::forward(passType);
MatrixPtr inV = getInputValue(0); MatrixPtr inV = getInputValue(0);
...@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) { ...@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_); Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_); Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_); Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
normBuffer_->zeroMem();
// add eps to avoid overflow
normBuffer_->addScalar(*normBuffer_, 1e-6);
inV->square2(*dataBuffer_); inV->square2(*dataBuffer_);
for (size_t i = 0; i < batchSize; i++) { for (size_t i = 0; i < batchSize; i++) {
const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim); const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
...@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) { ...@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
// compute norm. // compute norm.
spatialBuffer_->sumCols(*dataTmp, 1, 0); spatialBuffer_->sumCols(*dataTmp, 1, 0);
// add eps to avoid overflow
spatialBuffer_->add(1e-6);
spatialBuffer_->sqrt2(*spatialBuffer_); spatialBuffer_->sqrt2(*spatialBuffer_);
normTmp->copyFrom(*spatialBuffer_); normTmp->copyFrom(*spatialBuffer_);
outVTmp->copyFrom(*inVTmp); outVTmp->copyFrom(*inVTmp);
...@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) { ...@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
size_t dataDim = inG->getWidth(); size_t dataDim = inG->getWidth();
size_t spatialDim = dataDim / channels_; size_t spatialDim = dataDim / channels_;
MatrixPtr inGBuffer;
Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
dataBuffer_->dotMul(*outG, *outV); dataBuffer_->dotMul(*outG, *outV);
Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_); Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_); Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
...@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) { ...@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
scaleDiff_->add(*channelBuffer_, 1.); scaleDiff_->add(*channelBuffer_, 1.);
sampleBuffer_->dotMul(*inVTmp, *outGTmp); sampleBuffer_->dotMul(*inVTmp, *outGTmp);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.); spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
// scale the grad // scale the grad
inGTmp->copyFrom(*inVTmp); inGBuffer->copyFrom(*inVTmp);
inGTmp->mulRowVector(*spatialBuffer_); inGBuffer->mulRowVector(*spatialBuffer_);
// divide by square of norm // divide by square of norm
spatialBuffer_->dotMul(*normTmp, *normTmp); spatialBuffer_->dotMul(*normTmp, *normTmp);
inGTmp->divRowVector(*spatialBuffer_); inGBuffer->divRowVector(*spatialBuffer_);
// subtract // subtract
inGTmp->add(*outGTmp, -1, 1); inGBuffer->add(*outGTmp, -1, 1);
// divide by norm // divide by norm
inGTmp->divRowVector(*normTmp); inGBuffer->divRowVector(*normTmp);
// scale the diff // scale the diff
inGTmp->mulColVector(*scale_->getW()); inGBuffer->mulColVector(*scale_->getW());
inGTmp->add(*inGBuffer);
} }
// updata scale // updata scale
if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_); if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
scale_->getParameterPtr()->incUpdate(callback); scale_->getParameterPtr()->incUpdate(callback);
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DetectionOutputLayer.h"
namespace paddle {
REGISTER_LAYER(detection_output, DetectionOutputLayer);
bool DetectionOutputLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto& layerConf = config_.inputs(0).detection_output_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
nmsThreshold_ = layerConf.nms_threshold();
confidenceThreshold_ = layerConf.confidence_threshold();
nmsTopK_ = layerConf.nms_top_k();
keepTopK_ = layerConf.keep_top_k();
backgroundId_ = layerConf.background_id();
return true;
}
void DetectionOutputLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
Matrix::resizeOrCreate(
confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).detection_output_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locTmpBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confTmpBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
MatrixPtr priorValue;
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(
confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
confBuffer_->softmax(*confBuffer_);
size_t numPriors = priorValue->getElementCnt() / 8;
std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
for (size_t n = 0; n < batchSize; ++n) {
std::vector<NormalizedBBox> decodedBBoxes;
for (size_t i = 0; i < numPriors; ++i) {
size_t priorOffset = i * 8;
size_t locPredOffset = n * numPriors * 4 + i * 4;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
std::vector<real> locPredData;
for (size_t j = 0; j < 4; ++j)
locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
NormalizedBBox bbox =
decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
decodedBBoxes.push_back(bbox);
}
allDecodedBBoxes.push_back(decodedBBoxes);
}
std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
size_t numKept = getDetectionIndices(confBuffer_->getData(),
numPriors,
numClasses_,
backgroundId_,
batchSize,
confidenceThreshold_,
nmsTopK_,
nmsThreshold_,
keepTopK_,
allDecodedBBoxes,
&allIndices);
resetOutput(numKept, 7);
MatrixPtr outV = getOutputValue();
getDetectionOutput(confBuffer_->getData(),
numKept,
numPriors,
numClasses_,
batchSize,
allIndices,
allDecodedBBoxes,
*outV);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <vector>
#include "DetectionUtil.h"
#include "Layer.h"
namespace paddle {
/**
* The detection output layer for a SSD detection task. This layer applies the
* Non-maximum suppression to the all predicted bounding box and keeps the
* Top-K bounding boxes.
* - Input: This layer needs three input layers: The first input layer
* is the priorbox layer. The rest two input layers are convolution
* layers for generating bbox location offset and the classification
* confidence.
* - Output: The predict bounding box locations.
*/
class DetectionOutputLayer : public Layer {
public:
explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[1 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[1 + inputNum_ + index];
}
private:
size_t numClasses_; // number of classes
size_t inputNum_; // number of input layers
real nmsThreshold_;
real confidenceThreshold_;
size_t nmsTopK_;
size_t keepTopK_;
size_t backgroundId_;
size_t locSizeSum_;
size_t confSizeSum_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
};
} // namespace paddle
...@@ -26,6 +26,10 @@ namespace paddle { ...@@ -26,6 +26,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq: * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances) * Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]} * output[i] = max_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the max pooling operation is
* then applied to each interval independently.
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MultiBoxLossLayer.h"
#include <float.h>
#include <vector>
#include "DataLayer.h"
namespace paddle {
REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
bool MultiBoxLossLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto layerConf = config_.inputs(0).multibox_loss_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
overlapThreshold_ = layerConf.overlap_threshold();
negPosRatio_ = layerConf.neg_pos_ratio();
negOverlap_ = layerConf.neg_overlap();
backgroundId_ = layerConf.background_id();
return true;
}
void MultiBoxLossLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
resetOutput(batchSize, 1);
// all location data and confidence score data
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
// locBuffer layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
locBuffer_ = locTmpBuffer_;
// confBuffer layout:
// | class1 score | class2 score | ... |classN score | class1 score | ......
Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
confBuffer_ = confTmpBuffer_;
// concate location data and confidence score data
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
// priorValue layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
// | xmin2 | ......
MatrixPtr priorValue;
// labelValue layout:
// | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
MatrixPtr labelValue;
// Copy data from GPU to CPU if use GPU
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
Matrix::resizeOrCreate(labelCpuValue_,
labelTmpValue->getHeight(),
labelTmpValue->getWidth(),
false,
false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
labelCpuValue_->copyFrom(*labelTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
labelValue = labelCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
labelValue = getInputValue(*getLabelLayer());
}
// Get max scores for each prior bbox. Used in negative mining
std::vector<std::vector<real>> allMaxConfScore;
numPriors_ = priorValue->getElementCnt() / 8;
getMaxConfidenceScores(confBuffer_->getData(),
batchSize,
numPriors_,
numClasses_,
backgroundId_,
&allMaxConfScore);
// Match prior bbox to groundtruth bbox
Argument label = getInput(*getLabelLayer());
const int* labelIndex = label.sequenceStartPositions->getData(false);
size_t seqNum = label.getNumSequences();
numMatches_ = 0;
numNegs_ = 0;
allMatchIndices_.clear();
allNegIndices_.clear();
std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
numPriors_,
*labelValue,
labelIndex,
seqNum,
allMaxConfScore,
batchSize,
overlapThreshold_,
negOverlap_,
negPosRatio_,
&allMatchIndices_,
&allNegIndices_);
numMatches_ = retPair.first;
numNegs_ = retPair.second;
// BBox location L1 smooth loss
locLoss_ = 0.0;
if (numMatches_ >= 1) {
size_t count = 0;
MatrixPtr locLossOutput;
Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
locDiff_->zeroMem();
std::vector<real> locGTData;
real* locDiffData = locDiff_->getData();
const real* locBufferData = locBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue; // match none
size_t locOffset =
n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
std::copy(locBufferData + locOffset,
locBufferData + locOffset + 4,
locDiffData + count);
count += 4;
const int gtIdx = allMatchIndices_[n][i];
size_t priorOffset = i * 8;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
std::vector<NormalizedBBox> gtBBoxVec;
getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
std::vector<real> gtEncode;
encodeBBoxWithVar(
priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
}
}
locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
locLoss_ = locLossOutput->getSum() / numMatches_;
}
// BBox confidence softmax loss
confLoss_ = 0;
numConf_ = numMatches_ + numNegs_;
if (numConf_ >= 1) {
Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
IVector::resizeOrCreate(confGTData_, numConf_, false);
confProb_->zeroMem();
size_t count = 0;
std::vector<real> confPredData;
real* confProbData = confProb_->getData();
const real* confBufferData = confBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
const int gtLabel = (labelValue->getData() + labelOffset)[0];
confGTData_->getData()[count] = gtLabel;
size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
// Negative mining samples
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
confGTData_->getData()[count] = backgroundId_;
size_t confOffset =
n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
}
CHECK_EQ(numConf_, count);
confProb_->softmax(*confProb_);
MatrixPtr confLossOutput;
Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
confLoss_ = confLossOutput->getSum() / numMatches_;
}
real loss = locLoss_ + confLoss_;
MatrixPtr outV = getOutputValue();
outV->assign(loss);
}
void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locBuffer_->zeroMem();
confBuffer_->zeroMem();
// Back propagate on location prediction
if (numMatches_ >= 1) {
MatrixPtr locDiffBuffer;
Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
locDiff_->copyFrom(*locDiffBuffer);
// scale gradient
for (size_t i = 0; i < numMatches_ * 4; ++i)
locDiff_->getData()[i] *= (1. / numMatches_);
// Copy gradient back
size_t count = 0;
const real* locDiffData = locDiff_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* locBufferData =
locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
std::copy(locDiffData + count * 4,
locDiffData + (count + 1) * 4,
locBufferData);
++count;
}
}
CHECK_EQ(count, numMatches_);
}
if (numConf_ >= 1) {
for (size_t i = 0; i < numConf_; ++i)
confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
for (size_t i = 0; i < numConf_ * numClasses_; ++i)
confProb_->getData()[i] *= (1. / numMatches_);
size_t count = 0;
const real* confProbData = confProb_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
int idx = allNegIndices_[n][i];
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + idx * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
}
CHECK_EQ(count, numConf_);
}
if (useGpu_) {
locTmpBuffer_->copyFrom(*locCpuBuffer_);
confTmpBuffer_->copyFrom(*confCpuBuffer_);
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
// copy back
size_t locOffset = 0;
size_t confOffset = 0;
auto layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
// only for unittest, there are no width and height information
// when constructing matrix in unittest, so we should
// set the shape in configuration
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
// NHWC to NCHW
MatrixPtr locGBuffer;
Matrix::resizeOrCreate(
locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
MatrixPtr confGBuffer;
Matrix::resizeOrCreate(
confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
locOffset += decomposeWithPermute(*locBuffer_,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locGBuffer,
kNHWCToNCHW);
inLocG->add(*locGBuffer);
confOffset += decomposeWithPermute(*confBuffer_,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confGBuffer,
kNHWCToNCHW);
inConfG->add(*confGBuffer);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
}
} // namespace paddle
/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
licensed under the apache license, version 2.0 (the "license");
you may not use this file except in compliance with the license.
you may obtain a copy of the license at
http://www.apache.org/licenses/license-2.0
unless required by applicable law or agreed to in writing, software
distributed under the license is distributed on an "as is" basis,
without warranties or conditions of any kind, either express or implied.
see the license for the specific language governing permissions and
limitations under the license. */
#pragma once
#include <vector>
#include "CostLayer.h"
#include "DataLayer.h"
#include "DetectionUtil.h"
#include "Layer.h"
using std::vector;
using std::pair;
namespace paddle {
/**
* The multibox loss layer for a SSD detection task.
* The loss is composed by the location loss and the confidence loss.
* The location loss is a smooth L1 loss and the confidence loss is
* a softmax loss.
* - Input: This layer needs four input layers: The first input layer
* is the priorbox layer and the second layer is a label layer.
* The rest two input layers are convolution layers for generating
* bbox location offset and the classification confidence.
* - Output: The Single Shot Multibox Detection loss value.
* Reference:
* Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
* Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
*/
class MultiBoxLossLayer : public CostLayer {
public:
explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[2 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[2 + inputNum_ + index];
}
protected:
size_t numClasses_;
real overlapThreshold_;
real negPosRatio_;
real negOverlap_;
size_t inputNum_;
size_t backgroundId_;
real locLoss_;
real confLoss_;
size_t numPriors_;
size_t numMatches_;
size_t numNegs_;
size_t numConf_;
size_t locSizeSum_;
size_t confSizeSum_;
vector<vector<int>> allMatchIndices_;
vector<vector<int>> allNegIndices_;
MatrixPtr locGTData_;
IVectorPtr confGTData_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locDiff_;
MatrixPtr confProb_;
MatrixPtr labelCpuValue_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
};
} // namespace paddle
...@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap, ...@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
return true; return true;
} }
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
} // namespace paddle } // namespace paddle
...@@ -26,10 +26,9 @@ namespace paddle { ...@@ -26,10 +26,9 @@ namespace paddle {
* If SequenceLevel = kNonseq: * If SequenceLevel = kNonseq:
* Output: a sequence containing only the last instance of the input sequence * Output: a sequence containing only the last instance of the input sequence
* If stride_ > 0: * If stride_ > 0:
* Output: a shorten sequence. The operation of getting last instance of a * Output: a shorten sequence. Stride is the step size by which we slide a
* sequence is independently performed on every slice of the input * window upon the input sequence, and getting last instance
* sequence, which is obtained by sliding a window with the window * operation is then applied to each interval independently.
* size set to stride_.
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence * Output: a sequence containing only the last instance of each sub-sequence
...@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, ...@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
void SequenceLastInstanceLayer::forward(PassType passType) { void SequenceLastInstanceLayer::forward(PassType passType) {
SequencePoolLayer::forward(passType); SequencePoolLayer::forward(passType);
auto starts = (stride_ > 0) ? stridePositions_->getData() auto starts = startPositions_->getData(false);
: startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue(); MatrixPtr outputValue = getOutputValue();
......
...@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) { ...@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
if (stride_ > 0) { if (stride_ > 0) {
CHECK_EQ(input.hasSubseq(), 0UL) CHECK_EQ(input.hasSubseq(), 0UL)
<< "sequence stride pooling is invalid for hasSubseq now"; << "sequence stride pooling is invalid for hasSubseq now";
output_.poolSequenceWithStride( output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
input, stride_, &stridePositions_, reversed_); newBatchSize_ = startPositions_->getSize() - 1;
newBatchSize_ = stridePositions_->getSize() - 1;
} }
resetOutput(newBatchSize_, dim); resetOutput(newBatchSize_, dim);
......
...@@ -28,8 +28,9 @@ namespace paddle { ...@@ -28,8 +28,9 @@ namespace paddle {
* sequence}{input[i]} * sequence}{input[i]}
* If stride_ > 0: * If stride_ > 0:
* Check input sequence must not have sub-sequence * Check input sequence must not have sub-sequence
* Output: a shorten sequence, pooling is performed upon a small local * Output: a shorten sequence. Stride is the step size by which we slide
* area * a window upon the input sequence, and the pooling operation
* is then applied to each interval independently.
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
...@@ -47,8 +48,6 @@ protected: ...@@ -47,8 +48,6 @@ protected:
size_t newBatchSize_; size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_; ICpuGpuVectorPtr startPositions_;
int stride_; int stride_;
// Store the start position of each window.
IVectorPtr stridePositions_;
// Whether the input sequence is reversed or not. // Whether the input sequence is reversed or not.
bool reversed_ = false; bool reversed_ = false;
......
...@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox ...@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox
add_test(NAME test_PriorBox add_test(NAME test_PriorBox
COMMAND test_PriorBox) COMMAND test_PriorBox)
################# test_DetectionOutput #######################
add_unittest_without_exec(test_DetectionOutput
test_DetectionOutput.cpp
LayerGradUtil.cpp)
add_test(NAME test_DetectionOutput
COMMAND test_DetectionOutput)
################# test_ConvUnify ####################### ################# test_ConvUnify #######################
add_unittest_without_exec(test_ConvUnify add_unittest_without_exec(test_ConvUnify
test_ConvUnify.cpp test_ConvUnify.cpp
......
...@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer, ...@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
std::vector<Argument> args; std::vector<Argument> args;
args.push_back(out); args.push_back(out);
EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed"; ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
for (size_t seqId = 0; seqId < numSequences; ++seqId) { for (size_t seqId = 0; seqId < numSequences; ++seqId) {
start[seqId] += seqLens[seqId]; start[seqId] += seqLens[seqId];
} }
...@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf, ...@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf,
data.value->sigmoid(*data.value); data.value->sigmoid(*data.value);
data.grad->zeroMem(); data.grad->zeroMem();
break; break;
case INPUT_SELF_DEFINE_DATA: {
size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
CHECK_GT(static_cast<int>(height), 0);
CHECK_GT(static_cast<int>(width), 0);
data.value = Matrix::create(height, width, false, useGpu);
data.grad = Matrix::create(height, width, false, useGpu);
data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
data.grad->zeroMem();
const std::vector<int>& labelSeqStartPositions =
testConf.inputDefs[i].labelSeqStartPositions;
if (labelSeqStartPositions.size() != 0) {
CHECK(!sequenceStartPositions);
CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
sequenceStartPositions =
ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
labelSeqStartPositions.size(),
useGpu);
data.sequenceStartPositions = sequenceStartPositions;
}
break;
}
default: default:
LOG(FATAL) << " unknown inputType "; LOG(FATAL) << " unknown inputType ";
return; return;
...@@ -440,7 +465,6 @@ void initTestLayer(TestConfig testConf, ...@@ -440,7 +465,6 @@ void initTestLayer(TestConfig testConf,
ParameterConfig paraConfig) { ParameterConfig paraConfig) {
paraConfig.set_name(paraName); paraConfig.set_name(paraName);
paraConfig.set_size(paraSize); paraConfig.set_size(paraSize);
paraConfig.set_initial_std(1);
paraConfig.set_is_static(isStatic); paraConfig.set_is_static(isStatic);
auto para = auto para =
std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize); std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
...@@ -474,6 +498,9 @@ void initTestLayer(TestConfig testConf, ...@@ -474,6 +498,9 @@ void initTestLayer(TestConfig testConf,
paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize()); paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
paraConfig.add_dims(testConf.layerConfig.size()); paraConfig.add_dims(testConf.layerConfig.size());
} }
CHECK_GE(testConf.paramInitialStd, 0);
paraConfig.set_initial_mean(testConf.paramInitialMean);
paraConfig.set_initial_std(testConf.paramInitialStd);
initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig); initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
} }
} }
......
...@@ -32,6 +32,7 @@ enum InputType { ...@@ -32,6 +32,7 @@ enum InputType {
INPUT_SPARSE_NON_VALUE_DATA, INPUT_SPARSE_NON_VALUE_DATA,
INPUT_SPARSE_FLOAT_VALUE_DATA, INPUT_SPARSE_FLOAT_VALUE_DATA,
INPUT_DENSE_DIM_DATA, // using sequence length to init dense data INPUT_DENSE_DIM_DATA, // using sequence length to init dense data
INPUT_SELF_DEFINE_DATA, // support customizing for input value
}; };
struct ParaSparse { struct ParaSparse {
...@@ -66,6 +67,7 @@ struct InputDef { ...@@ -66,6 +67,7 @@ struct InputDef {
bool isStatic; bool isStatic;
std::vector<int> labelInitValue; std::vector<int> labelInitValue;
std::vector<int> labelSeqStartPositions; std::vector<int> labelSeqStartPositions;
MatrixPtr selfDefinedData;
InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
inputType = type; inputType = type;
...@@ -76,6 +78,20 @@ struct InputDef { ...@@ -76,6 +78,20 @@ struct InputDef {
isStatic = false; isStatic = false;
} }
InputDef(InputType type,
string nameIn,
MatrixPtr selfDefinedData,
std::vector<int> selfDefinedSeqStartPos = {})
: labelSeqStartPositions(selfDefinedSeqStartPos),
selfDefinedData(selfDefinedData) {
inputType = type;
name = nameIn;
dim = 0;
sparse = {""};
paraSize = 0;
isStatic = false;
}
InputDef(InputType type, InputDef(InputType type,
string nameIn, string nameIn,
size_t dimIn, size_t dimIn,
...@@ -109,12 +125,16 @@ struct TestConfig { ...@@ -109,12 +125,16 @@ struct TestConfig {
LayerConfig layerConfig; LayerConfig layerConfig;
std::vector<InputDef> inputDefs; std::vector<InputDef> inputDefs;
size_t biasSize; size_t biasSize;
real paramInitialMean;
real paramInitialStd;
bool testAccumulate; bool testAccumulate;
bool testState; bool testState;
bool staticBias; bool staticBias;
bool testBatchState; bool testBatchState;
TestConfig() TestConfig()
: biasSize(0), : biasSize(0),
paramInitialMean(0.0),
paramInitialStd(1.0),
testAccumulate(true), testAccumulate(true),
testState(false), testState(false),
staticBias(false), staticBias(false),
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
// Do one forward pass of priorBox layer and check to see if its output
// matches the given result
void doOneDetectionOutputTest(MatrixPtr& inputLoc,
MatrixPtr& inputConf,
MatrixPtr& inputPriorBox,
size_t feature_map_width,
size_t feature_map_height,
real nms_threshold,
bool use_gpu,
MatrixPtr& result) {
// Setting up the detection output layer
TestConfig configt;
configt.layerConfig.set_type("detection_output");
LayerInputConfig* input = configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
detOutput->set_width(feature_map_width);
detOutput->set_height(feature_map_height);
detOutput->set_nms_threshold(nms_threshold);
detOutput->set_num_classes(2);
detOutput->set_nms_top_k(20);
detOutput->set_keep_top_k(10);
detOutput->set_background_id(0);
detOutput->set_confidence_threshold(0.01);
detOutput->set_input_num(1);
configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
// data layer initialize
std::vector<DataLayerPtr> dataLayers;
LayerMap layerMap;
vector<Argument> datas;
initDataLayer(
configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
// test layer initialize
bool store_FLAGS_use_gpu = FLAGS_use_gpu;
FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters;
LayerPtr detectionOutputLayer;
initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
FLAGS_use_gpu = store_FLAGS_use_gpu;
detectionOutputLayer->forward(PASS_GC);
checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
}
TEST(Layer, detectionOutputLayerFwd) {
bool useGpu = false;
// CPU case 1.
MatrixPtr inputLoc;
MatrixPtr inputConf;
MatrixPtr inputPriorBox;
MatrixPtr result, result2, result3, result4;
real nmsTreshold = 0.01;
real inputLocData[] = {0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1};
real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
real resultData[] = {
0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
result = Matrix::create(1, 7, false, useGpu);
inputLoc->setData(inputLocData);
inputConf->setData(inputConfData);
inputPriorBox->setData(inputPriorBoxData);
result->setData(resultData);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result);
// CPU case 2.
nmsTreshold = 0.2;
result2 = Matrix::create(2, 7, false, useGpu);
real resultData2[] = {0,
1,
0.68997443,
0.099959746,
0.099959746,
0.50804031,
0.50804031,
0,
1,
0.59868765,
0.29995975,
0.29995975,
0.70804024,
0.70804024};
result2->setData(resultData2);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result2);
#ifndef PADDLE_ONLY_CPU
// GPU case 1.
useGpu = true;
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
inputLoc->copyFrom(inputLocData, 16);
inputConf->copyFrom(inputConfData, 8);
inputPriorBox->copyFrom(inputPriorBoxData, 32);
nmsTreshold = 0.01;
result3 = Matrix::create(1, 7, false, useGpu);
result3->copyFrom(resultData, 7);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result3);
// GPU case 2.
nmsTreshold = 0.2;
result4 = Matrix::create(2, 7, false, useGpu);
result4->copyFrom(resultData2, 14);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result4);
#endif
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
...@@ -891,6 +891,10 @@ void testDegradeLayer(bool hasSubseq, ...@@ -891,6 +891,10 @@ void testDegradeLayer(bool hasSubseq,
TEST(Layer, MaxLayer) { TEST(Layer, MaxLayer) {
testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
testDegradeLayer(false,
"max",
"non-seq",
5); // seq max to a shorten seq, stride window = 5
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
} }
...@@ -914,6 +918,10 @@ TEST(Layer, SequenceLastInstanceLayer) { ...@@ -914,6 +918,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
testDegradeLayer(false,
"average",
"non-seq",
5); // seq average to a shorten seq, stride window = 5
testDegradeLayer( testDegradeLayer(
true, "average", "non-seq", -1); // hasSubseq average to non-seq true, "average", "non-seq", -1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
...@@ -1707,6 +1715,8 @@ TEST(Layer, PadLayer) { ...@@ -1707,6 +1715,8 @@ TEST(Layer, PadLayer) {
TEST(Layer, CrossChannelNormLayer) { TEST(Layer, CrossChannelNormLayer) {
TestConfig config; TestConfig config;
config.paramInitialMean = 1.;
config.paramInitialStd = 0.;
config.layerConfig.set_type("norm"); config.layerConfig.set_type("norm");
config.layerConfig.set_size(100); config.layerConfig.set_size(100);
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
...@@ -1720,7 +1730,7 @@ TEST(Layer, CrossChannelNormLayer) { ...@@ -1720,7 +1730,7 @@ TEST(Layer, CrossChannelNormLayer) {
config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5); testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
} }
} }
...@@ -1738,6 +1748,70 @@ TEST(Layer, smooth_l1) { ...@@ -1738,6 +1748,70 @@ TEST(Layer, smooth_l1) {
} }
} }
TEST(Layer, multibox_loss) {
TestConfig config;
config.layerConfig.set_type("multibox_loss");
config.biasSize = 0;
LayerInputConfig* input = config.layerConfig.add_inputs();
MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
multiboxLoss->set_num_classes(21);
multiboxLoss->set_input_num(1);
multiboxLoss->set_overlap_threshold(0.5);
multiboxLoss->set_neg_pos_ratio(3);
multiboxLoss->set_neg_overlap(0.5);
multiboxLoss->set_background_id(0);
multiboxLoss->set_height(3);
multiboxLoss->set_width(3);
size_t gtNum = 1;
MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
labelValue->randomizeUniform();
labelValue->add(-0.5);
labelValue->sigmoid(*labelValue);
real* labelData = labelValue->getData();
size_t labelWidth = labelValue->getWidth();
for (size_t i = 0; i < gtNum; ++i) {
*(labelData + i * labelWidth) = std::rand() % 20 + 1;
*(labelData + i * labelWidth + 1) = 0.400259;
*(labelData + i * labelWidth + 2) = 0.377857;
*(labelData + i * labelWidth + 3) = 0.525712;
*(labelData + i * labelWidth + 4) = 0.519368;
}
vector<int> seqStartPositions(gtNum + 1, 0);
for (size_t i = 1; i <= gtNum; ++i) {
seqStartPositions[i] = i;
}
// Ensure at lease one matched bbox
MatrixPtr priorValue = Matrix::create(1, 72, false, false);
priorValue->randomizeUniform();
priorValue->add(-0.5);
priorValue->sigmoid(*priorValue);
real* priorData = priorValue->getData();
*(priorData) = 0.424811;
*(priorData + 1) = 0.397059;
*(priorData + 2) = 0.538905;
*(priorData + 3) = 0.447091;
*(priorData + 4) = 0.425720;
*(priorData + 5) = 0.515228;
*(priorData + 6) = 0.519452;
*(priorData + 7) = 0.591065;
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
}
}
TEST(Layer, TransLayer) { TEST(Layer, TransLayer) {
TestConfig config; TestConfig config;
const int height = 128; const int height = 128;
......
...@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) { ...@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {
void Argument::poolSequenceWithStride(const Argument& input, void Argument::poolSequenceWithStride(const Argument& input,
size_t stride, size_t stride,
IVectorPtr* stridePostions, ICpuGpuVectorPtr* stridePostions,
bool reversed) { bool reversed) {
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5, // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
// then sequenceStartPositions = [0, 2, 3, 4, 7]. // then sequenceStartPositions = [0, 2, 3, 4, 7].
...@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input, ...@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
stridePos.emplace_back(starts[numSequences]); stridePos.emplace_back(starts[numSequences]);
int size = stridePos.size(); int size = stridePos.size();
CHECK_EQ(size - 1, tgtBuf[numSequences]); CHECK_EQ(size - 1, tgtBuf[numSequences]);
IVector::resizeOrCreate(*stridePostions, size, false); ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->copyFrom(stridePos.data(), size); (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
} }
void Argument::getValueString( void Argument::getValueString(
......
...@@ -299,7 +299,7 @@ struct Argument { ...@@ -299,7 +299,7 @@ struct Argument {
*/ */
void poolSequenceWithStride(const Argument& input, void poolSequenceWithStride(const Argument& input,
size_t stride, size_t stride,
IVectorPtr* stridePositions, ICpuGpuVectorPtr* stridePositions,
bool reversed = false); bool reversed = false);
/** /**
* @brief getValueString will return the argument's output in string. There * @brief getValueString will return the argument's output in string. There
......
...@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) { ...@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30}; int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
for (auto reversed : {false, true}) { for (auto reversed : {false, true}) {
IVectorPtr stridePositions; ICpuGpuVectorPtr stridePositions;
output.poolSequenceWithStride( output.poolSequenceWithStride(
input, 5 /* stride */, &stridePositions, reversed); input, 5 /* stride */, &stridePositions, reversed);
...@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) { ...@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
CHECK_EQ(stridePositions->getSize(), 8UL); CHECK_EQ(stridePositions->getSize(), 8UL);
auto result = reversed ? strideResultReversed : strideResult; auto result = reversed ? strideResultReversed : strideResult;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
CHECK_EQ(stridePositions->getData()[i], result[i]); CHECK_EQ(stridePositions->getData(false)[i], result[i]);
} }
} }
} }
......
...@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) { ...@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
EXPECT_EQ((int)0, nums[i]); EXPECT_EQ((int)0, nums[i]);
} }
} }
TEST_F(CommonTest, barrierStat) {
const int threadNum = 10;
SyncThreadPool pool(threadNum);
#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
struct timeval time; \
gettimeofday(&time, nullptr); \
uint64_t usec = timeToMicroSecond(time); \
std::srand(usec); \
auto value = std::rand() % 100000; \
usleep(value); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
}
globalStat.printAllStatus();
globalStat.reset();
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
// use it to test accurate barrier gap
#define TEST_BARRIER(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
usleep(tid * 10000); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
}
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <cublas_v2.h> #include <cublas_v2.h>
#include "paddle/platform/dynamic_loader.h" #include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <cudnn.h> #include <cudnn.h>
#include "paddle/platform/dynamic_loader.h" #include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <curand.h> #include <curand.h>
#include "paddle/platform/dynamic_loader.h" #include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "dynamic_loader.h" #include "paddle/platform/dynload/dynamic_loader.h"
#include <dlfcn.h> #include <dlfcn.h>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <string> #include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/framework/enforce.h"
DEFINE_string(cudnn_dir, "", DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, " "Specify path for loading libcudnn.so. For instance, "
...@@ -72,13 +73,12 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, ...@@ -72,13 +73,12 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) { if (nullptr == *dso_handle) {
if (dso_path == "libcudnn.dylib") { if (dso_path == "libcudnn.dylib") {
LOG(FATAL) PADDLE_ENFORCE(true,
<< "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
<< "For instance, sudo tar -xzf " "For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
<< "/usr/local \n sudo chmod a+r " "chmod a+r /usr/local/cuda/include/cudnn.h "
"/usr/local/cuda/include/cudnn.h " // NOLINT "/usr/local/cuda/lib/libcudnn*");
<< "/usr/local/cuda/lib/libcudnn*";
} }
} }
} }
...@@ -106,22 +106,15 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -106,22 +106,15 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
} }
} }
PADDLE_ENFORCE(nullptr != *dso_handle,
CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath "Failed to find dynamic library: %s ( %s ) \n Please specify "
<< " (" << dlerror() << ") \n" "its path correctly using following ways: \n Method. set "
<< "Please specify its path correctly using " "environment variable LD_LIBRARY_PATH on Linux or "
"following ways: \n" "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
"export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
<< "Method. set environment variable " "using the DYLD_LIBRARY_PATH is impossible unless System "
"LD_LIBRARY_PATH on Linux or " "Integrity Protection (SIP) is disabled.",
<< "DYLD_LIBRARY_PATH on Mac OS. \n" dlPath, dlerror());
<< "For instance, issue command: export "
"LD_LIBRARY_PATH=... \n"
<< "Note: After Mac OS 10.11, using the "
"DYLD_LIBRARY_PATH is impossible "
<< "unless System Integrity Protection (SIP) "
"is disabled.";
} }
void GetCublasDsoHandle(void** dso_handle) { void GetCublasDsoHandle(void** dso_handle) {
......
...@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu) ...@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
} }
/// trigger to initialize RDMA lib /// trigger to initialize RDMA lib
PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n"; CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
} }
SocketServer::~SocketServer() { SocketServer::~SocketServer() {
...@@ -168,7 +168,7 @@ void SocketServer::tcpServer() { ...@@ -168,7 +168,7 @@ void SocketServer::tcpServer() {
/// First call to socket() function /// First call to socket() function
socket_ = socket(AF_INET, SOCK_STREAM, 0); socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket"; CHECK(socket_ >= 0) << "ERROR opening socket";
/// Initialize socket structure /// Initialize socket structure
bzero((char *)&serv_addr, sizeof(serv_addr)); bzero((char *)&serv_addr, sizeof(serv_addr));
...@@ -176,7 +176,7 @@ void SocketServer::tcpServer() { ...@@ -176,7 +176,7 @@ void SocketServer::tcpServer() {
serv_addr.sin_port = htons(port_); serv_addr.sin_port = htons(port_);
if (!addr_.empty()) { if (!addr_.empty()) {
server = gethostbyname(addr_.c_str()); server = gethostbyname(addr_.c_str());
PCHECK(server) << "ERROR, no such host: " << addr_; CHECK(server) << "ERROR, no such host: " << addr_;
bcopy((char *)server->h_addr, bcopy((char *)server->h_addr,
(char *)&serv_addr.sin_addr.s_addr, (char *)&serv_addr.sin_addr.s_addr,
server->h_length); server->h_length);
...@@ -187,7 +187,7 @@ void SocketServer::tcpServer() { ...@@ -187,7 +187,7 @@ void SocketServer::tcpServer() {
setOption(socket_); setOption(socket_);
/// Now bind the host address using bind() call. /// Now bind the host address using bind() call.
PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0) CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding " << addr_; << "ERROR on binding " << addr_;
/// Now start listening for the clients, here process will /// Now start listening for the clients, here process will
...@@ -201,7 +201,7 @@ void SocketServer::tcpServer() { ...@@ -201,7 +201,7 @@ void SocketServer::tcpServer() {
if (stopping_) { if (stopping_) {
break; break;
} }
PCHECK(newsockfd >= 0) << "ERROR on accept"; CHECK(newsockfd >= 0) << "ERROR on accept";
constexpr int kPeerNameLen = 128; constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen]; char peerName[kPeerNameLen];
CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen)); CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
...@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() { ...@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() {
/// First call to socket() function /// First call to socket() function
rdmaSocket_ = rdma::ssocket(rdmaCpu_); rdmaSocket_ = rdma::ssocket(rdmaCpu_);
PCHECK(rdmaSocket_) << "ERROR opening RDMA socket"; CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0) CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
<< "ERROR bind RDMA socket"; << "ERROR bind RDMA socket";
/// Now start listening for the clients, here process will /// Now start listening for the clients, here process will
/// go in sleep mode and will wait for the incoming connection /// go in sleep mode and will wait for the incoming connection
PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket"; CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
while (true) { while (true) {
/// Accept actual connection from the client /// Accept actual connection from the client
...@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() { ...@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() {
if (stopping_) { if (stopping_) {
break; break;
} }
PCHECK(newsock) << "ERROR on accept"; CHECK(newsock) << "ERROR on accept";
constexpr int kPeerNameLen = 128; constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen]; char peerName[kPeerNameLen];
...@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() { ...@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
onlineCpus_ = rdma::numCpus(); onlineCpus_ = rdma::numCpus();
for (auto i = 0; i < onlineCpus_; i++) { for (auto i = 0; i < onlineCpus_; i++) {
socket = rdma::csocket(i); socket = rdma::csocket(i);
PCHECK(socket) << "ERROR open client socket daemon"; CHECK(socket) << "ERROR open client socket daemon";
rdmaClientSocket_.push_back(socket); rdmaClientSocket_.push_back(socket);
} }
...@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { ...@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
/// Create a socket point /// Create a socket point
int sockfd = socket(AF_INET, SOCK_STREAM, 0); int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket"; CHECK(sockfd >= 0) << "ERROR opening socket";
#if defined(__OSX__) || defined(__APPLE__) #if defined(__OSX__) || defined(__APPLE__)
server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet); server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
...@@ -396,7 +396,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { ...@@ -396,7 +396,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
} }
std::this_thread::sleep_for(std::chrono::seconds(1)); std::this_thread::sleep_for(std::chrono::seconds(1));
} else { } else {
PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":" CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
<< serverPort << "errorno: " << errno; << serverPort << "errorno: " << errno;
} }
} while (errno == ECONNREFUSED); } while (errno == ECONNREFUSED);
...@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) { ...@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
/// connect to server with socket daemon /// connect to server with socket daemon
sock = rdma::connect(socketDaemon_, rdmaUri.c_str()); sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
PCHECK(sock) << "ERROR connect to server" << rdmaUri; CHECK(sock) << "ERROR connect to server" << rdmaUri;
std::vector<std::string> seg; std::vector<std::string> seg;
str::split(rdmaUri, '/', &seg); str::split(rdmaUri, '/', &seg);
......
...@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, ...@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
SetConfigResponse response; SetConfigResponse response;
callback(response); callback(response);
/// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
} }
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) { real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
...@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, ...@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::vector<Buffer>* outputBuffers) { std::vector<Buffer>* outputBuffers) {
VLOG(1) << "pserver: addGradient"; VLOG(1) << "pserver: addGradient";
// forwardbackward delta from all trainers
// indicate the fluctuation caused by forwardbackward.
if (!numPassFinishClients_) {
REGISTER_BARRIER_DELTA_SERVER_SET(
*statSet_,
"forwardbackwardDelta",
FLAGS_num_gradient_servers,
request.trainer_id(),
request.forwardbackward_time(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{
/// approximately pure network overhead
REGISTER_TIMER_DYNAMIC_SET(
"pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
}
#ifndef PADDLE_DISABLE_TIMER
gettimeofday(&(*addGradBegin_), nullptr);
#endif
/// barrier fluctuation caused by network and previous forwardbackward
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER_SET(
*statSet_,
"handleReqBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
(*handleRequestBegin_),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{ {
REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
ReadLockGuard guard(parameterMutex_); ReadLockGuard guard(parameterMutex_);
int bufferIndex = 0; int bufferIndex = 0;
for (const auto& block : request.blocks()) { for (const auto& block : request.blocks()) {
...@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, ...@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::lock_guard<std::mutex> guard(*info.lock); std::lock_guard<std::mutex> guard(*info.lock);
simd::addTo(gradientSumBuffer, gradientBuffer, size); simd::addTo(gradientSumBuffer, gradientBuffer, size);
} }
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradCoreFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
} }
if (request.batch_status() == BATCH_FINISH || if (request.batch_status() == BATCH_FINISH ||
request.batch_status() == BATCH_START_AND_FINISH) { request.batch_status() == BATCH_START_AND_FINISH) {
...@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, ...@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
VLOG(1) << "num samples: " << numSamplesProcessed_ VLOG(1) << "num samples: " << numSamplesProcessed_
<< ", new cost:" << cost_; << ", new cost:" << cost_;
/// numPassFinishClients_ means some trainer has entered finishPass
if (!numPassFinishClients_) {
REGISTER_SLOW_NODES_PROBE(
*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// notify doOperation gradient ready /// notify doOperation gradient ready
gradientReadyBarrier_.wait(); gradientReadyBarrier_.wait();
/// if wait pass finish does not start, do check
if (!numPassFinishClients_) {
CHECK_BARRIER_TIMER(*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// barrier performance while all parameter add is finished
/// can indicate the fluctation caused by computation at pserver.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"paraReady",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// wait doOperation finish /// wait doOperation finish
parameterReadyBarrier_.wait(); parameterReadyBarrier_.wait();
VLOG(1) << "start send back"; VLOG(1) << "start send back";
{
/// total time except overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
} }
} }
...@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat( ...@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
return commitGradient; return commitGradient;
} }
void ParameterServer2::printAsyncGradientCommitStatAndReset() {
std::stringstream statFormat;
if (asyncUpdateSteps_) {
statFormat << "async discard gradients stat: " << std::endl;
statFormat << "serverId: " << serverId_
<< " serverType: " << isSparseServer_
<< " total updates: " << asyncUpdateSteps_
<< " discard updates: " << asyncLaggedGradientsNum_
<< " discard ratio: "
<< (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Update Steps distribution: " << std::endl
<< "Sample: 1:1912(0.00284449) means "
<< "the updates step=1 count 1912 times "
<< "and account for 0.284449% of total updates" << std::endl;
size_t index = 0;
for (const auto& stat : asyncUpdateStat_) {
statFormat << index << ":" << stat << "("
<< (real)stat / (real)asyncUpdateSteps_ << ") ";
index++;
}
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
<< "Sample: 2:22(0.0016363) means "
<< "total discarded updates from trainer_id=2 count 22 "
<< "and account for 0.16363% of all updates from trainer_id=2"
<< std::endl;
for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
real ratio =
(real)asyncTrainerDiscardStat_[i] /
(real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
<< ")"
<< " ";
}
LOG(INFO) << statFormat.str();
/// reset stat
asyncUpdateSteps_ = 0;
asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
asyncLaggedGradientsNum_ = 0;
asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
}
}
static ThreadLocal<std::vector<bool>> localBlockBitset_; static ThreadLocal<std::vector<bool>> localBlockBitset_;
void ParameterServer2::asyncSGD(const SendParameterRequest& request, void ParameterServer2::asyncSGD(const SendParameterRequest& request,
...@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request, ...@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
if (request.trainer_id() == 0) { if (request.trainer_id() == 0) {
/// batchId_ is approximately equal to "real batchId_" /// batchId_ is approximately equal to "real batchId_"
batchId_++; batchId_++;
tuningAsyncsgdMidOutput();
} }
} }
...@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request, ...@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
} }
(*requestVec_).clear(); (*requestVec_).clear();
(*callbackVec_).clear(); (*callbackVec_).clear();
/// barrier perfromance while all data are send finished.
/// indicates network flucatuation for big message.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"sendParamFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// all time exhausted in parameterServer for big message.
/// it contains network and computation at pserver.
{
/// total time including overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
timeToMicroSecond(*handleRequestBegin_),
-1,
*statSet_);
}
/// all time exhausted in pserverServer except recieve network.
{
/// total time except overhead of network receive
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
} }
break; break;
case PSERVER_UPDATE_MODE_SET_PARAM: case PSERVER_UPDATE_MODE_SET_PARAM:
...@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation, ...@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
} }
{ {
REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) { parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
BlockInfo& info = blockInfos_[blockId]; BlockInfo& info = blockInfos_[blockId];
const ParameterConfig& config = getParameterConfig(blockId); const ParameterConfig& config = getParameterConfig(blockId);
...@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation, ...@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
} }
batchId_++; batchId_++;
tuningSgdMidOutput();
} }
void ParameterServer2::op_start_pass(const Operation& operation, void ParameterServer2::op_start_pass(const Operation& operation,
...@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation, ...@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
/// finish pass /// finish pass
info.optimizer->finishPass(); info.optimizer->finishPass();
}); });
tuningSgdFinished();
batchId_ = 0; batchId_ = 0;
} }
...@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request, ...@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
callback(SynchronizeResponse()); callback(SynchronizeResponse());
if (request.trainer_id() == 0) { if (request.trainer_id() == 0) {
tuningAsyncsgdFinished();
batchId_ = 0; batchId_ = 0;
} }
} }
...@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request, ...@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
callback(response); callback(response);
} }
void ParameterServer2::tuningSgdMidOutput() {
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== Batch=" << batchId_ << "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/// not reset raw data for reducing the overhead of performance tuning
statSet_->reset(false);
}
}
void ParameterServer2::tuningSgdFinished() {
LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
<< "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/**
* reset raw data at end of pass since some raw data could be not
* complete. Otherwise the raw data will pollute next pass performance
* tuning
*/
statSet_->reset();
}
void ParameterServer2::tuningAsyncsgdMidOutput() {
#ifndef PADDLE_DISABLE_TIMER
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
printAsyncGradientCommitStatAndReset();
}
#endif
}
void ParameterServer2::tuningAsyncsgdFinished() {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
<< "=======";
printAsyncGradientCommitStatAndReset();
}
} // namespace paddle } // namespace paddle
...@@ -298,24 +298,6 @@ protected: ...@@ -298,24 +298,6 @@ protected:
/// barrier performance tuning sync-sgd required /// barrier performance tuning sync-sgd required
std::atomic<int64_t> batchId_; std::atomic<int64_t> batchId_;
/// the beginning of addGradient without network overhead
ThreadLocal<struct timeval> addGradBegin_;
/**
* tuning barrier performance
* to better control log for sparse and dense parameter,
* we use different log entities for different parameterServer
* objects.
* it will output lots of performance stats to perceive the
* overhead of network, fluctuation of computation from
* forwardbackward and network, computation from optimization
* at pserver end, barrier overhead, etc. to understand tuning
* data, focus on the synchronization between addGradient and
* doOperation which indirectly call op_SGD operation controlled
* by remote updater controller
*/
std::unique_ptr<StatSet> statSet_;
public: public:
struct Buffer { struct Buffer {
real* base; real* base;
...@@ -325,7 +307,6 @@ public: ...@@ -325,7 +307,6 @@ public:
protected: protected:
/// async gradient commit control /// async gradient commit control
bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request); bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
void printAsyncGradientCommitStatAndReset();
public: public:
/// disable default parameter for overloading /// disable default parameter for overloading
...@@ -710,36 +691,6 @@ public: ...@@ -710,36 +691,6 @@ public:
void op_load(const Operation& operation, OperationResult* result); void op_load(const Operation& operation, OperationResult* result);
void op_save(const Operation& operation, OperationResult* result); void op_save(const Operation& operation, OperationResult* result);
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for sgd
*/
void tuningSgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for sgd. it will also
* flush some stateful stat for next pass.
*/
void tuningSgdFinished();
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for async-sgd.
* it will log some performance log if some lagged node are found
*/
void tuningAsyncsgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for async-sgd.
*/
void tuningAsyncsgdFinished();
}; };
} // namespace paddle } // namespace paddle
...@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) { ...@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
else else
len = rdma::read(rdmaSocket_, (char*)buf + total, size - total); len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_; CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) { if (len <= 0) {
return total; return total;
} }
...@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) { ...@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
else else
len = rdma::write(rdmaSocket_, (char*)buf + total, size - total); len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_; CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) { if (len <= 0) {
return total; return total;
} }
...@@ -98,7 +98,7 @@ static size_t readwritev(IOFunc iofunc, ...@@ -98,7 +98,7 @@ static size_t readwritev(IOFunc iofunc,
while (size < total) { while (size < total) {
ssize_t len = ssize_t len =
iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs)); iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
<< " iovCnt=" << iovcnt << " iovCnt=" << iovcnt
<< " iovs[curIov].base=" << iovs[curIov].iov_base << " iovs[curIov].base=" << iovs[curIov].iov_base
<< " iovs[curIov].iov_len=" << iovs[curIov].iov_len; << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
...@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) { ...@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
header.totalLength += iov.iov_len; header.totalLength += iov.iov_len;
} }
PCHECK(writev(iovs) == (size_t)header.totalLength); CHECK(writev(iovs) == (size_t)header.totalLength);
} }
std::unique_ptr<MsgReader> SocketChannel::readMessage() { std::unique_ptr<MsgReader> SocketChannel::readMessage() {
...@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() { ...@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
return nullptr; return nullptr;
} }
PCHECK(len == sizeof(header)); CHECK(len == sizeof(header));
std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs)); std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
...@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() { ...@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks) MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
: channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) { : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
size_t size = numBlocks * sizeof(blockLengths_[0]); size_t size = numBlocks * sizeof(blockLengths_[0]);
PCHECK(channel_->read(&blockLengths_[0], size) == size); CHECK(channel_->read(&blockLengths_[0], size) == size);
} }
void MsgReader::readBlocks(const std::vector<void*>& bufs) { void MsgReader::readBlocks(const std::vector<void*>& bufs) {
...@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) { ...@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
++currentBlockIndex_; ++currentBlockIndex_;
} }
PCHECK(channel_->readv(&iovs) == totalLength); CHECK(channel_->readv(&iovs) == totalLength);
} }
void MsgReader::readNextBlock(void* buf) { void MsgReader::readNextBlock(void* buf) {
CHECK_LT(currentBlockIndex_, blockLengths_.size()); CHECK_LT(currentBlockIndex_, blockLengths_.size());
PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength()); CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
++currentBlockIndex_; ++currentBlockIndex_;
} }
......
...@@ -113,7 +113,7 @@ void SocketServer::run() { ...@@ -113,7 +113,7 @@ void SocketServer::run() {
/* First call to socket() function */ /* First call to socket() function */
socket_ = socket(AF_INET, SOCK_STREAM, 0); socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket"; CHECK(socket_ >= 0) << "ERROR opening socket";
/* Initialize socket structure */ /* Initialize socket structure */
bzero((char*)&serv_addr, sizeof(serv_addr)); bzero((char*)&serv_addr, sizeof(serv_addr));
...@@ -122,7 +122,7 @@ void SocketServer::run() { ...@@ -122,7 +122,7 @@ void SocketServer::run() {
serv_addr.sin_port = htons(port_); serv_addr.sin_port = htons(port_);
/* Now bind the host address using bind() call.*/ /* Now bind the host address using bind() call.*/
PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding"; << "ERROR on binding";
/* Now start listening for the clients, here process will /* Now start listening for the clients, here process will
...@@ -134,7 +134,7 @@ void SocketServer::run() { ...@@ -134,7 +134,7 @@ void SocketServer::run() {
while (true) { while (true) {
/* Accept actual connection from the client */ /* Accept actual connection from the client */
newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen); newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
PCHECK(newsockfd >= 0) << "ERROR on accept"; CHECK(newsockfd >= 0) << "ERROR on accept";
SocketWorker* worker = new SocketWorker(newsockfd); SocketWorker* worker = new SocketWorker(newsockfd);
worker->start(); worker->start();
...@@ -146,17 +146,17 @@ void SocketWorker::run() { ...@@ -146,17 +146,17 @@ void SocketWorker::run() {
while (true) { while (true) {
int64_t n = channel_.readAll(&header, sizeof(header)); int64_t n = channel_.readAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket"; CHECK(n == sizeof(header)) << "ERROR reading from socket";
buffer_.resize(header.dataLength); buffer_.resize(header.dataLength);
n = channel_.readAll(&buffer_[0], header.dataLength); n = channel_.readAll(&buffer_[0], header.dataLength);
PCHECK(n == header.dataLength) << "ERROR reading from socket"; CHECK(n == header.dataLength) << "ERROR reading from socket";
/* Write a response to the client */ /* Write a response to the client */
n = channel_.writeAll(&header, sizeof(header)); n = channel_.writeAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket"; CHECK(n == sizeof(header)) << "ERROR reading from socket";
n = channel_.writeAll(buffer_.data(), buffer_.size()); n = channel_.writeAll(buffer_.data(), buffer_.size());
PCHECK(n == header.dataLength) << "ERROR writing to socket"; CHECK(n == header.dataLength) << "ERROR writing to socket";
} }
} }
...@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) { ...@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
/* Create a socket point */ /* Create a socket point */
int sockfd = socket(AF_INET, SOCK_STREAM, 0); int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket"; CHECK(sockfd >= 0) << "ERROR opening socket";
server = gethostbyname(serverAddr.c_str()); server = gethostbyname(serverAddr.c_str());
PCHECK(server) << "ERROR, no such host: " << serverAddr; CHECK(server) << "ERROR, no such host: " << serverAddr;
bzero((char*)&serv_addr, sizeof(serv_addr)); bzero((char*)&serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET; serv_addr.sin_family = AF_INET;
...@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) { ...@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
serv_addr.sin_port = htons(serverPort); serv_addr.sin_port = htons(serverPort);
/* Now connect to the server */ /* Now connect to the server */
PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0) CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR connecting"; << "ERROR connecting";
channel_.reset(new SocketChannel(sockfd)); channel_.reset(new SocketChannel(sockfd));
...@@ -234,18 +234,18 @@ int main(int argc, char** argv) { ...@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
cpuGrad.copyFrom(gpuGrad); cpuGrad.copyFrom(gpuGrad);
header.dataLength = dataSize; header.dataLength = dataSize;
PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header)) CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
<< "Client write header error"; << "Client write header error";
PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize) CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
<< "Client write data error"; << "Client write data error";
/* Now read server response */ /* Now read server response */
PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header)) CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
<< "Client read header error"; << "Client read header error";
CHECK_EQ((uint64_t)header.dataLength, dataSize); CHECK_EQ((uint64_t)header.dataLength, dataSize);
PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize) CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
<< "Client read data error"; << "Client read data error";
gpuParam.copyFrom(cpuParam); gpuParam.copyFrom(cpuParam);
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
set -xe set -xe
# Set BASE_IMAGE according to env variables # Set BASE_IMAGE according to env variables
if [ ${WITH_GPU} == "ON" ]; then if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04" BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else else
BASE_IMAGE="ubuntu:16.04" BASE_IMAGE="ubuntu:16.04"
...@@ -78,7 +78,7 @@ paddle version ...@@ -78,7 +78,7 @@ paddle version
# PaddlePaddle. This awkwardness is due to # PaddlePaddle. This awkwardness is due to
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also # https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# describes a solution. # describes a solution.
if [ ${WITH_DOC} == "ON" ]; then if [[ ${WITH_DOC} == "ON" ]]; then
cat <<EOF cat <<EOF
======================================== ========================================
Building documentation ... Building documentation ...
......
...@@ -5,13 +5,14 @@ set -e ...@@ -5,13 +5,14 @@ set -e
mkdir -p $TRAVIS_BUILD_DIR/build mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only. # Compile paddle binaries first
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
mkdir output mkdir output
make -j `nproc` make -j `nproc`
find .. -name '*whl' | xargs pip install # install all wheels. find .. -name '*whl' | xargs pip install # install all wheels.
rm -rf * rm -rf *
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
make -j `nproc` paddle_docs paddle_docs_cn make -j `nproc` paddle_docs paddle_docs_cn
......
...@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch, ...@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
} }
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
FILE* fp = fopen(featFile.c_str(), "ab+"); FILE* fp = fopen(featFile.c_str(), "ab+");
PCHECK(!ferror(fp)) << "Fail to open " << featFile; CHECK(!ferror(fp)) << "Fail to open " << featFile;
size_t sampleNum = featMatrices[0]->getHeight(); size_t sampleNum = featMatrices[0]->getHeight();
for (size_t i = 0; i < sampleNum; ++i) { for (size_t i = 0; i < sampleNum; ++i) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/BarrierStat.h"
#include <string.h>
#include <sys/types.h>
#include <algorithm>
#include <iomanip>
#include "paddle/utils/Flags.h"
#include "paddle/utils/Stat.h"
DEFINE_bool(log_barrier_abstract,
true,
"if true, show abstract of barrier performance");
DEFINE_int32(log_barrier_lowest_nodes,
5,
"how many lowest node will be logged");
DEFINE_bool(log_barrier_show_log,
false, // for performance tuning insight
"if true, always show barrier abstract even with little gap");
namespace paddle {
std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
if (FLAGS_log_barrier_abstract) {
std::lock_guard<std::mutex> guard(stat.lock_);
stat.showAbstract(output);
}
return output;
}
BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
const std::string &name)
: totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
abstract_.resize(numConnThreads_);
if (FLAGS_log_barrier_show_log) {
rateThreshold_ = 0.0;
} else {
/* probablity of abnormal node
* p = 1/n + (n/8)/(n+1), n = nodes, n > 1
* if the freq of lowest trainerId larger than p,
* output FLAGS_log_barrier_lowest_nodes lastTrainerId.
* numConnThreads_ indicates nodes
*/
float n = (float)numConnThreads;
rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
}
}
BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorEnd(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierEndStat: " << name
<< " endBarrier warning rate: " << rateThreshold_;
}
/*
* Note:
* the design different pserver entity owns different statSet to obey
* the background that different pserver runs separately.
*/
void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(cur, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getLastTrainerId();
auto delta = timeToMicroSecond(timeVector_->getDelta());
auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
abstract_[id].totSecondDelta += secondDelta;
abstract_[id].totLastTwoDelta += lastTwoDelta;
abstract_[id].totMidDelta += midDelta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
totAbstract_.totSecondDelta += secondDelta;
totAbstract_.totLastTwoDelta += lastTwoDelta;
totAbstract_.totMidDelta += midDelta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierEndStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierEndStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/*
* Note:
* avgGap: the average delta between 1 -- n arriving trainers
* avgSecondGap: the average delta between 2 -- n arriving trainers
* avgLastTwoGap: the average delta between n-1 -- n arriving trainers
* avgMidGap: the average delta between n/2 -- n arriving trainers
* rato: samples / totSamples
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
<< std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
<< std::setw(10) << "rate" << std::setw(10) << "samples"
<< std::setw(10) << "totSamples" << std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
<< std::setw(10) << (float)totAbstract_.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
<< std::setw(15)
<< (abstract.totLastTwoDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
<< std::setw(10) << (float)abstract.freq / (float)totSamples_
<< std::setw(10) << (float)abstract.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
}
}
BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorDelta(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierDeltaStat: " << name
<< " barrierDelta warning rate: " << rateThreshold_;
}
void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(delta, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getMaxTrainerId();
auto delta = timeVector_->getDelta();
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierDeltaStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierDeltaStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/* Note:
* Gap means the delta from all trainers' forwardbackward
* avgGap: average Gap in log_period batches
* minGap: min Gap in log_period batches
* maxGap: max Gap in log_period batches
* trainerId: the slowest trainer_id
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(10) << "minGap"
<< std::setw(10) << "maxGap" << std::setw(10) << "rate"
<< std::setw(10) << "samples" << std::setw(10) << "totSamples"
<< std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
<< totAbstract_.maxDelta * 0.001 << std::setw(10)
<< (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
<< (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
<< std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
<< abstract.maxDelta * 0.001 << std::setw(10)
<< (float)abstract.freq / (float)totSamples_ << std::setw(10)
<< (float)abstract.freq << std::setw(10) << (float)totSamples_
<< std::endl;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <sys/time.h>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
namespace paddle {
inline uint64_t timeToMicroSecond(struct timeval time) {
return time.tv_sec * 1000000LU + time.tv_usec;
}
class TimeVectorEnd {
/*
* help class for gathering all barrier performance data
* which shows time point property.
* freqently used in barrier performance tuning API, such
* as tuning which is slowest node in sync-sgd mode training.
*/
public:
explicit TimeVectorEnd(uint16_t size) : size_(size) {
index_ = 0;
timeArray_.resize(size);
trainerIds_.resize(size);
}
~TimeVectorEnd() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() { index_ = 0; }
void addTimeval(struct timeval time, int32_t trainerId) {
timeArray_[index_] = time;
trainerIds_[index_] = trainerId;
index_++;
}
struct timeval getDelta() const {
struct timeval delta;
CHECK_GT(size_, 1) << "not support with 1 pserver";
timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
return delta;
}
/* 2, n delta */
struct timeval get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
return delta;
}
/* n-1, n delta */
struct timeval getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
return delta;
}
/* n/2, n delta */
struct timeval getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
return delta;
}
int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
private:
uint16_t size_;
uint16_t index_;
std::vector<struct timeval> timeArray_;
std::vector<int32_t> trainerIds_;
};
class TimeVectorDelta {
/*
* help class for gathering performance data which shows time
* delta property, such as tuning the time distribution of
* forwardBackward time from all cluster nodes.
*/
public:
explicit TimeVectorDelta(uint16_t size)
: size_(size), min_(UINT64_MAX), max_(0) {
index_ = 0;
timeArray_.resize(size);
}
~TimeVectorDelta() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() {
index_ = 0;
min_ = UINT64_MAX;
max_ = 0;
}
void addTimeval(uint64_t delta, int32_t trainerId) {
timeArray_[index_] = delta;
index_++;
if (delta < min_) {
min_ = delta;
}
if (delta > max_) {
max_ = delta;
maxTrainerId_ = trainerId;
}
}
uint64_t getDelta() const {
CHECK_GT(size_, 1) << "not support with 1 pserver";
return max_ - min_;
}
/* 2, n delta */
uint64_t get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n-1, n delta */
uint64_t getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n/2, n delta */
uint64_t getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
int32_t getMaxTrainerId() const { return maxTrainerId_; }
private:
uint16_t size_;
uint16_t index_;
std::vector<uint64_t> timeArray_;
private:
uint64_t min_;
uint64_t max_;
int32_t maxTrainerId_;
};
// total samples stats, us
struct Abstract {
// last trainerId for barrier end, maxDelta trainerId for barrier delta
int32_t trainerId;
uint64_t minDelta;
uint64_t maxDelta;
uint64_t totDelta;
// first one is probably itself, so discard it.
uint64_t totSecondDelta;
// to confirm if last node destroy barrier performance.
uint64_t totLastTwoDelta;
// n/2-n delta
uint64_t totMidDelta;
uint64_t freq;
};
// barrier performance tunning stats
class BarrierStatBase {
public:
BarrierStatBase(uint16_t numConnThreads, const std::string &name);
virtual ~BarrierStatBase() {}
// if called at pserver end, then trainId means trainer's id.
// by default trainer does not use trainerId, so set it to -1
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
const std::string &getName() { return name_; }
virtual void reset(bool clearRawData = true) {}
// since the timeVector_ is not stateful, so it's not clear whether the
// the barrier delta is correct. if one timestamp was lost, the all data
// from barrier stat becomes rubbish. -_-
virtual bool checkPassBarrier() {
LOG(INFO) << "bug implementation found";
return false;
}
protected:
virtual void showAbstract(std::ostream &output) const {}
friend std::ostream &operator<<(std::ostream &output,
const BarrierStatBase &stat);
protected:
mutable std::mutex lock_;
std::mutex abstractLock_; // see note on updaterStat
// each freqency for each barrier trainer
std::vector<struct Abstract> abstract_;
// it is valuable when do perf-tuining, if lastTrainerId acts uniform
// distribution
struct Abstract totAbstract_;
uint64_t totSamples_;
protected:
uint16_t numConnThreads_; // total updates needed
float rateThreshold_;
std::string name_;
};
// the end-time of arriving real/forged barrier position
class BarrierEndStat : public BarrierStatBase {
public:
BarrierEndStat(uint16_t numConnThreads, const std::string &name);
~BarrierEndStat() {}
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
LOG(INFO) << "have no delta updateStat in BarrierEndStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
/*
* LOG:
* readAllBlocks_denseUpdater
* trainerId avgGap avgSecondGap avgLastTwoGap avgMidGap rate
* 44 86.702 81.022 9.984 50.472 0.144737
* 46 87.723 82.939 8.737 50.019 0.118421
* 35 100.923 96.752 14.305 61.979
* 0.0657895
* log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
* control details.
*/
virtual void showAbstract(std::ostream &output) const;
private:
std::unique_ptr<TimeVectorEnd> timeVector_;
};
// the delta-time from different trainers,
// eg, find the degree of imbalance of BP time at pserver end
// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
class BarrierDeltaStat : public BarrierStatBase {
public:
BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
~BarrierDeltaStat() {}
virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
virtual void showAbstract(std::ostream &outPut) const;
private:
// store delta time in uint64_t, eg BP time of all trainers
std::unique_ptr<TimeVectorDelta> timeVector_;
};
// to distinguish different contexts for same parallel threads, and different
// threads with same code-sgement, just use tagName to tag the run-time
// position.
// in Sparse, sendParallel threads can not only run in the stage of push&pull
// with same thread group, but also run in the stage of pull&push with different
// thread group, tag will be used to distinguish different run-time barrier
// position.
// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
// nodes.
// end barrier
#define __REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
struct timeval cur; \
gettimeofday(&cur, nullptr); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// end barrier with user-defined timer
#define __REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// delta barrier
#define __REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
__stat->updateStat(delta, trainerId); \
} \
} while (0);
// check end barrier
#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
do { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
PCHECK(__stat->checkPassBarrier()) << internalName \
<< ": invalid barrier data"; \
} while (0);
/*
* Note:
* with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
* pserve end. these synchronizaton actions have impact on the efficiency of
* parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
* factors, such as the forwardBackward variance, network fluncation. we try
* to have a quantitative analysis on these factor, so we design lots of barrier
* time to capture these performance. these barrier also can be placed at
* implict barrier position.
*
* example:
* in sync-sgd algorithm, each parameter server waits for all gradients from
* all trainers, thus, an explict barrier point exsit before doing optimization.
* the barrier timer located before the point can sense the barrier condition.
*
*/
// try to capture which trainer is slowest node in sync-sgd at pserver.
#define REGISTER_SLOW_NODES_PROBE( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
// try to check if all threads or trainers have passed barriers for data
// accuracy.
#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
__CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
#ifdef PADDLE_DISABLE_TIMER
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...)
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#else
/*
* sensing barrier time distribution for all parallelization threads.
* it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
*/
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
/*
* sensing barrier time distribution for all parallelization threads.
* but time point for barrier performance is set by user.
* eg, with this api, you can get implict barrier point such as the beginning
* time distribution
* for receiving data.
*/
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
__REGISTER_BARRIER_TIMER_SERVER_SET( \
(set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
// try to capture time delta from all trainers, such as forwardBackward time
// which implies
// computation fluctuation
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
__REGISTER_BARRIER_DELTA_SERVER_SET( \
(set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
#endif // DISABLE_TIMER
} // namespace paddle
...@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) { ...@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
return outPut; return outPut;
} }
BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType) {
{
ReadLockGuard guard(lock_);
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
}
std::lock_guard<RWLock> guard(lock_);
// test again with lock_guard
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
BarrierStatPtr stat;
if (bType == BARRIER_END) {
stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
} else if (bType == BARRIER_DELTA) {
stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
}
auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
return ret.first->second;
}
void StatSet::printSegTimerStatus() { void StatSet::printSegTimerStatus() {
ReadLockGuard guard(lock_); ReadLockGuard guard(lock_);
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
...@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() { ...@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
} }
} }
void StatSet::printBarrierTimerStatus() {
ReadLockGuard guard(lock_);
if (barrierStatSet_.empty()) {
return;
}
// control barrierAbstact in runtime, so enable compliation
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< "======= BarrierStatSet status ======" << std::endl;
for (auto& stat : barrierStatSet_) {
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< *(stat.second);
}
}
void StatSet::printAllStatus() { void StatSet::printAllStatus() {
#ifndef PADDLE_DISABLE_TIMER #ifndef PADDLE_DISABLE_TIMER
printSegTimerStatus(); printSegTimerStatus();
#endif #endif
printBarrierTimerStatus();
LOG(INFO) << std::setiosflags(std::ios::left) LOG(INFO) << std::setiosflags(std::ios::left)
<< "--------------------------------------------------" << "--------------------------------------------------"
<< std::endl; << std::endl;
} }
void StatSet::printStatus(const std::string& name) {
ReadLockGuard guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
LOG(INFO) << *(iter->second);
}
void StatSet::reset(bool clearRawData) { void StatSet::reset(bool clearRawData) {
ReadLockGuard guard(lock_); ReadLockGuard guard(lock_);
for (auto& stat : statSet_) { for (auto& stat : statSet_) {
stat.second->reset(); stat.second->reset();
} }
// reset barrierStat
for (auto& stat : barrierStatSet_) {
stat.second->reset(clearRawData);
}
} }
void StatSet::setThreadInfo(const std::string& name, bool flag) { void StatSet::setThreadInfo(const std::string& name, bool flag) {
...@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) { ...@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
iter->second->setThreadInfo(flag); iter->second->setThreadInfo(flag);
} }
void StatSet::deleteStat(const std::string& name) {
std::lock_guard<RWLock> guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
statSet_.erase(iter);
}
StatInfo::~StatInfo() { StatInfo::~StatInfo() {
if (stat_) { if (stat_) {
std::lock_guard<std::mutex> guard(stat_->lock_); std::lock_guard<std::mutex> guard(stat_->lock_);
......
...@@ -23,7 +23,6 @@ limitations under the License. */ ...@@ -23,7 +23,6 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "BarrierStat.h"
#include "Locks.h" #include "Locks.h"
#include "Logging.h" #include "Logging.h"
#include "ThreadLocal.h" #include "ThreadLocal.h"
...@@ -60,12 +59,6 @@ public: ...@@ -60,12 +59,6 @@ public:
class Stat; class Stat;
typedef std::shared_ptr<Stat> StatPtr; typedef std::shared_ptr<Stat> StatPtr;
typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
enum BarrierStatType {
BARRIER_END = 0,
BARRIER_DELTA = 1,
};
class StatSet { class StatSet {
public: public:
...@@ -74,11 +67,8 @@ public: ...@@ -74,11 +67,8 @@ public:
// print to LOG(INFO) // print to LOG(INFO)
void printSegTimerStatus(); void printSegTimerStatus();
void printBarrierTimerStatus();
void printAllStatus(); void printAllStatus();
void printStatus(const std::string& name);
StatPtr getStat(const std::string& name) { StatPtr getStat(const std::string& name) {
{ {
ReadLockGuard guard(lock_); ReadLockGuard guard(lock_);
...@@ -93,12 +83,6 @@ public: ...@@ -93,12 +83,6 @@ public:
return ret.first->second; return ret.first->second;
} }
BarrierStatPtr getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType);
void deleteStat(const std::string& name);
// true for showing stats for each thread // true for showing stats for each thread
// false for showing stats aggragated over threads // false for showing stats aggragated over threads
void setThreadInfo(const std::string& name, bool flag); void setThreadInfo(const std::string& name, bool flag);
...@@ -120,7 +104,6 @@ public: ...@@ -120,7 +104,6 @@ public:
private: private:
std::unordered_map<std::string, StatPtr> statSet_; std::unordered_map<std::string, StatPtr> statSet_;
std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
const std::string name_; const std::string name_;
RWLock lock_; RWLock lock_;
}; };
......
...@@ -51,7 +51,7 @@ template <class T> ...@@ -51,7 +51,7 @@ template <class T>
class ThreadLocal { class ThreadLocal {
public: public:
ThreadLocal() { ThreadLocal() {
PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0); CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
} }
~ThreadLocal() { pthread_key_delete(threadSpecificKey_); } ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
...@@ -65,7 +65,7 @@ public: ...@@ -65,7 +65,7 @@ public:
if (!p && createLocal) { if (!p && createLocal) {
p = new T(); p = new T();
int ret = pthread_setspecific(threadSpecificKey_, p); int ret = pthread_setspecific(threadSpecificKey_, p);
PCHECK(ret == 0); CHECK(ret == 0);
} }
return p; return p;
} }
...@@ -79,7 +79,7 @@ public: ...@@ -79,7 +79,7 @@ public:
if (T* q = get(false)) { if (T* q = get(false)) {
dataDestructor(q); dataDestructor(q);
} }
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
} }
/** /**
...@@ -112,7 +112,7 @@ private: ...@@ -112,7 +112,7 @@ private:
template <class T> template <class T>
class ThreadLocalD { class ThreadLocalD {
public: public:
ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); } ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
~ThreadLocalD() { ~ThreadLocalD() {
pthread_key_delete(threadSpecificKey_); pthread_key_delete(threadSpecificKey_);
for (auto t : threadMap_) { for (auto t : threadMap_) {
...@@ -127,7 +127,7 @@ public: ...@@ -127,7 +127,7 @@ public:
T* p = (T*)pthread_getspecific(threadSpecificKey_); T* p = (T*)pthread_getspecific(threadSpecificKey_);
if (!p) { if (!p) {
p = new T(); p = new T();
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p); updateMap(p);
} }
return p; return p;
...@@ -141,7 +141,7 @@ public: ...@@ -141,7 +141,7 @@ public:
if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) { if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
dataDestructor(q); dataDestructor(q);
} }
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0); CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p); updateMap(p);
} }
......
...@@ -266,6 +266,29 @@ message PadConfig { ...@@ -266,6 +266,29 @@ message PadConfig {
repeated uint32 pad_w = 4; repeated uint32 pad_w = 4;
} }
message MultiBoxLossConfig {
required uint32 num_classes = 1;
required float overlap_threshold = 2;
required float neg_pos_ratio = 3;
required float neg_overlap = 4;
required uint32 background_id = 5;
required uint32 input_num = 6;
optional uint32 height = 7 [default = 1];
optional uint32 width = 8 [default = 1];
}
message DetectionOutputConfig {
required uint32 num_classes = 1;
required float nms_threshold = 2;
required uint32 nms_top_k = 3;
required uint32 background_id = 4;
required uint32 input_num = 5;
required uint32 keep_top_k = 6;
required float confidence_threshold = 7;
optional uint32 height = 8 [default = 1];
optional uint32 width = 9 [default = 1];
}
message LayerInputConfig { message LayerInputConfig {
required string input_layer_name = 1; required string input_layer_name = 1;
optional string input_parameter_name = 2; optional string input_parameter_name = 2;
...@@ -284,6 +307,8 @@ message LayerInputConfig { ...@@ -284,6 +307,8 @@ message LayerInputConfig {
optional PriorBoxConfig priorbox_conf = 13; optional PriorBoxConfig priorbox_conf = 13;
optional PadConfig pad_conf = 14; optional PadConfig pad_conf = 14;
optional RowConvConfig row_conv_conf = 15; optional RowConvConfig row_conv_conf = 15;
optional MultiBoxLossConfig multibox_loss_conf = 16;
optional DetectionOutputConfig detection_output_conf = 17;
} }
message LayerConfig { message LayerConfig {
......
...@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ...@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
add_custom_target(paddle_python ALL DEPENDS add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp) ${OUTPUT_DIR}/.timestamp)
...@@ -43,6 +43,7 @@ if (WITH_TESTING) ...@@ -43,6 +43,7 @@ if (WITH_TESTING)
add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests) add_subdirectory(paddle/v2/plot/tests)
add_subdirectory(paddle/v2/framework/tests)
endif() endif()
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
......
...@@ -1353,7 +1353,8 @@ class LayerBase(object): ...@@ -1353,7 +1353,8 @@ class LayerBase(object):
device=None, device=None,
active_type="", active_type="",
drop_rate=0., drop_rate=0.,
coeff=None): coeff=None,
error_clipping_threshold=None):
config_assert('@' not in name, config_assert('@' not in name,
"layer name: %s contain special character @" % name) "layer name: %s contain special character @" % name)
global g_current_submodel global g_current_submodel
...@@ -1387,6 +1388,9 @@ class LayerBase(object): ...@@ -1387,6 +1388,9 @@ class LayerBase(object):
elif g_default_device is not None: elif g_default_device is not None:
self.config.device = g_default_device self.config.device = g_default_device
if error_clipping_threshold is not None:
self.config.error_clipping_threshold = error_clipping_threshold
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input = self.inputs[input_index] input = self.inputs[input_index]
input_config = None input_config = None
...@@ -1674,6 +1678,52 @@ class PriorBoxLayer(LayerBase): ...@@ -1674,6 +1678,52 @@ class PriorBoxLayer(LayerBase):
self.config.size = size self.config.size = size
@config_layer('multibox_loss')
class MultiBoxLossLayer(LayerBase):
def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
neg_pos_ratio, neg_overlap, background_id, **xargs):
super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 2),
'MultiBoxLossLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
self.config.inputs[
0].multibox_loss_conf.overlap_threshold = overlap_threshold
self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
self.config.inputs[0].multibox_loss_conf.background_id = background_id
self.config.inputs[0].multibox_loss_conf.input_num = input_num
self.config.size = 1
@config_layer('detection_output')
class DetectionOutputLayer(LayerBase):
def __init__(self, name, inputs, size, input_num, num_classes,
nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
background_id, **xargs):
super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 1),
'DetectionOutputLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].detection_output_conf.num_classes = num_classes
self.config.inputs[
0].detection_output_conf.nms_threshold = nms_threshold
self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
self.config.inputs[
0].detection_output_conf.confidence_threshold = confidence_threshold
self.config.inputs[
0].detection_output_conf.background_id = background_id
self.config.inputs[0].detection_output_conf.input_num = input_num
self.config.size = size
@config_layer('data') @config_layer('data')
class DataLayer(LayerBase): class DataLayer(LayerBase):
def __init__(self, name, size, height=None, width=None, device=None): def __init__(self, name, size, height=None, width=None, device=None):
...@@ -2473,10 +2523,14 @@ class MaxLayer(LayerBase): ...@@ -2473,10 +2523,14 @@ class MaxLayer(LayerBase):
trans_type='non-seq', trans_type='non-seq',
bias=False, bias=False,
output_max_index=None, output_max_index=None,
stride=-1,
**xargs): **xargs):
super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs) super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input') config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
self.set_layer_size(input_layer.size) self.set_layer_size(input_layer.size)
...@@ -2738,11 +2792,15 @@ class AverageLayer(LayerBase): ...@@ -2738,11 +2792,15 @@ class AverageLayer(LayerBase):
average_strategy='average', average_strategy='average',
trans_type='non-seq', trans_type='non-seq',
bias=False, bias=False,
stride=-1,
**xargs): **xargs):
super(AverageLayer, self).__init__( super(AverageLayer, self).__init__(
name, 'average', 0, inputs=inputs, **xargs) name, 'average', 0, inputs=inputs, **xargs)
self.config.average_strategy = average_strategy self.config.average_strategy = average_strategy
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
config_assert(len(inputs) == 1, 'AverageLayer must have 1 input') config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
...@@ -2781,13 +2839,7 @@ class TensorLayer(LayerBase): ...@@ -2781,13 +2839,7 @@ class TensorLayer(LayerBase):
@config_layer('mixed') @config_layer('mixed')
class MixedLayer(LayerBase): class MixedLayer(LayerBase):
def __init__(self, def __init__(self, name, inputs, size=0, bias=True, **xargs):
name,
inputs,
size=0,
bias=True,
error_clipping_threshold=None,
**xargs):
config_assert(inputs, 'inputs cannot be empty') config_assert(inputs, 'inputs cannot be empty')
super(MixedLayer, self).__init__( super(MixedLayer, self).__init__(
name, 'mixed', size, inputs=inputs, **xargs) name, 'mixed', size, inputs=inputs, **xargs)
...@@ -2869,9 +2921,6 @@ class MixedLayer(LayerBase): ...@@ -2869,9 +2921,6 @@ class MixedLayer(LayerBase):
self.config.bias_size = psize self.config.bias_size = psize
self.create_bias_parameter(bias, psize) self.create_bias_parameter(bias, psize)
if error_clipping_threshold is not None:
self.config.error_clipping_threshold = error_clipping_threshold
# like MixedLayer, but no bias parameter # like MixedLayer, but no bias parameter
@config_func @config_func
......
...@@ -116,6 +116,8 @@ __all__ = [ ...@@ -116,6 +116,8 @@ __all__ = [
'print_layer', 'print_layer',
'priorbox_layer', 'priorbox_layer',
'cross_channel_norm_layer', 'cross_channel_norm_layer',
'multibox_loss_layer',
'detection_output_layer',
'spp_layer', 'spp_layer',
'pad_layer', 'pad_layer',
'eos_layer', 'eos_layer',
...@@ -197,6 +199,8 @@ class LayerType(object): ...@@ -197,6 +199,8 @@ class LayerType(object):
PRINT_LAYER = 'print' PRINT_LAYER = 'print'
PRIORBOX_LAYER = 'priorbox' PRIORBOX_LAYER = 'priorbox'
MULTIBOX_LOSS_LAYER = 'multibox_loss'
DETECTION_OUTPUT_LAYER = 'detection_output'
CTC_LAYER = 'ctc' CTC_LAYER = 'ctc'
WARP_CTC_LAYER = 'warp_ctc' WARP_CTC_LAYER = 'warp_ctc'
...@@ -1043,6 +1047,158 @@ def priorbox_layer(input, ...@@ -1043,6 +1047,158 @@ def priorbox_layer(input,
size=size) size=size)
@wrap_name_default("multibox_loss")
def multibox_loss_layer(input_loc,
input_conf,
priorbox,
label,
num_classes,
overlap_threshold=0.5,
neg_pos_ratio=3.0,
neg_overlap=0.5,
background_id=0,
name=None):
"""
Compute the location loss and the confidence loss for ssd.
:param name: The Layer Name.
:type name: basestring
:param input_loc: The input predict locations.
:type input_loc: LayerOutput | List of LayerOutput
:param input_conf: The input priorbox confidence.
:type input_conf: LayerOutput | List of LayerOutput
:param priorbox: The input priorbox location and the variance.
:type priorbox: LayerOutput
:param label: The input label.
:type label: LayerOutput
:param num_classes: The number of the classification.
:type num_classes: int
:param overlap_threshold: The threshold of the overlap.
:type overlap_threshold: float
:param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
:type neg_pos_ratio: float
:param neg_overlap: The negative bbox overlap threshold.
:type neg_overlap: float
:param background_id: The background class index.
:type background_id: int
:return: LayerOutput
"""
if isinstance(input_loc, LayerOutput):
input_loc = [input_loc]
assert isinstance(input_loc, collections.Sequence) # list or tuple
for each in input_loc:
assert isinstance(each, LayerOutput)
input_loc_num = len(input_loc)
if isinstance(input_conf, LayerOutput):
input_conf = [input_conf]
assert isinstance(input_conf, collections.Sequence) # list or tuple
for each in input_conf:
assert isinstance(each, LayerOutput)
input_conf_num = len(input_conf)
# Check the input layer number.
assert input_loc_num == input_conf_num
inputs = [priorbox.name, label.name]
inputs.extend([l.name for l in input_loc])
inputs.extend([l.name for l in input_conf])
parents = [priorbox, label]
parents.extend(input_loc)
parents.extend(input_conf)
Layer(
name=name,
type=LayerType.MULTIBOX_LOSS_LAYER,
inputs=inputs,
input_num=input_loc_num,
num_classes=num_classes,
overlap_threshold=overlap_threshold,
neg_pos_ratio=neg_pos_ratio,
neg_overlap=neg_overlap,
background_id=background_id)
return LayerOutput(
name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
@wrap_name_default("detection_output")
def detection_output_layer(input_loc,
input_conf,
priorbox,
num_classes,
nms_threshold=0.45,
nms_top_k=400,
keep_top_k=200,
confidence_threshold=0.01,
background_id=0,
name=None):
"""
Apply the NMS to the output of network and compute the predict bounding
box location.
:param name: The Layer Name.
:type name: basestring
:param input_loc: The input predict locations.
:type input_loc: LayerOutput | List of LayerOutput.
:param input_conf: The input priorbox confidence.
:type input_conf: LayerOutput | List of LayerOutput.
:param priorbox: The input priorbox location and the variance.
:type priorbox: LayerOutput
:param num_classes: The number of the classification.
:type num_classes: int
:param nms_threshold: The Non-maximum suppression threshold.
:type nms_threshold: float
:param nms_top_k: The bbox number kept of the NMS's output
:type nms_top_k: int
:param keep_top_k: The bbox number kept of the layer's output
:type keep_top_k: int
:param confidence_threshold: The classification confidence threshold
:type confidence_threshold: float
:param background_id: The background class index.
:type background_id: int
:return: LayerOutput
"""
if isinstance(input_loc, LayerOutput):
input_loc = [input_loc]
assert isinstance(input_loc, collections.Sequence) # list or tuple
for each in input_loc:
assert isinstance(each, LayerOutput)
input_loc_num = len(input_loc)
if isinstance(input_conf, LayerOutput):
input_conf = [input_conf]
assert isinstance(input_conf, collections.Sequence) # list or tuple
for each in input_conf:
assert isinstance(each, LayerOutput)
input_conf_num = len(input_conf)
# Check the input layer number.
assert input_loc_num == input_conf_num
inputs = [priorbox.name]
inputs.extend([l.name for l in input_loc])
inputs.extend([l.name for l in input_conf])
parents = [priorbox]
parents.extend(input_loc)
parents.extend(input_conf)
size = keep_top_k * 7
Layer(
name=name,
type=LayerType.DETECTION_OUTPUT_LAYER,
inputs=inputs,
size=size,
input_num=input_loc_num,
num_classes=num_classes,
nms_threshold=nms_threshold,
nms_top_k=nms_top_k,
keep_top_k=keep_top_k,
confidence_threshold=confidence_threshold,
background_id=background_id)
return LayerOutput(
name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
@wrap_name_default("cross_channel_norm") @wrap_name_default("cross_channel_norm")
def cross_channel_norm_layer(input, name=None, param_attr=None): def cross_channel_norm_layer(input, name=None, param_attr=None):
""" """
...@@ -1092,10 +1248,19 @@ def pooling_layer(input, ...@@ -1092,10 +1248,19 @@ def pooling_layer(input,
name=None, name=None,
bias_attr=None, bias_attr=None,
agg_level=AggregateLevel.TO_NO_SEQUENCE, agg_level=AggregateLevel.TO_NO_SEQUENCE,
stride=-1,
layer_attr=None): layer_attr=None):
""" """
Pooling layer for sequence inputs, not used for Image. Pooling layer for sequence inputs, not used for Image.
If stride > 0, this layer slides a window whose size is determined by stride,
and return the pooling value of the window as the output. Thus, a long sequence
will be shorten.
The parameter stride specifies the intervals at which to apply the pooling
operation. Note that for sequence with sub-sequence, the default value
of stride is -1.
The example usage is: The example usage is:
.. code-block:: python .. code-block:: python
...@@ -1114,6 +1279,8 @@ def pooling_layer(input, ...@@ -1114,6 +1279,8 @@ def pooling_layer(input,
:param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
SumPooling, SquareRootNPooling. SumPooling, SquareRootNPooling.
:type pooling_type: BasePoolingType|None :type pooling_type: BasePoolingType|None
:param stride: The step size between successive pooling regions.
:type stride: Int
:param bias_attr: Bias parameter attribute. False if no bias. :param bias_attr: Bias parameter attribute. False if no bias.
:type bias_attr: ParameterAttribute|None|False :type bias_attr: ParameterAttribute|None|False
:param layer_attr: The Extra Attributes for layer, such as dropout. :param layer_attr: The Extra Attributes for layer, such as dropout.
...@@ -1131,12 +1298,16 @@ def pooling_layer(input, ...@@ -1131,12 +1298,16 @@ def pooling_layer(input,
extra_dict['output_max_index'] = pooling_type.output_max_index extra_dict['output_max_index'] = pooling_type.output_max_index
extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr)) extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
if agg_level == AggregateLevel.TO_SEQUENCE:
assert stride == -1
Layer( Layer(
name=name, name=name,
type=pooling_type.name, type=pooling_type.name,
inputs=[Input(input.name)], inputs=[Input(input.name)],
bias=ParamAttr.to_bias(bias_attr), bias=ParamAttr.to_bias(bias_attr),
trans_type=agg_level, trans_type=agg_level,
stride=stride,
**extra_dict) **extra_dict)
return LayerOutput( return LayerOutput(
...@@ -1398,7 +1569,7 @@ def last_seq(input, ...@@ -1398,7 +1569,7 @@ def last_seq(input,
:type name: basestring :type name: basestring
:param input: Input layer name. :param input: Input layer name.
:type input: LayerOutput :type input: LayerOutput
:param stride: window size. :param stride: The step size between successive pooling regions.
:type stride: Int :type stride: Int
:param layer_attr: extra layer attributes. :param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
...@@ -1454,7 +1625,7 @@ def first_seq(input, ...@@ -1454,7 +1625,7 @@ def first_seq(input,
:type name: basestring :type name: basestring
:param input: Input layer name. :param input: Input layer name.
:type input: LayerOutput :type input: LayerOutput
:param stride: window size. :param stride: The step size between successive pooling regions.
:type stride: Int :type stride: Int
:param layer_attr: extra layer attributes. :param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
...@@ -4724,6 +4895,14 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): ...@@ -4724,6 +4895,14 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
So groups should be larger than 1, and the num of channels should be able So groups should be larger than 1, and the num of channels should be able
to devided by groups. to devided by groups.
.. math::
y_{si+j} = \max_k x_{gsi + sk + j}
g = groups
s = input.size / num_channels
0 \le i < num_channels / groups
0 \le j < s
0 \le k < groups
Please refer to Paper: Please refer to Paper:
- Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
- Multi-digit Number Recognition from Street View \ - Multi-digit Number Recognition from Street View \
......
...@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos ...@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_prelu_layer test_row_conv) test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
export whole_configs=(test_split_datasource) export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "input_loc"
type: "data"
size: 16
active_type: ""
height: 16
width: 1
}
layers {
name: "input_conf"
type: "data"
size: 8
active_type: ""
height: 1
width: 8
}
layers {
name: "priorbox"
type: "data"
size: 32
active_type: ""
height: 4
width: 8
}
layers {
name: "test_detection_output"
type: "detection_output"
size: 1400
active_type: ""
inputs {
input_layer_name: "priorbox"
detection_output_conf {
num_classes: 21
nms_threshold: 0.45
nms_top_k: 400
background_id: 0
input_num: 1
keep_top_k: 200
confidence_threshold: 0.01
}
}
inputs {
input_layer_name: "input_loc"
}
inputs {
input_layer_name: "input_conf"
}
}
input_layer_names: "priorbox"
input_layer_names: "input_loc"
input_layer_names: "input_conf"
output_layer_names: "test_detection_output"
sub_models {
name: "root"
layer_names: "input_loc"
layer_names: "input_conf"
layer_names: "priorbox"
layer_names: "test_detection_output"
input_layer_names: "priorbox"
input_layer_names: "input_loc"
input_layer_names: "input_conf"
output_layer_names: "test_detection_output"
is_recurrent_layer_group: false
}
type: "nn"
layers {
name: "input_loc"
type: "data"
size: 16
active_type: ""
height: 16
width: 1
}
layers {
name: "input_conf"
type: "data"
size: 8
active_type: ""
height: 1
width: 8
}
layers {
name: "priorbox"
type: "data"
size: 32
active_type: ""
height: 4
width: 8
}
layers {
name: "label"
type: "data"
size: 24
active_type: ""
height: 4
width: 6
}
layers {
name: "test_multibox_loss"
type: "multibox_loss"
size: 1
active_type: ""
inputs {
input_layer_name: "priorbox"
multibox_loss_conf {
num_classes: 21
overlap_threshold: 0.5
neg_pos_ratio: 3.0
neg_overlap: 0.5
background_id: 0
input_num: 1
}
}
inputs {
input_layer_name: "label"
}
inputs {
input_layer_name: "input_loc"
}
inputs {
input_layer_name: "input_conf"
}
}
input_layer_names: "priorbox"
input_layer_names: "label"
input_layer_names: "input_loc"
input_layer_names: "input_conf"
output_layer_names: "test_multibox_loss"
sub_models {
name: "root"
layer_names: "input_loc"
layer_names: "input_conf"
layer_names: "priorbox"
layer_names: "label"
layer_names: "test_multibox_loss"
input_layer_names: "priorbox"
input_layer_names: "label"
input_layer_names: "input_loc"
input_layer_names: "input_conf"
output_layer_names: "test_multibox_loss"
is_recurrent_layer_group: false
}
...@@ -14,6 +14,7 @@ layers { ...@@ -14,6 +14,7 @@ layers {
input_layer_name: "dat_in" input_layer_name: "dat_in"
} }
trans_type: "seq" trans_type: "seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_1__" name: "__seq_pooling_1__"
...@@ -24,6 +25,7 @@ layers { ...@@ -24,6 +25,7 @@ layers {
input_layer_name: "dat_in" input_layer_name: "dat_in"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_2__" name: "__seq_pooling_2__"
...@@ -35,6 +37,7 @@ layers { ...@@ -35,6 +37,7 @@ layers {
} }
average_strategy: "average" average_strategy: "average"
trans_type: "seq" trans_type: "seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_3__" name: "__seq_pooling_3__"
...@@ -46,6 +49,7 @@ layers { ...@@ -46,6 +49,7 @@ layers {
} }
average_strategy: "average" average_strategy: "average"
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_4__" name: "__seq_pooling_4__"
...@@ -57,6 +61,7 @@ layers { ...@@ -57,6 +61,7 @@ layers {
} }
average_strategy: "sum" average_strategy: "sum"
trans_type: "seq" trans_type: "seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_5__" name: "__seq_pooling_5__"
...@@ -68,6 +73,7 @@ layers { ...@@ -68,6 +73,7 @@ layers {
} }
average_strategy: "sum" average_strategy: "sum"
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__seq_pooling_6__" name: "__seq_pooling_6__"
...@@ -77,8 +83,44 @@ layers { ...@@ -77,8 +83,44 @@ layers {
inputs { inputs {
input_layer_name: "dat_in" input_layer_name: "dat_in"
} }
trans_type: "non-seq"
seq_pool_stride: 5
}
layers {
name: "__seq_pooling_7__"
type: "average"
size: 100
active_type: ""
inputs {
input_layer_name: "dat_in"
}
average_strategy: "average"
trans_type: "non-seq"
seq_pool_stride: 5
}
layers {
name: "__seq_pooling_8__"
type: "average"
size: 100
active_type: ""
inputs {
input_layer_name: "dat_in"
}
average_strategy: "sum"
trans_type: "non-seq"
seq_pool_stride: 5
}
layers {
name: "__seq_pooling_9__"
type: "max"
size: 100
active_type: ""
inputs {
input_layer_name: "dat_in"
}
output_max_index: true output_max_index: true
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
input_layer_names: "dat_in" input_layer_names: "dat_in"
output_layer_names: "__seq_pooling_0__" output_layer_names: "__seq_pooling_0__"
...@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__" ...@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__"
output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_4__"
output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_5__"
output_layer_names: "__seq_pooling_6__" output_layer_names: "__seq_pooling_6__"
output_layer_names: "__seq_pooling_7__"
output_layer_names: "__seq_pooling_8__"
output_layer_names: "__seq_pooling_9__"
sub_models { sub_models {
name: "root" name: "root"
layer_names: "dat_in" layer_names: "dat_in"
...@@ -98,6 +143,9 @@ sub_models { ...@@ -98,6 +143,9 @@ sub_models {
layer_names: "__seq_pooling_4__" layer_names: "__seq_pooling_4__"
layer_names: "__seq_pooling_5__" layer_names: "__seq_pooling_5__"
layer_names: "__seq_pooling_6__" layer_names: "__seq_pooling_6__"
layer_names: "__seq_pooling_7__"
layer_names: "__seq_pooling_8__"
layer_names: "__seq_pooling_9__"
input_layer_names: "dat_in" input_layer_names: "dat_in"
output_layer_names: "__seq_pooling_0__" output_layer_names: "__seq_pooling_0__"
output_layer_names: "__seq_pooling_1__" output_layer_names: "__seq_pooling_1__"
...@@ -106,6 +154,9 @@ sub_models { ...@@ -106,6 +154,9 @@ sub_models {
output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_4__"
output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_5__"
output_layer_names: "__seq_pooling_6__" output_layer_names: "__seq_pooling_6__"
output_layer_names: "__seq_pooling_7__"
output_layer_names: "__seq_pooling_8__"
output_layer_names: "__seq_pooling_9__"
is_recurrent_layer_group: false is_recurrent_layer_group: false
} }
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
detout = detection_output_layer(
input_loc=input_loc,
input_conf=input_conf,
priorbox=priorbox,
num_classes=21,
nms_threshold=0.45,
nms_top_k=400,
keep_top_k=200,
confidence_threshold=0.01,
background_id=0,
name='test_detection_output')
outputs(detout)
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
label = data_layer(name='label', size=24, height=4, width=6)
multibox_loss = multibox_loss_layer(
input_loc=input_loc,
input_conf=input_conf,
priorbox=priorbox,
label=label,
num_classes=21,
overlap_threshold=0.5,
neg_pos_ratio=3.0,
neg_overlap=0.5,
background_id=0,
name='test_multibox_loss')
outputs(multibox_loss)
...@@ -14,6 +14,14 @@ for pt in POOL_TYPE: ...@@ -14,6 +14,14 @@ for pt in POOL_TYPE:
for al in AGG_LEVEL: for al in AGG_LEVEL:
opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt())) opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
for pt in POOL_TYPE:
opts.append(
pooling_layer(
input=din,
agg_level=AggregateLevel.TO_NO_SEQUENCE,
pooling_type=pt(),
stride=5))
opts.append( opts.append(
pooling_layer( pooling_layer(
input=din, pooling_type=MaxPooling(output_max_index=True))) input=din, pooling_type=MaxPooling(output_max_index=True)))
......
add_python_test(test_framework test_protobuf.py)
import paddle.v2.framework.proto.op_proto_pb2
import paddle.v2.framework.proto.attr_type_pb2
import unittest
class TestFrameworkProto(unittest.TestCase):
def test_all(self):
op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
op_proto = op_proto_lib.OpProto()
ipt0 = op_proto.inputs.add()
ipt0.name = "a"
ipt0.comment = "the input of cosine op"
ipt1 = op_proto.inputs.add()
ipt1.name = "b"
ipt1.comment = "the other input of cosine op"
opt = op_proto.outputs.add()
opt.name = "output"
opt.comment = "the output of cosine op"
op_proto.comment = "cosine op, output = scale*cos(a, b)"
attr = op_proto.attrs.add()
attr.name = "scale"
attr.comment = "scale of cosine op"
attr.type = attr_type_lib.FLOAT
op_proto.type = "cos"
self.assertTrue(op_proto.IsInitialized())
...@@ -26,14 +26,22 @@ class client(object): ...@@ -26,14 +26,22 @@ class client(object):
holder[idx] = c_ptr holder[idx] = c_ptr
lib.paddle_set_dataset(self.c, holder, len(paths)) lib.paddle_set_dataset(self.c, holder, len(paths))
# return format: (record, errno)
# errno = 0: ok
# < 0: error
def next_record(self): def next_record(self):
p = ctypes.c_char_p() p = ctypes.c_char_p()
ret = ctypes.pointer(p) ret = ctypes.pointer(p)
size = lib.paddle_next_record(self.c, ret) size = lib.paddle_next_record(self.c, ret)
if size < 0:
# Error
return None, size
if size == 0: if size == 0:
# Empty record # Empty record
return "" return "", 0
record = ret.contents.value[:size] record = ret.contents.value[:size]
# Memory created from C should be freed. # Memory created from C should be freed.
lib.mem_free(ret.contents) lib.mem_free(ret.contents)
return record return record, 0
...@@ -57,17 +57,20 @@ def text_file(path): ...@@ -57,17 +57,20 @@ def text_file(path):
return reader return reader
def recordio(path): def recordio_local(paths, buf_size=100):
""" """
Creates a data reader that outputs record one one by one from given recordio file Creates a data reader from given RecordIO file paths separated by ",",
:path: path of recordio file glob pattern is supported.
:returns: data reader of recordio file :path: path of recordio files.
:returns: data reader of recordio files.
""" """
import recordio as rec import recordio as rec
import paddle.v2.reader.decorator as dec
def reader(): def reader():
f = rec.reader(path) a = ','.join(paths)
f = rec.reader(a)
while True: while True:
r = f.read() r = f.read()
if r is None: if r is None:
...@@ -75,4 +78,38 @@ def recordio(path): ...@@ -75,4 +78,38 @@ def recordio(path):
yield r yield r
f.close() f.close()
return dec.buffered(reader, buf_size)
def recordio(paths, buf_size=100):
"""
Creates a data reader that outputs record one one by one
from given local or cloud recordio path.
:path: path of recordio files.
:returns: data reader of recordio files.
"""
import os
import paddle.v2.master.client as cloud
if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
return recordio_local(paths)
host_name = "MASTER_SERVICE_HOST"
if host_name not in os.environ.keys():
raise Exception('not find ' + host_name + ' in environ.')
addr = os.environ(host)
def reader():
c = cloud(addr, buf_size)
c.set_dataset(paths)
while True:
r, err = client.next_record()
if err < 0:
break
yield r
c.close()
return reader return reader
...@@ -38,7 +38,7 @@ class TestRecordIO(unittest.TestCase): ...@@ -38,7 +38,7 @@ class TestRecordIO(unittest.TestCase):
def test_recordio(self): def test_recordio(self):
path = os.path.join( path = os.path.join(
os.path.dirname(__file__), "test_recordio_creator.dat") os.path.dirname(__file__), "test_recordio_creator.dat")
reader = paddle.v2.reader.creator.recordio(path) reader = paddle.v2.reader.creator.recordio([path])
for idx, r in enumerate(reader()): for idx, r in enumerate(reader()):
self.assertSequenceEqual(r, str(idx)) self.assertSequenceEqual(r, str(idx))
......
...@@ -9,7 +9,9 @@ packages=['paddle', ...@@ -9,7 +9,9 @@ packages=['paddle',
'paddle.v2.dataset', 'paddle.v2.dataset',
'paddle.v2.reader', 'paddle.v2.reader',
'paddle.v2.master', 'paddle.v2.master',
'paddle.v2.plot'] 'paddle.v2.plot',
'paddle.v2.framework',
'paddle.v2.framework.proto']
setup_requires=["requests", setup_requires=["requests",
"numpy", "numpy",
...@@ -27,8 +29,11 @@ setup(name='paddle', ...@@ -27,8 +29,11 @@ setup(name='paddle',
description='Parallel Distributed Deep Learning', description='Parallel Distributed Deep Learning',
install_requires=setup_requires, install_requires=setup_requires,
packages=packages, packages=packages,
package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], }, package_data={'paddle.v2.master': ['libpaddle_master.so'], },
package_dir={ package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}' '': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.v2.framework.proto will be generated while compiling.
# So that package points to other directory.
'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework'
}, },
) )
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册