提交 7663a40c 编写于 作者: G gongweibao

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into taskfail

......@@ -42,7 +42,7 @@ before_install:
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
notifications:
email:
......
| Github account | name |
|---|---|
| reyoung | Yang Yu |
| backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi |
| Canpio | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |
| dzhwinter | Zhi-Hong Dong |
| emailweixu | Wei Xu |
| gangliao | Gang Liao |
| luotao01 | Tao Luo |
| jacquesqiao | Long-Fei Qiao |
| qingqing01 | Qing-Qing Dang |
| gongweibao | Wei-Bao Gong |
| Guo Sheng | Sheng Guo |
| Haichao-Zhang | Hai-Chao Zhang |
| hedaoyuan | Dao-Yuan He |
| wangyang59 | Yang Wang |
| helinwang | He-Lin Wang |
| jacquesqiao | Long-Fei Qiao |
| kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao |
| lipeng-unisound | Peng Li |
| liuyuan | Yuan Liu |
| livc | Zhao Li |
| llxxxll | Yong-Feng Liu |
| luotao01 | Tao Luo |
| lzhao4ever | Liang Zhao |
| NHZlX | Zhao-Long Xing |
| pakchoi | Chuan-Jiang Song |
| pengli09 | Peng Li |
| pkuyym | Ya-Ming Yang |
| QiJune | Jun Qi |
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Superjom | Chun-Wei Yan |
| tianbingsz | Tian-Bing Xu |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| typhoonzero | Yi Wu |
| backyes | Yan-Fei Wang |
| pengli09 | Peng Li |
| livc | Zhao Li |
| wanghaoshuang | Hao-Shuang Wang |
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang |
| wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu |
| xujun05 | Jun Xu |
| xushaoyong | Shao-Yong Xu |
| Yancey1989 | Xu Yan |
| emailweixu | Wei Xu |
| wen-bo-yang | Wen-Bo Yang |
| helinwang | He-Lin Wang |
| lcy-seso | Ying Cao |
| Zrachel | Rui-Qing Zhang |
| Haichao-Zhang | Hai-Chao Zhang |
| gongweibao | Wei-Bao Gong |
| lzhao4ever | Liang Zhao |
| zhaopu7 | Pu Zhao |
| zhouxiao-coder | Xiao Zhou |
| lipeng-unisound | Peng Li |
| Zrachel | Rui-Qing Zhang |
......@@ -113,7 +113,7 @@ include(coveralls) # set code coverage
include_directories("${PROJ_ROOT}")
include_directories("${PROJ_ROOT}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS
......
......@@ -2,10 +2,10 @@ INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add(
linb_any
extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
......@@ -17,5 +17,15 @@ ExternalProject_Add(
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(lib_any STATIC ${dummyfile})
else()
add_library(lib_any INTERFACE)
endif()
add_dependencies(lib_any extern_lib_any)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
LIST(APPEND external_project_dependencies lib_any)
......@@ -2,10 +2,10 @@ INCLUDE(ExternalProject)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
ExternalProject_Add(
eigen3
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
# for latest version, please get from official website
# URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
......@@ -26,4 +26,14 @@ ExternalProject_Add(
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile})
else()
add_library(eigen3 INTERFACE)
endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
......@@ -162,6 +162,7 @@ function(cc_library TARGET_NAME)
endif()
if (cc_library_DEPS)
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
else(cc_library_SRCS)
if (cc_library_DEPS)
......@@ -191,9 +192,9 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_test(${TARGET_NAME} ${TARGET_NAME})
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
......@@ -211,6 +212,7 @@ function(nv_library TARGET_NAME)
endif()
if (nv_library_DEPS)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
endif()
else(nv_library_SRCS)
if (nv_library_DEPS)
......@@ -279,10 +281,11 @@ function(go_library TARGET_NAME)
file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
# Golang build source code
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-o "${${TARGET_NAME}_LIB_PATH}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
# must run under GOPATH
......@@ -297,11 +300,13 @@ function(go_binary TARGET_NAME)
cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
# TODO: don't know what ${TARGET_NAME}_link does
add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
......@@ -323,10 +328,10 @@ endfunction(go_test)
function(proto_library TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(proto_srcs)
set(proto_hdrs)
protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
endfunction()
......@@ -445,6 +445,11 @@ smooth_l1_cost
.. autoclass:: paddle.v2.layer.smooth_l1_cost
:noindex:
multibox_loss
--------------
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
Check Layer
============
......@@ -468,3 +473,11 @@ prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
......@@ -13,7 +13,7 @@
# limitations under the License.
#
add_subdirectory(pserver/cclient)
add_subdirectory(pserver/client/c)
add_subdirectory(cmd/pserver)
add_subdirectory(cmd/master)
add_subdirectory(master/c)
......@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
go_binary(master SRC master.go)
go_binary(master SRC master.go DEPS paddle_go_optimizer)
......@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
go_binary(pserver SRCS pserver.go)
go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
......@@ -15,6 +15,7 @@ import (
func main() {
port := flag.Int("port", 0, "port of the pserver")
index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
"comma separated endpoint string for pserver to connect to etcd")
etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
......@@ -29,11 +30,16 @@ func main() {
}
log.SetLevel(level)
timeout := time.Second * time.Duration((*etcdTimeout))
e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
idx, err := e.Register()
if err != nil {
panic(err)
var idx int
if *index >= 0 {
idx = *index
} else {
timeout := time.Second * time.Duration((*etcdTimeout))
e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
idx, err = e.Register()
if err != nil {
panic(err)
}
}
s, err := pserver.NewService(idx)
......
go_library(paddle_master SHARED)
go_library(paddle_master SHARED DEPS paddle_go_optimizer)
......@@ -50,7 +50,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
lock := concurrency.NewMutex(sess, lockPath)
// It's fine for the lock to get stuck, in this case we have
// multiple master servers running (only configured to have
// one master running, but split-brain problem may cuase
// one master running, but split-brain problem may cause
// multiple master servers running), and the cluster management
// software will kill one of them.
log.Debugf("Trying to acquire lock at %s.", lockPath)
......@@ -98,7 +98,7 @@ func (e *EtcdClient) Save(state []byte) error {
// We lost the master lock and can not acquire
// it back, it means some other master is
// already started. We don't want cluster
// managment system to kill the master server
// management system to kill the master server
// who is holding the lock and running
// correctly. So the most feasible solution is
// to kill current master server. The current
......
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
go_library(paddle_pserver_cclient STATIC)
go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
if(WITH_TESTING)
add_subdirectory(test)
# TODO: add unit test
#add_subdirectory(test)
endif()
......@@ -30,15 +30,16 @@ import (
"unsafe"
"github.com/PaddlePaddle/Paddle/go/pserver"
"github.com/PaddlePaddle/Paddle/go/pserver/client"
log "github.com/sirupsen/logrus"
)
var nullPtr = unsafe.Pointer(uintptr(0))
var mu sync.Mutex
var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
var handleMap = make(map[C.paddle_pserver_client]*client.Client)
var curHandle C.paddle_pserver_client
func add(c *pserver.Client) C.paddle_pserver_client {
func add(c *client.Client) C.paddle_pserver_client {
mu.Lock()
defer mu.Unlock()
client := curHandle
......@@ -47,13 +48,13 @@ func add(c *pserver.Client) C.paddle_pserver_client {
return client
}
func get(client C.paddle_pserver_client) *pserver.Client {
func get(client C.paddle_pserver_client) *client.Client {
mu.Lock()
defer mu.Unlock()
return handleMap[client]
}
func remove(client C.paddle_pserver_client) *pserver.Client {
func remove(client C.paddle_pserver_client) *client.Client {
mu.Lock()
defer mu.Unlock()
h := handleMap[client]
......@@ -80,9 +81,9 @@ func (s selector) Select() bool {
return bool(s)
}
type lister []pserver.Server
type lister []client.Server
func (l lister) List() []pserver.Server {
func (l lister) List() []client.Server {
return l
}
......@@ -90,19 +91,22 @@ func (l lister) List() []pserver.Server {
func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
a := C.GoString(addrs)
as := strings.Split(a, ",")
servers := make([]pserver.Server, len(as))
servers := make([]client.Server, len(as))
for i := range as {
servers[i].Index = i
servers[i].Addr = as[i]
}
c := pserver.NewClient(lister(servers), len(as), selector(selected != 0))
c := client.NewClient(lister(servers), len(as), selector(selected != 0))
return add(c)
}
//export paddle_new_etcd_pserver_client
func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
// TODO(helin): fault tolerant pserver client using etcd.
panic("not implemented.")
func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
addr := C.GoString(etcd_endpoints)
etcd_client := client.NewEtcd(addr)
c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
return add(c)
}
//export paddle_pserver_client_release
......
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
add_style_check_target(test_cclient test_cclient.c)
package pserver
package client
import (
"errors"
......@@ -7,6 +7,7 @@ import (
"time"
"github.com/PaddlePaddle/Paddle/go/connection"
"github.com/PaddlePaddle/Paddle/go/pserver"
log "github.com/sirupsen/logrus"
)
......@@ -105,7 +106,7 @@ func (c *Client) BeginInitParams() bool {
}
// InitParam initializes the parameter on parameter servers.
func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
}
......@@ -123,13 +124,13 @@ func (c *Client) FinishInitParams() error {
// SendGrads sends gradients to parameter servers for updating
// parameters.
func (c *Client) SendGrads(grads []Gradient) error {
func (c *Client) SendGrads(grads []pserver.Gradient) error {
if len(grads) == 0 {
return errors.New("no gradient received")
}
errCh := make(chan error, len(grads))
for _, g := range grads {
go func(g Gradient) {
go func(g pserver.Gradient) {
err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
errCh <- err
}(g)
......@@ -151,7 +152,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
type result struct {
idx int
param Parameter
param pserver.Parameter
err error
}
......@@ -170,12 +171,12 @@ func (r results) Swap(i int, j int) {
}
// GetParams gets parameters from parameter servers.
func (c *Client) GetParams(names []string) ([]Parameter, error) {
func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
rCh := make(chan result, len(names))
for idx, name := range names {
go func(name string, idx int) {
var parameter Parameter
var parameter pserver.Parameter
err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
rCh <- result{idx: idx, param: parameter, err: err}
}(name, idx)
......@@ -196,7 +197,7 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) {
}
sort.Sort(rs)
ps := make([]Parameter, len(rs))
ps := make([]pserver.Parameter, len(rs))
for i := range rs {
ps[i] = rs[i].param
}
......
package pserver_test
package client_test
import (
"context"
"io/ioutil"
"net"
"net/http"
......@@ -8,15 +9,25 @@ import (
"strconv"
"strings"
"testing"
"time"
"github.com/PaddlePaddle/Paddle/go/pserver"
"github.com/PaddlePaddle/Paddle/go/pserver/client"
"github.com/coreos/etcd/clientv3"
log "github.com/sirupsen/logrus"
)
const numPserver = 10
const (
numPserver = 10
etcdEndpoints = "127.0.0.1:2379"
timeout = 2 * time.Second
)
var port [numPserver]int
var pserverClientPorts [numPserver]int
func init() {
// this function init pserver client and return their ports in an array.
func initClient() [numPserver]int {
var ports [numPserver]int
for i := 0; i < numPserver; i++ {
l, err := net.Listen("tcp", ":0")
if err != nil {
......@@ -28,7 +39,7 @@ func init() {
if err != nil {
panic(err)
}
port[i] = p
ports[i] = p
go func(l net.Listener) {
s, err := pserver.NewService(0)
......@@ -49,6 +60,31 @@ func init() {
}
}(l)
}
return ports
}
func initNativeClient() {
pserverClientPorts = initClient()
}
func initEtcdClient() {
client, err := clientv3.New(clientv3.Config{
Endpoints: []string{etcdEndpoints},
DialTimeout: time.Second * time.Duration(1),
})
if err != nil {
log.Errorf("err %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
client.Delete(ctx, pserver.PsDesired)
client.Delete(ctx, pserver.PsPath)
client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
ports := initClient()
for i := 0; i < numPserver; i++ {
client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
}
cancel()
client.Close()
}
type selector bool
......@@ -57,25 +93,20 @@ func (s selector) Select() bool {
return bool(s)
}
type lister []pserver.Server
type lister []client.Server
func (l lister) List() []pserver.Server {
func (l lister) List() []client.Server {
return l
}
func TestClientFull(t *testing.T) {
servers := make([]pserver.Server, numPserver)
for i := 0; i < numPserver; i++ {
servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])}
}
c := pserver.NewClient(lister(servers), len(servers), selector(true))
func ClientTest(t *testing.T, c *client.Client) {
selected := c.BeginInitParams()
if !selected {
t.Fatal("should be selected.")
}
const numParameter = 100
config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
if err != nil {
t.Fatalf("read optimizer proto failed")
}
......@@ -129,3 +160,21 @@ func TestClientFull(t *testing.T) {
}
}
}
func TestNativeClient(t *testing.T) {
initNativeClient()
servers := make([]client.Server, numPserver)
for i := 0; i < numPserver; i++ {
servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
}
c1 := client.NewClient(lister(servers), len(servers), selector(true))
ClientTest(t, c1)
}
// TODO: tmperary disable etcdClient test for dependency of etcd)
func EtcdClient(t *testing.T) {
initEtcdClient()
etcd_client := client.NewEtcd(etcdEndpoints)
c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
ClientTest(t, c2)
}
package client
import (
"context"
"strconv"
"strings"
"time"
"github.com/PaddlePaddle/Paddle/go/pserver"
"github.com/coreos/etcd/clientv3"
log "github.com/sirupsen/logrus"
)
const (
DefaultEtcdTimeout time.Duration = 5 * time.Second
)
// EtcdClient is used by pserver client that is a part of trainer process.
// TODO:
// 1. add watcher to watch the change state of pservers)
// 1. add etcd lock)
type EtcdClient struct {
client *clientv3.Client
timeout time.Duration
endpoints []string
}
// Desired read ps desired number from etcd.
func (p *EtcdClient) Desired() int {
var psDesired int
for {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
resp, err := p.client.Get(ctx, pserver.PsDesired)
cancel()
if err != nil {
log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
time.Sleep(p.timeout)
continue
}
kvs := resp.Kvs
if len(kvs) == 0 {
log.Infoln("Waiting for ps desired registered ...")
time.Sleep(p.timeout)
continue
}
psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
if err != nil {
log.Errorf("psDesired %s invalid %v", psDesired, err)
time.Sleep(p.timeout)
continue
}
log.Debugf("Get psDesired number: %d", psDesired)
break
}
return psDesired
}
// List return the pserver list read from etcd.
func (p *EtcdClient) List() []Server {
psDesired := p.Desired()
servers := make([]Server, psDesired)
for {
for i := 0; i < psDesired; i++ {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
cancel()
psKey := pserver.PsPath + strconv.Itoa(i)
log.Debugf("checking %s", psKey)
resp, err := p.client.Get(ctx, psKey)
if err != nil {
log.Infof("Get psKey= %s error, %v", psKey, err)
time.Sleep(p.timeout)
continue
}
kvs := resp.Kvs
if len(kvs) == 0 {
log.Infof("Waiting for ps addr registered ...")
time.Sleep(p.timeout)
continue
}
psAddr := string(resp.Kvs[0].Value)
// TODO(Longfei) check the ps address
if psAddr == "" {
log.Infof("Get psKey = %s, psAddr is empty", psKey)
time.Sleep(p.timeout)
continue
}
log.Infof("got value (%s) for key: %s", psAddr, psKey)
servers[i].Index = i
servers[i].Addr = psAddr
}
break
}
return servers
}
// NewEtcd create a etcd client to return the state of pserver on etcd.
func NewEtcd(endpoints string) *EtcdClient {
ep := strings.Split(endpoints, ",")
var cli *clientv3.Client
var err error
for {
cli, err = clientv3.New(clientv3.Config{
Endpoints: ep,
DialTimeout: DefaultEtcdTimeout,
})
if err != nil {
log.Errorf("Init etcd connection failed: %v", err)
time.Sleep(DefaultEtcdTimeout)
continue
}
break
}
log.Infof("Connected to etcd: %s\n", endpoints)
client := &EtcdClient{
client: cli,
timeout: DefaultEtcdTimeout,
endpoints: ep,
}
return client
}
......@@ -13,6 +13,13 @@ import (
log "github.com/sirupsen/logrus"
)
const (
// PsDesired is etcd path for store desired pserver count
PsDesired = "/ps_desired"
// PsAddr is the base dir for pserver to store their addr
PsPath = "/ps/"
)
// EtcdClient is the etcd client that the pserver uses for fault
// tolerance, service registry and coordination.
type EtcdClient struct {
......@@ -68,7 +75,7 @@ func (e *EtcdClient) Register() (int, error) {
// it at the same time.
for {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
_, err := e.initDesiredPsercers(ctx, e.numPservers)
_, err := e.initDesiredPservers(ctx, e.numPservers)
cancel()
if err != nil {
log.Warn(err)
......@@ -120,7 +127,7 @@ func (e *EtcdClient) Register() (int, error) {
return pserverIdx, nil
}
func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
dsStr := c.Get(PsDesired)
if dsStr == "" {
......@@ -136,7 +143,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
registered := false
for i := 0; i < e.desired; i++ {
psKey := "/ps/" + strconv.Itoa(i)
psKey := PsPath + strconv.Itoa(i)
log.Debugf("checking %s", psKey)
ps := c.Get(psKey)
log.Debugf("got value (%s) for key: %s", ps, psKey)
......
package pserver
// #cgo CFLAGS: -I ../../
// //FIXME: ldflags contain "build" path
// #cgo LDFLAGS: ../../build/go/pserver/cclient/libpaddle_go_optimizer.a -lstdc++
// #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm
// #include "paddle/optimizer/optimizer.h"
// #include <stdlib.h>
// #include <string.h>
......
......@@ -11,7 +11,7 @@ func TestOptimizerCreateRelease(t *testing.T) {
ElementType: Int32,
}
p.Content = []byte{1, 3}
config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
if err != nil {
t.Fatalf("read optimizer proto failed")
}
......
......@@ -24,9 +24,6 @@ const (
Float64
)
// PsDesired is etcd path for store desired pserver count
const PsDesired = "/ps_desired"
// Parameter is a piece of data to sync with the parameter server.
type Parameter struct {
Name string
......
......@@ -10,6 +10,10 @@ import (
"github.com/PaddlePaddle/Paddle/go/pserver"
)
const (
OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
)
func TestServiceFull(t *testing.T) {
s, err := pserver.NewService(0)
if err != nil {
......@@ -19,7 +23,7 @@ func TestServiceFull(t *testing.T) {
p.Name = "param_a"
p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
p.ElementType = pserver.Int32
config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
config, err := ioutil.ReadFile(OptimizerConfig)
if err != nil {
t.Fatalf("read optimizer proto failed")
}
......@@ -149,7 +153,7 @@ func TestBlockUntilInitialized(t *testing.T) {
p.Name = "param_a"
p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
p.ElementType = pserver.Int32
config, err := ioutil.ReadFile("./cclient/test/testdata/optimizer.pb")
config, err := ioutil.ReadFile(OptimizerConfig)
if err != nil {
t.Fatalf("read optimizer proto failed")
}
......
......@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
paddle_trainer_lib
paddle_network
paddle_parameter
paddle_optimizer
paddle_math
paddle_utils
paddle_proto
......
......@@ -2,9 +2,13 @@
cc_library(ddim SRCS ddim.cc)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
cc_test(variable_test SRCS variable_test.cc)
cc_test(scope_test SRCS scope_test.cc)
cc_test(enforce_test SRCS enforce_test.cc)
proto_library(attr_type SRCS attr_type.proto)
proto_library(op_proto SRCS op_proto.proto)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto attr_type protobuf)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
# Network Design
`Network` is the container and controller of a set of operators,
user can build a real network from a `NetDesc` which is a protobuf message
and use `Network.Run()` to run all the operators in the network.
A network object knows all Operators belonging to this network. Variables,
which are inputs and outputs of these operators,
are created and managed by a hierarchy of Scope objects.
# API
## Net
To make the `Network` extendable, a base class is defined like this
```c++
// operator's index stored in a network.
typedef int OpIndex;
// The minimum a network should be implemented.
class Net {
public:
// run all the operators and return success(true) or not, with all the
// variables are located in `scope`. `context` describes the detail execution
// environment for ops. `begin` and `end` specify the scope of `ops_` to run,
// If no positive indexes are provided, all operators in `ops_` will run.
virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
OpIndex end = -1) const = 0;
// Add an Operator according to `def`.
virtual OpIndex AddOp(const proto::OpDef &def) = 0;
// Add optimizer operators acctording to `attrs`.
virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
// Add backward operators.
virtual Error AddBackwardOps() = 0;
// Infer the shapes of variables required by operators in the network. The
// `scope` will be mutated according to the inferred shapes.
static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
};
```
All network implementations should build networks from a protobuf message which
describes the structure of a real network; `Run` method should be implemented by
all implementations to offer a universal method to forward or backward compute a network.
`Net::Create` is a method of factory pattern and can be implemented like
```c++
std::unique<Net> Net::Create(const NetDesc& def) {
switch (def.model_type()) {
case NN:
return new Network(def);
case Recursive:
return new RecursiveNet(def);
case Recurrent:
return new RecurrentNet(def);
}
return nullptr;
}
```
Network is designed as the container of operators. to make it more extendable,
we decouple it from the related variable resources.
`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
Finally, `Net` can be used as followed
```c++
Scope default_scope;
OpContext default_context;
auto net = Net::CreateNet(def);
if (net) {
net.Run(&default_scope, &default_context);
}
```
## `PlainNet` as a simple implementation of `BaseNet`
A very basic implementation is as follows. All it does is simply to run every operators in sequence.
```c++
class PlainNet : public Net {
public:
// Create a network describe by `def`. NetDesc is the definition of a network.
PlainNet(const NetDesc &def);
// Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
training.
virtual Error InferShape(Scope *scope) override;
// Run all the operators with the `scope`, if no scope is provided, default
// scope will be used instead. If no OpContext is provicded, default context will be used.
virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
OpIndex end = -1) const override;
virtual OpIndex AddOp(const proto::OpDef &def) override;
virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
virtual Error AddBackwardOps() override;
protected:
// Create operators accordding to `def`, will be called by the constructor.
Error BuildNet(const NetDesc &def);
// Add a operator which is identified as `type` and has attributes described
// in `attrs`, the `inputs` are the keys of readonly input variables,
// `outputs` are keys of mutable output variables. An `OpIndex` will be
// returned to indicate the offset of the new operator in `ops_`.
OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
const std::vector<string> &outputs,
const OprAttr &attrs = OprAttr());
private:
// the operators owned by `Network`.
std::vector<Operator> ops_;
};
```
`PlainNet` will create operators so that a private member `ops_` is defined,
the operators are created by `CreateNet`, and each operator is created by `AddOp`.
## PlainNet Usage
`PlainNet` can be used to define and run a network as follows
```c++
// create an empty scope located on CPU device.
Scope scope(CPUPlace());
// create and init variables described in `net_desc`.
scope.CreateVariables(net_desc);
scope.InitVariables(net_desc);
// create a network according to `net_desc`
auto net = Net::CreateNet(net_desc);
// Add more operators if needed.
net->AddOp(add...);
net->AddOp(fc...);
net->AddBackwardOps();
net->AddOptimizerOps();
// run the network providing the `scope`.
net.Run(&scope);
```
## `NetBuilder` as a C++ syntax wrapper
This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
```c++
Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
Variable* avg_loss = builder.AddOp("mean", loss);
builder.BackwardFrom(avg_loss)
builder.AddOptimization(1e-4, "adam");
builder.Run();
```
`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
```c++
class NetBuilder final {
public:
NetBuilder(Net* net) : net_(net) {}
Variable* AddOp(const string& type, const vector<Variable>& inputs,
size_t size, Activation act) {
// much code here.
// ...
net_->AddOp(def);
need_rebuild_net_ = true;
net_->InferShape();
// ...
}
Error BackwardFrom(const Variable& cost);
Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
// backward.
if (need_backward) {
if (need_rebuild_net_) {
AddBackwardOps();
AddOptimizerOps();
}
net_->Run(scope, context);
return;
}
// just forward.
net_->Run(scope, context, 0, last_forward_op_);
}
protected:
Error AddBackwardOps();
Error AddOptimizerOps();
private:
Net* net_;
OpIndex last_forward_op_{-1};
bool need_rebuild_net_{true};
}
```
## Compatibility with RNN
Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
for example we can implement a simple recurrent neural network as follows
```c++
// copy some `vars` form `source` to `target`
void Copy(const Scope &source, Scope &target,
const std::vector<std::string> &vars);
Scope default_scope;
// some initial mutations on `default_scope` here.
auto rnn_step_net = PlainNet(rnn_step_net_def);
// Create rnn's states, the last scope is used to store rnn outputs.
Scope *rnn_states = new Scope[num_states + 1];
for (int i = 0; i < num_states + 1; i++) {
// Initialize all rnn state scopes, copy parameters and so on.
rnn_states[i].CreateVars(rnn_step_net_def);
Copy(default_scope, rnn_states[i], rnn_related_vars);
// Prepare rnn's inlinks, just copy inlink variables to each state.
Copy(default_scope, rnn_states[i], inlink_vars);
}
// Run the rnn.
for (int i = 0; i < num_states; i++) {
rnn_step_net.Run(rnn_states[i]);
// Copy current state's state variables to next state, the related variables
// are named like "previous_state_xxx".
Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
}
// Copy rnn's final outputs to `default_scope`.
Copy(rnn_states[num_states], default_scope, outlink_vars);
```
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
package paddle.framework;
import "attr_type.proto";
// AttrDesc is used to describe Attributes of an Operator. It contain's
// name, type, and value of Attribute.
//
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc {
required string name = 1;
required AttrType type = 2;
optional int32 i = 3;
optional float f = 4;
optional string s = 5;
repeated int32 ints = 6;
repeated float floats = 7;
repeated string strings = 8;
};
// Protocol Message to describe an Operator.
//
// In PaddlePaddle, Operator is used to do a certain computation such
// as "add", "sub", "cosine", etc.
// (1) Operator needs to know the input and output variable names.
// (2) Some ops may have special attributes such as "scale" in "CosineOp".
//
// 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc {
// input names of this Operator.
repeated string inputs = 1;
// output names of this Operator.
repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc".
required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4;
};
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/framework/op_desc.pb.h>
TEST(OpDesc, Create) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("add");
op_desc.add_inputs("X");
op_desc.add_inputs("Y");
op_desc.add_outputs("Z");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.14);
// required field name is not set, so IsInitialized should be false.
ASSERT_FALSE(op_desc.IsInitialized());
attr->set_name("add");
// after all required fields are set, IsInitialized should be true now.
ASSERT_TRUE(op_desc.IsInitialized());
}
\ No newline at end of file
......@@ -14,33 +14,39 @@ limitations under the License. */
#pragma once
#include <memory>
#include <type_traits>
#include "paddle/framework/ddim.h"
#include "paddle/framework/enforce.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
class Tensor {
using paddle::platform::Place;
using paddle::platform::get_place;
public:
template <typename T>
const T* data() const {
PADDLE_ASSERT(holder_ != nullptr,
"Tensor::data must be called after Tensor::mutable_data");
return static_cast<const T*>(holder->Ptr());
PADDLE_ENFORCE(holder_ != nullptr,
"Tensor::data must be called after Tensor::mutable_data.");
return static_cast<const T*>(holder_->Ptr());
}
template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type>
T* mutable_data(DDim dims, Place place) {
if (holder_ == nullptr || holder_->Place() != place ||
holder_->Size() < dims.product() * sizeof(T)) {
holder_.reset(new PlaceholderImpl(place, dims.product() * sizeof(T)));
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, paddle::platform::Place place) {
if (holder_ == nullptr ||
!(holder_->Place() ==
place) /* some versions of boost::variant don't have operator!= */
|| holder_->Size() < product(dims) * sizeof(T)) {
holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
}
return static_cast<T*>(holder_->Ptr());
}
template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type>
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims) {
return mutable_data<T>(dims, paddle::platform::get_place());
}
......@@ -51,27 +57,41 @@ class Tensor {
struct Placeholder {
virtual ~Placeholder() {}
virtual void* Ptr() const = 0;
virtual Place Place() const = 0;
virtual paddle::platform::Place Place() const = 0;
virtual size_t Size() const = 0;
};
template <typename T>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place pl, size_t size)
: ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)),
place_(pl),
private:
class Deleter {
public:
Deleter(platform::Place place) : place_(place) {}
void operator()(T* ptr) {
paddle::memory::Free(place_, static_cast<void*>(ptr));
}
private:
paddle::platform::Place place_;
};
public:
PlaceholderImpl(paddle::platform::Place place, size_t size)
: ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
Deleter(place)),
place_(place),
size_(size) {}
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
virtual size_t Size() const { return size_; }
virtual Place Place() const { return place_; }
virtual paddle::platform::Place Place() const { return place_; }
std::unique_ptr<T, memory::Deleter> ptr_;
Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
std::unique_ptr<T, Deleter> ptr_;
paddle::platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
};
std::unique_ptr<Placeholder> holder_; // holds the memory block if allocated.
std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
};
} // namespace framework
......
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "paddle/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
TEST(Tensor, ASSERT) {
paddle::framework::Tensor cpu_tensor;
bool caught = false;
try {
const double* p __attribute__((unused)) = cpu_tensor.data<double>();
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
const char* what = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(what[i], msg[i]);
}
}
ASSERT_TRUE(caught);
}
/* mutable_data() is not tested at present
because Memory::Alloc() and Memory::Free() have not been ready.
TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
{
Tensor cpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
EXPECT_NE(p1, nullptr);
// set cpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set cpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set cpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
{
Tensor gpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
EXPECT_NE(p1, nullptr);
// set gpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set gpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set gpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
}
*/
......@@ -25,6 +25,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the average pooling
* operation is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......
......@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
}
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
void CrossChannelNormLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
......@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
normBuffer_->zeroMem();
// add eps to avoid overflow
normBuffer_->addScalar(*normBuffer_, 1e-6);
inV->square2(*dataBuffer_);
for (size_t i = 0; i < batchSize; i++) {
const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
......@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
// compute norm.
spatialBuffer_->sumCols(*dataTmp, 1, 0);
// add eps to avoid overflow
spatialBuffer_->add(1e-6);
spatialBuffer_->sqrt2(*spatialBuffer_);
normTmp->copyFrom(*spatialBuffer_);
outVTmp->copyFrom(*inVTmp);
......@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
size_t dataDim = inG->getWidth();
size_t spatialDim = dataDim / channels_;
MatrixPtr inGBuffer;
Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
dataBuffer_->dotMul(*outG, *outV);
Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
......@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
scaleDiff_->add(*channelBuffer_, 1.);
sampleBuffer_->dotMul(*inVTmp, *outGTmp);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
// scale the grad
inGTmp->copyFrom(*inVTmp);
inGTmp->mulRowVector(*spatialBuffer_);
inGBuffer->copyFrom(*inVTmp);
inGBuffer->mulRowVector(*spatialBuffer_);
// divide by square of norm
spatialBuffer_->dotMul(*normTmp, *normTmp);
inGTmp->divRowVector(*spatialBuffer_);
inGBuffer->divRowVector(*spatialBuffer_);
// subtract
inGTmp->add(*outGTmp, -1, 1);
inGBuffer->add(*outGTmp, -1, 1);
// divide by norm
inGTmp->divRowVector(*normTmp);
inGBuffer->divRowVector(*normTmp);
// scale the diff
inGTmp->mulColVector(*scale_->getW());
inGBuffer->mulColVector(*scale_->getW());
inGTmp->add(*inGBuffer);
}
// updata scale
if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
scale_->getParameterPtr()->incUpdate(callback);
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DetectionOutputLayer.h"
namespace paddle {
REGISTER_LAYER(detection_output, DetectionOutputLayer);
bool DetectionOutputLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto& layerConf = config_.inputs(0).detection_output_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
nmsThreshold_ = layerConf.nms_threshold();
confidenceThreshold_ = layerConf.confidence_threshold();
nmsTopK_ = layerConf.nms_top_k();
keepTopK_ = layerConf.keep_top_k();
backgroundId_ = layerConf.background_id();
return true;
}
void DetectionOutputLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
Matrix::resizeOrCreate(
confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).detection_output_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locTmpBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confTmpBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
MatrixPtr priorValue;
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(
confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
confBuffer_->softmax(*confBuffer_);
size_t numPriors = priorValue->getElementCnt() / 8;
std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
for (size_t n = 0; n < batchSize; ++n) {
std::vector<NormalizedBBox> decodedBBoxes;
for (size_t i = 0; i < numPriors; ++i) {
size_t priorOffset = i * 8;
size_t locPredOffset = n * numPriors * 4 + i * 4;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
std::vector<real> locPredData;
for (size_t j = 0; j < 4; ++j)
locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
NormalizedBBox bbox =
decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
decodedBBoxes.push_back(bbox);
}
allDecodedBBoxes.push_back(decodedBBoxes);
}
std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
size_t numKept = getDetectionIndices(confBuffer_->getData(),
numPriors,
numClasses_,
backgroundId_,
batchSize,
confidenceThreshold_,
nmsTopK_,
nmsThreshold_,
keepTopK_,
allDecodedBBoxes,
&allIndices);
resetOutput(numKept, 7);
MatrixPtr outV = getOutputValue();
getDetectionOutput(confBuffer_->getData(),
numKept,
numPriors,
numClasses_,
batchSize,
allIndices,
allDecodedBBoxes,
*outV);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <vector>
#include "DetectionUtil.h"
#include "Layer.h"
namespace paddle {
/**
* The detection output layer for a SSD detection task. This layer applies the
* Non-maximum suppression to the all predicted bounding box and keeps the
* Top-K bounding boxes.
* - Input: This layer needs three input layers: The first input layer
* is the priorbox layer. The rest two input layers are convolution
* layers for generating bbox location offset and the classification
* confidence.
* - Output: The predict bounding box locations.
*/
class DetectionOutputLayer : public Layer {
public:
explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[1 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[1 + inputNum_ + index];
}
private:
size_t numClasses_; // number of classes
size_t inputNum_; // number of input layers
real nmsThreshold_;
real confidenceThreshold_;
size_t nmsTopK_;
size_t keepTopK_;
size_t backgroundId_;
size_t locSizeSum_;
size_t confSizeSum_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
};
} // namespace paddle
......@@ -26,6 +26,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the max pooling operation is
* then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MultiBoxLossLayer.h"
#include <float.h>
#include <vector>
#include "DataLayer.h"
namespace paddle {
REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
bool MultiBoxLossLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto layerConf = config_.inputs(0).multibox_loss_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
overlapThreshold_ = layerConf.overlap_threshold();
negPosRatio_ = layerConf.neg_pos_ratio();
negOverlap_ = layerConf.neg_overlap();
backgroundId_ = layerConf.background_id();
return true;
}
void MultiBoxLossLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
resetOutput(batchSize, 1);
// all location data and confidence score data
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
// locBuffer layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
locBuffer_ = locTmpBuffer_;
// confBuffer layout:
// | class1 score | class2 score | ... |classN score | class1 score | ......
Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
confBuffer_ = confTmpBuffer_;
// concate location data and confidence score data
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
// priorValue layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
// | xmin2 | ......
MatrixPtr priorValue;
// labelValue layout:
// | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
MatrixPtr labelValue;
// Copy data from GPU to CPU if use GPU
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
Matrix::resizeOrCreate(labelCpuValue_,
labelTmpValue->getHeight(),
labelTmpValue->getWidth(),
false,
false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
labelCpuValue_->copyFrom(*labelTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
labelValue = labelCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
labelValue = getInputValue(*getLabelLayer());
}
// Get max scores for each prior bbox. Used in negative mining
std::vector<std::vector<real>> allMaxConfScore;
numPriors_ = priorValue->getElementCnt() / 8;
getMaxConfidenceScores(confBuffer_->getData(),
batchSize,
numPriors_,
numClasses_,
backgroundId_,
&allMaxConfScore);
// Match prior bbox to groundtruth bbox
Argument label = getInput(*getLabelLayer());
const int* labelIndex = label.sequenceStartPositions->getData(false);
size_t seqNum = label.getNumSequences();
numMatches_ = 0;
numNegs_ = 0;
allMatchIndices_.clear();
allNegIndices_.clear();
std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
numPriors_,
*labelValue,
labelIndex,
seqNum,
allMaxConfScore,
batchSize,
overlapThreshold_,
negOverlap_,
negPosRatio_,
&allMatchIndices_,
&allNegIndices_);
numMatches_ = retPair.first;
numNegs_ = retPair.second;
// BBox location L1 smooth loss
locLoss_ = 0.0;
if (numMatches_ >= 1) {
size_t count = 0;
MatrixPtr locLossOutput;
Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
locDiff_->zeroMem();
std::vector<real> locGTData;
real* locDiffData = locDiff_->getData();
const real* locBufferData = locBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue; // match none
size_t locOffset =
n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
std::copy(locBufferData + locOffset,
locBufferData + locOffset + 4,
locDiffData + count);
count += 4;
const int gtIdx = allMatchIndices_[n][i];
size_t priorOffset = i * 8;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
std::vector<NormalizedBBox> gtBBoxVec;
getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
std::vector<real> gtEncode;
encodeBBoxWithVar(
priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
}
}
locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
locLoss_ = locLossOutput->getSum() / numMatches_;
}
// BBox confidence softmax loss
confLoss_ = 0;
numConf_ = numMatches_ + numNegs_;
if (numConf_ >= 1) {
Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
IVector::resizeOrCreate(confGTData_, numConf_, false);
confProb_->zeroMem();
size_t count = 0;
std::vector<real> confPredData;
real* confProbData = confProb_->getData();
const real* confBufferData = confBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
const int gtLabel = (labelValue->getData() + labelOffset)[0];
confGTData_->getData()[count] = gtLabel;
size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
// Negative mining samples
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
confGTData_->getData()[count] = backgroundId_;
size_t confOffset =
n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
}
CHECK_EQ(numConf_, count);
confProb_->softmax(*confProb_);
MatrixPtr confLossOutput;
Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
confLoss_ = confLossOutput->getSum() / numMatches_;
}
real loss = locLoss_ + confLoss_;
MatrixPtr outV = getOutputValue();
outV->assign(loss);
}
void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locBuffer_->zeroMem();
confBuffer_->zeroMem();
// Back propagate on location prediction
if (numMatches_ >= 1) {
MatrixPtr locDiffBuffer;
Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
locDiff_->copyFrom(*locDiffBuffer);
// scale gradient
for (size_t i = 0; i < numMatches_ * 4; ++i)
locDiff_->getData()[i] *= (1. / numMatches_);
// Copy gradient back
size_t count = 0;
const real* locDiffData = locDiff_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* locBufferData =
locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
std::copy(locDiffData + count * 4,
locDiffData + (count + 1) * 4,
locBufferData);
++count;
}
}
CHECK_EQ(count, numMatches_);
}
if (numConf_ >= 1) {
for (size_t i = 0; i < numConf_; ++i)
confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
for (size_t i = 0; i < numConf_ * numClasses_; ++i)
confProb_->getData()[i] *= (1. / numMatches_);
size_t count = 0;
const real* confProbData = confProb_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
int idx = allNegIndices_[n][i];
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + idx * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
}
CHECK_EQ(count, numConf_);
}
if (useGpu_) {
locTmpBuffer_->copyFrom(*locCpuBuffer_);
confTmpBuffer_->copyFrom(*confCpuBuffer_);
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
// copy back
size_t locOffset = 0;
size_t confOffset = 0;
auto layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
// only for unittest, there are no width and height information
// when constructing matrix in unittest, so we should
// set the shape in configuration
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
// NHWC to NCHW
MatrixPtr locGBuffer;
Matrix::resizeOrCreate(
locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
MatrixPtr confGBuffer;
Matrix::resizeOrCreate(
confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
locOffset += decomposeWithPermute(*locBuffer_,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locGBuffer,
kNHWCToNCHW);
inLocG->add(*locGBuffer);
confOffset += decomposeWithPermute(*confBuffer_,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confGBuffer,
kNHWCToNCHW);
inConfG->add(*confGBuffer);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
}
} // namespace paddle
/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
licensed under the apache license, version 2.0 (the "license");
you may not use this file except in compliance with the license.
you may obtain a copy of the license at
http://www.apache.org/licenses/license-2.0
unless required by applicable law or agreed to in writing, software
distributed under the license is distributed on an "as is" basis,
without warranties or conditions of any kind, either express or implied.
see the license for the specific language governing permissions and
limitations under the license. */
#pragma once
#include <vector>
#include "CostLayer.h"
#include "DataLayer.h"
#include "DetectionUtil.h"
#include "Layer.h"
using std::vector;
using std::pair;
namespace paddle {
/**
* The multibox loss layer for a SSD detection task.
* The loss is composed by the location loss and the confidence loss.
* The location loss is a smooth L1 loss and the confidence loss is
* a softmax loss.
* - Input: This layer needs four input layers: The first input layer
* is the priorbox layer and the second layer is a label layer.
* The rest two input layers are convolution layers for generating
* bbox location offset and the classification confidence.
* - Output: The Single Shot Multibox Detection loss value.
* Reference:
* Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
* Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
*/
class MultiBoxLossLayer : public CostLayer {
public:
explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[2 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[2 + inputNum_ + index];
}
protected:
size_t numClasses_;
real overlapThreshold_;
real negPosRatio_;
real negOverlap_;
size_t inputNum_;
size_t backgroundId_;
real locLoss_;
real confLoss_;
size_t numPriors_;
size_t numMatches_;
size_t numNegs_;
size_t numConf_;
size_t locSizeSum_;
size_t confSizeSum_;
vector<vector<int>> allMatchIndices_;
vector<vector<int>> allNegIndices_;
MatrixPtr locGTData_;
IVectorPtr confGTData_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locDiff_;
MatrixPtr confProb_;
MatrixPtr labelCpuValue_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
};
} // namespace paddle
......@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
return true;
}
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
} // namespace paddle
......@@ -26,10 +26,9 @@ namespace paddle {
* If SequenceLevel = kNonseq:
* Output: a sequence containing only the last instance of the input sequence
* If stride_ > 0:
* Output: a shorten sequence. The operation of getting last instance of a
* sequence is independently performed on every slice of the input
* sequence, which is obtained by sliding a window with the window
* size set to stride_.
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and getting last instance
* operation is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence
......@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
void SequenceLastInstanceLayer::forward(PassType passType) {
SequencePoolLayer::forward(passType);
auto starts = (stride_ > 0) ? stridePositions_->getData()
: startPositions_->getData(false);
auto starts = startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
......
......@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
if (stride_ > 0) {
CHECK_EQ(input.hasSubseq(), 0UL)
<< "sequence stride pooling is invalid for hasSubseq now";
output_.poolSequenceWithStride(
input, stride_, &stridePositions_, reversed_);
newBatchSize_ = stridePositions_->getSize() - 1;
output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
newBatchSize_ = startPositions_->getSize() - 1;
}
resetOutput(newBatchSize_, dim);
......
......@@ -28,8 +28,9 @@ namespace paddle {
* sequence}{input[i]}
* If stride_ > 0:
* Check input sequence must not have sub-sequence
* Output: a shorten sequence, pooling is performed upon a small local
* area
* Output: a shorten sequence. Stride is the step size by which we slide
* a window upon the input sequence, and the pooling operation
* is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......@@ -47,8 +48,6 @@ protected:
size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_;
int stride_;
// Store the start position of each window.
IVectorPtr stridePositions_;
// Whether the input sequence is reversed or not.
bool reversed_ = false;
......
......@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox
add_test(NAME test_PriorBox
COMMAND test_PriorBox)
################# test_DetectionOutput #######################
add_unittest_without_exec(test_DetectionOutput
test_DetectionOutput.cpp
LayerGradUtil.cpp)
add_test(NAME test_DetectionOutput
COMMAND test_DetectionOutput)
################# test_ConvUnify #######################
add_unittest_without_exec(test_ConvUnify
test_ConvUnify.cpp
......
......@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf,
data.value->sigmoid(*data.value);
data.grad->zeroMem();
break;
case INPUT_SELF_DEFINE_DATA: {
size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
CHECK_GT(static_cast<int>(height), 0);
CHECK_GT(static_cast<int>(width), 0);
data.value = Matrix::create(height, width, false, useGpu);
data.grad = Matrix::create(height, width, false, useGpu);
data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
data.grad->zeroMem();
const std::vector<int>& labelSeqStartPositions =
testConf.inputDefs[i].labelSeqStartPositions;
if (labelSeqStartPositions.size() != 0) {
CHECK(!sequenceStartPositions);
CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
sequenceStartPositions =
ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
labelSeqStartPositions.size(),
useGpu);
data.sequenceStartPositions = sequenceStartPositions;
}
break;
}
default:
LOG(FATAL) << " unknown inputType ";
return;
......@@ -440,7 +465,6 @@ void initTestLayer(TestConfig testConf,
ParameterConfig paraConfig) {
paraConfig.set_name(paraName);
paraConfig.set_size(paraSize);
paraConfig.set_initial_std(1);
paraConfig.set_is_static(isStatic);
auto para =
std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
......@@ -474,6 +498,9 @@ void initTestLayer(TestConfig testConf,
paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
paraConfig.add_dims(testConf.layerConfig.size());
}
CHECK_GE(testConf.paramInitialStd, 0);
paraConfig.set_initial_mean(testConf.paramInitialMean);
paraConfig.set_initial_std(testConf.paramInitialStd);
initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
}
}
......
......@@ -31,7 +31,8 @@ enum InputType {
INPUT_SEQUENCE_LABEL,
INPUT_SPARSE_NON_VALUE_DATA,
INPUT_SPARSE_FLOAT_VALUE_DATA,
INPUT_DENSE_DIM_DATA, // using sequence length to init dense data
INPUT_DENSE_DIM_DATA, // using sequence length to init dense data
INPUT_SELF_DEFINE_DATA, // support customizing for input value
};
struct ParaSparse {
......@@ -66,6 +67,7 @@ struct InputDef {
bool isStatic;
std::vector<int> labelInitValue;
std::vector<int> labelSeqStartPositions;
MatrixPtr selfDefinedData;
InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
inputType = type;
......@@ -76,6 +78,20 @@ struct InputDef {
isStatic = false;
}
InputDef(InputType type,
string nameIn,
MatrixPtr selfDefinedData,
std::vector<int> selfDefinedSeqStartPos = {})
: labelSeqStartPositions(selfDefinedSeqStartPos),
selfDefinedData(selfDefinedData) {
inputType = type;
name = nameIn;
dim = 0;
sparse = {""};
paraSize = 0;
isStatic = false;
}
InputDef(InputType type,
string nameIn,
size_t dimIn,
......@@ -109,12 +125,16 @@ struct TestConfig {
LayerConfig layerConfig;
std::vector<InputDef> inputDefs;
size_t biasSize;
real paramInitialMean;
real paramInitialStd;
bool testAccumulate;
bool testState;
bool staticBias;
bool testBatchState;
TestConfig()
: biasSize(0),
paramInitialMean(0.0),
paramInitialStd(1.0),
testAccumulate(true),
testState(false),
staticBias(false),
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
// Do one forward pass of priorBox layer and check to see if its output
// matches the given result
void doOneDetectionOutputTest(MatrixPtr& inputLoc,
MatrixPtr& inputConf,
MatrixPtr& inputPriorBox,
size_t feature_map_width,
size_t feature_map_height,
real nms_threshold,
bool use_gpu,
MatrixPtr& result) {
// Setting up the detection output layer
TestConfig configt;
configt.layerConfig.set_type("detection_output");
LayerInputConfig* input = configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
detOutput->set_width(feature_map_width);
detOutput->set_height(feature_map_height);
detOutput->set_nms_threshold(nms_threshold);
detOutput->set_num_classes(2);
detOutput->set_nms_top_k(20);
detOutput->set_keep_top_k(10);
detOutput->set_background_id(0);
detOutput->set_confidence_threshold(0.01);
detOutput->set_input_num(1);
configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
// data layer initialize
std::vector<DataLayerPtr> dataLayers;
LayerMap layerMap;
vector<Argument> datas;
initDataLayer(
configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
// test layer initialize
bool store_FLAGS_use_gpu = FLAGS_use_gpu;
FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters;
LayerPtr detectionOutputLayer;
initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
FLAGS_use_gpu = store_FLAGS_use_gpu;
detectionOutputLayer->forward(PASS_GC);
checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
}
TEST(Layer, detectionOutputLayerFwd) {
bool useGpu = false;
// CPU case 1.
MatrixPtr inputLoc;
MatrixPtr inputConf;
MatrixPtr inputPriorBox;
MatrixPtr result, result2, result3, result4;
real nmsTreshold = 0.01;
real inputLocData[] = {0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1};
real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
real resultData[] = {
0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
result = Matrix::create(1, 7, false, useGpu);
inputLoc->setData(inputLocData);
inputConf->setData(inputConfData);
inputPriorBox->setData(inputPriorBoxData);
result->setData(resultData);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result);
// CPU case 2.
nmsTreshold = 0.2;
result2 = Matrix::create(2, 7, false, useGpu);
real resultData2[] = {0,
1,
0.68997443,
0.099959746,
0.099959746,
0.50804031,
0.50804031,
0,
1,
0.59868765,
0.29995975,
0.29995975,
0.70804024,
0.70804024};
result2->setData(resultData2);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result2);
#ifndef PADDLE_ONLY_CPU
// GPU case 1.
useGpu = true;
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
inputLoc->copyFrom(inputLocData, 16);
inputConf->copyFrom(inputConfData, 8);
inputPriorBox->copyFrom(inputPriorBoxData, 32);
nmsTreshold = 0.01;
result3 = Matrix::create(1, 7, false, useGpu);
result3->copyFrom(resultData, 7);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result3);
// GPU case 2.
nmsTreshold = 0.2;
result4 = Matrix::create(2, 7, false, useGpu);
result4->copyFrom(resultData2, 14);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result4);
#endif
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,
TEST(Layer, MaxLayer) {
testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
testDegradeLayer(false,
"max",
"non-seq",
5); // seq max to a shorten seq, stride window = 5
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
}
TEST(Layer, SequenceLastInstanceLayer) {
......@@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
TEST(Layer, AverageLayer) {
testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
testDegradeLayer(false,
"average",
"non-seq",
5); // seq average to a shorten seq, stride window = 5
testDegradeLayer(
true, "average", "non-seq", -1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
......@@ -1661,6 +1669,8 @@ TEST(Layer, PadLayer) {
TEST(Layer, CrossChannelNormLayer) {
TestConfig config;
config.paramInitialMean = 1.;
config.paramInitialStd = 0.;
config.layerConfig.set_type("norm");
config.layerConfig.set_size(100);
LayerInputConfig* input = config.layerConfig.add_inputs();
......@@ -1674,7 +1684,7 @@ TEST(Layer, CrossChannelNormLayer) {
config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
for (auto useGpu : {false, true}) {
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
}
}
......@@ -1692,6 +1702,70 @@ TEST(Layer, smooth_l1) {
}
}
TEST(Layer, multibox_loss) {
TestConfig config;
config.layerConfig.set_type("multibox_loss");
config.biasSize = 0;
LayerInputConfig* input = config.layerConfig.add_inputs();
MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
multiboxLoss->set_num_classes(21);
multiboxLoss->set_input_num(1);
multiboxLoss->set_overlap_threshold(0.5);
multiboxLoss->set_neg_pos_ratio(3);
multiboxLoss->set_neg_overlap(0.5);
multiboxLoss->set_background_id(0);
multiboxLoss->set_height(3);
multiboxLoss->set_width(3);
size_t gtNum = 1;
MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
labelValue->randomizeUniform();
labelValue->add(-0.5);
labelValue->sigmoid(*labelValue);
real* labelData = labelValue->getData();
size_t labelWidth = labelValue->getWidth();
for (size_t i = 0; i < gtNum; ++i) {
*(labelData + i * labelWidth) = std::rand() % 20 + 1;
*(labelData + i * labelWidth + 1) = 0.400259;
*(labelData + i * labelWidth + 2) = 0.377857;
*(labelData + i * labelWidth + 3) = 0.525712;
*(labelData + i * labelWidth + 4) = 0.519368;
}
vector<int> seqStartPositions(gtNum + 1, 0);
for (size_t i = 1; i <= gtNum; ++i) {
seqStartPositions[i] = i;
}
// Ensure at lease one matched bbox
MatrixPtr priorValue = Matrix::create(1, 72, false, false);
priorValue->randomizeUniform();
priorValue->add(-0.5);
priorValue->sigmoid(*priorValue);
real* priorData = priorValue->getData();
*(priorData) = 0.424811;
*(priorData + 1) = 0.397059;
*(priorData + 2) = 0.538905;
*(priorData + 3) = 0.447091;
*(priorData + 4) = 0.425720;
*(priorData + 5) = 0.515228;
*(priorData + 6) = 0.519452;
*(priorData + 7) = 0.591065;
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
}
}
TEST(Layer, TransLayer) {
TestConfig config;
const int height = 128;
......
......@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {
void Argument::poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePostions,
ICpuGpuVectorPtr* stridePostions,
bool reversed) {
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
// then sequenceStartPositions = [0, 2, 3, 4, 7].
......@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
stridePos.emplace_back(starts[numSequences]);
int size = stridePos.size();
CHECK_EQ(size - 1, tgtBuf[numSequences]);
IVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->copyFrom(stridePos.data(), size);
ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
}
void Argument::getValueString(
......
......@@ -299,7 +299,7 @@ struct Argument {
*/
void poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePositions,
ICpuGpuVectorPtr* stridePositions,
bool reversed = false);
/**
* @brief getValueString will return the argument's output in string. There
......
......@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
for (auto reversed : {false, true}) {
IVectorPtr stridePositions;
ICpuGpuVectorPtr stridePositions;
output.poolSequenceWithStride(
input, 5 /* stride */, &stridePositions, reversed);
......@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
CHECK_EQ(stridePositions->getSize(), 8UL);
auto result = reversed ? strideResultReversed : strideResult;
for (int i = 0; i < 8; i++) {
CHECK_EQ(stridePositions->getData()[i], result[i]);
CHECK_EQ(stridePositions->getData(false)[i], result[i]);
}
}
}
......
......@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
EXPECT_EQ((int)0, nums[i]);
}
}
TEST_F(CommonTest, barrierStat) {
const int threadNum = 10;
SyncThreadPool pool(threadNum);
#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
struct timeval time; \
gettimeofday(&time, nullptr); \
uint64_t usec = timeToMicroSecond(time); \
std::srand(usec); \
auto value = std::rand() % 100000; \
usleep(value); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
}
globalStat.printAllStatus();
globalStat.reset();
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
// use it to test accurate barrier gap
#define TEST_BARRIER(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
usleep(tid * 10000); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
}
add_subdirectory(dynload)
nv_test(cuda_test SRCS cuda_test.cu)
cc_library(place SRCS place.cc)
......
......@@ -34,6 +34,16 @@ int GetDeviceCount(void) {
return count;
}
int GetCurrentDeviceId(void) {
int device_id;
throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
return device_id;
}
void SetDeviceId(int device_id) {
throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
}
} // namespace platform
} // namespace paddle
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublas_v2.h>
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublas_dso_flag;
void *cublas_dso_handle = nullptr;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublas routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cublasStatus_t operator()(Args... args) { \
typedef cublasStatus_t (*cublasFunc)(Args...); \
std::call_once(cublas_dso_flag, \
paddle::platform::dynload::GetCublasDsoHandle, \
&cublas_dso_handle); \
void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
} \
} __name; // struct DynLoad__##__name
#else
#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
cublasStatus_t operator()(Args... args) { \
return __name(args...); \
} \
} __name; // struct DynLoad__##__name
#endif
#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
// include all needed cublas functions in HPPL
// clang-format off
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSgemv) \
__macro(cublasDgemv) \
__macro(cublasSgemm) \
__macro(cublasDgemm) \
__macro(cublasSgeam) \
__macro(cublasDgeam) \
DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
#undef DYNAMIC_LOAD_CUBLAS_WRAP
#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
#undef CUBLAS_BLAS_ROUTINE_EACH
// clang-format on
#ifndef PADDLE_TYPE_DOUBLE
#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
#else
#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cudnn.h>
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cudnn_dso_flag;
void* cudnn_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudnn_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudnn_dso_flag, \
paddle::platform::dynload::GetCudnnDsoHandle, \
&cudnn_dso_handle); \
void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
#endif
/**
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
// clang-format off
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor) \
__macro(cudnnSetTensor4dDescriptorEx) \
__macro(cudnnGetConvolutionNdForwardOutputDim) \
__macro(cudnnGetConvolutionForwardAlgorithm) \
__macro(cudnnCreateTensorDescriptor) \
__macro(cudnnDestroyTensorDescriptor) \
__macro(cudnnCreateFilterDescriptor) \
__macro(cudnnSetFilter4dDescriptor) \
__macro(cudnnSetPooling2dDescriptor) \
__macro(cudnnDestroyFilterDescriptor) \
__macro(cudnnCreateConvolutionDescriptor) \
__macro(cudnnCreatePoolingDescriptor) \
__macro(cudnnDestroyPoolingDescriptor) \
__macro(cudnnSetConvolution2dDescriptor) \
__macro(cudnnDestroyConvolutionDescriptor) \
__macro(cudnnCreate) \
__macro(cudnnDestroy) \
__macro(cudnnSetStream) \
__macro(cudnnActivationForward) \
__macro(cudnnConvolutionForward) \
__macro(cudnnConvolutionBackwardBias) \
__macro(cudnnGetConvolutionForwardWorkspaceSize) \
__macro(cudnnTransformTensor) \
__macro(cudnnPoolingForward) \
__macro(cudnnPoolingBackward) \
__macro(cudnnSoftmaxBackward) \
__macro(cudnnSoftmaxForward) \
__macro(cudnnGetVersion) \
__macro(cudnnGetErrorString)
CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
__macro(cudnnAddTensor) \
__macro(cudnnConvolutionBackwardData) \
__macro(cudnnConvolutionBackwardFilter)
CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
// APIs available after R3:
#if CUDNN_VERSION >= 3000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize) \
__macro(cudnnGetConvolutionBackwardDataAlgorithm) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm) \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
#endif
// APIs available after R4:
#if CUDNN_VERSION >= 4007
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
__macro(cudnnBatchNormalizationForwardTraining) \
__macro(cudnnBatchNormalizationForwardInference) \
__macro(cudnnBatchNormalizationBackward)
CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
#endif
// APIs in R5
#if CUDNN_VERSION >= 5000
#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \
__macro(cudnnCreateActivationDescriptor) \
__macro(cudnnSetActivationDescriptor) \
__macro(cudnnGetActivationDescriptor) \
__macro(cudnnDestroyActivationDescriptor)
CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
#undef CUDNN_DNN_ROUTINE_EACH_R5
#endif
#undef CUDNN_DNN_ROUTINE_EACH
// clang-format on
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <curand.h>
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag curand_dso_flag;
void *curand_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
typedef curandStatus_t (*curandFunc)(Args...); \
std::call_once(curand_dso_flag, \
paddle::platform::dynload::GetCurandDsoHandle, \
&curand_dso_handle); \
void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
#endif
/* include all needed curand functions in HPPL */
// clang-format off
#define CURAND_RAND_ROUTINE_EACH(__macro) \
__macro(curandCreateGenerator) \
__macro(curandSetStream) \
__macro(curandSetPseudoRandomGeneratorSeed)\
__macro(curandGenerateUniform) \
__macro(curandGenerateUniformDouble) \
__macro(curandDestroyGenerator)
// clang-format on
CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef CURAND_RAND_ROUTINE_EACH
#undef DYNAMIC_LOAD_CURAND_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <mutex>
#include <string>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/framework/enforce.h"
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
DEFINE_string(cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
namespace paddle {
namespace platform {
namespace dynload {
static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator
const char sep = '/';
if (!part2.empty() && part2.front() == sep) {
return part2;
}
std::string ret;
ret.reserve(part1.size() + part2.size() + 1);
ret = part1;
if (!ret.empty() && ret.back() != sep) {
ret += sep;
}
ret += part2;
return ret;
}
static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
void** dso_handle,
int dynload_flags) {
VLOG(3) << "Try to find library: " << dso_path
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
#if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) {
if (dso_path == "libcudnn.dylib") {
PADDLE_ENFORCE(true,
"Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
"chmod a+r /usr/local/cuda/include/cudnn.h "
"/usr/local/cuda/lib/libcudnn*");
}
}
}
#endif
}
static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
const std::string& dso_name,
void** dso_handle) {
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
*dso_handle = nullptr;
std::string dlPath = dso_name;
if (search_root.empty()) {
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
} else {
// search xxx.so from custom path
dlPath = join(search_root, dso_name);
*dso_handle = dlopen(dlPath.c_str(), dynload_flags);
// if not found, search from default path
if (nullptr == *dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")";
dlPath = dso_name;
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
}
}
PADDLE_ENFORCE(nullptr != *dso_handle,
"Failed to find dynamic library: %s ( %s ) \n Please specify "
"its path correctly using following ways: \n Method. set "
"environment variable LD_LIBRARY_PATH on Linux or "
"DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
"export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
"using the DYLD_LIBRARY_PATH is impossible unless System "
"Integrity Protection (SIP) is disabled.",
dlPath, dlerror());
}
void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
}
void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
}
void GetWarpCTCDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
#endif
}
void GetLapackDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
#endif
}
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace platform {
namespace dynload {
/**
* @brief load the DSO of CUBLAS
*
* @param **dso_handle dso handler
*
*/
void GetCublasDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CUDNN
*
* @param **dso_handle dso handler
*
*/
void GetCudnnDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CURAND
*
* @param **dso_handle dso handler
*
*/
void GetCurandDsoHandle(void** dso_handle);
/**
* @brief load the DSO of warp-ctc
*
* @param **dso_handle dso handler
*
*/
void GetWarpCTCDsoHandle(void** dso_handle);
/**
* @brief load the DSO of lapack
*
* @param **dso_handle dso handler
*
*/
void GetLapackDsoHandle(void** dso_handle);
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
}
/// trigger to initialize RDMA lib
PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
}
SocketServer::~SocketServer() {
......@@ -168,7 +168,7 @@ void SocketServer::tcpServer() {
/// First call to socket() function
socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket";
CHECK(socket_ >= 0) << "ERROR opening socket";
/// Initialize socket structure
bzero((char *)&serv_addr, sizeof(serv_addr));
......@@ -176,7 +176,7 @@ void SocketServer::tcpServer() {
serv_addr.sin_port = htons(port_);
if (!addr_.empty()) {
server = gethostbyname(addr_.c_str());
PCHECK(server) << "ERROR, no such host: " << addr_;
CHECK(server) << "ERROR, no such host: " << addr_;
bcopy((char *)server->h_addr,
(char *)&serv_addr.sin_addr.s_addr,
server->h_length);
......@@ -187,7 +187,7 @@ void SocketServer::tcpServer() {
setOption(socket_);
/// Now bind the host address using bind() call.
PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding " << addr_;
/// Now start listening for the clients, here process will
......@@ -201,7 +201,7 @@ void SocketServer::tcpServer() {
if (stopping_) {
break;
}
PCHECK(newsockfd >= 0) << "ERROR on accept";
CHECK(newsockfd >= 0) << "ERROR on accept";
constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen];
CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
......@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() {
/// First call to socket() function
rdmaSocket_ = rdma::ssocket(rdmaCpu_);
PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
<< "ERROR bind RDMA socket";
/// Now start listening for the clients, here process will
/// go in sleep mode and will wait for the incoming connection
PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
while (true) {
/// Accept actual connection from the client
......@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() {
if (stopping_) {
break;
}
PCHECK(newsock) << "ERROR on accept";
CHECK(newsock) << "ERROR on accept";
constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen];
......@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
onlineCpus_ = rdma::numCpus();
for (auto i = 0; i < onlineCpus_; i++) {
socket = rdma::csocket(i);
PCHECK(socket) << "ERROR open client socket daemon";
CHECK(socket) << "ERROR open client socket daemon";
rdmaClientSocket_.push_back(socket);
}
......@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
/// Create a socket point
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket";
CHECK(sockfd >= 0) << "ERROR opening socket";
#if defined(__OSX__) || defined(__APPLE__)
server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
......@@ -396,8 +396,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
}
std::this_thread::sleep_for(std::chrono::seconds(1));
} else {
PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
<< serverPort << "errorno: " << errno;
CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
<< serverPort << "errorno: " << errno;
}
} while (errno == ECONNREFUSED);
......@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
/// connect to server with socket daemon
sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
PCHECK(sock) << "ERROR connect to server" << rdmaUri;
CHECK(sock) << "ERROR connect to server" << rdmaUri;
std::vector<std::string> seg;
str::split(rdmaUri, '/', &seg);
......
......@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
SetConfigResponse response;
callback(response);
/// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
}
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
......@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::vector<Buffer>* outputBuffers) {
VLOG(1) << "pserver: addGradient";
// forwardbackward delta from all trainers
// indicate the fluctuation caused by forwardbackward.
if (!numPassFinishClients_) {
REGISTER_BARRIER_DELTA_SERVER_SET(
*statSet_,
"forwardbackwardDelta",
FLAGS_num_gradient_servers,
request.trainer_id(),
request.forwardbackward_time(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{
/// approximately pure network overhead
REGISTER_TIMER_DYNAMIC_SET(
"pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
}
#ifndef PADDLE_DISABLE_TIMER
gettimeofday(&(*addGradBegin_), nullptr);
#endif
/// barrier fluctuation caused by network and previous forwardbackward
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER_SET(
*statSet_,
"handleReqBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
(*handleRequestBegin_),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{
REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
ReadLockGuard guard(parameterMutex_);
int bufferIndex = 0;
for (const auto& block : request.blocks()) {
......@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::lock_guard<std::mutex> guard(*info.lock);
simd::addTo(gradientSumBuffer, gradientBuffer, size);
}
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradCoreFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
}
if (request.batch_status() == BATCH_FINISH ||
request.batch_status() == BATCH_START_AND_FINISH) {
......@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
VLOG(1) << "num samples: " << numSamplesProcessed_
<< ", new cost:" << cost_;
/// numPassFinishClients_ means some trainer has entered finishPass
if (!numPassFinishClients_) {
REGISTER_SLOW_NODES_PROBE(
*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// notify doOperation gradient ready
gradientReadyBarrier_.wait();
/// if wait pass finish does not start, do check
if (!numPassFinishClients_) {
CHECK_BARRIER_TIMER(*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// barrier performance while all parameter add is finished
/// can indicate the fluctation caused by computation at pserver.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"paraReady",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// wait doOperation finish
parameterReadyBarrier_.wait();
VLOG(1) << "start send back";
{
/// total time except overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
}
}
......@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
return commitGradient;
}
void ParameterServer2::printAsyncGradientCommitStatAndReset() {
std::stringstream statFormat;
if (asyncUpdateSteps_) {
statFormat << "async discard gradients stat: " << std::endl;
statFormat << "serverId: " << serverId_
<< " serverType: " << isSparseServer_
<< " total updates: " << asyncUpdateSteps_
<< " discard updates: " << asyncLaggedGradientsNum_
<< " discard ratio: "
<< (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Update Steps distribution: " << std::endl
<< "Sample: 1:1912(0.00284449) means "
<< "the updates step=1 count 1912 times "
<< "and account for 0.284449% of total updates" << std::endl;
size_t index = 0;
for (const auto& stat : asyncUpdateStat_) {
statFormat << index << ":" << stat << "("
<< (real)stat / (real)asyncUpdateSteps_ << ") ";
index++;
}
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
<< "Sample: 2:22(0.0016363) means "
<< "total discarded updates from trainer_id=2 count 22 "
<< "and account for 0.16363% of all updates from trainer_id=2"
<< std::endl;
for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
real ratio =
(real)asyncTrainerDiscardStat_[i] /
(real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
<< ")"
<< " ";
}
LOG(INFO) << statFormat.str();
/// reset stat
asyncUpdateSteps_ = 0;
asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
asyncLaggedGradientsNum_ = 0;
asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
}
}
static ThreadLocal<std::vector<bool>> localBlockBitset_;
void ParameterServer2::asyncSGD(const SendParameterRequest& request,
......@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
if (request.trainer_id() == 0) {
/// batchId_ is approximately equal to "real batchId_"
batchId_++;
tuningAsyncsgdMidOutput();
}
}
......@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
}
(*requestVec_).clear();
(*callbackVec_).clear();
/// barrier perfromance while all data are send finished.
/// indicates network flucatuation for big message.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"sendParamFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// all time exhausted in parameterServer for big message.
/// it contains network and computation at pserver.
{
/// total time including overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
timeToMicroSecond(*handleRequestBegin_),
-1,
*statSet_);
}
/// all time exhausted in pserverServer except recieve network.
{
/// total time except overhead of network receive
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
}
break;
case PSERVER_UPDATE_MODE_SET_PARAM:
......@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
{
REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
BlockInfo& info = blockInfos_[blockId];
const ParameterConfig& config = getParameterConfig(blockId);
......@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
batchId_++;
tuningSgdMidOutput();
}
void ParameterServer2::op_start_pass(const Operation& operation,
......@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
/// finish pass
info.optimizer->finishPass();
});
tuningSgdFinished();
batchId_ = 0;
}
......@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
callback(SynchronizeResponse());
if (request.trainer_id() == 0) {
tuningAsyncsgdFinished();
batchId_ = 0;
}
}
......@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
callback(response);
}
void ParameterServer2::tuningSgdMidOutput() {
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== Batch=" << batchId_ << "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/// not reset raw data for reducing the overhead of performance tuning
statSet_->reset(false);
}
}
void ParameterServer2::tuningSgdFinished() {
LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
<< "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/**
* reset raw data at end of pass since some raw data could be not
* complete. Otherwise the raw data will pollute next pass performance
* tuning
*/
statSet_->reset();
}
void ParameterServer2::tuningAsyncsgdMidOutput() {
#ifndef PADDLE_DISABLE_TIMER
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
printAsyncGradientCommitStatAndReset();
}
#endif
}
void ParameterServer2::tuningAsyncsgdFinished() {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
<< "=======";
printAsyncGradientCommitStatAndReset();
}
} // namespace paddle
......@@ -298,24 +298,6 @@ protected:
/// barrier performance tuning sync-sgd required
std::atomic<int64_t> batchId_;
/// the beginning of addGradient without network overhead
ThreadLocal<struct timeval> addGradBegin_;
/**
* tuning barrier performance
* to better control log for sparse and dense parameter,
* we use different log entities for different parameterServer
* objects.
* it will output lots of performance stats to perceive the
* overhead of network, fluctuation of computation from
* forwardbackward and network, computation from optimization
* at pserver end, barrier overhead, etc. to understand tuning
* data, focus on the synchronization between addGradient and
* doOperation which indirectly call op_SGD operation controlled
* by remote updater controller
*/
std::unique_ptr<StatSet> statSet_;
public:
struct Buffer {
real* base;
......@@ -325,7 +307,6 @@ public:
protected:
/// async gradient commit control
bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
void printAsyncGradientCommitStatAndReset();
public:
/// disable default parameter for overloading
......@@ -710,36 +691,6 @@ public:
void op_load(const Operation& operation, OperationResult* result);
void op_save(const Operation& operation, OperationResult* result);
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for sgd
*/
void tuningSgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for sgd. it will also
* flush some stateful stat for next pass.
*/
void tuningSgdFinished();
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for async-sgd.
* it will log some performance log if some lagged node are found
*/
void tuningAsyncsgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for async-sgd.
*/
void tuningAsyncsgdFinished();
};
} // namespace paddle
......@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
else
len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_;
CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) {
return total;
}
......@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
else
len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_;
CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) {
return total;
}
......@@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc,
while (size < total) {
ssize_t len =
iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
<< " iovCnt=" << iovcnt
<< " iovs[curIov].base=" << iovs[curIov].iov_base
<< " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
<< " iovCnt=" << iovcnt
<< " iovs[curIov].base=" << iovs[curIov].iov_base
<< " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
size += len;
/// restore iovs[curIov] to the original value
......@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
header.totalLength += iov.iov_len;
}
PCHECK(writev(iovs) == (size_t)header.totalLength);
CHECK(writev(iovs) == (size_t)header.totalLength);
}
std::unique_ptr<MsgReader> SocketChannel::readMessage() {
......@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
return nullptr;
}
PCHECK(len == sizeof(header));
CHECK(len == sizeof(header));
std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
......@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
: channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
size_t size = numBlocks * sizeof(blockLengths_[0]);
PCHECK(channel_->read(&blockLengths_[0], size) == size);
CHECK(channel_->read(&blockLengths_[0], size) == size);
}
void MsgReader::readBlocks(const std::vector<void*>& bufs) {
......@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
++currentBlockIndex_;
}
PCHECK(channel_->readv(&iovs) == totalLength);
CHECK(channel_->readv(&iovs) == totalLength);
}
void MsgReader::readNextBlock(void* buf) {
CHECK_LT(currentBlockIndex_, blockLengths_.size());
PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
++currentBlockIndex_;
}
......
......@@ -113,7 +113,7 @@ void SocketServer::run() {
/* First call to socket() function */
socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket";
CHECK(socket_ >= 0) << "ERROR opening socket";
/* Initialize socket structure */
bzero((char*)&serv_addr, sizeof(serv_addr));
......@@ -122,7 +122,7 @@ void SocketServer::run() {
serv_addr.sin_port = htons(port_);
/* Now bind the host address using bind() call.*/
PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding";
/* Now start listening for the clients, here process will
......@@ -134,7 +134,7 @@ void SocketServer::run() {
while (true) {
/* Accept actual connection from the client */
newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
PCHECK(newsockfd >= 0) << "ERROR on accept";
CHECK(newsockfd >= 0) << "ERROR on accept";
SocketWorker* worker = new SocketWorker(newsockfd);
worker->start();
......@@ -146,17 +146,17 @@ void SocketWorker::run() {
while (true) {
int64_t n = channel_.readAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket";
CHECK(n == sizeof(header)) << "ERROR reading from socket";
buffer_.resize(header.dataLength);
n = channel_.readAll(&buffer_[0], header.dataLength);
PCHECK(n == header.dataLength) << "ERROR reading from socket";
CHECK(n == header.dataLength) << "ERROR reading from socket";
/* Write a response to the client */
n = channel_.writeAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket";
CHECK(n == sizeof(header)) << "ERROR reading from socket";
n = channel_.writeAll(buffer_.data(), buffer_.size());
PCHECK(n == header.dataLength) << "ERROR writing to socket";
CHECK(n == header.dataLength) << "ERROR writing to socket";
}
}
......@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
/* Create a socket point */
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket";
CHECK(sockfd >= 0) << "ERROR opening socket";
server = gethostbyname(serverAddr.c_str());
PCHECK(server) << "ERROR, no such host: " << serverAddr;
CHECK(server) << "ERROR, no such host: " << serverAddr;
bzero((char*)&serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
......@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
serv_addr.sin_port = htons(serverPort);
/* Now connect to the server */
PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR connecting";
channel_.reset(new SocketChannel(sockfd));
......@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
cpuGrad.copyFrom(gpuGrad);
header.dataLength = dataSize;
PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
<< "Client write header error";
PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
<< "Client write data error";
/* Now read server response */
PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
<< "Client read header error";
CHECK_EQ((uint64_t)header.dataLength, dataSize);
PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
<< "Client read data error";
gpuParam.copyFrom(cpuParam);
......
......@@ -3,7 +3,7 @@
set -xe
# Set BASE_IMAGE according to env variables
if [ ${WITH_GPU} == "ON" ]; then
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
......@@ -78,7 +78,7 @@ paddle version
# PaddlePaddle. This awkwardness is due to
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# describes a solution.
if [ ${WITH_DOC} == "ON" ]; then
if [[ ${WITH_DOC} == "ON" ]]; then
cat <<EOF
========================================
Building documentation ...
......
......@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
}
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
FILE* fp = fopen(featFile.c_str(), "ab+");
PCHECK(!ferror(fp)) << "Fail to open " << featFile;
CHECK(!ferror(fp)) << "Fail to open " << featFile;
size_t sampleNum = featMatrices[0]->getHeight();
for (size_t i = 0; i < sampleNum; ++i) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/BarrierStat.h"
#include <string.h>
#include <sys/types.h>
#include <algorithm>
#include <iomanip>
#include "paddle/utils/Flags.h"
#include "paddle/utils/Stat.h"
DEFINE_bool(log_barrier_abstract,
true,
"if true, show abstract of barrier performance");
DEFINE_int32(log_barrier_lowest_nodes,
5,
"how many lowest node will be logged");
DEFINE_bool(log_barrier_show_log,
false, // for performance tuning insight
"if true, always show barrier abstract even with little gap");
namespace paddle {
std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
if (FLAGS_log_barrier_abstract) {
std::lock_guard<std::mutex> guard(stat.lock_);
stat.showAbstract(output);
}
return output;
}
BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
const std::string &name)
: totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
abstract_.resize(numConnThreads_);
if (FLAGS_log_barrier_show_log) {
rateThreshold_ = 0.0;
} else {
/* probablity of abnormal node
* p = 1/n + (n/8)/(n+1), n = nodes, n > 1
* if the freq of lowest trainerId larger than p,
* output FLAGS_log_barrier_lowest_nodes lastTrainerId.
* numConnThreads_ indicates nodes
*/
float n = (float)numConnThreads;
rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
}
}
BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorEnd(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierEndStat: " << name
<< " endBarrier warning rate: " << rateThreshold_;
}
/*
* Note:
* the design different pserver entity owns different statSet to obey
* the background that different pserver runs separately.
*/
void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(cur, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getLastTrainerId();
auto delta = timeToMicroSecond(timeVector_->getDelta());
auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
abstract_[id].totSecondDelta += secondDelta;
abstract_[id].totLastTwoDelta += lastTwoDelta;
abstract_[id].totMidDelta += midDelta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
totAbstract_.totSecondDelta += secondDelta;
totAbstract_.totLastTwoDelta += lastTwoDelta;
totAbstract_.totMidDelta += midDelta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierEndStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierEndStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/*
* Note:
* avgGap: the average delta between 1 -- n arriving trainers
* avgSecondGap: the average delta between 2 -- n arriving trainers
* avgLastTwoGap: the average delta between n-1 -- n arriving trainers
* avgMidGap: the average delta between n/2 -- n arriving trainers
* rato: samples / totSamples
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
<< std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
<< std::setw(10) << "rate" << std::setw(10) << "samples"
<< std::setw(10) << "totSamples" << std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
<< std::setw(10) << (float)totAbstract_.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
<< std::setw(15)
<< (abstract.totLastTwoDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
<< std::setw(10) << (float)abstract.freq / (float)totSamples_
<< std::setw(10) << (float)abstract.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
}
}
BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorDelta(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierDeltaStat: " << name
<< " barrierDelta warning rate: " << rateThreshold_;
}
void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(delta, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getMaxTrainerId();
auto delta = timeVector_->getDelta();
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierDeltaStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierDeltaStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/* Note:
* Gap means the delta from all trainers' forwardbackward
* avgGap: average Gap in log_period batches
* minGap: min Gap in log_period batches
* maxGap: max Gap in log_period batches
* trainerId: the slowest trainer_id
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(10) << "minGap"
<< std::setw(10) << "maxGap" << std::setw(10) << "rate"
<< std::setw(10) << "samples" << std::setw(10) << "totSamples"
<< std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
<< totAbstract_.maxDelta * 0.001 << std::setw(10)
<< (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
<< (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
<< std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
<< abstract.maxDelta * 0.001 << std::setw(10)
<< (float)abstract.freq / (float)totSamples_ << std::setw(10)
<< (float)abstract.freq << std::setw(10) << (float)totSamples_
<< std::endl;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <sys/time.h>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
namespace paddle {
inline uint64_t timeToMicroSecond(struct timeval time) {
return time.tv_sec * 1000000LU + time.tv_usec;
}
class TimeVectorEnd {
/*
* help class for gathering all barrier performance data
* which shows time point property.
* freqently used in barrier performance tuning API, such
* as tuning which is slowest node in sync-sgd mode training.
*/
public:
explicit TimeVectorEnd(uint16_t size) : size_(size) {
index_ = 0;
timeArray_.resize(size);
trainerIds_.resize(size);
}
~TimeVectorEnd() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() { index_ = 0; }
void addTimeval(struct timeval time, int32_t trainerId) {
timeArray_[index_] = time;
trainerIds_[index_] = trainerId;
index_++;
}
struct timeval getDelta() const {
struct timeval delta;
CHECK_GT(size_, 1) << "not support with 1 pserver";
timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
return delta;
}
/* 2, n delta */
struct timeval get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
return delta;
}
/* n-1, n delta */
struct timeval getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
return delta;
}
/* n/2, n delta */
struct timeval getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
return delta;
}
int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
private:
uint16_t size_;
uint16_t index_;
std::vector<struct timeval> timeArray_;
std::vector<int32_t> trainerIds_;
};
class TimeVectorDelta {
/*
* help class for gathering performance data which shows time
* delta property, such as tuning the time distribution of
* forwardBackward time from all cluster nodes.
*/
public:
explicit TimeVectorDelta(uint16_t size)
: size_(size), min_(UINT64_MAX), max_(0) {
index_ = 0;
timeArray_.resize(size);
}
~TimeVectorDelta() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() {
index_ = 0;
min_ = UINT64_MAX;
max_ = 0;
}
void addTimeval(uint64_t delta, int32_t trainerId) {
timeArray_[index_] = delta;
index_++;
if (delta < min_) {
min_ = delta;
}
if (delta > max_) {
max_ = delta;
maxTrainerId_ = trainerId;
}
}
uint64_t getDelta() const {
CHECK_GT(size_, 1) << "not support with 1 pserver";
return max_ - min_;
}
/* 2, n delta */
uint64_t get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n-1, n delta */
uint64_t getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n/2, n delta */
uint64_t getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
int32_t getMaxTrainerId() const { return maxTrainerId_; }
private:
uint16_t size_;
uint16_t index_;
std::vector<uint64_t> timeArray_;
private:
uint64_t min_;
uint64_t max_;
int32_t maxTrainerId_;
};
// total samples stats, us
struct Abstract {
// last trainerId for barrier end, maxDelta trainerId for barrier delta
int32_t trainerId;
uint64_t minDelta;
uint64_t maxDelta;
uint64_t totDelta;
// first one is probably itself, so discard it.
uint64_t totSecondDelta;
// to confirm if last node destroy barrier performance.
uint64_t totLastTwoDelta;
// n/2-n delta
uint64_t totMidDelta;
uint64_t freq;
};
// barrier performance tunning stats
class BarrierStatBase {
public:
BarrierStatBase(uint16_t numConnThreads, const std::string &name);
virtual ~BarrierStatBase() {}
// if called at pserver end, then trainId means trainer's id.
// by default trainer does not use trainerId, so set it to -1
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
const std::string &getName() { return name_; }
virtual void reset(bool clearRawData = true) {}
// since the timeVector_ is not stateful, so it's not clear whether the
// the barrier delta is correct. if one timestamp was lost, the all data
// from barrier stat becomes rubbish. -_-
virtual bool checkPassBarrier() {
LOG(INFO) << "bug implementation found";
return false;
}
protected:
virtual void showAbstract(std::ostream &output) const {}
friend std::ostream &operator<<(std::ostream &output,
const BarrierStatBase &stat);
protected:
mutable std::mutex lock_;
std::mutex abstractLock_; // see note on updaterStat
// each freqency for each barrier trainer
std::vector<struct Abstract> abstract_;
// it is valuable when do perf-tuining, if lastTrainerId acts uniform
// distribution
struct Abstract totAbstract_;
uint64_t totSamples_;
protected:
uint16_t numConnThreads_; // total updates needed
float rateThreshold_;
std::string name_;
};
// the end-time of arriving real/forged barrier position
class BarrierEndStat : public BarrierStatBase {
public:
BarrierEndStat(uint16_t numConnThreads, const std::string &name);
~BarrierEndStat() {}
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
LOG(INFO) << "have no delta updateStat in BarrierEndStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
/*
* LOG:
* readAllBlocks_denseUpdater
* trainerId avgGap avgSecondGap avgLastTwoGap avgMidGap rate
* 44 86.702 81.022 9.984 50.472 0.144737
* 46 87.723 82.939 8.737 50.019 0.118421
* 35 100.923 96.752 14.305 61.979
* 0.0657895
* log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
* control details.
*/
virtual void showAbstract(std::ostream &output) const;
private:
std::unique_ptr<TimeVectorEnd> timeVector_;
};
// the delta-time from different trainers,
// eg, find the degree of imbalance of BP time at pserver end
// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
class BarrierDeltaStat : public BarrierStatBase {
public:
BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
~BarrierDeltaStat() {}
virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
virtual void showAbstract(std::ostream &outPut) const;
private:
// store delta time in uint64_t, eg BP time of all trainers
std::unique_ptr<TimeVectorDelta> timeVector_;
};
// to distinguish different contexts for same parallel threads, and different
// threads with same code-sgement, just use tagName to tag the run-time
// position.
// in Sparse, sendParallel threads can not only run in the stage of push&pull
// with same thread group, but also run in the stage of pull&push with different
// thread group, tag will be used to distinguish different run-time barrier
// position.
// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
// nodes.
// end barrier
#define __REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
struct timeval cur; \
gettimeofday(&cur, nullptr); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// end barrier with user-defined timer
#define __REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// delta barrier
#define __REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
__stat->updateStat(delta, trainerId); \
} \
} while (0);
// check end barrier
#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
do { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
PCHECK(__stat->checkPassBarrier()) << internalName \
<< ": invalid barrier data"; \
} while (0);
/*
* Note:
* with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
* pserve end. these synchronizaton actions have impact on the efficiency of
* parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
* factors, such as the forwardBackward variance, network fluncation. we try
* to have a quantitative analysis on these factor, so we design lots of barrier
* time to capture these performance. these barrier also can be placed at
* implict barrier position.
*
* example:
* in sync-sgd algorithm, each parameter server waits for all gradients from
* all trainers, thus, an explict barrier point exsit before doing optimization.
* the barrier timer located before the point can sense the barrier condition.
*
*/
// try to capture which trainer is slowest node in sync-sgd at pserver.
#define REGISTER_SLOW_NODES_PROBE( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
// try to check if all threads or trainers have passed barriers for data
// accuracy.
#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
__CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
#ifdef PADDLE_DISABLE_TIMER
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...)
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#else
/*
* sensing barrier time distribution for all parallelization threads.
* it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
*/
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
/*
* sensing barrier time distribution for all parallelization threads.
* but time point for barrier performance is set by user.
* eg, with this api, you can get implict barrier point such as the beginning
* time distribution
* for receiving data.
*/
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
__REGISTER_BARRIER_TIMER_SERVER_SET( \
(set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
// try to capture time delta from all trainers, such as forwardBackward time
// which implies
// computation fluctuation
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
__REGISTER_BARRIER_DELTA_SERVER_SET( \
(set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
#endif // DISABLE_TIMER
} // namespace paddle
......@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
return outPut;
}
BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType) {
{
ReadLockGuard guard(lock_);
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
}
std::lock_guard<RWLock> guard(lock_);
// test again with lock_guard
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
BarrierStatPtr stat;
if (bType == BARRIER_END) {
stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
} else if (bType == BARRIER_DELTA) {
stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
}
auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
return ret.first->second;
}
void StatSet::printSegTimerStatus() {
ReadLockGuard guard(lock_);
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
......@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
}
}
void StatSet::printBarrierTimerStatus() {
ReadLockGuard guard(lock_);
if (barrierStatSet_.empty()) {
return;
}
// control barrierAbstact in runtime, so enable compliation
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< "======= BarrierStatSet status ======" << std::endl;
for (auto& stat : barrierStatSet_) {
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< *(stat.second);
}
}
void StatSet::printAllStatus() {
#ifndef PADDLE_DISABLE_TIMER
printSegTimerStatus();
#endif
printBarrierTimerStatus();
LOG(INFO) << std::setiosflags(std::ios::left)
<< "--------------------------------------------------"
<< std::endl;
}
void StatSet::printStatus(const std::string& name) {
ReadLockGuard guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
LOG(INFO) << *(iter->second);
}
void StatSet::reset(bool clearRawData) {
ReadLockGuard guard(lock_);
for (auto& stat : statSet_) {
stat.second->reset();
}
// reset barrierStat
for (auto& stat : barrierStatSet_) {
stat.second->reset(clearRawData);
}
}
void StatSet::setThreadInfo(const std::string& name, bool flag) {
......@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
iter->second->setThreadInfo(flag);
}
void StatSet::deleteStat(const std::string& name) {
std::lock_guard<RWLock> guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
statSet_.erase(iter);
}
StatInfo::~StatInfo() {
if (stat_) {
std::lock_guard<std::mutex> guard(stat_->lock_);
......
......@@ -23,7 +23,6 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include "BarrierStat.h"
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
......@@ -60,12 +59,6 @@ public:
class Stat;
typedef std::shared_ptr<Stat> StatPtr;
typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
enum BarrierStatType {
BARRIER_END = 0,
BARRIER_DELTA = 1,
};
class StatSet {
public:
......@@ -74,11 +67,8 @@ public:
// print to LOG(INFO)
void printSegTimerStatus();
void printBarrierTimerStatus();
void printAllStatus();
void printStatus(const std::string& name);
StatPtr getStat(const std::string& name) {
{
ReadLockGuard guard(lock_);
......@@ -93,12 +83,6 @@ public:
return ret.first->second;
}
BarrierStatPtr getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType);
void deleteStat(const std::string& name);
// true for showing stats for each thread
// false for showing stats aggragated over threads
void setThreadInfo(const std::string& name, bool flag);
......@@ -120,7 +104,6 @@ public:
private:
std::unordered_map<std::string, StatPtr> statSet_;
std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
const std::string name_;
RWLock lock_;
};
......
......@@ -51,7 +51,7 @@ template <class T>
class ThreadLocal {
public:
ThreadLocal() {
PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
}
~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
......@@ -65,7 +65,7 @@ public:
if (!p && createLocal) {
p = new T();
int ret = pthread_setspecific(threadSpecificKey_, p);
PCHECK(ret == 0);
CHECK(ret == 0);
}
return p;
}
......@@ -79,7 +79,7 @@ public:
if (T* q = get(false)) {
dataDestructor(q);
}
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
}
/**
......@@ -112,7 +112,7 @@ private:
template <class T>
class ThreadLocalD {
public:
ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
~ThreadLocalD() {
pthread_key_delete(threadSpecificKey_);
for (auto t : threadMap_) {
......@@ -127,7 +127,7 @@ public:
T* p = (T*)pthread_getspecific(threadSpecificKey_);
if (!p) {
p = new T();
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p);
}
return p;
......@@ -141,7 +141,7 @@ public:
if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
dataDestructor(q);
}
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p);
}
......
......@@ -266,6 +266,29 @@ message PadConfig {
repeated uint32 pad_w = 4;
}
message MultiBoxLossConfig {
required uint32 num_classes = 1;
required float overlap_threshold = 2;
required float neg_pos_ratio = 3;
required float neg_overlap = 4;
required uint32 background_id = 5;
required uint32 input_num = 6;
optional uint32 height = 7 [default = 1];
optional uint32 width = 8 [default = 1];
}
message DetectionOutputConfig {
required uint32 num_classes = 1;
required float nms_threshold = 2;
required uint32 nms_top_k = 3;
required uint32 background_id = 4;
required uint32 input_num = 5;
required uint32 keep_top_k = 6;
required float confidence_threshold = 7;
optional uint32 height = 8 [default = 1];
optional uint32 width = 9 [default = 1];
}
message LayerInputConfig {
required string input_layer_name = 1;
optional string input_parameter_name = 2;
......@@ -284,6 +307,8 @@ message LayerInputConfig {
optional PriorBoxConfig priorbox_conf = 13;
optional PadConfig pad_conf = 14;
optional RowConvConfig row_conv_conf = 15;
optional MultiBoxLossConfig multibox_loss_conf = 16;
optional DetectionOutputConfig detection_output_conf = 17;
}
message LayerConfig {
......
......@@ -1674,6 +1674,52 @@ class PriorBoxLayer(LayerBase):
self.config.size = size
@config_layer('multibox_loss')
class MultiBoxLossLayer(LayerBase):
def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
neg_pos_ratio, neg_overlap, background_id, **xargs):
super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 2),
'MultiBoxLossLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
self.config.inputs[
0].multibox_loss_conf.overlap_threshold = overlap_threshold
self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
self.config.inputs[0].multibox_loss_conf.background_id = background_id
self.config.inputs[0].multibox_loss_conf.input_num = input_num
self.config.size = 1
@config_layer('detection_output')
class DetectionOutputLayer(LayerBase):
def __init__(self, name, inputs, size, input_num, num_classes,
nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
background_id, **xargs):
super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 1),
'DetectionOutputLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].detection_output_conf.num_classes = num_classes
self.config.inputs[
0].detection_output_conf.nms_threshold = nms_threshold
self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
self.config.inputs[
0].detection_output_conf.confidence_threshold = confidence_threshold
self.config.inputs[
0].detection_output_conf.background_id = background_id
self.config.inputs[0].detection_output_conf.input_num = input_num
self.config.size = size
@config_layer('data')
class DataLayer(LayerBase):
def __init__(self, name, size, height=None, width=None, device=None):
......@@ -2420,10 +2466,14 @@ class MaxLayer(LayerBase):
trans_type='non-seq',
bias=False,
output_max_index=None,
stride=-1,
**xargs):
super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index)
self.set_layer_size(input_layer.size)
......@@ -2685,11 +2735,15 @@ class AverageLayer(LayerBase):
average_strategy='average',
trans_type='non-seq',
bias=False,
stride=-1,
**xargs):
super(AverageLayer, self).__init__(
name, 'average', 0, inputs=inputs, **xargs)
self.config.average_strategy = average_strategy
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index)
......
......@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_prelu_layer test_row_conv)
test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
export whole_configs=(test_split_datasource)
......@@ -14,6 +14,14 @@ for pt in POOL_TYPE:
for al in AGG_LEVEL:
opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
for pt in POOL_TYPE:
opts.append(
pooling_layer(
input=din,
agg_level=AggregateLevel.TO_NO_SEQUENCE,
pooling_type=pt(),
stride=5))
opts.append(
pooling_layer(
input=din, pooling_type=MaxPooling(output_max_index=True)))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册