diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index efb4dcb2dfbc63bb6905961b054cdef860cf4573..980a97a07c996eca2e8c126a6ad5ab7f340fa1e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,9 +22,11 @@ hooks: - id: clang-formater - repo: https://github.com/PaddlePaddle/pre-commit-golang - sha: 16398aeccf263adaf53b2495eed0406347d76281 + sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: - - id: go-fmt - types: [go] - - id: gometalinter - types: [go] + - id: go-fmt + types: + - go + - id: gometalinter + types: + - go diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 3e6cedbb0d718cfd4454f95dedf7e02a24f2981b..f7483f6be9169eb58f0148cd3a956a8c881e1fe3 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - # for latest version, please get from official website - # URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz" - # URL_MD5 "1a47e78efe365a97de0c022d127607c3" - - # for no-ssl http support, please get from bazel's mirror - # URL "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz" - # URL_MD5 "4645c66075982da6fa0bcf6b20f3e8f7" - - # get from github mirror GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG "a46d2e7337c4656f00abe54a8115f6d76153a048" + GIT_TAG "master" PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 34fd348893058980964d723490d9cc220a157b5a..ef31c252038ce18655913c0f41343fe6dc7dbb86 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) +LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread) LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) if(CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 287da694915ca383dc29e6d33201dc701cb7de87..739c4c01e02b10f46c36b997f8c4700150da2a26 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -19,6 +19,8 @@ import ( "net" "net/http" "net/rpc" + "os" + "os/signal" "strconv" "strings" "time" @@ -68,6 +70,20 @@ func main() { store = &master.InMemStore{} } + shutdown := func() { + log.Infoln("shutting down gracefully") + err := store.Shutdown() + if err != nil { + log.Errorln(err) + } + } + + // Guaranteed to run even panic happens. + defer shutdown() + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) if err != nil { log.Fatal(err) @@ -84,8 +100,12 @@ func main() { log.Fatal(err) } - err = http.Serve(l, nil) - if err != nil { - log.Fatal(err) - } + go func() { + err = http.Serve(l, nil) + if err != nil { + log.Fatal(err) + } + }() + + <-c } diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index aa81d0432b1d4f411644e0a5b703d7ea74d144b7..f9cd8f87e8f2e715c87834ee08482be0f511f681 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -18,6 +18,8 @@ import ( "net" "net/http" "net/rpc" + "os" + "os/signal" "strconv" "time" @@ -33,7 +35,8 @@ func main() { index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") - etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls") + dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout") + etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds") numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job") checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path") checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds") @@ -53,7 +56,7 @@ func main() { if *index >= 0 { idx = *index } else { - e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout) + e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL) idx, err = e.Register(*port) candy.Must(err) @@ -67,6 +70,20 @@ func main() { } } + shutdown := func() { + log.Infoln("shutting down gracefully") + sErr := e.Shutdown() + if sErr != nil { + log.Errorln(sErr) + } + } + + // Guaranteed to run even panic happens. + defer shutdown() + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp) candy.Must(err) @@ -77,7 +94,11 @@ func main() { l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) candy.Must(err) - log.Infof("start pserver at port %d", *port) - err = http.Serve(l, nil) - candy.Must(err) + go func() { + log.Infof("start pserver at port %d", *port) + err = http.Serve(l, nil) + candy.Must(err) + }() + + <-c } diff --git a/go/glide.lock b/go/glide.lock index f71ae643d68d29846611ec52d0ae7d67e4ced850..1f16abdf66422abcd0ab7987cab3499d02cf1b9c 100644 --- a/go/glide.lock +++ b/go/glide.lock @@ -1,15 +1,105 @@ -hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855 -updated: 2017-07-11T10:04:40.786745417+08:00 +hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c +updated: 2017-07-29T07:34:48.722757905+08:00 imports: +- name: github.com/beorn7/perks + version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 + subpackages: + - quantile +- name: github.com/boltdb/bolt + version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9 +- name: github.com/cockroachdb/cmux + version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92 - name: github.com/coreos/etcd - version: cb2a496c4ddd1c87a9f280e116649b599999ec79 + version: c31bec0f29facff13f7c3e3d948e55dd6689ed42 subpackages: + - alarm + - auth - auth/authpb + - client - clientv3 - clientv3/concurrency + - compactor + - discovery + - embed + - error + - etcdserver + - etcdserver/api + - etcdserver/api/v2http + - etcdserver/api/v2http/httptypes + - etcdserver/api/v3client + - etcdserver/api/v3election + - etcdserver/api/v3election/v3electionpb + - etcdserver/api/v3election/v3electionpb/gw + - etcdserver/api/v3lock + - etcdserver/api/v3lock/v3lockpb + - etcdserver/api/v3lock/v3lockpb/gw + - etcdserver/api/v3rpc - etcdserver/api/v3rpc/rpctypes + - etcdserver/auth - etcdserver/etcdserverpb + - etcdserver/etcdserverpb/gw + - etcdserver/membership + - etcdserver/stats + - lease + - lease/leasehttp + - lease/leasepb + - mvcc + - mvcc/backend - mvcc/mvccpb + - pkg/adt + - pkg/contention + - pkg/cors + - pkg/cpuutil + - pkg/crc + - pkg/debugutil + - pkg/fileutil + - pkg/httputil + - pkg/idutil + - pkg/ioutil + - pkg/logutil + - pkg/monotime + - pkg/netutil + - pkg/pathutil + - pkg/pbutil + - pkg/runtime + - pkg/schedule + - pkg/srv + - pkg/tlsutil + - pkg/transport + - pkg/types + - pkg/wait + - proxy/grpcproxy/adapter + - raft + - raft/raftpb + - rafthttp + - snap + - snap/snappb + - store + - version + - wal + - wal/walpb +- name: github.com/coreos/go-semver + version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6 + subpackages: + - semver +- name: github.com/coreos/go-systemd + version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6 + subpackages: + - daemon + - journal + - util +- name: github.com/coreos/pkg + version: 3ac0863d7acf3bc44daf49afef8919af12f704ef + subpackages: + - capnslog +- name: github.com/dgrijalva/jwt-go + version: d2709f9f1f31ebcda9651b03077758c1f3a0018c +- name: github.com/ghodss/yaml + version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7 +- name: github.com/gogo/protobuf + version: 909568be09de550ed094403c2bf8a261b5bb730a + subpackages: + - proto - name: github.com/golang/protobuf version: 4bd1920723d7b7c925de087aa32e2187708897f7 subpackages: @@ -17,14 +107,61 @@ imports: - proto - name: github.com/golang/snappy version: 553a641470496b2327abcac10b36396bd98e45c9 +- name: github.com/google/btree + version: 925471ac9e2131377a91e1595defec898166fe49 +- name: github.com/grpc-ecosystem/go-grpc-prometheus + version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0 +- name: github.com/grpc-ecosystem/grpc-gateway + version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676 + subpackages: + - runtime + - runtime/internal + - utilities +- name: github.com/jonboulle/clockwork + version: 2eee05ed794112d45db504eb05aa693efd2b8b09 +- name: github.com/matttproud/golang_protobuf_extensions + version: c12348ce28de40eed0136aa2b644d0ee0650e56c + subpackages: + - pbutil - name: github.com/namsral/flag version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04 - name: github.com/PaddlePaddle/recordio - version: edfb82af0739c84f241c87390ec5649c7b28c129 + version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81 +- name: github.com/prometheus/client_golang + version: c5b7fccd204277076155f10851dad72b76a49317 + subpackages: + - prometheus +- name: github.com/prometheus/client_model + version: 6f3806018612930941127f2a7c6c453ba2c527d2 + subpackages: + - go +- name: github.com/prometheus/common + version: 49fee292b27bfff7f354ee0f64e1bc4850462edf + subpackages: + - expfmt + - internal/bitbucket.org/ww/goautoneg + - model +- name: github.com/prometheus/procfs + version: a1dba9ce8baed984a2495b658c82687f8157b98f + subpackages: + - xfs - name: github.com/sirupsen/logrus - version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1 + version: a3f95b5c423586578a4e099b11a46c2479628cac - name: github.com/topicai/candy version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc +- name: github.com/ugorji/go + version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74 + subpackages: + - codec +- name: github.com/xiang90/probing + version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2 +- name: golang.org/x/crypto + version: 1351f936d976c60a0a48d728281922cf63eafb8d + repo: https://github.com/golang/crypto.git + vcs: git + subpackages: + - bcrypt + - blowfish - name: golang.org/x/net version: c8c74377599bd978aee1cf3b9b63a8634051cec2 subpackages: @@ -36,11 +173,15 @@ imports: - lex/httplex - trace - name: golang.org/x/sys - version: abf9c25f54453410d0c6668e519582a9e1115027 + version: 0f826bdd13b500be0f1d4004938ad978fcc6031e + repo: https://github.com/golang/sys.git + vcs: git subpackages: - unix - name: golang.org/x/text - version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa + version: 836efe42bb4aa16aaa17b9c155d8813d336ed720 + repo: https://github.com/golang/text.git + vcs: git subpackages: - secure/bidirule - transform @@ -60,4 +201,23 @@ imports: - stats - tap - transport -testImports: [] +- name: gopkg.in/yaml.v2 + version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b +testImports: +- name: github.com/davecgh/go-spew + version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9 + subpackages: + - spew +- name: github.com/docker/docker + version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e + subpackages: + - pkg/ioutils + - pkg/longpath +- name: github.com/pmezard/go-difflib + version: d8ed2627bdf02c080bf22230dbb337003b7aba2d + subpackages: + - difflib +- name: github.com/stretchr/testify + version: 05e8a0eda380579888eb53c394909df027f06991 + subpackages: + - assert diff --git a/go/glide.yaml b/go/glide.yaml index ab472c7cda9755d0399bb8376b16589be8b53057..bc23fa6ebf2c3db61e2d63e5f7e7ddcb595dfed0 100644 --- a/go/glide.yaml +++ b/go/glide.yaml @@ -6,8 +6,19 @@ import: subpackages: - clientv3 - clientv3/concurrency + - embed + - etcdserver - package: github.com/namsral/flag version: ^1.7.4-pre - package: github.com/sirupsen/logrus version: ^1.0.0 - package: github.com/topicai/candy +- package: golang.org/x/crypto + vcs: git + repo: https://github.com/golang/crypto.git +- package: golang.org/x/sys + vcs: git + repo: https://github.com/golang/sys.git +- package: golang.org/x/text + vcs: git + repo: https://github.com/golang/text.git diff --git a/go/master/c/client.go b/go/master/c/client.go index a2b18e4b474e039e661a3ae130379b41e76f29bd..b5759c30b1d7f7dc33e162e959c7de165e02e1da 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -18,7 +18,6 @@ package main #include #include #include - #define PADDLE_MASTER_OK 0 #define PADDLE_MASTER_ERROR -1 @@ -101,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) { remove(client) } +//export paddle_start_get_records +func paddle_start_get_records(client C.paddle_master_client, pass C.int) { + c := get(client) + c.StartGetRecords(int(pass)) +} + //export paddle_set_dataset func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int { c := get(client) @@ -121,15 +126,19 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int // paddle_next_record gets the nexts training record. // -// returns number of bytes of the records if success, -1 if failed. +// returns number of bytes of the records if success, -1 if failed, -2 if pass end. // //export paddle_next_record func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int { c := get(client) r, err := c.NextRecord() if err != nil { - // Error - // TODO: return the type of error? + // NOTE: use errors to indicate pass ends + if err.Error() == master.ErrAllTaskFailed.Error() || + err.Error() == master.ErrNoMoreAvailable.Error() || + err.Error() == master.ErrPassBefore.Error() { + return -2 + } *record = (*C.uchar)(nil) return -1 } diff --git a/go/master/client.go b/go/master/client.go index bbf3768d96ead1911508486410d2402ea0ac8b12..62801b9b7fe85fe27147b12160f48d988623d547 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -16,7 +16,6 @@ package master import ( "os" - "sync" "time" "github.com/PaddlePaddle/Paddle/go/connection" @@ -27,9 +26,9 @@ import ( // Client is the client of the master server. type Client struct { - conn *connection.Conn - ch chan record - initChOnce sync.Once + conn *connection.Conn + ch chan record + bufSize int } type record struct { @@ -46,11 +45,7 @@ func WithBuffer(bufSize int) func(*Client) error { if bufSize <= 0 { return nil } - - c.initChOnce.Do(func() { - c.ch = make(chan record, bufSize) - go c.getRecords() - }) + c.bufSize = bufSize return nil } } @@ -104,25 +99,41 @@ func NewClient(opts ...func(*Client) error) (*Client, error) { if err != nil { return nil, err } - } - + c.ch = make(chan record, c.bufSize) + // FIXME: connection is created asyncrosly in monitorMaster go routine, + // ensure the connection is ready for use before calling c.addClient. + time.Sleep(time.Second) return c, nil } -func (c *Client) getRecords() { +// StartGetRecords must be called at beginning of each pass +func (c *Client) StartGetRecords(passID int) { + go c.getRecords(passID) +} + +func (c *Client) getRecords(passID int) { for { - t, err := c.getTask() + t, err := c.getTask(passID) if err != nil { - log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err) - time.Sleep(3 * time.Second) - continue + if err.Error() == ErrPassBefore.Error() || + err.Error() == ErrNoMoreAvailable.Error() || + err.Error() == ErrAllTaskFailed.Error() { + c.ch <- record{nil, err} + break + } + if err.Error() == ErrPassAfter.Error() { + // wait util last pass finishes + time.Sleep(time.Second * 3) + continue + } + log.Errorf("getTask error: %s", err) } for _, chunk := range t.Chunks { - f, err := os.Open(chunk.Path) - if err != nil { - log.Errorln(err) + f, e := os.Open(chunk.Path) + if e != nil { + log.Errorln(e) continue } @@ -178,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) { } } -// SetDataset set dataset for the master server to dispatch. +// SetDataset sets dataset to dispatch for the master server. +// +// SetDataset can be call multiple times at one pass. But only the first call +// will be honored. // -// SetDataset can be call multiple times from different nodes. But -// only the first call will be honored. +// After all tasks are done, another call of SetDataset will start another pass. func (c *Client) SetDataset(globPaths []string) error { - return c.conn.Call("Service.SetDataset", globPaths, nil) + err := c.conn.Call("Service.SetDataset", globPaths, nil) + return err } // getTask gets a new task from the master server. -func (c *Client) getTask() (Task, error) { +func (c *Client) getTask(passID int) (Task, error) { var t Task - err := c.conn.Call("Service.GetTask", 0, &t) + err := c.conn.Call("Service.GetTask", passID, &t) return t, err } @@ -208,12 +222,6 @@ func (c *Client) taskFailed(meta TaskMeta) error { // NextRecord will block until the next record is available. It is // thread-safe. func (c *Client) NextRecord() ([]byte, error) { - c.initChOnce.Do(func() { - // initialize with in case WithBuffer is not used. - c.ch = make(chan record, 0) - go c.getRecords() - }) - r := <-c.ch return r.r, r.err } diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go index ee305e2c80f54ebee2e5011ca7ff0cf5e0612f41..d5f3d79464655540a29eaa6395057aa5795c4615 100644 --- a/go/master/client_internal_test.go +++ b/go/master/client_internal_test.go @@ -54,22 +54,22 @@ func TestGetFinishTask(t *testing.T) { panic(err) } go func(l net.Listener) { - s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1) - if err != nil { - panic(err) + s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1) + if sErr != nil { + panic(sErr) } server := rpc.NewServer() - err = server.Register(s) - if err != nil { - panic(err) + sErr = server.Register(s) + if sErr != nil { + panic(sErr) } mux := http.NewServeMux() mux.Handle(rpc.DefaultRPCPath, server) - err = http.Serve(l, mux) - if err != nil { - panic(err) + sErr = http.Serve(l, mux) + if sErr != nil { + panic(sErr) } }(l) @@ -103,6 +103,7 @@ func TestGetFinishTask(t *testing.T) { ch := make(chan string, 1) ch <- addr go c.monitorMaster(ch) + err = c.SetDataset([]string{path}) if err != nil { panic(err) @@ -111,44 +112,47 @@ func TestGetFinishTask(t *testing.T) { checkOnePass := func(i int) { var tasks []Task for idx := 0; idx < totalTask; idx++ { - task, err := c.getTask() - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + task, cErr := c.getTask(i) + if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() { + t.Fatalf("error: %v, pass: %d\n", cErr, i) } tasks = append(tasks, task) } - _, err = c.getTask() - if err == nil { + // getting task before task finishes should return error + _, cErr := c.getTask(i) + if cErr == nil { t.Fatalf("Should get error, pass: %d\n", i) } - err = c.taskFinished(tasks[0].Meta.ID) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + cErr = c.taskFinished(tasks[0].Meta.ID) + if cErr != nil { + t.Fatalf("Error: %v, pass: %d\n", cErr, i) } - - err = c.taskFailed(tasks[0].Meta) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + // call taskFailed once won't put the task to failed queue, just ensure + // the call + cErr = c.taskFailed(tasks[0].Meta) + if cErr != nil { + t.Fatalf("Error: %v, pass: %d\n", cErr, i) } tasks = tasks[1:] - task, err := c.getTask() - if err != nil { - t.Fatal(err) + _, cErr = c.getTask(i) + if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() { + t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr) } - tasks = append(tasks, task) for _, task := range tasks { - err = c.taskFinished(task.Meta.ID) - if err != nil { - t.Fatalf("Error: %v, pass: %d\n", err, i) + cErr = c.taskFinished(task.Meta.ID) + if cErr != nil { + t.Fatal(cErr) } } } for i := 0; i < 10; i++ { + // init pass data + c.StartGetRecords(i) checkOnePass(i) } } diff --git a/go/master/client_test.go b/go/master/client_test.go index a3a434ae7e855c1cd2043d196435b42d2520f003..79b9cc844d1ff938915a622bf19a7d772682becf 100644 --- a/go/master/client_test.go +++ b/go/master/client_test.go @@ -20,8 +20,10 @@ import ( "net/http" "net/rpc" "os" + "runtime" "strconv" "strings" + "sync" "testing" "time" @@ -29,6 +31,18 @@ import ( "github.com/PaddlePaddle/recordio" ) +// tool function for testing output goroutine ids +func goid() int { + var buf [64]byte + n := runtime.Stack(buf[:], false) + idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0] + id, err := strconv.Atoi(idField) + if err != nil { + panic(fmt.Sprintf("cannot get goroutine id: %v", err)) + } + return id +} + func TestNextRecord(t *testing.T) { const ( path = "/tmp/master_client_TestFull" @@ -45,7 +59,7 @@ func TestNextRecord(t *testing.T) { panic(err) } go func(l net.Listener) { - s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1) + s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1) if err != nil { panic(err) } @@ -69,7 +83,7 @@ func TestNextRecord(t *testing.T) { panic(err) } - w := recordio.NewWriter(f, -1, -1) + w := recordio.NewWriter(f, 1, -1) for i := 0; i < total; i++ { _, err = w.Write([]byte{byte(i)}) if err != nil { @@ -87,32 +101,49 @@ func TestNextRecord(t *testing.T) { panic(err) } - c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10)) - if err != nil { - panic(err) - } - - err = c.SetDataset([]string{path}) - if err != nil { - panic(err) - } - - for pass := 0; pass < 50; pass++ { - received := make(map[byte]bool) - for i := 0; i < total; i++ { - r, err := c.NextRecord() - if err != nil { - t.Fatal(pass, i, "Read error:", err) + // start several client to test task fetching + var wg sync.WaitGroup + for i := 0; i < 4; i++ { + wg.Add(1) + // test for multiple concurrent clients + go func() { + defer wg.Done() + // each go-routine needs a single client connection instance + c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1)) + if e != nil { + t.Fatal(e) } - - if len(r) != 1 { - t.Fatal(pass, i, "Length should be 1.", r) + e = c.SetDataset([]string{path}) + if e != nil { + panic(e) } - - if received[r[0]] { - t.Fatal(pass, i, "Received duplicate.", received, r) + // test for n passes + for pass := 0; pass < 10; pass++ { + c.StartGetRecords(pass) + + received := make(map[byte]bool) + taskid := 0 + for { + r, e := c.NextRecord() + if e != nil { + // ErrorPassAfter will wait, else break for next pass + if e.Error() == master.ErrPassBefore.Error() || + e.Error() == master.ErrNoMoreAvailable.Error() { + break + } + t.Fatal(pass, taskid, "Read error:", e) + } + if len(r) != 1 { + t.Fatal(pass, taskid, "Length should be 1.", r) + } + if received[r[0]] { + t.Fatal(pass, taskid, "Received duplicate.", received, r) + } + taskid++ + received[r[0]] = true + } } - received[r[0]] = true - } + }() } + wg.Wait() } diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go index ae6b6f776bec9ccaead4465ad233fc8ed6c3a418..94848d887e8bc4b055a7c8b89b9b7f26a39229d1 100644 --- a/go/master/etcd_client.go +++ b/go/master/etcd_client.go @@ -39,15 +39,12 @@ type EtcdClient struct { statePath string client *clientv3.Client lock *concurrency.Mutex + sess *concurrency.Session } // NewEtcdClient creates a new EtcdClient. func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) { log.Debugf("Connecting to etcd at %v", endpoints) - // TODO(helin): gracefully shutdown etcd store. Because etcd - // store holds a etcd lock, even though the lock will expire - // when the lease timeout, we need to implement graceful - // shutdown to release the lock. cli, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, DialTimeout: dialTimeout, @@ -67,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat // one master running, but split-brain problem may cause // multiple master servers running), and the cluster management // software will kill one of them. - log.Debugf("Trying to acquire lock at %s.", lockPath) + log.Infof("Trying to acquire lock at %s.", lockPath) err = lock.Lock(context.TODO()) if err != nil { return nil, err } - log.Debugf("Successfully acquired lock at %s.", lockPath) + log.Infof("Successfully acquired lock at %s.", lockPath) put := clientv3.OpPut(addrPath, addr) resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit() @@ -89,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat statePath: statePath, client: cli, lock: lock, + sess: sess, } return e, nil @@ -157,6 +155,21 @@ func (e *EtcdClient) Load() ([]byte, error) { return state, nil } +// Shutdown shuts down the etcd client gracefully. +func (e *EtcdClient) Shutdown() error { + err := e.sess.Close() + newErr := e.client.Close() + if newErr != nil { + if err == nil { + err = newErr + } else { + log.Errorln(newErr) + } + } + + return err +} + // GetKey gets the value by the specify key. func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go index ffd663f7f0b25c29f0bab082d27b29dcfeb60826..a5bd2d4fe150cd34c699ccfae1f3d3e0fb2ef3d6 100644 --- a/go/master/inmem_store.go +++ b/go/master/inmem_store.go @@ -40,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) { return m.buf, nil } + +// Shutdown shuts down the in mem store. +func (m *InMemStore) Shutdown() error { + return nil +} diff --git a/go/master/service.go b/go/master/service.go index d1ec8939e18e8f4a7b4578a9399e2fa9f24325f3..d30e9a33229c0aff354417771b5bf2ae6a781715 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -19,6 +19,7 @@ import ( "compress/gzip" "encoding/gob" "errors" + "math/rand" "os" "path/filepath" "sync" @@ -33,10 +34,23 @@ const ( dialTimeout = 5 * time.Second ) +// ErrAllTaskFailed occur when tasks are in done or failed state. +var ErrAllTaskFailed = errors.New("all task finished") + +// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail. +var ErrNoMoreAvailable = errors.New("no more available task") + +// ErrPassBefore client side pass number does not match with master counter. +var ErrPassBefore = errors.New("pass number smaller than master") + +// ErrPassAfter client side pass number does not match with master counter. +var ErrPassAfter = errors.New("pass number larger than master") + // Store is the interface for save and load the master state. type Store interface { Save([]byte) error Load() ([]byte, error) + Shutdown() error } // Chunk is a chunk of data consisted of several data instances. @@ -75,17 +89,26 @@ type Service struct { chunksPerTask int timeoutDur time.Duration failureMax int - ready chan struct{} store Store - mu sync.Mutex - initDone bool - taskQueues taskQueues + ready chan struct{} + initDone bool + + mu sync.Mutex + taskQueues taskQueues + currPass int + jobTasks []taskEntry + savingTrainer string } func partition(chunks []Chunk, chunksPerTask int) []taskEntry { - id := 0 + // generate uniq id across job using nanosecond + randint + counter + // FIXME(typhoonzero): this is a workaround, use uuid + randStart := rand.Int() + counter := 0 + timestamp := time.Now().Nanosecond() + id := timestamp + randStart + counter if chunksPerTask <= 0 { chunksPerTask = 1 } @@ -95,7 +118,8 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry { for i, c := range chunks { if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 { cur.Task.Meta.ID = id - id++ + counter++ + id = timestamp + randStart + counter result = append(result, cur) cur.Task.Chunks = nil } @@ -266,19 +290,21 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error { return err } - s.taskQueues.Todo = partition(chunks, s.chunksPerTask) + s.jobTasks = partition(chunks, s.chunksPerTask) + s.taskQueues.Todo = s.jobTasks err = s.snapshot() if err != nil { log.Errorln(err) return err } - close(s.ready) s.initDone = true return nil } +// processFailedTask retry s.failureMax times for failed task. +// return true if all task are done or failed. func (s *Service) processFailedTask(t taskEntry, epoch int) { if t.Task.Meta.Epoch != epoch { // new epoch, task launched after the @@ -302,8 +328,9 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { return } - log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure) + log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure) s.taskQueues.Todo = append(s.taskQueues.Todo, t) + return } func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { @@ -331,37 +358,30 @@ func (s *Service) logFields() log.Fields { } // GetTask gets a new task from the service. -func (s *Service) GetTask(_ int, task *Task) error { +// passID is the client side pass count +func (s *Service) GetTask(passID int, task *Task) error { select { case <-s.ready: } s.mu.Lock() defer s.mu.Unlock() + if passID < s.currPass { + return ErrPassBefore + } + if passID > s.currPass { + // Client may get run to pass after master when one client faster than the + // other + return ErrPassAfter + } if len(s.taskQueues.Todo) == 0 { - if len(s.taskQueues.Done) == 0 { - if len(s.taskQueues.Pending) == 0 { - err := errors.New("all task failed") - log.WithFields(s.logFields()).Warningln("All tasks failed.") - return err - } - - // TODO(helin): client need to retry in this - // error case. Gotcha: RPC client can't - // compare returned error with predefined - // errors like io.EOF, because the error - // instance deserialized from RPC is a - // different instance than the error defined - // in package. So we need to figure out a way - // for client to check this error correctly. - err := errors.New("no more available task") - log.WithFields(s.logFields()).Warningln("No more available task.") - return err + if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 { + log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass") + return ErrAllTaskFailed } - s.taskQueues.Todo = s.taskQueues.Done - s.taskQueues.Done = nil - log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.") + log.WithFields(s.logFields()).Warningln("No more available task.") + return ErrNoMoreAvailable } t := s.taskQueues.Todo[0] @@ -381,7 +401,7 @@ func (s *Service) GetTask(_ int, task *Task) error { } // TaskFinished tell the service that a task is finished. -func (s *Service) TaskFinished(taskID int, _ *int) error { +func (s *Service) TaskFinished(taskID int, dummy *int) error { select { case <-s.ready: } @@ -401,11 +421,14 @@ func (s *Service) TaskFinished(taskID int, _ *int) error { delete(s.taskQueues.Pending, taskID) log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID) - - if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 { - log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.") - s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...) - s.taskQueues.Done = nil + if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 { + // increase master side pass count if all tasks finished + s.currPass++ + s.taskQueues.Todo = s.jobTasks + s.taskQueues.Done = []taskEntry{} + // TODO(typhoonzero): deal with failed tasks + s.taskQueues.Failed = []taskEntry{} + log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass) } err := s.snapshot() @@ -416,7 +439,7 @@ func (s *Service) TaskFinished(taskID int, _ *int) error { } // TaskFailed tells the service that a task is failed. -func (s *Service) TaskFailed(meta TaskMeta, _ *int) error { +func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error { select { case <-s.ready: } diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go index 69a882fc33668a8cdefa30ae394f6c605f3bf099..bd1a939a55553b558181d91a757c487d0f97b40b 100644 --- a/go/master/service_internal_test.go +++ b/go/master/service_internal_test.go @@ -44,7 +44,8 @@ func TestPartionIndex(t *testing.T) { cs := make([]Chunk, 100) ts := partition(cs, 20) for i := range ts { - if ts[i].Task.Meta.ID != i { + // test auto increament ids + if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 { t.Error(ts[i], i) } } diff --git a/go/master/service_test.go b/go/master/service_test.go new file mode 100644 index 0000000000000000000000000000000000000000..5f91910ecc8cf32289e71e2e41e8b283acc115e6 --- /dev/null +++ b/go/master/service_test.go @@ -0,0 +1,68 @@ +package master_test + +import ( + "os" + "testing" + "time" + + "github.com/PaddlePaddle/Paddle/go/master" + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/embed" + "github.com/docker/docker/pkg/ioutils" + "github.com/stretchr/testify/assert" +) + +func TestNewServiceWithEtcd(t *testing.T) { + // setup an embed etcd server + etcdDir, err := ioutils.TempDir("", "") + if err != nil { + t.Fatal(err) + } + cfg := embed.NewConfig() + cfg.Dir = etcdDir + e, err := embed.StartEtcd(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + e.Close() + if err := os.RemoveAll(etcdDir); err != nil { + t.Fatal(err) + } + }() + select { + case <-e.Server.ReadyNotify(): + t.Log("Server is ready!") + case <-time.After(60 * time.Second): + e.Server.Stop() // trigger a shutdown + t.Fatal("Server took too long to start!") + } + + ep := []string{"127.0.0.1:2379"} + masterAddr := "127.0.0.1:3306" + store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30) + if err != nil { + t.Fatal(err) + } + + _, err = master.NewService(store, 10, 10, 3) + if err != nil { + t.Fatal(err) + } + cli, err := clientv3.New(clientv3.Config{ + Endpoints: ep, + DialTimeout: 3 * time.Second, + }) + if err != nil { + t.Fatal(err) + } + v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second) + if err != nil { + t.Fatal(err) + } + if err := cli.Close(); err != nil { + t.Fatal(err) + } + // test master process registry itself into etcd server. + assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.") +} diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go index 0f7e20cdd8d20e37b586c22377a89fca4c3cf7ce..14ad0774550f6e5a5d8610d6007904cd2820432c 100644 --- a/go/pserver/client/c/cclient.go +++ b/go/pserver/client/c/cclient.go @@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client func add(c *client.Client) C.paddle_pserver_client { mu.Lock() defer mu.Unlock() - client := curHandle + cli := curHandle curHandle++ - handleMap[client] = c - return client + handleMap[cli] = c + return cli } func get(client C.paddle_pserver_client) *client.Client { diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py index e9264592b4f18fddf68b198d73bf907206e77a3f..85cb399590f7a5e7e73285ca87c49ea5f24afb32 100644 --- a/go/pserver/client/c/test/test_train.py +++ b/go/pserver/client/c/test/test_train.py @@ -6,16 +6,19 @@ import cPickle as pickle etcd_ip = os.getenv("MASTER_IP", "127.0.0.1") etcd_endpoint = "http://" + etcd_ip + ":2379" +print "connecting to master, etcd endpoints: ", etcd_endpoint +master_client = master.client(etcd_endpoint, 5, 64) def cloud_reader(): - print "connecting to master, etcd endpoints: ", etcd_endpoint - master_client = master.client(etcd_endpoint, 5, 64) + global master_client master_client.set_dataset( - ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"]) + ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30) while 1: r, e = master_client.next_record() if not r: + if e != -2: # other errors + print "get record error:", e break yield pickle.loads(r) @@ -27,10 +30,12 @@ def main(): # network config x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) y_predict = paddle.layer.fc(input=x, - param_attr=paddle.attr.Param(name='w'), + param_attr=paddle.attr.Param( + name='w', learning_rate=1e-3), size=1, act=paddle.activation.Linear(), - bias_attr=paddle.attr.Param(name='b')) + bias_attr=paddle.attr.Param( + name='b', learning_rate=1e-3)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) cost = paddle.layer.mse_cost(input=y_predict, label=y) @@ -38,9 +43,8 @@ def main(): parameters = paddle.parameters.create(cost) # create optimizer of new remote updater to pserver - optimizer = paddle.optimizer.Momentum(momentum=0) + optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3) - print "etcd endoint: ", etcd_endpoint trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=optimizer, @@ -51,6 +55,8 @@ def main(): # event_handler to print training and testing info def event_handler(event): if isinstance(event, paddle.event.EndIteration): + # FIXME: for cloud data reader, pass number is managed by master + # should print the server side pass number if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f" % ( event.pass_id, event.batch_id, event.cost) diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 98ff8ce827c7cfcd9122cb043f2a6226057cc95a..4fb26307667295ab825d07be6c3d1d4b33f6eb8b 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -34,16 +34,19 @@ const ( PsPath = "/ps/" // PsCheckpoint is the etcd path for store checkpoints information PsCheckpoint = "/checkpoints/" + + retryTimeout = 5 * time.Second ) // EtcdClient is the etcd client that the pserver uses for fault // tolerance, service registry and coordination. type EtcdClient struct { - numPservers int - etcdEndpoints string - etcdClient *clientv3.Client - // etcdTimeout is also used as retry intervals. - etcdTimeout time.Duration + numPservers int + endpoints string + client *clientv3.Client + sess *concurrency.Session + dialTimeout time.Duration + ttlSec int // FIXME: ensure GetExternalIP gets the correct ip for trainers to connect. externalIP string // desired number of pservers in the job. @@ -52,11 +55,12 @@ type EtcdClient struct { } // NewEtcdClient creates an EtcdClient -func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient { +func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient { return &EtcdClient{ - etcdTimeout: timeout, - numPservers: numPservers, - etcdEndpoints: endpoints, + dialTimeout: dialtimeout, + ttlSec: ttlSec, + numPservers: numPservers, + endpoints: endpoints, } } @@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et // // Register returns the index of the current pserver. func (e *EtcdClient) Register(port int) (int, error) { - var err error e.externalIP, err = networkhelper.GetExternalIP() if err != nil { @@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) { } // initialize connection to etcd. - ep := strings.Split(e.etcdEndpoints, ",") + ep := strings.Split(e.endpoints, ",") for { cli, err := clientv3.New(clientv3.Config{ Endpoints: ep, - DialTimeout: e.etcdTimeout, + DialTimeout: e.dialTimeout, }) if err != nil { log.Errorf("connect to etcd error: %v", err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) + continue + } + e.client = cli + sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec)) + if err != nil { + log.Errorf("create etcd session error: %v", err) + time.Sleep(retryTimeout) continue } - e.etcdClient = cli - log.Debugf("inited client to %s", e.etcdEndpoints) + e.sess = sess + log.Debugf("inited client to %s", e.endpoints) break } // init /ps_desired using transaction, for multiple pservers may want to write @@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) { // wait and set s.desired init value for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) - resp, err := e.etcdClient.Get(ctx, PsDesired) + resp, err := e.client.Get(ctx, PsDesired) cancel() if err != nil { log.Errorf("getting %s error: %v", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } if len(resp.Kvs) != 0 { e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value)) if err != nil { log.Errorf("value of %s invalid %v\n", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) // NOTE: wait util ps_desired value change continue } @@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) { } func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { - return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + return concurrency.NewSTM(e.client, func(c concurrency.STM) error { dsStr := c.Get(PsDesired) if dsStr == "" { - c.Put(PsDesired, strconv.Itoa(numPservers)) + c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease())) } return nil }, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads)) @@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) ( // registerPserverEtcd registers pserver node on etcd using transaction. func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) { var idx int - _, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + _, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error { registered := false for i := 0; i < e.desired; i++ { psKey := PsPath + strconv.Itoa(i) @@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er log.Debugf("got value (%s) for key: %s", ps, psKey) if ps == "" { - resp, err := e.etcdClient.Grant(context.TODO(), 5) - if err != nil { - log.Fatal(err) - } // find the first id and write info pserverAddr := e.externalIP + ":" + strconv.Itoa(port) - c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID)) + c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease())) log.Debugf("set pserver node %s with value %s", psKey, pserverAddr) - ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID) - if kaerr != nil { - log.Errorf("keepalive etcd node error: %v", kaerr) - return kaerr - } - - // Eat the keep alive message so etcd - // will not expire the lease. - go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { - ka := <-ch - log.Debugf("keepalive: %d\n", ka.TTL) - }(ch) log.Debug("register finished") idx = i registered = true @@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er // GetKey gets the value by the specified key func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) - resp, err := e.etcdClient.Get(ctx, key) + resp, err := e.client.Get(ctx, key) cancel() if err != nil { return []byte{}, err @@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { // PutKey put into etcd with value by key specified func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) - _, err := e.etcdClient.Put(ctx, key, string(value)) + _, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) cancel() return err } + +// Shutdown shuts down the etcd client gracefully. +func (e *EtcdClient) Shutdown() error { + var err error + if e.sess != nil { + err = e.sess.Close() + } + + if e.client != nil { + newErr := e.client.Close() + if newErr != nil { + if err != nil { + log.Errorln(newErr) + } else { + err = newErr + } + } + } + return err +} diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp index 681e3a380912339c531c16c88f43255c2f34c32f..fcda6eaf031c02f2314298f88b3af2c08ba6fa11 100644 --- a/paddle/api/Evaluator.cpp +++ b/paddle/api/Evaluator.cpp @@ -37,7 +37,7 @@ std::vector Evaluator::getNames() const { double Evaluator::getValue(const std::string name) const { paddle::Error err; double v = m->rawPtr->getValue(name, &err); - if (err) { + if (!err.isOK()) { throw std::runtime_error(err.msg()); } return v; diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa..9f84db72da24b0e678520b077f9cba7ffc2d589a 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -17,73 +17,6 @@ limitations under the License. */ #include "hl_base.h" -/** - * @brief Shrink column to feature. - * - * @param[in] dataCol expand data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] blockH filter height. - * @param[in] blockW filter width. - * @param[in] strideH stride height. - * @param[in] strideW stride width. - * @param[in] paddingH padding height. - * @param[in] paddingW padding width. - * @param[in] outputH output height. - * @param[in] outputW output width. - * @param[out] dataIm output image data. - * @param[in] alpha - * @param[in] beta - */ -extern void hl_shrink_col2feature(const real* dataCol, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataIm, - real alpha = 1.0f, - real beta = 0.0f); - -/** - * @brief Expand feature to column. - * - * @param[in] dataIm input image data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] blockH filter height. - * @param[in] blockW filter width. - * @param[in] strideH stride height. - * @param[in] strideW stride width. - * @param[in] paddingH padding height. - * @param[in] paddingW padding width. - * @param[in] outputH output height. - * @param[in] outputW output width. - * @param[out] dataCol expand data. - * - */ -extern void hl_expand_feature2col(const real* dataIm, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataCol); - /** * @brief Maximum pool forward. * diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index 039551c6cc69525e71c8c311f78fb6dec07d7fed..2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -17,36 +17,6 @@ limitations under the License. */ #include "hl_cnn.h" -inline void hl_shrink_col2feature(const real* dataCol, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataIm, - real alpha, - real beta) {} - -inline void hl_expand_feature2col(const real* dataIm, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataCol) {} - inline void hl_maxpool_forward(const int frameCnt, const real* inputData, const int channels, diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index b94f4d8fe4a251750c527d4b686fcc8f452d4606..b6e3e63a4f52261e49467bd82fdabd063e81460e 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -18,134 +18,6 @@ limitations under the License. */ #include "hl_cnn.h" #include "hl_device_functions.cuh" -__global__ void KeFeature2col(size_t n, size_t height, const real* data_im, - size_t blockH, size_t blockW, size_t width, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - real* data_col) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - size_t w_out = index % width_col; - index /= width_col; - size_t h_out = index % height_col; - size_t channel_in = index / height_col; - size_t channel_out = channel_in * blockH * blockW; - size_t h_in = h_out * strideH; - size_t w_in = w_out * strideW; - - data_col += (channel_out * height_col + h_out) * width_col + w_out; - for (size_t i = 0; i < blockH; ++i) { - for (size_t j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { - *data_col = 0; - } else { - rIdx = rIdx + channel_in*height - paddingH; - cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; - } - data_col += height_col * width_col; - } - } - } -} - -void hl_expand_feature2col(const real* dataIm, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataCol) { - size_t numKernels = channels * outputH * outputW; - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, height, dataIm, blockH, blockW, width, - strideH, strideW, paddingH, paddingW, - outputH, outputW, dataCol); - CHECK_SYNC("hl_expand_feature2col failed"); -} - -__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - real* data_im, real alpha, real beta) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - real val = 0; - int w = int(index % width); - int h = int((index / width) % height); - int c = int(index / (width * height)); - if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { - // compute the start and end of the output - int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); - int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; - int h_col_end = min(int(h / strideH + 1), int(height_col)); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - h -= paddingH; - w -= paddingW; - real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w]; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] = alpha * val + beta*tD; - } - } -} - -void hl_shrink_col2feature(const real * dataCol, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataIm, real alpha, real beta) { - size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW); - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW, - channels, blockH, blockW, strideH, strideW, paddingH, paddingW, - outputH, outputW, dataIm, alpha, beta); - CHECK_SYNC("hl_shrink_col2feature failed"); -} - __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, const int channels, const int height, const int width, diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 0fe2877f89f8d0fbc4db40c400037be30bb87ff7..4f650ce03ccb2d14cc2997e9cd426acb91439539 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -330,7 +330,7 @@ __global__ void KeSequenceAvgForward(real* dst, } sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); - dst[gid] = sum; + dst[gid] += sum; } } diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 433edbfda742d3be9915eade7b0a455398a501dc..21cb7c7265e0052630b68954fa25f9189e641e7b 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory) +cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) @@ -29,7 +29,5 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -proto_library(net_proto SRCS net_proto.proto DEPS op_proto) -# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op) -cc_library(net SRCS net.cc DEPS operator net_proto op_registry) +cc_library(net SRCS net.cc DEPS op_registry) cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op) diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..e7ff09dd5c954378afeca299e901277c3ebdb96a --- /dev/null +++ b/paddle/framework/detail/tensor-inl.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/memory/memcpy.h" + +namespace paddle { +namespace framework { + +template +inline void Tensor::check_memory_size() const { + PADDLE_ENFORCE(holder_ != nullptr, + "Tenosr holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory."); +} + +template +inline const T* Tensor::data() const { + check_memory_size(); + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +template +inline T* Tensor::data() { + check_memory_size(); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + Resize(dims); + return mutable_data(place); +} + +template +inline T* Tensor::mutable_data(platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + PADDLE_ENFORCE(product(dims_) > 0, + "Tensor's numel must be larger than zero to call " + "Tensor::mutable_data. Call Tensor::set_dim first."); + /* some versions of boost::variant don't have operator!= */ + size_t size = product(dims_) * sizeof(T); + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline void Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; +} + +template +inline void Tensor::CopyFrom(const Tensor& src, + const platform::Place& dst_place) { + src.check_memory_size(); + Resize(src.dims()); + + auto src_place = src.holder_->place(); + auto src_ptr = static_cast(src.data()); + + auto dst_ptr = static_cast(mutable_data(dst_place)); + + auto size = product(src.dims_) * sizeof(T); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(src_place) && + platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } + +#endif +} + +template +inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { + check_memory_size(); + PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); + PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); + PADDLE_ENFORCE(begin_idx < end_idx, + "Begin index must be less than end index."); + PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); + int base = product(dims_) / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * sizeof(T); + return dst; +} + +inline void Tensor::Resize(const DDim& dims) { dims_ = dims; } + +inline const DDim& Tensor::dims() const { return dims_; } + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc index bc23b63b35d37eea01ae6b9b8891e9cd94615898..2cd378c6b21303d1a24206ba3010b0d035aaa766 100644 --- a/paddle/framework/net.cc +++ b/paddle/framework/net.cc @@ -20,17 +20,7 @@ namespace paddle { namespace framework { -std::shared_ptr AddBackwardOp(std::shared_ptr ForwardOps) { - auto grad_ops = std::make_shared(); - for (auto& op : ForwardOps->ops_) { - auto op_grad = OpRegistry::CreateGradOp(op); - grad_ops->AddOp(op_grad); - } - grad_ops->CompleteAddOp(); - return grad_ops; -} - -void PlainNet::CompleteAddOp(bool calc) { +void NetOp::CompleteAddOp(bool calc) { add_op_done_ = true; if (!calc) return; std::unordered_set input_set; @@ -70,7 +60,7 @@ void PlainNet::CompleteAddOp(bool calc) { attrs_["temporary_index"] = tmp_index; } -std::string PlainNet::DebugString() const { +std::string NetOp::DebugString() const { std::ostringstream os; os << OperatorBase::DebugString() << std::endl; for (auto& op : ops_) { @@ -82,5 +72,7 @@ std::string PlainNet::DebugString() const { return os.str(); } +bool NetOp::IsNetOp() const { return true; } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/net.h b/paddle/framework/net.h index 3264f1f565e3efc188e7835cb9b44e5741e1eea8..089c1355951f59d51db16d4b4bdce4282d6e5c25 100644 --- a/paddle/framework/net.h +++ b/paddle/framework/net.h @@ -37,21 +37,7 @@ namespace framework { * This is the base class of network, all the networks should implement the APIs * it defines. */ -class Net : public OperatorBase { - public: - virtual void AddOp(const std::shared_ptr& op) = 0; - virtual void CompleteAddOp(bool calc) = 0; -}; - -using NetPtr = std::shared_ptr; - -/** - * @brief a basic implementation of Net. - * - * PlainNet is a very simple Net, it create a list of operators, and run them - * sequentially following the order they added. - */ -class PlainNet : public Net { +class NetOp : public OperatorBase { public: /** * Infer all the operators' input and output variables' shapes, will be called @@ -80,15 +66,17 @@ class PlainNet : public Net { /** * @brief Add an operator by ptr */ - void AddOp(const std::shared_ptr& op) override { + void AddOp(const std::shared_ptr& op) { PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); ops_.push_back(op); } - void CompleteAddOp(bool calculate = true) override; + void CompleteAddOp(bool calculate = true); std::string DebugString() const override; + bool IsNetOp() const override; + std::vector> ops_; private: @@ -100,7 +88,5 @@ class PlainNet : public Net { } }; -std::shared_ptr AddBackwardOp(std::shared_ptr ForwardOps); - } // namespace framework } // namespace paddle diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc index 20b42cbb4923590804a7806ac42347590c73d62f..8048311fe54ee1827fb5b91577478a1d30803e43 100644 --- a/paddle/framework/net_op_test.cc +++ b/paddle/framework/net_op_test.cc @@ -40,7 +40,7 @@ void AssertSameVectorWithoutOrder(const std::vector& expected, } TEST(OpKernel, all) { - auto net = std::make_shared(); + auto net = std::make_shared(); ASSERT_NE(net, nullptr); auto op1 = std::make_shared(); @@ -69,30 +69,23 @@ TEST(OpKernel, all) { net->Run(scope, dev_ctx); ASSERT_EQ(2, infer_shape_cnt); ASSERT_EQ(2, run_cnt); - ASSERT_THROW(net->AddOp(op2), std::runtime_error); -} -TEST(AddBackwardOp, TestGradOp) { - auto net = std::make_shared(); - ASSERT_NE(net, nullptr); - net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {})); - net->AddOp( - framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {})); - net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {})); - auto grad_ops = AddBackwardOp(net); - for (auto& op : grad_ops->ops_) { - op->DebugString(); - } + ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet); } -// TODO(zhihong): add fc grad without registering. -// TEST(AddBackwardOp, TestNoGradOp) { -// auto net = std::make_shared(); -// ASSERT_NE(net, nullptr); -// net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"}, -// {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) { -// op->DebugString(); -// } -// } +//! TODO(yuyang18): Refine Backward Op. +// TEST(AddBackwardOp, TestGradOp) { +// auto net = std::make_shared(); +// ASSERT_NE(net, nullptr); +// net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {})); +// net->AddOp( +// framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {})); +// net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, +// {})); +// auto grad_ops = AddBackwardOp(net); +// for (auto& op : grad_ops->ops_) { +// op->DebugString(); +// } +//} } // namespace framework } // namespace paddle diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto deleted file mode 100644 index 0779f49fe2a9a6d0d1ea5ec11ba3befeb0a67fa1..0000000000000000000000000000000000000000 --- a/paddle/framework/net_proto.proto +++ /dev/null @@ -1,15 +0,0 @@ -syntax="proto2"; -package paddle.framework; - -import "op_proto.proto"; - -message NetDesc { - // network identification - optional string name = 1; - // operator contains in network - repeated OpProto operators = 2; - // network type to run with. e.g "plainNet", "DAG" - optional string net_type = 3; - // num worker always - optional int32 num_workers = 4; -} diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index f16deae028d76dc40d6bc589648b461c430c3c98..384f0f631dd9b9a4dd7c0c628340afe668bc248f 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -403,15 +403,16 @@ class GradOpRegisterHelper { STATIC_ASSERT_GLOBAL_NAMESPACE( \ __reg_op_kernel_##type##_##DEVICE_TYPE##__, \ "REGISTER_OP_KERNEL must be in global namespace"); \ - struct __op_kernel_register__##type##__ { \ - __op_kernel_register__##type##__() { \ + struct __op_kernel_register__##type##__##DEVICE_TYPE##__ { \ + __op_kernel_register__##type##__##DEVICE_TYPE##__() { \ ::paddle::framework::OperatorWithKernel::OpKernelKey key; \ key.place_ = PlaceType(); \ ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \ .reset(new __VA_ARGS__()); \ } \ }; \ - static __op_kernel_register__##type##__ __reg_kernel_##type##__; \ + static __op_kernel_register__##type##__##DEVICE_TYPE##__ \ + __reg_kernel_##type##__##DEVICE_TYPE##__; \ int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; } // (type, KernelType) diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 05095372d835e7137daedb548b4bb78043e586ea..2ef781bf8672c8aa53ae32a44f1ea61973f3792c 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -90,7 +90,7 @@ TEST(OpRegistry, IllegalAttr) { bool caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "larger_than check fail"; const char* err_msg = err.what(); @@ -136,7 +136,7 @@ TEST(OpRegistry, CustomChecker) { bool caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Attribute 'test_attr' is required!"; const char* err_msg = err.what(); @@ -154,7 +154,7 @@ TEST(OpRegistry, CustomChecker) { caught = false; try { paddle::framework::OpRegistry::CreateOp(op_desc); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "'test_attr' must be even!"; const char* err_msg = err.what(); @@ -192,7 +192,7 @@ TEST(ProtoMaker, DuplicatedAttr) { pd::OpProto op_proto; pd::OpAttrChecker op_checker; auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker); - ASSERT_THROW(proto_maker.Validate(), std::runtime_error); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker { @@ -208,5 +208,5 @@ TEST(ProtoMaker, DuplicatedInOut) { pd::OpProto op_proto; pd::OpAttrChecker op_checker; auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker); - ASSERT_THROW(proto_maker.Validate(), std::runtime_error); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 0a317dffa961a365ad8981aa8cae53c27e80c16b..745508e6ac6e8f310a8ebd8b8d0762fb8ab39bd4 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -34,22 +34,26 @@ KernelContext::GetEigenDevice() const { #endif const std::string& OperatorBase::Input(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, + "Input Output Indices could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); - if (attrs_.count("input_format") == 0) { - return inputs_[it->second]; + return inputs_.at((size_t)it->second); } else { const auto& input_format = GetAttr>("input_format"); int idx = input_format[it->second]; - return inputs_.at(idx); + return inputs_.at((size_t)idx); } } std::vector OperatorBase::Inputs(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr"); auto input_format = GetAttr>("input_format"); auto offset = in_out_idxs_->at(name); + PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(), + "Input Out Of Range"); return std::vector{ inputs_.begin() + input_format.at(offset), @@ -57,23 +61,25 @@ std::vector OperatorBase::Inputs(const std::string& name) const { } const std::string& OperatorBase::Output(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); auto it = in_out_idxs_->find(name); PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_", name); - if (attrs_.count("output_format") == 0) { - return outputs_[it->second]; + return outputs_.at((size_t)it->second); } else { const auto& output_format = GetAttr>("output_format"); int idx = output_format[it->second]; - return outputs_.at(idx); + return outputs_.at((size_t)idx); } } std::vector OperatorBase::Outputs(const std::string& name) const { + PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr"); auto output_format = GetAttr>("output_format"); auto offset = in_out_idxs_->at(name); - + PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(), + "Output Out of Range"); return std::vector{ outputs_.begin() + output_format.at(offset), outputs_.begin() + output_format.at(offset + 1)}; diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 9ba661968cd2bb60c7aa76513363db65b54c8923..8d9ca92565b3d34c5ede7c66cc9eb0d629e8d4e4 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -90,15 +90,17 @@ class OperatorBase { virtual void Run(const std::shared_ptr& scope, const platform::DeviceContext& dev_ctx) const = 0; - // Get a input with argument's name described in `op_proto` + virtual bool IsNetOp() const { return false; } + + //! Get a input with argument's name described in `op_proto` const std::string& Input(const std::string& name) const; - // Get a input which has multiple variables. - // TODO add a vector_view to prevent memory copy. + //! Get a input which has multiple variables. + //! TODO add a vector_view to prevent memory copy. std::vector Inputs(const std::string& name) const; - // Get a output with argument's name described in `op_proto` + //! Get a output with argument's name described in `op_proto` const std::string& Output(const std::string& name) const; - // Get an output which has multiple variables. - // TODO add a vector_view to prevent memory copy. + //! Get an output which has multiple variables. + //! TODO add a vector_view to prevent memory copy. std::vector Outputs(const std::string& name) const; public: @@ -199,7 +201,9 @@ class OperatorWithKernel : public OperatorBase { place_ = dev_ctx.GetPlace(); } - bool operator==(const OpKernelKey& o) const { return place_ == o.place_; } + bool operator==(const OpKernelKey& o) const { + return platform::places_are_same_class(place_, o.place_); + } }; struct OpKernelHash { diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 79c9ffd1a677346fac7712373681acdbaa8116d6..4faaf841440ba30b79c83d09fea977186bd0270a 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -56,7 +56,9 @@ class Scope { if (var) { return var; } else { - vars_[name] = std::unique_ptr(new Variable()); + auto ptr = new Variable(); + name_to_var_[name] = std::unique_ptr(ptr); + var_to_name_[ptr] = name; return GetVariable(name); } } @@ -68,8 +70,8 @@ class Scope { * from it's parent scope. Return nullptr if not found. */ Variable* GetVariable(const std::string& name) const { - auto it = vars_.find(name); - if (it != vars_.end()) { + auto it = name_to_var_.find(name); + if (it != name_to_var_.end()) { return it->second.get(); } else if (parent_ != nullptr) { return parent_->GetVariable(name); @@ -84,12 +86,21 @@ class Scope { * Find if there is a Variable in this scope and it's parent scope */ bool HasVariable(const std::string& name) const { - return (vars_.find(name) != vars_.end() || + return (name_to_var_.find(name) != name_to_var_.end() || (parent_ && parent_->HasVariable(name))); } + std::string GetVariableName(Variable* const var) const { + try { + return var_to_name_.at(var); + } catch (...) { + return ""; + } + } + private: - std::unordered_map> vars_; + std::unordered_map var_to_name_; + std::unordered_map> name_to_var_; std::shared_ptr parent_{nullptr}; }; diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc index df1afb200ce9e75c5b1e40f2da56667487ae3576..ff069c7be002e9bcfd63225c3d80aa958935ba14 100644 --- a/paddle/framework/scope_test.cc +++ b/paddle/framework/scope_test.cc @@ -40,6 +40,11 @@ TEST(Scope, Create) { /// already exist. Variable* var4 = scope->CreateVariable("a"); EXPECT_EQ(var4, var2); + + EXPECT_EQ("a", scope->GetVariableName(var4)); + Scope scope2; + auto var = scope2.CreateVariable("tmp"); + EXPECT_EQ("", scope->GetVariableName(var)); } TEST(Scope, Parent) { diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc index 964f15ab66bca7da75824e192e61600c29e572c0..ea7b2a1f7b17d9abc2c2e14de5ecd1cf4a1a5027 100644 --- a/paddle/framework/tensor.cc +++ b/paddle/framework/tensor.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/framework/tensor.h" namespace paddle { namespace framework {} diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index a36f375d2e42ee3c46ddef42954335cba7eb88f2..76070f636b0971f4a136042e056c59adb5dc2d40 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include "paddle/framework/ddim.h" #include "paddle/memory/memory.h" +#include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -31,9 +32,11 @@ template struct CastToPyBufferImpl; } // namespace details } // namespace pybind + namespace framework { class Tensor { + public: template friend struct paddle::pybind::details::CastToPyBufferImpl; @@ -46,143 +49,122 @@ class Tensor { public: Tensor() : offset_(0) {} + /*! Return a pointer to mutable memory block. */ template - const T* data() const { - EnforceSufficientMemory(); - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } + inline T* data(); + /*! Return a pointer to constant memory block. */ template - T* data() { - EnforceSufficientMemory(); - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - template ::value>::type* = nullptr> - T* mutable_data(DDim dims, platform::Place place) { - Resize(dims); - return mutable_data(place); - } - - template ::value>::type* = nullptr> - T* mutable_data(platform::Place place) { - PADDLE_ENFORCE(product(dims_) > 0, - "Tensor's numel must be larger than zero to call " - "Tensor::mutable_data. Call Tensor::set_dim first."); - if (holder_ == nullptr || - !(holder_->place() == - place) /* some versions of boost::variant don't have operator!= */ - || holder_->size() < product(dims_) * sizeof(T) + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); - } else if (platform::is_gpu_place(place)) { -#ifdef PADDLE_ONLY_CPU - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#else - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); -#endif - } else { - PADDLE_THROW("Unknown 'place'."); - } - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } + inline const T* data() const; + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ template - void ShareDataWith(const Tensor& src) { - src.EnforceSufficientMemory(); - *this = src; - } + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + /*! Resize the dimensions of the memory block. */ + inline void Resize(const DDim& dims); + + /*! The internal of two tensors share the same memory block. */ + template + inline void ShareDataWith(const Tensor& src); + + /** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains place where to store. + * + * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. + */ + template + inline void CopyFrom(const Tensor& src, const platform::Place& dst_place); + + /** + * @brief Return the slice of the tensor. + * + * @param[in] begin_idx The begin index of the slice. + * @param[in] end_idx The end index of the slice. + */ template - void CopyFrom(const Tensor& src, platform::Place dst_place) { - PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) && - platform::is_cpu_place(dst_place), - "Tensor::CopyFrom only support CPU now."); - src.EnforceSufficientMemory(); - size_t size = product(src.dims_) * sizeof(T); - Resize(src.dims()); - const void* src_ptr = static_cast(src.data()); - void* dst_ptr = static_cast(mutable_data(dst_place)); - memcpy(dst_ptr, src_ptr, size); - } + inline Tensor Slice(const int& begin_idx, const int& end_idx) const; + private: template - Tensor Slice(const int& begin_idx, const int& end_idx) const { - EnforceSufficientMemory(); - PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); - PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); - PADDLE_ENFORCE(begin_idx < end_idx, - "Begin index must be less than end index."); - PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); - int base = product(dims_) / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * sizeof(T); - return dst; - } - - void Resize(const DDim& dims) { dims_ = dims; } - - const DDim& dims() const { return dims_; } + inline void check_memory_size() const; private: - // Placeholder hides type T, so it doesn't appear as a template - // parameter of Variable. + /** + * @note Placeholder hides type T, so it doesn't appear as a template + * parameter of Variable. + */ struct Placeholder { virtual ~Placeholder() {} virtual void* ptr() const = 0; - virtual platform::Place place() const = 0; virtual size_t size() const = 0; virtual std::type_index type() const = 0; + virtual platform::Place place() const = 0; }; - template + template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(PlaceType place, size_t size) + PlaceholderImpl(Place place, size_t size) : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + memory::PODDeleter(place)), place_(place), - size_(size) {} + size_(size) { + PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.", + is_cpu_place(place_) ? "CPU" : "GPU"); + } - virtual void* ptr() const { return static_cast(ptr_.get()); } virtual size_t size() const { return size_; } - virtual paddle::platform::Place place() const { return place_; } + virtual platform::Place place() const { return place_; } + virtual void* ptr() const { return static_cast(ptr_.get()); } virtual std::type_index type() const { return std::type_index(typeid(T)); } - std::unique_ptr> ptr_; - platform::Place place_; // record the place of ptr_. - size_t size_; // size of the memory block. + /*! the pointer of memory block. */ + std::unique_ptr> ptr_; + + /*! the place of memory block. */ + platform::Place place_; + + /*! the size of memory block. */ + size_t size_; }; - template - inline void EnforceSufficientMemory() const { - PADDLE_ENFORCE(holder_ != nullptr, - "Tenosr holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory."); - } - - std::shared_ptr holder_; // holds the memory block if allocated. + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ DDim dims_; - // A PlaceHolder may be shared by more than one tensor. Some of them may be - // slices of the others. So the offset_ is introduced here to indicate the - // byte offset between PlaceHolder::ptr_ and where tensor's data really - // begins. + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ size_t offset_; }; } // namespace framework } // namespace paddle + +#include "paddle/framework/detail/tensor-inl.h" diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 089844dc0164dae8067846a8e6846d47fb1b0833..ef1cc10b840896d9ab97f963fc12a4971cd74e1f 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -33,7 +33,7 @@ TEST(Tensor, DataAssert) { bool caught = false; try { src_tensor.data(); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data first."; @@ -72,7 +72,8 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); EXPECT_EQ(p1, p2); } -#ifdef __CUDACC__ + +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; float* p1 = nullptr; @@ -107,7 +108,7 @@ TEST(Tensor, ShareDataWith) { bool caught = false; try { dst_tensor.ShareDataWith(src_tensor); - } catch (std::runtime_error& err) { + } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data first."; @@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; Tensor dst_tensor; @@ -160,7 +161,7 @@ TEST(Tensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); @@ -188,25 +189,74 @@ TEST(Tensor, Slice) { TEST(Tensor, CopyFrom) { using namespace paddle::framework; using namespace paddle::platform; + { + Tensor src_tensor; + Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); - Tensor src_tensor; - int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - memcpy(src_ptr, arr, 9 * sizeof(int)); - Tensor dst_tensor; - dst_tensor.CopyFrom(src_tensor, CPUPlace()); - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], dst_ptr[i]); + auto cpu_place = new paddle::platform::CPUPlace(); + dst_tensor.CopyFrom(src_tensor, *cpu_place); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, *cpu_place); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } } +#ifndef PADDLE_ONLY_CPU + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new paddle::platform::GPUPlace(0); + gpu_tensor.CopyFrom(src_tensor, *gpu_place); + + // GPU Tensor to CPU Tensor + auto cpu_place = new paddle::platform::CPUPlace(); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + + // Compare Tensors + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + gpu_tensor.CopyFrom(slice_tensor, *gpu_place); - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, CPUPlace()); - const int* slice_ptr = slice_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(dst_ptr, slice_ptr); - for (size_t i = 0; i < 3; ++i) { - EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + // GPU Tensor to CPU Tensor + dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + + // Compare Slice Tensors + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } } +#endif } diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a89b6bba45843d81264819cad6ba053f28314f6b --- /dev/null +++ b/paddle/function/BlockExpandOp.cpp @@ -0,0 +1,202 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Function.h" +#include "Im2Col.h" + +namespace paddle { + +/* + * \brief Converts the image data of four dimensions(NCHW) into + * a sequence data of three dimensions(NST) in the forward calculation, + * which is reversed in the backward calculation. + * Where N is batch size, S is the length of the sequence after each + * image is expanded, T is the size of each time step in the sequence. + * + * Arguments in forward function: + * \param inputs[0] Image data of NCHW format. + * \param outputs[0] Sequence data of NST format. + * + * Arguments in backward function: + * \param inputs[0] Sequence data of NST format. + * \param outputs[0] Image data of NCHW format. + */ +class BlockExpandFunction : public FunctionBase { +public: + void init(const FuncConfig& config) override { + // function arguments + strides_ = config.get>("strides"); + paddings_ = config.get>("paddings"); + blocks_ = config.get>("blocks"); + + // number of inputs and outputs + numInputs_ = 1; + numOutputs_ = 1; + } + + void checkShape(const TensorShape& image, const TensorShape& sequence) const { + // image shape should be 4-dimensional. + CHECK_EQ(image.ndims(), (size_t)4); + // sequence shape should be 3-dimensional. + CHECK_EQ(sequence.ndims(), (size_t)3); + // The batchSize of the image needs to be equal to + // the batchSize of the sequence. + CHECK_EQ(image[0], sequence[0]); + } + + // Calculate the shape of colData based on the shape of the image + // and the shape of the sequence. + TensorShape getColShape(const TensorShape& image, + const TensorShape& sequence) const { + size_t inputChannels = image[1]; + size_t inputHeight = image[2]; + size_t inputWidth = image[3]; + size_t seqLength = sequence[1]; + size_t stepSize = sequence[2]; + size_t outputHeight = + 1 + + (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH(); + size_t outputWidth = + 1 + + (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW(); + CHECK_EQ(seqLength, outputHeight * outputWidth); + CHECK_EQ(stepSize, inputChannels * blockH() * blockW()); + + // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + return TensorShape({outputHeight, + outputWidth, + inputChannels, + (size_t)blockH(), + (size_t)blockW()}); + } + +protected: + std::vector strides_; + std::vector paddings_; + std::vector blocks_; + + inline int strideH() const { return strides_[0]; } + + inline int strideW() const { return strides_[1]; } + + inline int paddingH() const { return paddings_[0]; } + + inline int paddingW() const { return paddings_[1]; } + + inline int blockH() const { return blocks_[0]; } + + inline int blockW() const { return blocks_[1]; } +}; + +template +class BlockExpandForward : public BlockExpandFunction { +public: + void init(const FuncConfig& config) override { + BlockExpandFunction::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& image = inputs[0].shape(); + const TensorShape& sequence = outputs[0].shape(); + checkShape(image, sequence); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + const TensorShape& image = inputs[0].shape(); + const TensorShape& sequence = outputs[0].shape(); + + TensorShape imShape = TensorShape({image[1], image[2], image[3]}); + TensorShape colShape = getColShape(image, sequence); + size_t batchSize = image[0]; + + real* imageData = inputs[0].data(); + real* seqData = outputs[0].data(); + Im2ColFunctor im2col; + for (size_t i = 0; i < batchSize; i++) { + // The result of im2col is [outputHeight, outputWidth, + // inputChannels, filterHeight, filterWidth], and it is easy to + // reshape into [seqLength, stepSize], where seqLength is equal + // output_height * output_width, stepSize is equal + // input_channels * filter_height * filter_width + im2col(imageData, + imShape, + seqData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + imageData += imShape.getElements(); + seqData += colShape.getElements(); + } + } +}; + +template +class BlockExpandBackward : public BlockExpandFunction { +public: + void init(const FuncConfig& config) override { + BlockExpandFunction::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& image = outputs[0].shape(); + const TensorShape& sequence = inputs[0].shape(); + checkShape(image, sequence); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + // Since the implementation of Col2ImFunctor is ADD_TO, + // this function only supports ADD_TO mode. + CHECK_EQ(outputs[0].getArgType(), ADD_TO); + const TensorShape& image = outputs[0].shape(); + const TensorShape& sequence = inputs[0].shape(); + + TensorShape imShape = TensorShape({image[1], image[2], image[3]}); + TensorShape colShape = getColShape(image, sequence); + size_t batchSize = image[0]; + + real* imageData = outputs[0].data(); + real* seqData = inputs[0].data(); + Col2ImFunctor col2im; + for (size_t i = 0; i < batchSize; i++) { + col2im(imageData, + imShape, + seqData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW()); + imageData += imShape.getElements(); + seqData += colShape.getElements(); + } + } +}; + +REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward); +REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward); +REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward); +#endif + +} // namespace paddle diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5e4897e72ba9fab2dd9e25d90313dc1b4d38e2d4 --- /dev/null +++ b/paddle/function/BlockExpandOpTest.cpp @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +namespace paddle { + +TEST(BlockExpandForward, real) { + for (size_t batchSize : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t block : {1, 3, 5}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + // init Test object + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector blocks = {block, block}; + CpuGpuFuncCompare test("BlockExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + + size_t outputHeight = + 1 + + (inputHeight + 2 * padding - block + stride - 1) / stride; + size_t outputWidth = + 1 + + (inputWidth + 2 * padding - block + stride - 1) / stride; + TensorShape inputShape = + TensorShape({batchSize, channels, inputHeight, inputWidth}); + TensorShape outputShape = + TensorShape({batchSize, + outputHeight * outputWidth, + channels * block * block}); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape)); + // run Function + test.run(); + } + } + } + } + } + } + } +} + +TEST(BlockExpandBackward, real) { + for (size_t batchSize : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t block : {1, 3, 5}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + // init Test object + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector blocks = {block, block}; + CpuGpuFuncCompare test("BlockExpandGrad", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + + size_t outputHeight = + 1 + + (inputHeight + 2 * padding - block + stride - 1) / stride; + size_t outputWidth = + 1 + + (inputWidth + 2 * padding - block + stride - 1) / stride; + TensorShape inputShape = + TensorShape({batchSize, channels, inputHeight, inputWidth}); + TensorShape outputShape = + TensorShape({batchSize, + outputHeight * outputWidth, + channels * block * block}); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape), + ADD_TO); + // run Function + test.run(); + } + } + } + } + } + } + } +} + +} // namespace paddle diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 2bec00cdb2d32d01a5a24e662bcca07f4154939c..93304f73037690b5cf3ac8189aabc28f51316a77 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -36,10 +36,12 @@ if(WITH_GPU) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) add_simple_unittest(RowConvOpTest) + add_simple_unittest(BlockExpandOpTest) add_simple_unittest(CropOpTest) endif() add_simple_unittest(ConvOpTest) +add_simple_unittest(Im2ColTest) endif() add_style_check_target(paddle_function ${h_files}) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 00880effc59cc80b2761fb6a4d9f3246439afd3f..9deb2739fcfff935a98a0b5b31b5d11819d81227 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "GemmConvOp.h" +#include "ConvOp.h" #include "GemmFunctor.h" +#include "Im2Col.h" #include "paddle/math/MemoryHandle.h" namespace paddle { -/* - * imData = [input_channels, input_height, input_width] - * colData = [input_channels, filter_height, filter_width, - * output_height, output_width] - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData) { - int channelsCol = inputChannels * filterHeight * filterWidth; - - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % filterWidth; - int hOffset = (c / filterWidth) % filterHeight; - int c_im = c / filterWidth / filterHeight; - for (int h = 0; h < outputHeight; ++h) { - for (int w = 0; w < outputWidth; ++w) { - int imRowIdx = h * strideHeight + hOffset; - int imColIdx = w * strideWidth + wOffset; - if ((imRowIdx - paddingHeight) < 0 || - (imRowIdx - paddingHeight) >= inputHeight || - (imColIdx - paddingWidth) < 0 || - (imColIdx - paddingWidth) >= inputWidth) { - colData[(c * outputHeight + h) * outputWidth + w] = T(0); - } else { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - colData[(c * outputHeight + h) * outputWidth + w] = - imData[imRowIdx * inputWidth + imColIdx]; - } - } - } - } - } -}; - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData) { - int channelsCol = inputChannels * filterHeight * filterWidth; - - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % filterWidth; - int hOffset = (c / filterWidth) % filterHeight; - int c_im = c / filterWidth / filterHeight; - for (int h = 0; h < outputHeight; ++h) { - for (int w = 0; w < outputWidth; ++w) { - int imRowIdx = h * strideHeight + hOffset; - int imColIdx = w * strideWidth + wOffset; - if ((imRowIdx - paddingHeight) >= 0 && - (imRowIdx - paddingHeight) < inputHeight && - (imColIdx - paddingWidth) >= 0 && - (imColIdx - paddingWidth) < inputWidth) { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - imData[imRowIdx * inputWidth + imColIdx] += - colData[(c * outputHeight + h) * outputWidth + w]; - } - } - } - } - } -}; - /* * \brief Forward calculation of convolution. */ @@ -154,15 +66,20 @@ public: real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Im2ColFunctor im2col; + Im2ColFunctor im2col; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; @@ -170,18 +87,13 @@ public: for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - colData); + paddingW()); int M = outputChannels / groups_; int N = outputHeight * outputWidth; @@ -247,15 +159,20 @@ public: real* outputGrad = inputs[0].data(); real* filterData = inputs[1].data(); real* inputGrad = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Col2ImFunctor col2im; + Col2ImFunctor col2im; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; @@ -278,20 +195,14 @@ public: 0.0f, colData, N); - - col2im(colData, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + col2im(inputGrad + g * inputOffset, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - inputGrad + g * inputOffset); + paddingW()); } inputGrad += inputChannels * inputHeight * inputWidth; outputGrad += outputChannels * outputHeight * outputWidth; @@ -344,33 +255,33 @@ public: real* outputGrad = inputs[0].data(); real* inputData = inputs[1].data(); real* filterGrad = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Im2ColFunctor im2col; + Im2ColFunctor im2col; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - colData); + paddingW()); int M = outputChannels / groups_; int K = outputHeight * outputWidth; diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h deleted file mode 100644 index 9f11cce597a07ce2a54f518be30b657c26ab7516..0000000000000000000000000000000000000000 --- a/paddle/function/GemmConvOp.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "ConvOp.h" - -namespace paddle { - -/* - * imData = [input_channels, input_height, input_width] - * colData = [input_channels, filter_height, filter_width, - * output_height, output_width] - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData); -}; - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData); -}; - -} // namespace paddle diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu deleted file mode 100644 index 2a1795ff0fb5643ea436c94fe893fe866056fccb..0000000000000000000000000000000000000000 --- a/paddle/function/GemmConvOpGpu.cu +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ConvOp.h" -#include "GemmConvOp.h" - -namespace paddle { - -template -__global__ -void im2col(const T* data_im, int numOuts, int height, int width, - int blockH, int blockW, - int strideH, int strideW, - int paddingH, int paddingW, - int height_col, int width_col, - T* data_col) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < numOuts) { - int w_out = index % width_col; - index /= width_col; - int h_out = index % height_col; - int channel_in = index / height_col; - int channel_out = channel_in * blockH * blockW; - int h_in = h_out * strideH; - int w_in = w_out * strideW; - - data_col += (channel_out * height_col + h_out) * width_col + w_out; - for (int i = 0; i < blockH; ++i) { - for (int j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { - *data_col = 0; - } else { - rIdx = rIdx + channel_in*height - paddingH; - cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; - } - data_col += height_col * width_col; - } - } - } -} - -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData) { - int numKernels = inputChannels * outputHeight * outputWidth; - int blocks = (numKernels + 1024 -1) / 1024; - int blockX = 512; - int blockY = (blocks + 512 - 1) / 512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - im2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, - strideHeight, strideWidth, paddingHeight, paddingWidth, - outputHeight, outputWidth, colData); - CHECK_SYNC("Im2ColFunctor GPU failed"); - } -}; - -template -__global__ -void col2im(size_t n, const T* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - T* data_im) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - T val = 0; - int w = int(index % width); - int h = int((index / width) % height); - int c = int(index / (width * height)); - if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { - // compute the start and end of the output - int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); - int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; - int h_col_end = min(int(h / strideH + 1), int(height_col)); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - h -= paddingH; - w -= paddingW; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] += val; - } - } -} - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData) { - size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) - * (inputWidth + 2*paddingWidth); - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - col2im<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, - colData, - inputHeight + 2*paddingHeight, - inputWidth + 2*paddingWidth, - inputChannels, - filterHeight, - filterWidth, - strideHeight, - strideWidth, - paddingHeight, - paddingWidth, - outputHeight, - outputWidth, - imData); - CHECK_SYNC("Col2ImFunctor GPU failed"); - } -}; - -template class Im2ColFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; - -} // namespace paddle diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h new file mode 100644 index 0000000000000000000000000000000000000000..48e2e32f9256fb49c67ba25e9b5a47d72499758b --- /dev/null +++ b/paddle/function/Im2Col.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "TensorShape.h" +#include "TensorType.h" + +namespace paddle { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of three dimensions(CHW) into a colData of + * five dimensions in the Im2ColFunctor calculation, + * And in the Col2ImFunctor calculation, it is reversed. + * + * \param imData Image data. + * \param imShape The shape of imData, + * [inputChannels, inputHeight, inputWidth]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * If the template argument Format is kCFO, the shape of colData is: + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * inputChannels * filterHeight * filterWidth, and the width is equal + * outputHeight * outputWidth. + * + * Reshape: + * shape of colData shape of convolution matrix + * [inputChannels, + * filterHeight, + * filterWidth, ======> [height, width] + * outputHeight, + * outputWidth] + * + * If the template argument Format is kOCF, the shape of colData is: + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + * So, it is easy to reshape into a sequence matrix for rnn calculation. + * The shape of sequence matrix is [seqLength, stepSize], where the seqLength + * is equal outputHeight * outputWidth, and the stepSize is equal + * inputChannels * filterHeight * filterWidth. + * + * Reshape: + * shape of colData shape of sequence matrix + * [outputHeight, + * outputWidth, + * inputChannels, ======> [seqLength, stepSize] + * filterHeight, + * filterWidth] + * + * \note The caller needs to ensure that imShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth); +}; + +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth); +}; + +} // namespace paddle diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b7d1eb1eded7a7471fd5833a649916d3ee3e598e --- /dev/null +++ b/paddle/function/Im2ColOp.cpp @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" + +namespace paddle { + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterWidth / filterHeight; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) < 0 || + (imRowIdx - paddingHeight) >= inputHeight || + (imColIdx - paddingWidth) < 0 || + (imColIdx - paddingWidth) >= inputWidth) { + colData[(c * outputHeight + h) * outputWidth + w] = T(0); + } else { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + colData[(c * outputHeight + h) * outputWidth + w] = + imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + } + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterWidth / filterHeight; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) >= 0 && + (imRowIdx - paddingHeight) < inputHeight && + (imColIdx - paddingWidth) >= 0 && + (imColIdx - paddingWidth) < inputWidth) { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + imData[imRowIdx * inputWidth + imColIdx] += + colData[(c * outputHeight + h) * outputWidth + w]; + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset < 0 || imRowOffset >= inputHeight || + imColOffset < 0 || imColOffset >= inputWidth) { + colData[colDataOffset] = float(0); + } else { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + colData[colDataOffset] = imData[imDataOffset]; + } + } + } + } + } + } + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset >= 0 && imRowOffset < inputHeight && + imColOffset >= 0 && imColOffset < inputWidth) { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + imData[imDataOffset] += colData[colDataOffset]; + } + } + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace paddle diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..15ba854009636d027447d104071163100d5e3f4b --- /dev/null +++ b/paddle/function/Im2ColOpGpu.cu @@ -0,0 +1,381 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" +#include "hl_device_functions.cuh" + +namespace paddle { + +template +__global__ +void im2col(const T* data_im, int numOuts, int height, int width, + int blockH, int blockW, + int strideH, int strideW, + int paddingH, int paddingW, + int height_col, int width_col, + T* data_col) { + int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < numOuts) { + int w_out = index % width_col; + index /= width_col; + int h_out = index % height_col; + int channel_in = index / height_col; + int channel_out = channel_in * blockH * blockW; + int h_in = h_out * strideH; + int w_in = w_out * strideW; + + data_col += (channel_out * height_col + h_out) * width_col + w_out; + for (int i = 0; i < blockH; ++i) { + for (int j = 0; j < blockW; ++j) { + int rIdx = int(h_in+i); + int cIdx = int(w_in+j); + if ((rIdx-(int)paddingH) >= (int)height || + (rIdx-(int)paddingH) < 0 || + (cIdx-(int)paddingW) >= (int)width || + (cIdx-(int)paddingW) < 0) { + *data_col = 0; + } else { + rIdx = rIdx + channel_in*height - paddingH; + cIdx = cIdx - paddingW; + *data_col = data_im[rIdx* width + cIdx]; + } + data_col += height_col * width_col; + } + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + + int numKernels = inputChannels * outputHeight * outputWidth; + int blocks = (numKernels + 1024 -1) / 1024; + int blockX = 512; + int blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + im2col<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, + strideHeight, strideWidth, paddingHeight, paddingWidth, + outputHeight, outputWidth, colData); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +template +__global__ +void col2im(size_t n, const T* data_col, size_t height, + size_t width, size_t channels, + size_t blockH, size_t blockW, + size_t strideH, size_t strideW, + size_t paddingH, size_t paddingW, + size_t height_col, size_t width_col, + T* data_im) { + size_t index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < n) { + T val = 0; + int w = int(index % width); + int h = int((index / width) % height); + int c = int(index / (width * height)); + if ((w - (int)paddingW) >= 0 && + (w - (int)paddingW) < (width-2 * paddingW) && + (h - (int)paddingH) >= 0 && + (h - paddingH) < (height - 2 * paddingH)) { + // compute the start and end of the output + int w_col_start = + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = + min((int)(w / (int)strideW + 1), (int)(width_col)); + int h_col_start = + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + int h_col_end = min(int(h / strideH + 1), int(height_col)); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + // the col location: [c * width * height + h_out, w_out] + int c_col = int(c * blockH* blockW) + \ + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); + val += data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + h -= paddingH; + w -= paddingW; + data_im[c*((width-2*paddingW) * (height-2*paddingH)) + + h*(width-2*paddingW) + w] += val; + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + + size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) + * (inputWidth + 2*paddingWidth); + + size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks+512-1)/512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<< grid, threads, 0, STREAM_DEFAULT >>> + (numKernels, + colData, + inputHeight + 2*paddingHeight, + inputWidth + 2*paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); + CHECK_SYNC("Col2ImFunctor GPU failed"); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +template +__global__ +void im2colOCF(const T* imData, T* colData, + int inputChannels, + int inputHeight, int inputWidth, + int filterHeight, int filterWidth, + int strideHeight, int strideWidth, + int paddingHeight, int paddingWidth, + int outputHeight, int outputWidth) { + int swId = blockIdx.x; + int shId = blockIdx.y; + for (int channelId = threadIdx.z; + channelId < inputChannels; + channelId += blockDim.z) { + for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (heightOffset >= inputHeight || heightOffset < 0 || + widthOffset >= inputWidth || widthOffset < 0) { + colData[colOffset] = T(0); + } else { + colData[colOffset] = imData[imOffset]; + } + } + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int blockDimX = 0; + int blockDimY = 0; + if (filterHeight <= 4 && filterWidth <= 4) { + blockDimX = 4; + blockDimY = 4; + } else if (filterHeight <= 8 && filterWidth <= 8) { + blockDimX = 8; + blockDimY = 8; + } else if (filterHeight <= 16 && filterWidth <= 16) { + blockDimX = 16; + blockDimY = 16; + } else { + blockDimX = 32; + blockDimY = 32; + } + + int blockDimZ = 1024 / blockDimX / blockDimY; + dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); + dim3 grid(outputWidth, outputHeight); + im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, colData, inputChannels, inputHeight, inputWidth, + filterHeight, filterWidth, strideHeight, strideWidth, + paddingHeight, paddingWidth, outputHeight, outputWidth); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +template +__global__ +void col2imOCF(T* imData, const T* colData, + int inputChannels, + int inputHeight, int inputWidth, + int filterHeight, int filterWidth, + int strideHeight, int strideWidth, + int paddingHeight, int paddingWidth, + int outputHeight, int outputWidth) { + int swId = blockIdx.x; + int shId = blockIdx.y; + for (int channelId = threadIdx.z; + channelId < inputChannels; + channelId += blockDim.z) { + for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (heightOffset >= 0 && heightOffset < inputHeight && + widthOffset >= 0 && widthOffset < inputWidth) { + paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]); + } + } + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int blockDimX = 0; + int blockDimY = 0; + if (filterHeight <= 4 && filterWidth <= 4) { + blockDimX = 4; + blockDimY = 4; + } else if (filterHeight <= 8 && filterWidth <= 8) { + blockDimX = 8; + blockDimY = 8; + } else if (filterHeight <= 16 && filterWidth <= 16) { + blockDimX = 16; + blockDimY = 16; + } else { + blockDimX = 32; + blockDimY = 32; + } + + int blockDimZ = 1024 / blockDimX / blockDimY; + dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); + dim3 grid(outputWidth, outputHeight); + col2imOCF<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, colData, inputChannels, inputHeight, inputWidth, + filterHeight, filterWidth, strideHeight, strideWidth, + paddingHeight, paddingWidth, outputHeight, outputWidth); + CHECK_SYNC("Col2ImFunctor GPU failed"); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace paddle diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..acc88a553abe7ac58b629aba9b850df58cee7f81 --- /dev/null +++ b/paddle/function/Im2ColTest.cpp @@ -0,0 +1,125 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" +#include +#include "Function.h" +#include "paddle/math/Matrix.h" +#include "paddle/math/tests/TensorCheck.h" + +namespace paddle { + +template +void TestIm2ColFunctor() { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t filterHeight : {1, 5}) { + for (size_t filterWidth : {3, 7}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + if (inputHeight <= filterHeight || inputWidth <= filterWidth) + break; + if (padding >= filterHeight || padding >= filterWidth) break; + size_t outputHeight = + (inputHeight - filterHeight + 2 * padding + stride) / + stride; + size_t outputWidth = + (inputWidth - filterWidth + 2 * padding + stride) / stride; + + TensorShape imShape = + TensorShape({channels, inputHeight, inputWidth}); + TensorShape colShape1 = TensorShape({channels, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + TensorShape colShape2 = TensorShape({outputHeight, + outputWidth, + channels, + filterHeight, + filterWidth}); + + size_t height = channels * filterHeight * filterWidth; + size_t width = outputHeight * outputWidth; + VectorPtr input1 = Vector::create(imShape.getElements(), false); + VectorPtr input2 = Vector::create(imShape.getElements(), false); + MatrixPtr output1 = Matrix::create(height, width, false, false); + MatrixPtr output2 = Matrix::create(width, height, false, false); + input1->uniform(0.001, 1); + input2->copyFrom(*input1); + + Im2ColFunctor im2Col1; + Im2ColFunctor im2Col2; + im2Col1(input1->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding); + im2Col2(input2->getData(), + imShape, + output2->getData(), + colShape2, + stride, + stride, + padding, + padding); + + // The transposition of the result of ColFormat == kCFO + // is equal to the result of ColFormat == kOCF. + MatrixPtr test; + output2->transpose(test, true); + autotest::TensorCheckErr(*output1, *test); + + Col2ImFunctor col2Im1; + Col2ImFunctor col2Im2; + col2Im1(input1->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding); + col2Im2(input2->getData(), + imShape, + output2->getData(), + colShape2, + stride, + stride, + padding, + padding); + + autotest::TensorCheckErr(*input1, *input2); + } + } + } + } + } + } + } +} + +TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor(); } + +#ifndef PADDLE_ONLY_CPU + +TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor(); } + +#endif + +} // namespace paddle diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index a40530f41313be27dc1c2606501c6c00bed11c8b..81cc3c890b6d4ad048e4edc03208c85778244078 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -207,8 +207,8 @@ Error __must_check backward(Argument& act) { argument_.value->setData(act.value->getData() + offset, 1UL, size); argument_.grad->setData(act.grad->getData() + offset, 1UL, size); - Error status = softmax_.backward(argument_); - if (!status) return status; + Error err = softmax_.backward(argument_); + if (!err.isOK()) return err; } return Error(); } diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index 2bafeb92158c56efe32f90742807f0af07bda5af..3b1f34635917290c6e4a9230b546892cc5cb7bfa 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -37,6 +37,22 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, imgSizeH_ = blockConf.img_size_y(); imgSizeW_ = blockConf.img_size_x(); + std::vector strides = {(size_t)strideH_, (size_t)strideW_}; + std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; + std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; + createFunction(forward_, + "BlockExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + createFunction(backward_, + "BlockExpandGrad", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + return true; } @@ -63,48 +79,27 @@ void BlockExpandLayer::forward(PassType passType) { Layer::forward(passType); size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight(); - size_t blockNum = getBlockNum(); size_t blockSize = blockH_ * blockW_ * channels_; resetOutput(blockNum * batchSize, blockSize); - Argument& out = getOutput(); - MatrixPtr outV = getOutputValue(); - MatrixPtr input = getPrev(0)->getOutputValue(); - Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_); + // calculate output_.value + inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); + outputShape_ = TensorShape({batchSize, blockNum, blockSize}); + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), inputShape_); + outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); + forward_[0]->calc(inputs, outputs); + + // calculate output_.sequenceStartPositions and output_.cpuSequenceDims + Argument& out = getOutput(); ICpuGpuVector::resizeOrCreate( out.sequenceStartPositions, batchSize + 1, false); IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false); int* start = out.sequenceStartPositions->getMutableData(false); int* dims = out.cpuSequenceDims->getData(); for (size_t i = 0; i < batchSize; i++) { - outVTrans_->zeroMem(); - /* expand each block as one row */ - MatrixPtr inputTmp = - Matrix::create(input->getData() + i * input->getWidth(), - 1, - input->getWidth(), - false, - useGpu_); - outVTrans_->convExpand(*inputTmp, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_); - MatrixPtr outVTmp = - Matrix::create(outV->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - outVTrans_->transpose(outVTmp, false); start[i] = i * blockNum; dims[2 * i] = outputH_; dims[2 * i + 1] = outputW_; @@ -113,48 +108,13 @@ void BlockExpandLayer::forward(PassType passType) { } void BlockExpandLayer::backward(const UpdateCallback& callback) { - size_t blockNum = outputH_ * outputW_; - size_t blockSize = blockH_ * blockW_ * channels_; /* Calculate the input layers error */ - MatrixPtr preGrad = inputLayers_[0]->getOutputGrad(); - if (!preGrad) { - return; - } - MatrixPtr grad = getOutputGrad(); - MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_); - size_t batchSize = preGrad->getHeight(); - - CHECK_EQ(batchSize * blockNum, grad->getHeight()); - CHECK_EQ(blockSize, grad->getWidth()); - - for (size_t i = 0; i < batchSize; i++) { - MatrixPtr gradTmp = - Matrix::create(grad->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - gradTmp->transpose(gradTrans, false); - MatrixPtr preGradTmp = - Matrix::create(preGrad->getData() + i * preGrad->getWidth(), - 1, - preGrad->getWidth(), - false, - useGpu_); - preGradTmp->convShrink(*gradTrans, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_, - 1.0, - 1.0); + if (getInputGrad(0)) { + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), outputShape_); + outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO); + backward_[0]->calc(inputs, outputs); } } diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index 8f347400e60ec84fc1b5fdbc1c911a8768b306d0..15ce73ab8b2ca16ba1e9329ed5c00dc7239e8b93 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -50,8 +50,8 @@ protected: size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_; size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_; - /// auxiliary variable, which saves the transposed output value. - MatrixPtr outVTrans_; + TensorShape inputShape_; + TensorShape outputShape_; public: explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {} diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp index d1e932ded595c90cbe6040c330c5c8663d81e2b4..eb6b0445c95a9e9a7acd5d693ecdb11a263f41fd 100644 --- a/paddle/gserver/layers/ConvBaseProjection.cpp +++ b/paddle/gserver/layers/ConvBaseProjection.cpp @@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() { bwdDataLimitBytes_ = 0; bwdFilterLimitBytes_ = 0; workSpaceInBytes_ = 0; - - batchNum_ = 0; - isSelectAlgo_ = false; } void ConvBaseProjection::reshapeTensorDesc(int batchSize) { @@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) { CHECK_EQ(width, out_->value->getWidth()); CHECK_EQ(calInputSize(), in_->value->getWidth()); - isSelectAlgo_ = (batchSize == batchNum_); - batchNum_ = batchSize; - - if (!isSelectAlgo_) { - reshapeTensorDesc(batchSize); - hl_conv_workspace(imageDesc_, - outputDesc_, - filterDesc_, - convDesc_, - &fwdAlgo_, - &fwdLimitBytes_, - &bwdDataAlgo_, - &bwdDataLimitBytes_, - &bwdFilterAlgo_, - &bwdFilterLimitBytes_); - - size_t maxWorkSpace = 0; - maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_); - maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_); - workSpaceInBytes_ = maxWorkSpace; - - VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_ - << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_; - } - - isSelectAlgo_ = true; + reshapeTensorDesc(batchSize); + hl_conv_workspace(imageDesc_, + outputDesc_, + filterDesc_, + convDesc_, + &fwdAlgo_, + &fwdLimitBytes_, + &bwdDataAlgo_, + &bwdDataLimitBytes_, + &bwdFilterAlgo_, + &bwdFilterLimitBytes_); + + size_t maxWorkSpace = 0; + maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_); + maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_); + workSpaceInBytes_ = maxWorkSpace; + + VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_ + << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_; } void *ConvBaseProjection::getSpaceBytes(size_t size) { diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h index 4a33aa1837dfc36dbead60deaccbc6b772fe4754..e9d9f8f1b2937b3a3b7323c43ef5608ffc5f82ca 100644 --- a/paddle/gserver/layers/ConvBaseProjection.h +++ b/paddle/gserver/layers/ConvBaseProjection.h @@ -101,12 +101,6 @@ protected: size_t bwdFilterLimitBytes_; /// Size of total work space. size_t workSpaceInBytes_; - - /// Whether to call cuDNN api to choose conv algorithm. - bool isSelectAlgo_; - /// batchNum is used to record batch size. If the batch size is changed, - /// the selection algorithm will be called. - int batchNum_; bool bias_; std::unique_ptr weight_; diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 4431d613f655c1d0c8da13bb5ac9225980c650ad..27f7d95b752d4a423bf99fa425b10b2816575d6a 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) { LOG(INFO) << "the diffCnt is " << diffCnt; } -void GpuMatrix::convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - CHECK(feature.useGpu_ == true) << "Matrix type are not equal"; - - CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels), - feature.getHeight() * feature.getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal"; - - hl_expand_feature2col(feature.getData(), - channels, - feaImgHeight, - feaImgWidth, - blockH, - blockW, - strideH, - strideW, - paddingH, - paddingW, - outputH, - outputW, - getData()); -} - -void GpuMatrix::convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha, - real beta) { - CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal"; - CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), - getHeight() * getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockW * blockH * channels; - CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth()) - << "Matrix dimensions are not equal"; - hl_shrink_col2feature(expandFeat.getData(), - channels, - thisImgHeight, - thisImgWidth, - blockH, - blockW, - strideH, - strideW, - paddingH, - paddingW, - outputH, - outputW, - getData(), - alpha, - beta); -} - void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1777,103 +1702,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) { CHECK_EQ(info, 0); } -void CpuMatrix::convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - CHECK(feature.useGpu_ == false) << "Matrix type are not equal"; - - CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels), - feature.getHeight() * feature.getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal"; - - int channelsCol = channels * blockH * blockW; - real* srcData = feature.getData(); - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % blockW; - int hOffset = (c / blockW) % blockH; - int c_im = c / blockH / blockW; - for (int h = 0; h < outputH; ++h) { - for (int w = 0; w < outputW; ++w) { - // no c_im*height to Exclude the channel number - int imgRowIdx = h * strideH + hOffset; - int imgColIdx = w * strideW + wOffset; - if ((imgRowIdx - paddingH) < 0 || - (imgRowIdx - paddingH) >= feaImgHeight || - (imgColIdx - paddingW) < 0 || - (imgColIdx - paddingW) >= feaImgWidth) { - data_[(c * outputH + h) * outputW + w] = 0; - } else { - imgRowIdx += c_im * feaImgHeight - paddingH; - imgColIdx -= paddingW; - data_[(c * outputH + h) * outputW + w] = - srcData[imgRowIdx * feaImgWidth + imgColIdx]; - } - } - } - } -} - -void CpuMatrix::convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha, - real beta) { - CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal"; - CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), - getHeight() * getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - - CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth()) - << "Matrix dimensions are not equal"; - - real* expandData = expandFeat.getData(); - int channelsCol = channels * blockH * blockW; - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % blockW; - int hOffset = (c / blockW) % blockH; - int c_im = c / blockW / blockH; - for (int h = 0; h < outputH; ++h) { - for (int w = 0; w < outputW; ++w) { - int imRowIdx = h * strideH + hOffset; - int imColIdx = w * strideW + wOffset; - if ((imRowIdx - paddingH) >= 0 && - (imRowIdx - paddingH) < thisImgHeight && - (imColIdx - paddingW) >= 0 && - (imColIdx - paddingW) < thisImgWidth) { - imRowIdx += c_im * thisImgHeight - paddingH; - imColIdx -= paddingW; - data_[imRowIdx * thisImgWidth + imColIdx] = - alpha * expandData[(c * outputH + h) * outputW + w] + - beta * data_[imRowIdx * thisImgWidth + imColIdx]; - } - } - } - } -} - void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 7dfd593225065e18830b2b0c0ce854fe7a2d5178..bb802bbb2c75289a45d987b22ad41ce8b1e95c98 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -859,49 +859,6 @@ public: LOG(FATAL) << "Not implemented"; } - /** - * This function is used to calculate the convolution: - * - * It will expand a feature matrix according to the - * convolution filters - */ - virtual void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - LOG(FATAL) << "Not implemeted"; - } - - /** - * This function is the reverse implementation of convExpand: - * - * Its function is to restore a expanded-matrix into a feature matrix - */ - virtual void convShrink(Matrix& expandColMat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f) { - LOG(FATAL) << "Not implemeted"; - } - /** * Pooling forward operation, pick out the largest element * in the sizeX of value @@ -1335,34 +1292,6 @@ public: void classificationError(Matrix& output, IVector& label, size_t topkSize = 1); - void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW); - - void convShrink(Matrix& expandColMat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blochW, - int strideH, - int strideW, - int paddingH, - int paddingWreal, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f); - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1522,34 +1451,6 @@ public: MatrixPtr clone(size_t height, size_t width, bool useGpu = false); - void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blcokH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW); - - void convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f); - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 27c1b4033b53b059d38ed88694b20b429cbb4cce..bb44970109c05d239e6b92d90b2079b752fa0104 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -27,12 +27,11 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, system_allocator_(std::move(system_allocator)) {} BuddyAllocator::~BuddyAllocator() { - DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these " - "have actually been freed"; + VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_ - << ")"; + VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -52,12 +51,11 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // acquire the allocator lock std::lock_guard lock(mutex_); - DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " - << size; + VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size; // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { - DLOG(INFO) << "Allocate from system allocator."; + VLOG(3) << "Allocate from system allocator."; return SystemAlloc(size); } @@ -72,9 +70,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { return nullptr; } } else { - DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it) - << " at address " - << reinterpret_cast(std::get<2>(*it))->data(); + VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); } total_used_ += size; @@ -91,10 +89,10 @@ void BuddyAllocator::Free(void* p) { // Acquire the allocator lock std::lock_guard lock(mutex_); - DLOG(INFO) << "Free from address " << block; + VLOG(3) << "Free from address " << block; if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { - DLOG(INFO) << "Free directly from system allocator"; + VLOG(3) << "Free directly from system allocator"; system_allocator_->Free(block, block->total_size(cache_), block->index(cache_)); @@ -111,8 +109,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the right buddy if (block->has_right_buddy(cache_)) { - DLOG(INFO) << "Merging this block " << block << " with its right buddy " - << block->right_buddy(cache_); + VLOG(3) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); auto right_buddy = block->right_buddy(cache_); @@ -129,8 +127,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the left buddy if (block->has_left_buddy(cache_)) { - DLOG(INFO) << "Merging this block " << block << " with its left buddy " - << block->left_buddy(cache_); + VLOG(3) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); auto left_buddy = block->left_buddy(cache_); @@ -146,8 +144,8 @@ void BuddyAllocator::Free(void* p) { } // Dumping this block into pool - DLOG(INFO) << "Inserting free block (" << block << ", " - << block->total_size(cache_) << ")"; + VLOG(3) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); @@ -166,7 +164,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(index, size); - DLOG(INFO) << "Allocated " << p << " from system allocator."; + VLOG(3) << "Allocated " << p << " from system allocator."; if (p == nullptr) return nullptr; @@ -192,8 +190,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { if (p == nullptr) return pool_.end(); - DLOG(INFO) << "Creating and inserting new block " << p - << " from system allocator"; + VLOG(3) << "Creating and inserting new block " << p + << " from system allocator"; static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, max_chunk_size_, nullptr, nullptr); @@ -237,19 +235,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, auto block = static_cast(std::get<2>(*it)); pool_.erase(it); - DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_) - << ") into"; + VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; block->split(cache_, size); - DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_) - << ")"; + VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; block->set_type(cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist if (block->has_right_buddy(cache_)) { if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { - DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " - << block->right_buddy(cache_)->total_size(cache_) << ")"; + VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->right_buddy(cache_)->index(cache_), @@ -276,7 +274,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { return; } - DLOG(INFO) << "Return block " << block << " to fallback allocator."; + VLOG(3) << "Return block " << block << " to fallback allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -312,7 +310,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() { MemoryBlock* block = static_cast(std::get<2>(*pool)); - DLOG(INFO) << "Return block " << block << " to base allocator."; + VLOG(3) << "Return block " << block << " to base allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 098931c887479ce6f1afc8b90e4003758d88c018..aaab1142ca18d3319469a4d685fde9d30929113f 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -35,7 +35,7 @@ void Copy(platform::CPUPlace dst_place, platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { - platform::GPUPlaceGuard g(src_place.device); + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } @@ -45,7 +45,7 @@ void Copy(platform::GPUPlace dst_place, platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { - platform::GPUPlaceGuard g(dst_place.device); + platform::SetDeviceId(dst_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } @@ -56,7 +56,7 @@ void Copy(platform::GPUPlace dst_place, const void* src, size_t num, cudaStream_t stream) { if (dst_place == src_place) { - platform::GPUPlaceGuard g(src_place.device); + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h index 99b1c2e1c3e5ae4facaeb4fd0b773a7531448f03..2b9c0eada6e8406fc81baec7f331a8dd5b8b0ec1 100644 --- a/paddle/memory/memcpy.h +++ b/paddle/memory/memcpy.h @@ -20,13 +20,39 @@ limitations under the License. */ namespace paddle { namespace memory { +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * + */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); #ifndef PADDLE_ONLY_CPU + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, cudaStream_t stream); + #endif // PADDLE_ONLY_CPU } // namespace memory diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index c2e046926fafd8f4cfc4cd81d8f32e3882ff02ec..207025f9b1c64f0f8943f9fae5edefc9328a1d26 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -60,6 +60,7 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { platform::GpuMaxChunkSize()); } } + platform::SetDeviceId(gpu_id); return as[gpu_id]; } diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 5e0d64707299acb22aacff0fad237c135f614d9c..44f567caf9c19775f17988b5142b7693b41a126d 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -20,19 +20,53 @@ limitations under the License. */ namespace paddle { namespace memory { +/** + * \brief Allocate memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] size Allocation size. + * + * \return Allocated memory block address. + * + * \note If return nullptr, it indicates memory allocation failed + * because insufficient memory in current system. When Alloc + * function is invoked, you must check the returned memory + * address is valid or not. + */ template -void* Alloc(Place, size_t); +void* Alloc(Place place, size_t size); +/** + * \brief Free memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] ptr Memory block address to free. + * + */ template -void Free(Place, void*); +void Free(Place place, void* ptr); +/** + * \brief Total size of used memory in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * + */ template -size_t Used(Place); +size_t Used(Place place); -template ::value>::type* = nullptr> +/** + * \brief Free memory block in one place. + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template class PODDeleter { + static_assert(std::is_pod::value, "T must be POD"); + public: PODDeleter(Place place) : place_(place) {} void operator()(T* ptr) { Free(place_, static_cast(ptr)); } diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index ded963deb2b21ab797ee661b6c42b9f3495c671e..4e34ba012038dda897244b3d491a5127ee10bf45 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -48,10 +48,15 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu) op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc) op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc) op_library(softmax_op SRCS softmax_op.cc softmax_op.cu) -op_library(random_op SRCS random_op.cc random_op.cu) +op_library(guassian_random_op SRCS guassain_random_op.cc guassian_random_op.cu) op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu) op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op softmax_op net) op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) + +op_library(recurrent_network_op SRCS recurrent_network_op.cc DEPS op_desc +tensor op_registry operator net) +cc_test(recurrent_network_op_test SRCS recurrent_network_op_test.cc DEPS +recurrent_network_op gtest mul_op add_op) diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc index 8d415fbd2e72af556e21f89c37d31b9fad130e3d..1424b0284372d8dfe9eb93ee251b121a48b19b0b 100644 --- a/paddle/operators/add_op.cc +++ b/paddle/operators/add_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/add_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class AddOp : public framework::OperatorWithKernel { +class AddOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one"); PADDLE_ENFORCE( @@ -35,10 +32,10 @@ protected: } }; -class AddOpMaker : public framework::OpProtoAndCheckerMaker { +class AddOpMaker : public OpProtoAndCheckerMaker { public: - AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of add op"); AddInput("Y", "The second input of add op"); AddOutput("Out", "The output of add op"); @@ -50,11 +47,10 @@ The equation is: Out = X + Y } }; -class AddOpGrad : public framework::OperatorWithKernel { +class AddOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "AddOpGrad"; return ""; @@ -64,7 +60,6 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker); -REGISTER_GRADIENT_OP(add_two, add_two_grad, paddle::operators::AddOpGrad); -REGISTER_OP_CPU_KERNEL( - add_two, paddle::operators::AddKernel); +REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker); +REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad); +REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel); diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu index 2e5a755f92e4d1fa487152ed453fe3b2823062ed..79d8de6cd46e1c72b14b0554c7be7b4eee281f4c 100644 --- a/paddle/operators/add_op.cu +++ b/paddle/operators/add_op.cu @@ -1,5 +1,4 @@ -#include "paddle/operators/add_op.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/add_op.h" -REGISTER_OP_GPU_KERNEL(add_two, - paddle::operators::AddKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel); diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h index 39d54a63bd16cdafeec1cfcd86ef5d142382e880..0c39433788e1e07e30aaadc4766028219b05bfa5 100644 --- a/paddle/operators/add_op.h +++ b/paddle/operators/add_op.h @@ -13,27 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class AddKernel : public framework::OpKernel { +class AddKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input0 = context.Input(0)->Get(); + auto input1 = context.Input(1)->Get(); + auto output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenVector::Flatten(*output).device( + EigenVector::Flatten(*output).device( *(context.GetEigenDevice())) = - framework::EigenVector::Flatten(input0) + - framework::EigenVector::Flatten(input1); + EigenVector::Flatten(input0) + EigenVector::Flatten(input1); } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 7d7bb09f3d63bef49913c3c7501082c509c45653..46c88d4d1a28eeedd02eb699562244651ead6d68 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/cross_entropy_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class OnehotCrossEntropyOp : public framework::OperatorWithKernel { +class OnehotCrossEntropyOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of OnehotCrossEntropyOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, @@ -35,15 +32,14 @@ protected: PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2."); PADDLE_ENFORCE(outputs[0]->dims().size() == 1, "label's dimension must be 1."); - outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]})); + outputs[0]->Resize({inputs[0]->dims()[0]}); } }; -class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { +class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker { public: - OnehotCrossEntropyOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of OnehotCrossEntropyOp"); AddInput("label", "The second input of OnehotCrossEntropyOp"); AddOutput("Y", "The output of OnehotCrossEntropyOp"); @@ -59,9 +55,7 @@ OnehotCrossEntropy Operator. } // namespace paddle REGISTER_OP(onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOp, - paddle::operators::OnehotCrossEntropyOpMaker); -REGISTER_OP_CPU_KERNEL( - onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOpKernel<::paddle::platform::CPUPlace, - float>); + ops::OnehotCrossEntropyOp, + ops::OnehotCrossEntropyOpMaker); +REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, + ops::OnehotCrossEntropyOpKernel); diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 1bcdcb7ea650a361cad376ecdd5e96fe8e8f7c94..19e4b74596a0f59edd04db830ec6f6f481373465 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -1,6 +1,4 @@ #include "paddle/operators/cross_entropy_op.h" -#include "paddle/framework/op_registry.h" REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, - paddle::operators::OnehotCrossEntropyOpKernel< - ::paddle::platform::GPUPlace, float>); \ No newline at end of file + ops::OnehotCrossEntropyOpKernel); \ No newline at end of file diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index ad2c7f34e1fd91b97287b4c5f4004d5b79ea4f82..0383df46be3a3cea7dde8f1b45857e64d5a2f2d8 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -13,23 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class OnehotCrossEntropyOpKernel : public framework::OpKernel { +class OnehotCrossEntropyOpKernel : public OpKernel { public: constexpr T LOG_THRESHOLD() const { return static_cast(1e-20); } - void Compute(const framework::KernelContext& context) const override { - auto X = context.Input(0)->Get(); + void Compute(const KernelContext& context) const override { + auto X = context.Input(0)->Get(); const T* X_data = X.data(); - const int* label_data = - context.Input(1)->Get().data(); - auto* Y = context.Output(0)->GetMutable(); + const int* label_data = context.Input(1)->Get().data(); + auto* Y = context.Output(0)->GetMutable(); Y->mutable_data(context.GetPlace()); diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc index 01e96f4c4817466e3266ca57a0d0ae2368b3e097..c4a9f5937f4fa8c60989bea1726cedbb73330156 100644 --- a/paddle/operators/fc_op.cc +++ b/paddle/operators/fc_op.cc @@ -12,41 +12,38 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/net.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/operator.h" +#include "type_alias.h" namespace paddle { namespace operators { -class FullyConnectedOp : public framework::PlainNet { +class FullyConnectedOp : public NetOp { public: void Init() override { - AddOp(framework::OpRegistry::CreateOp("mul", - { - Input("X"), Input("W"), - }, - {Output("before_act")}, - {})); + AddOp(OpRegistry::CreateOp("mul", + { + Input("X"), Input("W"), + }, + {Output("before_act")}, + {})); auto b = Input("b"); - if (b != framework::OperatorBase::EMPTY_VAR_NAME()) { - AddOp(framework::OpRegistry::CreateOp("rowwise_add", - {Output("before_act"), Input("b")}, - {Output("before_act")}, - {})); + if (b != EMPTY_VAR_NAME()) { + AddOp(OpRegistry::CreateOp("rowwise_add", + {Output("before_act"), Input("b")}, + {Output("before_act")}, + {})); } auto activation = GetAttr("activation"); - AddOp(framework::OpRegistry::CreateOp( + AddOp(OpRegistry::CreateOp( activation, {Output("before_act")}, {Output("Y")}, {})); CompleteAddOp(false); } }; -class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker { +class FullyConnectedOpMaker : public OpProtoAndCheckerMaker { public: - FullyConnectedOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "the input of fc operator"); AddInput("W", "the weight of fc operator"); @@ -71,6 +68,4 @@ USE_OP(rowwise_add); USE_OP(sigmoid); USE_OP(softmax); -REGISTER_OP(fc, - paddle::operators::FullyConnectedOp, - paddle::operators::FullyConnectedOpMaker); +REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker); diff --git a/paddle/operators/random_op.cc b/paddle/operators/gaussian_random_op.cc similarity index 100% rename from paddle/operators/random_op.cc rename to paddle/operators/gaussian_random_op.cc index 674c85134529c3bfb7a8810c9a1ee15fe249c05a..7afc0cd56b2d4719d2346f1d826e0a7bd0a796b9 100644 --- a/paddle/operators/random_op.cc +++ b/paddle/operators/gaussian_random_op.cc @@ -12,9 +12,9 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/random_op.h" #include "glog/logging.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/random_op.h" namespace paddle { namespace operators { diff --git a/paddle/operators/random_op.cu b/paddle/operators/gaussian_random_op.cu similarity index 100% rename from paddle/operators/random_op.cu rename to paddle/operators/gaussian_random_op.cu diff --git a/paddle/operators/random_op.h b/paddle/operators/gaussian_random_op.h similarity index 100% rename from paddle/operators/random_op.h rename to paddle/operators/gaussian_random_op.h diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index cd74c8b976d18ffecd50077cc81e1fce56bea155..22c1b78005358a934c57d487f5b0cff133f61f0c 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -13,17 +13,14 @@ limitations under the License. */ #include "paddle/operators/mul_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class MulOp : public framework::OperatorWithKernel { +class MulOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs"); auto dim0 = inputs[0]->dims(); auto dim1 = inputs[1]->dims(); @@ -37,10 +34,10 @@ protected: } }; -class MulOpMaker : public framework::OpProtoAndCheckerMaker { +class MulOpMaker : public OpProtoAndCheckerMaker { public: - MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The first input of mul op"); AddInput("Y", "The second input of mul op"); AddOutput("Out", "The output of mul op"); @@ -52,11 +49,10 @@ The equation is: Out = X * Y } }; -class MulOpGrad : public framework::OperatorWithKernel { +class MulOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "MulGrad"; return ""; @@ -66,8 +62,7 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker); -REGISTER_GRADIENT_OP(mul, mul_grad, paddle::operators::MulOpGrad); +REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker); +REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad); -REGISTER_OP_CPU_KERNEL( - mul, paddle::operators::MulKernel); +REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu index 3ee581dc77dc08e6e47b240588811fbc7c6ea303..c27fc886ce7238a13c8ef86bce673a2b54949a9d 100644 --- a/paddle/operators/mul_op.cu +++ b/paddle/operators/mul_op.cu @@ -13,8 +13,5 @@ limitations under the License. */ #include "paddle/operators/mul_op.h" -#include "paddle/framework/op_registry.h" -REGISTER_OP_GPU_KERNEL(mul, - paddle::operators::MulKernel); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); \ No newline at end of file diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index e6bad7fb9da2d489666aa67f032552e48a86c6cb..467975044638a3f034ceec84173e8d3fed43cc0c 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -14,30 +14,27 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class MulKernel : public framework::OpKernel { +class MulKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { + void Compute(const KernelContext& context) const override { Eigen::array, 1> dim_pair = { {Eigen::IndexPair(1, 0)}}; - auto input0 = context.Input(0)->Get(); - auto input1 = context.Input(1)->Get(); - auto* output = context.Output(0)->GetMutable(); + auto input0 = context.Input(0)->Get(); + auto input1 = context.Input(1)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenMatrix::From(*output).device( - *(context.GetEigenDevice())) = - framework::EigenMatrix::From(input0).contract( - framework::EigenMatrix::From(input1), dim_pair); + EigenMatrix::From(*output).device(*(context.GetEigenDevice())) = + EigenMatrix::From(input0).contract(EigenMatrix::From(input1), + dim_pair); } }; } // namespace operators diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a101d6ddf149d608dbdbe048ef43d86bacbcc16 --- /dev/null +++ b/paddle/operators/recurrent_network_op.cc @@ -0,0 +1,419 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/recurrent_network_op.h" + +#include +#include +#include + +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace operators { + +namespace rnn { + +void SegmentInputs(std::vector>& step_scopes, + const std::vector& inlinks, + const size_t seq_len) { + PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); + for (size_t i = 0; i < inlinks.size(); ++i) { + Tensor* input = + step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable(); + DDim dims = input->dims(); + PADDLE_ENFORCE(static_cast(dims[0]) == seq_len, + "all the inlinks must have same length"); + DDim step_dims = slice_ddim(dims, 1, dims.size()); + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_input = step_scopes[j] + ->CreateVariable(inlinks[i].internal) + ->GetMutable(); + *step_input = input->Slice(j, j + 1); + step_input->Resize(step_dims); + } + } +} + +void ConcatOutputs(std::vector>& step_scopes, + const std::vector& outlinks, + const size_t seq_len) { + for (size_t i = 0; i < outlinks.size(); i++) { + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + + // TODO(qingiqng) remove following code after adding + // InferShape in RecurrentGradientOp + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + dims_vec.insert(dims_vec.begin(), seq_len); + output->mutable_data(make_ddim(dims_vec), platform::CPUPlace()); + + for (size_t j = 0; j < seq_len; j++) { + Tensor* step_output = step_scopes[j] + ->GetVariable(outlinks[i].internal) + ->GetMutable(); + // TODO(luotao02) data type and platform::DeviceContext() should set + // correctly + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace()); + } + } +} + +void LinkMemories(std::vector>& scopes, + const std::vector& memories, + size_t step_id, + int offset) { + PADDLE_ENFORCE(step_id < scopes.size(), + "step [%d] is out of range of step scopes' size [%d]", + step_id, + scopes.size()); + PADDLE_ENFORCE(static_cast(step_id) + offset >= 0, + "offset [%d] must be large than -[%d]", + offset, + step_id); + PADDLE_ENFORCE(step_id + offset < scopes.size(), + "offset [%d] is out of range, it must be less than (%d - %d)", + offset, + scopes.size(), + step_id); + std::shared_ptr scope = scopes[step_id]; + std::shared_ptr linked_scope = scopes[step_id + offset]; + for (auto& attr : memories) { + auto mem = scope->CreateVariable(attr.pre_var)->GetMutable(); + // maybe share variable is better? + auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable(); + mem->ShareDataWith(*linked_mem); + + // TODO(qingqing) remove following code + // the memory of current step should be allocated in step net + auto m = scope->CreateVariable(attr.var)->GetMutable(); + // for unit test, as addOp and mulOp are null currently, if not + // mutable_data, mem.data() in output will be error. We will + // remove this line after merge the correct addOp and mulOp. + m->mutable_data(mem->dims(), platform::CPUPlace()); + } +} + +void InitArgument(const ArgumentName& name, + Argument* arg, + const OperatorBase& op) { + arg->step_net = op.Input(name.step_net); + arg->step_scopes = op.Output(name.step_scopes); + + auto inlinks = op.Inputs(name.inlinks); + auto inlink_alias = op.GetAttr>(name.inlink_alias); + PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), + "the size of inlinks and inlink_alias don't match:%d,%d", + inlinks.size(), + inlink_alias.size()); + for (size_t i = 0; i < inlinks.size(); ++i) { + rnn::Link link; + link.external = inlinks[i]; + link.internal = inlink_alias[i]; + (arg->inlinks).push_back(link); + } + + auto outlinks = op.Outputs(name.outlinks); + auto outlink_alias = op.GetAttr>(name.outlink_alias); + PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), + "the size of outlinks and outlink_alias don't match:%d,%d", + outlinks.size(), + outlink_alias.size()); + for (size_t i = 0; i < outlinks.size(); ++i) { + rnn::Link link; + link.external = outlinks[i]; + link.internal = outlink_alias[i]; + (arg->outlinks).push_back(link); + } + + auto boot_memories = op.Inputs(name.boot_memories); + + // attributes + auto memories = op.GetAttr>(name.memories); + auto pre_memories = op.GetAttr>(name.pre_memories); + + PADDLE_ENFORCE(memories.size() == boot_memories.size(), + "the size of memories, boot_memories don't match:%d,%d", + memories.size(), + boot_memories.size()); + PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), + "the size of pre_memories, boot_memories don't match:%d,%d", + pre_memories.size(), + boot_memories.size()); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + + for (size_t i = 0; i < memories.size(); ++i) { + rnn::MemoryAttr mem_attr; + mem_attr.var = memories[i]; + mem_attr.pre_var = pre_memories[i]; + mem_attr.boot_var = boot_memories[i]; + (arg->memories).push_back(mem_attr); + } +} + +} // namespace rnn + +void RecurrentAlgorithm::InferShape(const std::shared_ptr& scope) const { + seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + CreateScopes(scope); + auto step_scopes = GetStepScopes(scope); + + // SegmentInputs is called in InferShape. The input must hold memory in + // SegmentInputs. But the other op only set dimension for the output in + // InferShape. That's a problem. Wether the RNN op needs InferShape or not? + // Wether the following functions (SegmentInputs, InitMemories, ...) need + // to rewrite for RNN op? + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + + InitMemories(step_scopes[0]); + + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "stepnet [%s] is not in scope.", + arg_->step_net); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + // If the InferShape is called in OperatorBase's run function, + // the rnn op only needs to do InferShape for the first time step + for (size_t i = 0; i < seq_len_; i++) { + if (i > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, i, -1); + } + net->GetMutable()->InferShape(step_scopes[i]); + } + + auto outlinks = arg_->outlinks; + for (size_t i = 0; i < outlinks.size(); i++) { + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + // now only support fixed length + dims_vec.insert(dims_vec.begin(), seq_len_); + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + output->Resize(make_ddim(dims_vec)); + } +} + +void RecurrentAlgorithm::Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + + Variable* net = scope->GetVariable(arg_->step_net); + for (size_t step_id = 0; step_id < seq_len_; step_id++) { + // the link memory is done in InferShape + // maybe remove following code after testing + if (step_id > 0) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); + } + net->GetMutable()->Run(step_scopes[step_id], dev_ctx); + } + + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); +} + +void RecurrentAlgorithm::CreateScopes(std::shared_ptr scope) const { + // TODO(xxx) Only two scopes are needed for inference, this case will be + // supported later. + auto step_scopes = scope->GetVariable(arg_->step_scopes) + ->GetMutable>>(); + + if (seq_len_ > step_scopes->size()) { + for (size_t i = step_scopes->size(); i < seq_len_; ++i) { + std::shared_ptr step_scope = std::make_shared(scope); + + // Now all variables in scope must be created outside of op. + auto net_op = scope->GetVariable(arg_->step_net)->GetMutable(); + for (auto& input : net_op->inputs_) { + step_scope->CreateVariable(input); + } + for (auto& output : net_op->outputs_) { + step_scope->CreateVariable(output); + } + + step_scopes->push_back(std::make_shared(step_scope)); + } + } +} + +void RecurrentAlgorithm::InitMemories(std::shared_ptr step_scope) const { + for (auto& attr : arg_->memories) { + Tensor* pre_mem = + step_scope->CreateVariable(attr.pre_var)->GetMutable(); + PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), + "memory [%s]'s boot variable [%s] not exists", + attr.var, + attr.boot_var); + Tensor* boot_mem = + step_scope->GetVariable(attr.boot_var)->GetMutable(); + pre_mem->ShareDataWith(*boot_mem); + + // TODO(qingqing) remove following code + // the memory of current step should be allocated in step net + // here for unit test + auto cur_step_mem = + step_scope->CreateVariable(attr.var)->GetMutable(); + cur_step_mem->mutable_data(boot_mem->dims(), platform::CPUPlace()); + } +} + +const rnn::ArgumentName RecurrentOp::kArgName{"step_net", + "step_scopes", + "inlinks", + "outlinks", + "inlink_alias", + "outlink_alias", + "memories", + "pre_memories", + "boot_memories"}; + +const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net", + "step_scopes", + "outlink@grad", + "inlink@grad", + "inlink_alias", + "outlink_alias", + "memories", + "pre_memories", + "boot_memories@grad"}; + +void RecurrentOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker { +public: + RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = RecurrentOp::kArgName; + // inputs and outputs stored in proto + AddInputs(name.inlinks, + "the input that need to be segmented for each step."); + AddInputs(name.boot_memories, "variables to initialize memories."); + AddInput(name.step_net, "network shared by all steps."); + + AddOutputs(name.outlinks, + "the output that need to concated for all steps."); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.inlink_alias, "alias of inlinks"); + AddAttr>(name.outlink_alias, "alias of outlinks"); + AddAttr>(name.pre_memories, + "names of pre-memories"); + AddAttr>(name.memories, "names of memories"); + + AddComment("This is a recurrent group operator."); + } +}; + +void RecurrentGradientAlgorithm::Run( + const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const { + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "step net is not in scope."); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + } + net->GetMutable()->Run(step_scopes[step_id], dev_ctx); + } + LinkBootMemoryGradients(step_scopes[0]); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_); +} + +void RecurrentGradientAlgorithm::LinkBootMemoryGradients( + std::shared_ptr step_scope) const { + for (auto& attr : arg_->memories) { + Tensor* mem_grad = + step_scope->CreateVariable(attr.var)->GetMutable(); + PADDLE_ENFORCE(mem_grad != nullptr, + "boot_tensor should be retrieved before"); + PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var), + "memory [%s]'s boot variable [%s] not exists", + attr.var, + attr.boot_var); + Tensor* boot_mem_grad = + step_scope->CreateVariable(attr.boot_var)->GetMutable(); + boot_mem_grad->ShareDataWith(*mem_grad); + } +} + +void RecurrentGradientAlgorithm::InferShape( + const std::shared_ptr& scope) const { + seq_len_ = scope->GetVariable((arg_->inlinks[0]).external) + ->GetMutable() + ->dims()[0]; + auto step_scopes = GetStepScopes(scope); + rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_); + + PADDLE_ENFORCE(scope->HasVariable(arg_->step_net), + "step net is not in scope."); + Variable* net = scope->GetVariable(arg_->step_net); + PADDLE_ENFORCE(net != nullptr, "failed to get step net"); + + for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { + if (static_cast(step_id) != seq_len_ - 1) { + rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + } + net->GetMutable()->InferShape(step_scopes[step_id]); + } + + auto outlinks = arg_->outlinks; + for (size_t i = 0; i < outlinks.size(); i++) { + DDim step_dims = step_scopes[0] + ->GetVariable(outlinks[i].internal) + ->GetMutable() + ->dims(); + std::vector dims_vec = vectorize(step_dims); + // now only support fixed length + dims_vec.insert(dims_vec.begin(), seq_len_); + Tensor* output = + step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable(); + output->Resize(make_ddim(dims_vec)); + } + LinkBootMemoryGradients(step_scopes[0]); +} + +void RecurrentGradientOp::Init() { + OperatorBase::Init(); + std::unique_ptr arg(new rnn::Argument()); + rnn::InitArgument(kArgName, arg.get(), *this); + alg_.Init(std::move(arg)); +} + +} // namespace operators +} // namespace paddle + +REGISTER_OP(recurrent_op, + paddle::operators::RecurrentOp, + paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker); diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8946c8ce38117c391edcf56558c640ebd0d7f75c --- /dev/null +++ b/paddle/operators/recurrent_network_op.h @@ -0,0 +1,216 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/operator.h" + +namespace paddle { +namespace operators { + +using namespace paddle::framework; + +namespace rnn { + +/** + * Memory of a RNN (same as the role of `Momory` in PaddlePaddle). + * + * Memory attributes cached by this op, dims will be infered from + * boot memories in father scope. Other attributes are copied from Op's proto + * attributes. + */ +struct MemoryAttr { + // name of current state variable + std::string var; + // name of previous step's state variable + std::string pre_var; + // name of the variables to init this memory (same role of `boot_layer` in + // PaddlePaddle), which is store in father's scope. + std::string boot_var; +}; + +struct Link { + // input or output links name. + std::string internal; + // alias to avoid duplicate keys in scopes. + std::string external; +}; + +struct Argument { + std::string step_net; + std::string step_scopes; + std::vector inlinks; + std::vector outlinks; + std::vector memories; +}; + +struct ArgumentName { + std::string step_net; + std::string step_scopes; + std::string inlinks; + std::string outlinks; + std::string inlink_alias; // the alias of inlinks in step net. + std::string outlink_alias; // the alias of outlinks in step net. + std::string memories; // the memory name + std::string pre_memories; // the previous memory name + std::string boot_memories; // the boot memory name +}; + +/** + * Prepare inputs for each step net. + */ +void SegmentInputs(std::vector>& step_scopes, + const std::vector& inlinks, + const size_t seq_len); + +/** + * Process outputs of step nets and merge to variables. + */ +void ConcatOutputs(std::vector>& step_scopes, + const std::vector& outlinks, + const size_t seq_len); + +void LinkMemories(std::vector>& step_scopes, + const std::vector& memories, + size_t step_id, + int offset); + +void InitArgument(const ArgumentName& name, Argument* arg); + +}; // namespace rnn + +// The sequence format in RecurrentOp is Tensor now. +// TODO: +// 1. No-padding computing for sequences with indifinite length in one batch. +// 2. Hierarchical RNN for sequence with sub-sequence. +// 3. Internal Memory. +// 4. More Complex RNN architecture, such as Gated Feedback RNN. +// Refer to: https://arxiv.org/pdf/1502.02367.pdf + +class RecurrentAlgorithm { +public: + void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const; + + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + /** + * InferShape must be called before Run. + */ + void InferShape(const std::shared_ptr& scope) const; + +protected: + /* + * The step scopes will be stored in the father scope as a variable. + * + * NOTE the scopes are reused in both the forward and backward, so just + * create once and expand its size if more steps need. + */ + void CreateScopes(std::shared_ptr scope) const; + + inline const std::vector>& GetStepScopes( + std::shared_ptr scope) const { + return *(scope->GetVariable(arg_->step_scopes)) + ->GetMutable>>(); + } + + void InitMemories(std::shared_ptr step_scopes) const; + +private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentGradientAlgorithm { + /** + * RNN's backward alogorithm. + * + * To accelerate the development of RecurrentGradientOp, we decouple RNN's + * algorithm and `OperatorBase`'s implementation, the former contains the core + * implementation of a RNN, and will keep stable even if the framework changes + * a + * lot, and the latter is a wrapper acts like an dapter for it to make RNN an + * operator. + */ +public: + void Init(std::unique_ptr arg) { arg_ = std::move(arg); } + + void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const; + + void LinkBootMemoryGradients(std::shared_ptr step_scopes) const; + + /** + * InferShape must be called before Run. + */ + void InferShape(const std::shared_ptr& scope) const; + +protected: + inline const std::vector>& GetStepScopes( + std::shared_ptr scope) const { + return *(scope->GetVariable(arg_->step_scopes)) + ->GetMutable>>(); + } + +private: + std::unique_ptr arg_; + mutable size_t seq_len_; +}; + +class RecurrentOp final : public OperatorBase { +public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + virtual void InferShape(const std::shared_ptr& scope) const override { + alg_.InferShape(scope); + } + + virtual void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + +private: + RecurrentAlgorithm alg_; +}; + +class RecurrentGradientOp final : public OperatorBase { +public: + void Init() override; + + /** + * InferShape must be called before Run. + */ + virtual void InferShape(const std::shared_ptr& scope) const override { + alg_.InferShape(scope); + } + + virtual void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const override { + alg_.Run(scope, dev_ctx); + } + + static const rnn::ArgumentName kArgName; + +private: + RecurrentGradientAlgorithm alg_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_network_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6784ac6001ad1b464d65814cff1ad6247826ad66 --- /dev/null +++ b/paddle/operators/recurrent_network_op_test.cc @@ -0,0 +1,400 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/tensor.h" +#include "paddle/operators/recurrent_network_op.h" + +namespace paddle { +namespace operators { + +class RecurrentOpTest : public ::testing::Test { +protected: + virtual void SetUp() override { + CreateGlobalVariables(); + CreateStepNet(); + CreateRNNOp(); + } + + virtual void TearDown() override {} + + void CreateGlobalVariables() { + scope_ = std::make_shared(); + // create input, and init content + LOG(INFO) << "create global variable x"; + for (auto inlink : std::vector{"x", "x0", "x1", "h"}) { + Variable* x = scope_->CreateVariable(inlink); + DDim dims = make_ddim(std::vector{ + 10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + } + // create output alias just for test + for (auto inlink : std::vector{"h@alias"}) { + Variable* x = scope_->CreateVariable(inlink); + DDim dims = + make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + } + + LOG(INFO) << "create global variable w"; + Variable* w = scope_->CreateVariable("rnn/w"); + w->GetMutable()->mutable_data( + make_ddim(std::vector{30, 30}), platform::CPUPlace()); + + for (auto boot : std::vector{"x_boot", "h_boot"}) { + LOG(INFO) << "create global variable " << boot; + Variable* h_boot = scope_->CreateVariable(boot); + h_boot->GetMutable()->mutable_data( + make_ddim(std::vector{20 /*batch size*/, 30 /*input dim*/}), + platform::CPUPlace()); + } + + LOG(INFO) << "create variable step_scopes"; + scope_->CreateVariable("step_scopes"); + + LOG(INFO) << "create variable h"; + scope_->CreateVariable("h"); + } + + void CreateRNNOp() { + OpDesc op_desc; + + op_desc.set_type("recurrent_op"); + // inlinks 0 + op_desc.add_inputs("x"); + op_desc.add_inputs("x0"); + op_desc.add_inputs("x1"); + // boot_memories 3 + op_desc.add_inputs("x_boot"); + op_desc.add_inputs("h_boot"); + // step net 5 + op_desc.add_inputs("step_net"); + // outlinks 6 + op_desc.add_outputs("h"); + // step scopes 7 + op_desc.add_outputs("step_scopes"); + + auto _input_format = std::vector{ + 0, // in_link + 3, // memories + 5 // step_net + }; + auto input_format = op_desc.add_attrs(); + input_format->set_name("input_format"); + input_format->set_type(paddle::framework::AttrType::INTS); + for (auto i : _input_format) { + input_format->add_ints(i); + } + + auto output_format = op_desc.add_attrs(); + output_format->set_name("output_format"); + output_format->set_type(paddle::framework::AttrType::INTS); + for (auto i : std::vector{0, 1, 2}) { + output_format->add_ints(i); + } + + auto inlink_alias = op_desc.add_attrs(); + inlink_alias->set_name("inlink_alias"); + inlink_alias->set_type(paddle::framework::AttrType::STRINGS); + + auto outlink_alias = op_desc.add_attrs(); + outlink_alias->set_name("outlink_alias"); + outlink_alias->set_type(paddle::framework::AttrType::STRINGS); + + auto pre_memories = op_desc.add_attrs(); + pre_memories->set_name("pre_memories"); + pre_memories->set_type(paddle::framework::AttrType::STRINGS); + + auto memories = op_desc.add_attrs(); + memories->set_name("memories"); + memories->set_type(paddle::framework::AttrType::STRINGS); + + // create inlink_alias + for (const auto& item : + std::vector{"x@alias", "x0@alias", "x1@alias"}) { + inlink_alias->add_strings(item); + } + // pre memories + for (const auto& item : + std::vector{"rnn/x@pre", "rnn/h@pre"}) { + pre_memories->add_strings(item); + } + // memories + for (const auto& item : std::vector{"rnn/x", "rnn/h"}) { + memories->add_strings(item); + } + // output alias + for (const auto& item : std::vector{"h@alias"}) { + outlink_alias->add_strings(item); + } + + rnn_op_ = OpRegistry::CreateOp(op_desc); + + LOG(INFO) << "rnn_op finish init"; + } + + void CreateStepNet() { + LOG(INFO) << "create variable step_net"; + Variable* var = scope_->CreateVariable("step_net"); + auto net = var->GetMutable(); + // rnn/s is net's input or output? + net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"}; + net->inputs_ = {"rnn/s", "rnn/h"}; + net->AddOp( + OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {})); + + net->AddOp( + OpRegistry::CreateOp("add_two", {"rnn/x", "rnn/s"}, {"rnn/h"}, {})); + net->CompleteAddOp(); + } + + // father scope + std::shared_ptr scope_; + std::shared_ptr rnn_op_; +}; + +TEST_F(RecurrentOpTest, Run) { + platform::CPUDeviceContext ctx; + rnn_op_->InferShape(scope_); + rnn_op_->Run(scope_, ctx); +} + +class RecurrentGradientAlgorithmTest : public ::testing::Test { +protected: + virtual void SetUp() override { + CreateGlobalVariables(); + CreateStepScopes(); + CreateStepNet(); + CreateRNNGradientAlgorithm(); + + // segment inputs + SegmentInputs(); + // link forward memories + LinkeMemories(); + } + + virtual void TearDown() override {} + + void CreateGlobalVariables() { + scope_ = std::make_shared(); + // inputs: x + LOG(INFO) << "create global variable x"; + Variable* x = scope_->CreateVariable("x"); + DDim dims = + make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/}); + x->GetMutable()->mutable_data(dims, platform::CPUPlace()); + // inputs: h_boot + LOG(INFO) << "create global variable h_boot"; + Variable* h_boot = scope_->CreateVariable("h_boot"); + h_boot->GetMutable()->mutable_data( + make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace()); + // inputs: w + LOG(INFO) << "create global variable w"; + Variable* w = scope_->CreateVariable("rnn/w"); + w->GetMutable()->mutable_data(make_ddim({30, 30}), + platform::CPUPlace()); + // inputs: h_grad + LOG(INFO) << "create variable h_grad"; + Variable* dh = scope_->CreateVariable("h_grad"); + dh->GetMutable()->mutable_data(make_ddim({10, 20, 30}), + platform::CPUPlace()); + // inputs: step_scopes + LOG(INFO) << "create variable step_scopes"; + scope_->CreateVariable("step_scopes"); + // inputs: step_net + LOG(INFO) << "create variable step_net"; + scope_->CreateVariable("step_net"); + // outputs: w_grad + LOG(INFO) << "create global variable w_grad"; + scope_->CreateVariable("rnn/w_grad"); + // outputs: x_grad + LOG(INFO) << "create global variable x_grad"; + scope_->CreateVariable("x_grad"); + // outputs: h_boot_grad + LOG(INFO) << "create global variable h_boot_grad"; + scope_->CreateVariable("h_boot_grad"); + } + + void CreateStepScopes() { + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + for (int i = 0; i < 10; ++i) { + auto scope = std::make_shared(scope_); + auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable(); + pre_t->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + auto tensor = scope->CreateVariable("rnn/h")->GetMutable(); + tensor->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + + // for unit test of ConcatOutputs + auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable(); + xg->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + + step_scopes->push_back(scope); + } + + // last time step + auto g = (*step_scopes)[9] + ->CreateVariable("rnn/h_pre_grad") + ->GetMutable(); + g->mutable_data(make_ddim({20, 30}), platform::CPUPlace()); + } + + void CreateRNNGradientAlgorithm() { + std::unique_ptr arg(new rnn::Argument()); + arg->step_net = "step_net"; + arg->step_scopes = "step_scopes"; + rnn::Link inlink; + inlink.external = "h_grad"; + inlink.internal = "rnn/h_grad"; + arg->inlinks = std::vector{inlink}; + + rnn::Link outlink; + outlink.external = "x_grad"; + outlink.internal = "rnn/x_grad"; + arg->outlinks = std::vector{outlink}; + + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "rnn/h_pre_grad"; + mem_attr.var = "rnn/h_grad"; + mem_attr.boot_var = "h_boot_grad"; + arg->memories = std::vector{mem_attr}; + + rnn_grad_algo_.Init(std::move(arg)); + } + + void CreateStepNet() { + LOG(INFO) << "create variable step_net"; + Variable* var = scope_->CreateVariable("step_net"); + auto net = var->GetMutable(); + net->AddOp(OpRegistry::CreateOp("mul", + {"rnn/h_pre", "rnn/w", "rnn/s_grad"}, + {"rnn/h_pre_grad", "rnn/w_grad"}, + {})); + + net->AddOp(OpRegistry::CreateOp( + "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {})); + net->CompleteAddOp(); + } + + void SegmentInputs() { + LOG(INFO) << "segment inputs"; + std::vector inlinks = {"x"}; + std::vector inlinks_alias = {"rnn/x"}; + + rnn::Link inlink; + inlink.external = "x"; + inlink.internal = "rnn/x"; + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + rnn::SegmentInputs(*step_scopes, std::vector{inlink}, 10); + } + + void LinkeMemories() { + LOG(INFO) << "link memories"; + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "rnn/h_pre"; + mem_attr.var = "rnn/h"; + mem_attr.boot_var = "boot_h"; + std::vector memories; + memories.push_back(mem_attr); + std::vector>* step_scopes = + scope_->GetVariable("step_scopes") + ->GetMutable>>(); + for (int i = 1; i < 10; ++i) { + rnn::LinkMemories(*step_scopes, memories, i, -1); + } + } + + std::shared_ptr scope_; + RecurrentGradientAlgorithm rnn_grad_algo_; +}; + +// TEST_F(RecurrentGradientAlgorithmTest, Run) { +// platform::CPUDeviceContext ctx; +// rnn_grad_algo_.Run(scope_, ctx); +// } + +} // namespace operators +} // namespace paddle + +TEST(RecurrentOp, LinkMemories) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + // create and init step scopes + int len = 10; + std::vector> step_scopes; + for (int i = 0; i < len; ++i) { + auto scope = std::make_shared(); + scope->CreateVariable("pre_h"); + auto tensor = scope->CreateVariable("h")->GetMutable(); + float* data = tensor->mutable_data(make_ddim({15, 20}), CPUPlace()); + for (int i = 0; i < 15 * 20; ++i) { + data[i] = rand() * (1. / (double)RAND_MAX); + } + step_scopes.push_back(scope); + } + + // create MemoryAttr + rnn::MemoryAttr mem_attr; + mem_attr.pre_var = "pre_h"; + mem_attr.var = "h"; + mem_attr.boot_var = "boot_h"; + std::vector memories; + memories.push_back(mem_attr); + + for (int i = 1; i < len; ++i) { + rnn::LinkMemories(step_scopes, memories, i, -1); + } + // check + for (int i = 0; i < len - 1; ++i) { + const float* a = + step_scopes[i]->GetVariable("h")->GetMutable()->data(); + const float* b = step_scopes[i + 1] + ->GetVariable("pre_h") + ->GetMutable() + ->data(); + for (size_t i = 0; i < 15 * 20; ++i) { + ASSERT_FLOAT_EQ(a[i], b[i]); + } + } + + for (int i = len - 2; i >= 0; --i) { + rnn::LinkMemories(step_scopes, memories, i, 1); + } + // check + for (int i = len - 2; i >= 0; --i) { + const float* a = step_scopes[i] + ->GetVariable("pre_h") + ->GetMutable() + ->data(); + const float* b = step_scopes[i + 1] + ->GetVariable("h") + ->GetMutable() + ->data(); + for (size_t i = 0; i < 15 * 20; ++i) { + ASSERT_FLOAT_EQ(a[i], b[i]); + } + } +} + +USE_OP(add_two); +USE_OP(mul); diff --git a/paddle/operators/rnn_design.md b/paddle/operators/rnn_design.md new file mode 100644 index 0000000000000000000000000000000000000000..3d38b9a0ad225fd8e0c1bb037474b292b1887f5b --- /dev/null +++ b/paddle/operators/rnn_design.md @@ -0,0 +1,239 @@ +# RNN 变长输入设计 +对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式, +即将一个mini-batch内不同长度的序列补0到固定长度参与计算。 + +现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。 + +## 背景介绍 +由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时, +必须用zero-padding的方式将变长序列补全为固定shape的tensor。 + +由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在, +因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。 + +由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2], +但不管是padding还是bucket,对于用户都是额外的使用负担。 + +因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。 + +但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。 + +## 多层序列数据格式 `LODTensor` +目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上, +额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。 + +Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持; + +为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构: + +```c++ +std::shared_ptr>> lod_start_pos_; +``` + +或者更明确的定义 + +```c++ +typedef std::vector level_t; +std::vector lod_start_pos; +``` + +这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。 + +为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4], +其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。 +如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用, +而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。 + +`LODTensor` 具体定义如下: + +```c++ +class LODTensor : public Tensor { +public: + size_t Levels() const { return seq_start_positions_.size(); } + size_t Elements(int level = 0) const { + return seq_start_positions_[level].size(); + } + // slice of level[elem_begin: elem_end] + // NOTE low performance in slice seq_start_positions_. + // TODO should call Tensor's Slice. + LODTensor LODSlice(int level, int elem_begin, int elem_end) const; + + // slice with tensor's data shared with this. + LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const; + + // copy other's lod_start_pos_, to share LOD info. + // NOTE the LOD info sould not be changed. + void ShareConstLODFrom(const LODTensor &other) { + lod_start_pos_ = other.lod_start_pos_; + } + // copy other's lod_start_pos_'s content, free to mutate. + void ShareMutableLODFrom(const LODTensor &other) { + lod_start_pos_ = std::make_shared < + std::vector>(other.lod_start_pos_.begin(), + other.lod_start_pos_.end()); + } + +private: + std::shared_ptr>> lod_start_pos_; +}; +``` + +其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价, +可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。 + +## 框架支持 +### 框架现有的 `Tensor` 调用替换为 `LODTensor` +为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`, +简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。 + +此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。 + +### `lod_start_pos` 随着Op调用链传递 +框架需要支持下列特性,以实现`lod_start_pos`的传递: + +1. 以 `shared_ptr` 的方式实现传递 + - 不修改 `lod_start_pos` 内容的作为 consumer + - 修改 `lod_start_pos` 的作为 producer + - 约定 consumer 只需要复制传递过来的 `shared_ptr` + - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer + - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos` + +2. 对于不感知 `lod_start_pos` 的Op足够透明 +3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 + +具体的设计分为以下3小节 + +#### `load_start_pos` 的传递 + +- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制 +- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改 + +#### 框架透明 +传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下 + +- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false` + - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true` +- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。 +- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次 + +一些逻辑如下 + +```c++ +class OperatorBase { +public: + // ... + void InferShape() { + if (!is_load_inited) { + bool do_mutate_lod_info = GetAttr("do_mutate_load_info"); + // find a input having LOD to copy + auto lod_input = ValidLODInput(); + for (auto &output : outputs) { + if (do_mutate_load_info) { + output.ShareMutableLODFrom(lod_input); + } else { + output.ShareConstLODFrom(load_input); + } + } + is_pod_inited = true; + } + + // call op's InferShape + // ... + } + +private: + // ... + bool is_lod_inited{false}; +}; +``` + +如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。 + +#### `lod_start_pos` 的更新 +上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改, +Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` , +而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。 + +## 根据长度排序 +按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算 + +比如原始的输入: + +``` +origin: +xxxx +xx +xxx + +-> sorted: +xxxx +xxx +xx +``` + +经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列) + +``` +0 1 2 3 +x x x x +x x x +x x +``` + +为了追踪排序前后序列的变化,这里用 +```c++ +struct SortedSeqItem { + void *start{nullptr}; + void *end{nullptr}; +}; + +std::vector sorted_seqs; +``` +来追踪序列排序后的位置,并添加一个新的接口 + +```c++ +std::vector SortBySeqLen(const LODTensor& tensor); +``` + +由于输入序列的顺序变化,以下现有的接口需要针对性地修改: + +- InitMemories, memory需要根据 `sorted_seqs` 重新排列 +- SetmentInputs +- ConcatOutputs + +此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出, +之后作为 `RecurrentGradientOp` 的一个输入传入。 + +## InitMemories +由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。 + +## SegmentInputs +`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。 + +即下面的转变: +``` +origin: +xxxx +xx +xxx + + | + | + \ / + ! +0 1 2 3 +x x x x +x x x +x x +``` +## ConcatOutputs +`ConcatOutputs` 需要 + +- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱) +- 将每个序列concat 为规则的mini-batch表示 + +## 参考文献 +1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) +2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html) +3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) +4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail) diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index e04d69fa72a2f54cc1cc0829d12e0da1609b3383..4129422fa744b2a7cf135b681efa73ffb2ebcdcc 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -13,15 +13,13 @@ limitations under the License. */ #include "paddle/operators/rowwise_add_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class RowWiseAddOp : public framework::OperatorWithKernel { +class RowWiseAddOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add"); auto dim0 = inputs[0]->dims(); auto dim1 = inputs[1]->dims(); @@ -34,11 +32,10 @@ protected: } }; -class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker { +class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { public: - RowWiseAddOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The left input of row-wise add op, must be matrix"); AddInput("b", "The right input of row-wise add op, must be vector"); AddOutput("Out", "The output of row-wise add op"); @@ -53,9 +50,6 @@ for i in xrange(X.shape[0]): } // namespace operators } // namespace paddle -REGISTER_OP(rowwise_add, - paddle::operators::RowWiseAddOp, - paddle::operators::RowWiseAddOpMaker); -REGISTER_OP_CPU_KERNEL( - rowwise_add, - paddle::operators::RowWiseAddKernel); +REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker); +REGISTER_OP_CPU_KERNEL(rowwise_add, + ops::RowWiseAddKernel); diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu index 5dfac4fd2cf9b7da24dcfa5e7583b9ece12bad1e..4b33e38ebabe853e179fe70ef7fde0a80b9050e2 100644 --- a/paddle/operators/rowwise_add_op.cu +++ b/paddle/operators/rowwise_add_op.cu @@ -1,6 +1,4 @@ -#include "paddle/framework/op_registry.h" #include "paddle/operators/rowwise_add_op.h" -REGISTER_OP_GPU_KERNEL( - rowwise_add, - paddle::operators::RowWiseAddKernel); +REGISTER_OP_GPU_KERNEL(rowwise_add, + ops::RowWiseAddKernel); diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index dc47fe7c847bd0c8c179ac0a5f44b8cc541b47cb..4596925e9322f373c822608fd9aa6ecee6144d4c 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -13,25 +13,23 @@ limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class RowWiseAddKernel : public framework::OpKernel { +class RowWiseAddKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto in0 = context.Input(0)->Get(); - auto in1 = context.Input(1)->Get(); - auto* out = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto in0 = context.Input(0)->Get(); + auto in1 = context.Input(1)->Get(); + auto* out = context.Output(0)->GetMutable(); out->mutable_data(context.GetPlace()); - auto input = framework::EigenMatrix::From(in0); - auto bias = framework::EigenVector::From(in1); - auto output = framework::EigenMatrix::From(*out); + auto input = EigenMatrix::From(in0); + auto bias = EigenVector::From(in1); + auto output = EigenMatrix::From(*out); const int bias_size = bias.dimension(0); const int rest_size = input.size() / bias_size; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 66ab1e001142bfb005d3c2e2ea29e01a32dce507..f6c654a9e7083704e353c276e0abc975f4e61ef9 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -13,17 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/sgd_op.h" -#include "paddle/framework/op_registry.h" -#include "paddle/framework/tensor.h" namespace paddle { namespace operators { -class SGDOp : public framework::OperatorWithKernel { +class SGDOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two"); PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one"); PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set"); @@ -35,10 +32,10 @@ protected: } }; -class SGDOpMaker : public framework::OpProtoAndCheckerMaker { +class SGDOpMaker : public OpProtoAndCheckerMaker { public: - SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("param", "input parameter"); AddInput("grad", "input gradient"); AddOutput("param_out", "output parameter"); @@ -55,7 +52,5 @@ param_out = param - learning_rate * grad; } // namespace operators } // namespace paddle -REGISTER_OP(sgd, paddle::operators::SGDOp, paddle::operators::SGDOpMaker); -typedef paddle::operators::SGDOpKernel<::paddle::platform::CPUPlace, float> - SGDOpKernel_CPU_float; -REGISTER_OP_CPU_KERNEL(sgd, SGDOpKernel_CPU_float); +REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 400425db10896e3970fc7468e34aba596a536184..f8f5b90cab460b4457cfb0a88bfc012bafe0fbc2 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -1,5 +1,3 @@ #include "paddle/operators/sgd_op.h" -#include "paddle/framework/op_registry.h" -typedef paddle::operators::SGDOpKernel<::paddle::platform::GPUPlace, float> SGDOpKernel_GPU_float; -REGISTER_OP_GPU_KERNEL(sgd, SGDOpKernel_GPU_float); \ No newline at end of file +REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel); \ No newline at end of file diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 4b2d214618e5c7c15695bd66604139d805255c47..65179d323bd991b8b4e196c069a11cd901c62082 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -13,28 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SGDOpKernel : public framework::OpKernel { +class SGDOpKernel : public OpKernel { public: - void Compute(const framework::KernelContext& ctx) const override { - auto param = ctx.Input("param")->Get(); - auto grad = ctx.Input("grad")->Get(); - auto* param_out = ctx.Output(0)->GetMutable(); + void Compute(const KernelContext& ctx) const override { + auto param = ctx.Input("param")->Get(); + auto grad = ctx.Input("grad")->Get(); + auto* param_out = ctx.Output(0)->GetMutable(); float lr = ctx.op_.GetAttr("learning_rate"); param_out->mutable_data(ctx.GetPlace()); - framework::EigenVector::Flatten(*param_out) - .device(*(ctx.GetEigenDevice())) = - framework::EigenVector::Flatten(param) - - lr * framework::EigenVector::Flatten(grad); + EigenVector::Flatten(*param_out).device(*(ctx.GetEigenDevice())) = + EigenVector::Flatten(param) - lr * EigenVector::Flatten(grad); } }; diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc index bf63af28b003daad0ab8c223e71a561437ee663a..716f1d9c4dbc45e2d5569f8d634b06fd988a149c 100644 --- a/paddle/operators/sigmoid_op.cc +++ b/paddle/operators/sigmoid_op.cc @@ -13,37 +13,33 @@ limitations under the License. */ #include "paddle/operators/sigmoid_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class SigmoidOp : public framework::OperatorWithKernel { +class SigmoidOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input"); PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output"); outputs[0]->Resize(inputs[0]->dims()); } }; -class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { +class SigmoidOpMaker : public OpProtoAndCheckerMaker { public: - SigmoidOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : framework::OpProtoAndCheckerMaker(proto, op_checker) { + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "sigmoid input"); AddOutput("Y", "sigmoid output"); AddComment("Sigmoid function"); } }; -class SigmoidOpGrad : public framework::OperatorWithKernel { +class SigmoidOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "SigmoidGrad"; return ""; @@ -53,11 +49,7 @@ protected: } // namespace operators } // namespace paddle -REGISTER_OP(sigmoid, - paddle::operators::SigmoidOp, - paddle::operators::SigmoidOpMaker); -REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, paddle::operators::SigmoidOpGrad); +REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker); +REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad); -REGISTER_OP_CPU_KERNEL( - sigmoid, - paddle::operators::SigmoidKernel); +REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel); diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu index ed344b2bfd4a9eeef2ce79746bec608469503c9c..f679b20418f04eff4310efe4e121963ce5a235e0 100644 --- a/paddle/operators/sigmoid_op.cu +++ b/paddle/operators/sigmoid_op.cu @@ -1,5 +1,3 @@ #include "paddle/operators/sigmoid_op.h" -#include "paddle/framework/op_registry.h" -REGISTER_OP_GPU_KERNEL( - sigmoid, paddle::operators::SigmoidKernel); +REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel); diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h index 2b9356246c471853b53af1d73f8b2a3c206db7ad..896a6f5d83e0f96de50e3aaae6f545172bf5da14 100644 --- a/paddle/operators/sigmoid_op.h +++ b/paddle/operators/sigmoid_op.h @@ -14,25 +14,23 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SigmoidKernel : public framework::OpKernel { +class SigmoidKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input = context.Input(0)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - framework::EigenVector::Flatten(*output).device( + EigenVector::Flatten(*output).device( *(context.GetEigenDevice())) = - 1.0 / (1.0 + (-1.0 * framework::EigenVector::Flatten(input)).exp()); + 1.0 / (1.0 + (-1.0 * EigenVector::Flatten(input)).exp()); } }; } // namespace operators diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 82f72fa19f690bebdff01629e75d17eecd6ada74..df60b62fa6ac8d67c9dadc40ec49aaedab92bc88 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -12,16 +12,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/softmax_op.h" -#include "paddle/framework/op_registry.h" namespace paddle { namespace operators { -class SoftmaxOp : public framework::OperatorWithKernel { +class SoftmaxOp : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override { + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override { PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax"); PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "The input of softmax op must be matrix"); @@ -31,10 +29,9 @@ protected: } }; -class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { +class SoftmaxOpMaker : public OpProtoAndCheckerMaker { public: - SoftmaxOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "input of softmax"); AddOutput("Y", "output of softmax"); @@ -42,11 +39,10 @@ public: } }; -class SoftmaxOpGrad : public framework::OperatorWithKernel { +class SoftmaxOpGrad : public OperatorWithKernel { protected: - void InferShape( - const std::vector &inputs, - const std::vector &outputs) const override {} + void InferShape(const std::vector &inputs, + const std::vector &outputs) const override {} std::string DebugString() const override { LOG(INFO) << "SoftmaxOpGrad"; return ""; @@ -56,9 +52,6 @@ protected: } // namespace operators } // namespace paddle -namespace ops = paddle::operators; - REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker); -REGISTER_GRADIENT_OP(softmax, softmax_grad, paddle::operators::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL(softmax, - ops::SoftmaxKernel); +REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel); diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu index 60676191eb9460868a266d0e4f70357fa78bec2c..a1f6944a369fe5148ffcfeabf3bf7063dcbc2664 100644 --- a/paddle/operators/softmax_op.cu +++ b/paddle/operators/softmax_op.cu @@ -1,5 +1,4 @@ #include "paddle/framework/op_registry.h" #include "paddle/operators/softmax_op.h" -REGISTER_OP_GPU_KERNEL( - softmax, paddle::operators::SoftmaxKernel); +REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 500c188dbfcf28ae52c2d5b06466539e115acc4a..625a87b58560231572c1cca2a21bd0c47c8cb296 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -14,23 +14,21 @@ #pragma once -#include "glog/logging.h" -#include "paddle/framework/eigen.h" -#include "paddle/framework/operator.h" +#include "paddle/operators/type_alias.h" namespace paddle { namespace operators { template -class SoftmaxKernel : public framework::OpKernel { +class SoftmaxKernel : public OpKernel { public: - void Compute(const framework::KernelContext& context) const override { - auto input = context.Input(0)->Get(); - auto* output = context.Output(0)->GetMutable(); + void Compute(const KernelContext& context) const override { + auto input = context.Input(0)->Get(); + auto* output = context.Output(0)->GetMutable(); output->mutable_data(context.GetPlace()); - auto logits = framework::EigenMatrix::From(input); - auto softmax = framework::EigenMatrix::From(*output); + auto logits = EigenMatrix::From(input); + auto softmax = EigenMatrix::From(*output); const int kBatchDim = 0; const int kClassDim = 1; diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h new file mode 100644 index 0000000000000000000000000000000000000000..b712e457ff60e8b30b87c0d549693d53e9f05d59 --- /dev/null +++ b/paddle/operators/type_alias.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/net.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using OpKernel = framework::OpKernel; +using KernelContext = framework::KernelContext; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; +template +using EigenTensor = framework::EigenTensor; +using Tensor = framework::Tensor; +using OperatorWithKernel = framework::OperatorWithKernel; +using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker; +using OpProto = framework::OpProto; +using OpAttrChecker = framework::OpAttrChecker; +using CPUPlace = platform::CPUPlace; +using GPUPlace = platform::GPUPlace; +using NetOp = framework::NetOp; +using OpRegistry = framework::OpRegistry; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 9c1d94e9e703caf2db92ca4a8eac975317e6b945..f80c36b5b23b4e8bac3bee52d5492ffaa43778d0 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -20,12 +20,104 @@ Eigen::DefaultDevice* DeviceContext::get_eigen_device() return reinterpret_cast(this)->eigen_device(); } +CPUDeviceContext::CPUDeviceContext() { + random_seed_ = std::chrono::system_clock::now().time_since_epoch().count(); + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +CPUDeviceContext::CPUDeviceContext(CPUPlace place) { + random_seed_ = std::chrono::system_clock::now().time_since_epoch().count(); + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } + #ifndef PADDLE_ONLY_CPU + template <> Eigen::GpuDevice* DeviceContext::get_eigen_device() const { return reinterpret_cast(this)->eigen_device(); } -#endif + +CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { + random_seed_ = std::chrono::system_clock::now().time_since_epoch().count(); + SetDeviceId(place_.device); + // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly + // here will cause segment fault. We must implement a class derived from + // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id + // later. Please refer to the implementation of class EigenCudaStreamDevice + // in TensorFlow. + // + // We find that CUDA 7 introduces a new option, the per-thread default stream, + // that has two effects. Please refer to https://devblogs.nvidia.com/ + // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + // + // So, we decide to use default stream and add –default-stream per-thread nvcc + // flag. Than, two threads with two CUDADeviceContexts will run parallelly. + eigen_stream_.reset(new Eigen::CudaStreamDevice()); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); +} + +CUDADeviceContext::~CUDADeviceContext() { + SetDeviceId(place_.device); + Wait(); + if (cublas_handle_) { + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + } + + if (cudnn_handle_) { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + } + + if (curand_generator_) { + PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_)); + } + eigen_stream_.reset(); + eigen_device_.reset(); +} + +Place CUDADeviceContext::GetPlace() const { return place_; } + +void CUDADeviceContext::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(0)); +} + +Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +cublasHandle_t CUDADeviceContext::cublas_handle() { + if (!cublas_handle_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + } + return cublas_handle_; +} + +cudnnHandle_t CUDADeviceContext::cudnn_handle() { + if (!cudnn_handle_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + } + return cudnn_handle_; +} + +curandGenerator_t CUDADeviceContext::curand_generator() { + if (!curand_generator_) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_, + CURAND_RNG_PSEUDO_DEFAULT)); + PADDLE_ENFORCE(dynload::curandSetPseudoRandomGeneratorSeed( + curand_generator_, random_seed_)); + } + return curand_generator_; +} + +#endif // PADDLE_ONLY_CPU } // namespace platform } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 239c25a90c4f98f72fdfb3dff9dfb011e316b101..f5182707611318669b98e6f7b49344604bf7ab4a 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -41,17 +41,13 @@ class DeviceContext { class CPUDeviceContext : public DeviceContext { public: typedef std::mt19937 random_generator_type; - CPUDeviceContext() { - random_seed_ = std::chrono::system_clock::now().time_since_epoch().count(); - eigen_device_.reset(new Eigen::DefaultDevice()); - } + CPUDeviceContext(); + CPUDeviceContext(CPUPlace); + virtual ~CPUDeviceContext() {} - Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); } + Eigen::DefaultDevice* eigen_device() const; - Place GetPlace() const override { - Place retv = CPUPlace(); - return retv; - } + Place GetPlace() const override; random_generator_type& RandGenerator() { if (!rand_generator_) { @@ -68,122 +64,45 @@ class CPUDeviceContext : public DeviceContext { #ifndef PADDLE_ONLY_CPU -class GPUPlaceGuard { - public: - explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) { - if (previous_ != new_place) { - paddle::platform::SetDeviceId(new_place.device); - } - } - - ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); } - - private: - GPUPlace previous_; -}; - class CUDADeviceContext : public DeviceContext { public: - CUDADeviceContext() { - random_seed_ = std::chrono::system_clock::now().time_since_epoch().count(); - } - explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(cudaStreamCreate(&stream_), "cudaStreamCreate failed"); - eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_)); - eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - } + explicit CUDADeviceContext(GPUPlace); + virtual ~CUDADeviceContext(); - Place GetPlace() const override { - Place retv = GPUPlace(); - return retv; - } + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const; - void Wait() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_), - "cudaStreamSynchronize failed"); - } + /*! \brief Return place in the device context. */ + Place GetPlace() const override; - curandGenerator_t RandGenerator() { - if (!rand_generator_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator( - &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT), - "curandCreateGenerator failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed( - rand_generator_, random_seed_), - "curandSetPseudoRandomGeneratorSeed failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::curandSetStream(rand_generator_, stream_), - "curandSetStream failed"); - } - return rand_generator_; - } + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; - cudaStream_t stream() { return stream_; } + // clang-format off + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle (); - Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); } + /*! \brief Return cudnn handle in the device context. */ + cudnnHandle_t cudnn_handle (); - cublasHandle_t cublas_handle() { - if (!blas_handle_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_), - "cublasCreate failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::cublasSetStream(blas_handle_, stream_), - "cublasSetStream failed"); - } - return blas_handle_; - } - - cudnnHandle_t cudnn_handle() { - if (!dnn_handle_) { - GPUPlaceGuard guard(gpu_place_); - PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_), - "cudnnCreate failed"); - PADDLE_ENFORCE( - paddle::platform::dynload::cudnnSetStream(dnn_handle_, stream_), - "cudnnSetStream failed"); - } - return dnn_handle_; - } - - ~CUDADeviceContext() { - Wait(); - if (blas_handle_) { - PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_), - "cublasDestroy failed"); - } - - if (dnn_handle_) { - PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_), - "cudnnDestroy failed"); - } - - if (rand_generator_) { - PADDLE_ENFORCE( - paddle::platform::dynload::curandDestroyGenerator(rand_generator_), - "curandDestroyGenerator failed"); - } - eigen_stream_.reset(); - eigen_device_.reset(); - PADDLE_ENFORCE(cudaStreamDestroy(stream_), "cudaStreamDestroy failed"); - } + /*! \brief Return curand handle in the device context. */ + curandGenerator_t curand_generator(); + // clang-format on private: - GPUPlace gpu_place_; - cudaStream_t stream_; + GPUPlace place_; - std::unique_ptr eigen_stream_; + private: std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; - cublasHandle_t blas_handle_{nullptr}; - - cudnnHandle_t dnn_handle_{nullptr}; - + private: unsigned random_seed_; - curandGenerator_t rand_generator_{nullptr}; + // clang-format off + cudnnHandle_t cudnn_handle_ = nullptr; + cublasHandle_t cublas_handle_ = nullptr; + curandGenerator_t curand_generator_ = nullptr; + // clang-format on }; #endif diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index b06ab8a2f184e7bb7dd9cb39f377b087c5258dc4..fd4adbd9deca12ad6c3a59cfd5d30fb0cb6fcf98 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -36,6 +36,21 @@ limitations under the License. */ namespace paddle { namespace platform { +struct EnforceNotMet : public std::exception { + std::exception_ptr exp_; + std::string err_str_; + + EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { + try { + std::rethrow_exception(exp_); + } catch (const std::exception& exp) { + err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l); + } + } + + const char* what() const noexcept { return err_str_.c_str(); } +}; + // Because most enforce conditions would evaluate to true, we can use // __builtin_expect to instruct the C++ compiler to generate code that // always forces branch prediction of true. @@ -43,18 +58,11 @@ namespace platform { // For more details, please check https://stackoverflow.com/a/43870188/724872. #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -template -inline void throw_on_error(T e) { - throw_on_error(e, ""); -} - template inline typename std::enable_if::type throw_on_error( int stat, const Args&... args) { if (UNLIKELY(!(stat))) { - throw std::runtime_error( - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); + throw std::runtime_error(string::Sprintf(args...)); } } @@ -64,12 +72,8 @@ template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { if (UNLIKELY(e)) { - // clang-format off - throw thrust::system_error( - e, thrust::cuda_category(), - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); } } @@ -77,12 +81,8 @@ template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { if (stat != CURAND_STATUS_SUCCESS) { - // clang-format off - throw thrust::system_error( - cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); } } @@ -92,12 +92,8 @@ inline typename std::enable_if::type throw_on_error( if (stat == CUDNN_STATUS_SUCCESS) { return; } else { - // clang-format off - throw std::runtime_error( - platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - // clang-format on + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); } } @@ -126,22 +122,32 @@ inline typename std::enable_if::type throw_on_error( } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { err = "CUBLAS: license error, "; } - throw std::runtime_error(err + string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); + throw std::runtime_error(err + string::Sprintf(args...)); } #endif // PADDLE_ONLY_CPU -#define PADDLE_THROW(...) \ - do { \ - throw std::runtime_error( \ - string::Sprintf(__VA_ARGS__) + \ - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \ +template +inline void throw_on_error(T e) { + throw_on_error(e, ""); +} + +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ } while (0) -#define PADDLE_ENFORCE(...) \ - do { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ +#define PADDLE_ENFORCE(...) \ + do { \ + try { \ + ::paddle::platform::throw_on_error(__VA_ARGS__); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ } while (0) } // namespace platform diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc index d7152f81509a35e4ce36d5649e7d209f51e34b86..2ac31812a80d8dd57ce82234cb5835e029a46067 100644 --- a/paddle/platform/enforce_test.cc +++ b/paddle/platform/enforce_test.cc @@ -23,7 +23,7 @@ TEST(ENFORCE, FAILED) { bool in_catch = false; try { PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123); - } catch (const std::runtime_error& error) { + } catch (paddle::platform::EnforceNotMet error) { // your error handling code here in_catch = true; std::string msg = "Enforce is not ok 123 at all"; diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 8010369b410e7dc5bebdb63bec981fcfd90235a3..a8994366bc34ee3ec2c39e3482fc3757b089e61f 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,2 +1,3 @@ cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op fc_op sgd_op cross_entropy_op random_op) + add_op fc_op sgd_op cross_entropy_op recurrent_network_op guassian_random_op) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 88deb56207c04c202260d846ea36f80fdb5f9685..5f3b24a4a1370a49a792d02a4a814c8044741b25 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -37,6 +37,7 @@ USE_OP(sigmoid); USE_OP(softmax); USE_OP(rowwise_add); USE_OP(gaussian_random); +USE_OP_WITHOUT_KERNEL(recurrent_op); template void ExposeOperator(ClassType& m) { @@ -49,6 +50,11 @@ void ExposeOperator(ClassType& m) { .def("__str__", &ClassType::type::DebugString); } +static size_t UniqueIntegerGenerator() { + static std::atomic generator; + return generator.fetch_add(1); +} + PYBIND11_PLUGIN(core) { py::module m("core", "C++ core of PaddlePaddle"); @@ -90,6 +96,11 @@ All parameter, weight, gradient are variables in Paddle. [](pd::Variable& self) -> pd::Tensor* { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_net", + [](pd::Variable& self) -> pd::NetOp* { + return self.GetMutable(); + }, py::return_value_policy::reference); py::class_>(m, "Scope") @@ -99,7 +110,8 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference) .def("create_var", &pd::Scope::CreateVariable, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def("get_var_name", &pd::Scope::GetVariableName); //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. @@ -141,24 +153,25 @@ All parameter, weight, gradient are variables in Paddle. }); ExposeOperator(operator_base); - using PlainNetPtr = std::shared_ptr; - py::class_ plain_net(m, "PlainNet"); - - plain_net - .def_static("create", - []() -> std::shared_ptr { - auto retv = std::make_shared(); - retv->type_ = "plain_net"; - return retv; - }) - .def("add_op", &pd::PlainNet::AddOp) + py::class_> net(m, "Net"); + + net.def_static("create", + []() -> std::shared_ptr { + auto retv = std::make_shared(); + retv->type_ = "plain_net"; + return retv; + }) + .def("add_op", &pd::NetOp::AddOp) .def("add_op", - [](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void { - self->AddOp(std::static_pointer_cast(plain_net)); + [](pd::NetOp& self, const std::shared_ptr& net) -> void { + self.AddOp(std::static_pointer_cast(net)); }) - .def("complete_add_op", &pd::PlainNet::CompleteAddOp) - .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); }); - ExposeOperator(plain_net); + .def("complete_add_op", &pd::NetOp::CompleteAddOp) + .def("complete_add_op", + [](std::shared_ptr& self) { self->CompleteAddOp(); }); + ExposeOperator(net); + + m.def("unique_integer", UniqueIntegerGenerator); return m.ptr(); } diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp index a830ceba5772846cd9255a3eeb26e8d6a17dcfbc..e1558e3fdfbcf296be0ee64202132f53bf901be9 100644 --- a/paddle/trainer/NewRemoteParameterUpdater.cpp +++ b/paddle/trainer/NewRemoteParameterUpdater.cpp @@ -76,7 +76,11 @@ void NewRemoteParameterUpdater::init( sgdConfigV2->set_decay(paramConfig.decay_rate()); optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const); auto constlr = optimizeConfigV2.mutable_const_lr(); - constlr->set_learning_rate(paramConfig.learning_rate()); + if (paramConfig.has_learning_rate()) { + constlr->set_learning_rate(paramConfig.learning_rate()); + } else { + constlr->set_learning_rate(trainerConfig_.learning_rate()); + } if (trainerConfig_.algorithm() == "sgd") { optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD); // FIXME: config all algorithms diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h index 27ddaab3f003110a2684a871a2de17afb473d660..7cde98306026ca1de76089749aaea265d151da33 100644 --- a/paddle/utils/Error.h +++ b/paddle/utils/Error.h @@ -126,9 +126,11 @@ public: } /** - * @brief operator bool, return True if there is something error. + * @brief check this status by glog. + * @note It is a temp method used during cleaning Paddle code. It will be + * removed later. */ - operator bool() const { return !this->isOK(); } + void check() const { CHECK(this->isOK()) << msg(); } /** * @brief isOK return True if there is no error. @@ -136,13 +138,6 @@ public: */ bool isOK() const { return msg_ == nullptr; } - /** - * @brief check this status by glog. - * @note It is a temp method used during cleaning Paddle code. It will be - * removed later. - */ - void check() const { CHECK(this->isOK()) << msg(); } - private: std::shared_ptr msg_; }; diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp index fdf326b17a1c8baa87e2a17fafae253565d1e699..6f311fa6b80191de1e11ce1f63c31b64fe2eeb80 100644 --- a/paddle/utils/tests/test_Error.cpp +++ b/paddle/utils/tests/test_Error.cpp @@ -18,17 +18,17 @@ limitations under the License. */ TEST(Error, testAll) { paddle::Error error; - ASSERT_FALSE(error); + ASSERT_TRUE(error.isOK()); error = paddle::Error("I'm the error"); - ASSERT_TRUE(error); + ASSERT_FALSE(error.isOK()); ASSERT_STREQ("I'm the error", error.msg()); error = paddle::Error("error2"); - ASSERT_TRUE(error); + ASSERT_FALSE(error.isOK()); ASSERT_STREQ("error2", error.msg()); int i = 3; auto error3 = paddle::Error("error%d", i); - ASSERT_TRUE(error3); + ASSERT_FALSE(error3.isOK()); ASSERT_STREQ("error3", error3.msg()); } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index fc112f1327f5ad5f1bdd04873394b1fa0e761e29..5477158ecb8646992ebdded0b15cce50720ebf36 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2055,8 +2055,7 @@ class BatchNormLayer(LayerBase): # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU. # Also based on cudnn version. use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \ - ((not parallel_nn) or self.config.device > -1) and \ - cudnn_version >= 4007 + ((not parallel_nn) or self.config.device > -1) self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm" super(BatchNormLayer, self).__init__( name, self.layer_type, 0, inputs=inputs, **xargs) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index 9b9f979bb615f37ec1dc9baa154d28741b1400d5..ecba87191045cff6c05014010e60575741238f8d 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -272,7 +272,7 @@ class ExtraLayerAttribute(object): for key in self.attr: if not hasattr(self, 'can_%s' % key) or \ not getattr(self, 'can_%s' % key): - raise NotImplementedError("Layer %s cannot support %s" % + raise NotImplementedError("Layer %s does not support %s" % (layer_name, key)) @staticmethod diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 21eba71527e60833e0c69b344ecc639626faa529..14f072fc55109d770edf469ad7c574b8dda8a434 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -865,7 +865,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None): @wrap_name_default("embedding") @wrap_param_attr_default() -@layer_support(ERROR_CLIPPING) +@layer_support(ERROR_CLIPPING, DROPOUT) def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None): """ Define a embedding Layer. @@ -1320,7 +1320,7 @@ def pooling_layer(input, @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation()) @wrap_name_default("lstmemory") -@layer_support(DROPOUT) +@layer_support() def lstmemory(input, name=None, size=None, @@ -1429,7 +1429,7 @@ def lstmemory(input, @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(param_names=["act"], act=TanhActivation()) @wrap_name_default("gru") -@layer_support(DROPOUT) +@layer_support() def grumemory(input, size=None, name=None, @@ -1793,7 +1793,7 @@ def repeat_layer(input, @wrap_name_default("seqreshape") @wrap_act_default(act=IdentityActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support() +@layer_support(ERROR_CLIPPING, DROPOUT) def seq_reshape_layer(input, reshape_size, act=None, @@ -2703,7 +2703,7 @@ def img_cmrnorm_layer(input, default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.)) @wrap_act_default(act=ReluActivation()) @wrap_name_default("batch_norm") -@layer_support(DROPOUT) +@layer_support(DROPOUT, ERROR_CLIPPING) def batch_norm_layer(input, act=None, name=None, @@ -2783,15 +2783,6 @@ def batch_norm_layer(input, :return: LayerOutput object. :rtype: LayerOutput """ - if not isinstance(act, ReluActivation): - logger.log(logging.WARN, - "%s is not recommend for batch normalization's activation, " - "maybe the relu is better" % act.name) - - if not isinstance(input.activation, LinearActivation): - logger.log(logging.WARN, - "The activation should be inside batch normalization, the " - "previous layer's activation may be Linear") if num_channels is None: if input.num_filters is not None: @@ -2861,7 +2852,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): @wrap_name_default("addto") @wrap_act_default(act=LinearActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support(DROPOUT) +@layer_support(DROPOUT, ERROR_CLIPPING) def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): """ AddtoLayer. @@ -2940,7 +2931,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): @wrap_act_default(act=IdentityActivation()) @wrap_name_default("concat") -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): """ Concat all input vector into one huge vector. @@ -3024,7 +3015,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): @wrap_name_default("seqconcat") @wrap_act_default(act=IdentityActivation()) @wrap_bias_attr_default(has_bias=False) -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, bias_attr=None): """ @@ -3177,7 +3168,7 @@ def memory(name, @wrap_act_default(param_names=['state_act'], act=TanhActivation()) @wrap_act_default(act=TanhActivation()) @wrap_name_default('lstm_step') -@layer_support(ERROR_CLIPPING, DROPOUT) +@layer_support() def lstm_step_layer(input, state, size=None, @@ -4480,7 +4471,7 @@ def tensor_layer(a, @wrap_param_attr_default() @wrap_bias_attr_default() @wrap_act_default() -@layer_support() +@layer_support(DROPOUT, ERROR_CLIPPING) def selective_fc_layer(input, size, select=None, @@ -5974,7 +5965,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): """ The crop layer crops images by offset and shape. User can set crop shape by args 'shape' explicitly or by reference input layer. - + The example usage is: .. code-block:: python diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 07ab2c9b1898f0ec7a5ca168912f2f03597b094a..5bea980611904b37a4a5d4e2cbbee13503a61ff0 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -34,6 +34,7 @@ import minibatch import plot import image import model +import paddle.trainer.config_parser as cp __all__ = [ 'optimizer', @@ -58,6 +59,8 @@ __all__ = [ 'model', ] +cp.begin_parse() + def init(**kwargs): import py_paddle.swig_paddle as api @@ -73,6 +76,11 @@ def init(**kwargs): for key in args_dict.keys(): args.append('--%s=%s' % (key, str(args_dict[key]))) + if 'use_gpu' in kwargs: + cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] + assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not " + "supported in v2 APIs.") + api.initPaddle(*args) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 645f3cc0dce70752c20569523e4bab440861f6a1..111496618dfa997246d0a067b0cd4c7dad74f9dc 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -166,55 +166,37 @@ def cluster_files_reader(files_pattern, return reader -def convert(output_path, - reader, - num_shards, - name_prefix, - max_lines_to_shuffle=1000): +def convert(output_path, reader, line_count, name_prefix): import recordio """ Convert data from reader to recordio format files. :param output_path: directory in which output files will be saved. :param reader: a data reader, from which the convert program will read data instances. - :param num_shards: the number of shards that the dataset will be partitioned into. :param name_prefix: the name prefix of generated files. :param max_lines_to_shuffle: the max lines numbers to shuffle before writing. """ - assert num_shards >= 1 - assert max_lines_to_shuffle >= 1 - - def open_writers(): - w = [] - for i in range(0, num_shards): - n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i, - num_shards - 1) - w.append(recordio.writer(n)) - - return w - - def close_writers(w): - for i in range(0, num_shards): - w[i].close() + assert line_count >= 1 + indx_f = 0 - def write_data(w, lines): + def write_data(indx_f, lines): random.shuffle(lines) - for i, d in enumerate(lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: # FIXME(Yancey1989): # dumps with protocol: pickle.HIGHEST_PROTOCOL - o = pickle.dumps(d) - w[i % num_shards].write(o) + writer.write(cPickle.dumps(l)) + writer.close() - w = open_writers() lines = [] - for i, d in enumerate(reader()): lines.append(d) - if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle: - write_data(w, lines) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) lines = [] + indx_f += 1 continue - write_data(w, lines) - close_writers(w) + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py index cffb319ad8f56ccddba3fef63e1b6ec68e5bac1e..b705c9109b2b6769c9fafa9241db5d81c682f9e3 100644 --- a/python/paddle/v2/dataset/mq2007.py +++ b/python/paddle/v2/dataset/mq2007.py @@ -242,9 +242,9 @@ def gen_list(querylist): if not isinstance(querylist, QueryList): querylist = QueryList(querylist) querylist._correct_ranking_() - relevance_score_list = [query.relevance_score for query in querylist] + relevance_score_list = [[query.relevance_score] for query in querylist] feature_vector_list = [query.feature_vector for query in querylist] - yield np.array(relevance_score_list).T, np.array(feature_vector_list) + yield np.array(relevance_score_list), np.array(feature_vector_list) def query_filter(querylists): diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py index 7248c3f52a9902e8c08ac2f1405801a5710459e5..b034efffb69030cb09e09ea545e9bff6f1744671 100644 --- a/python/paddle/v2/framework/create_op_creation_methods.py +++ b/python/paddle/v2/framework/create_op_creation_methods.py @@ -220,6 +220,9 @@ def create_op_creation_method(op_proto): __impl__.all_input_args = [var.name for var in op_proto.inputs] __impl__.all_output_args = [var.name for var in op_proto.outputs] __impl__.all_attr_args = [attr.name for attr in op_proto.attrs] + __impl__.all_not_temp_output_args = [ + var.name for var in op_proto.outputs if not var.temporary + ] return __impl__ diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py new file mode 100644 index 0000000000000000000000000000000000000000..c85e87413ef45f40755709e134a277b8d8d1e233 --- /dev/null +++ b/python/paddle/v2/framework/network.py @@ -0,0 +1,124 @@ +import paddle.v2.framework.core as core +from paddle.v2.framework.create_op_creation_methods import op_creations +from default_scope_funcs import create_var, get_var, get_cur_scope + +__all__ = ['Network'] # Only expose Network + + +class NetworkFunctor(object): + """ + Network Op Creation Function. Used internally in this module. + It convert string input to Variable. If it is not created before, just + create in scope. + + It is a functor object. means the instances are callable. + + :param func: The op creation function which generated in Python. + :param net: The Network instance. + """ + + def __init__(self, func, net): + self.func = func + self.net = net + + def __call__(self, *args, **kwargs): + if len(args) != 0: + raise ValueError("Paddle must use keyword argument") + inputs = self.func.all_input_args + for ipt in inputs: + if ipt in kwargs: + var = kwargs[ipt] + if isinstance(var, basestring): + var = create_var(var) + if not isinstance(var, core.Variable): + raise TypeError( + "Input of op creation must be string or variable") + + kwargs[ipt] = get_cur_scope().get_var_name(var) + + notemp_outputs = self.func.all_not_temp_output_args + + for name in notemp_outputs: + if name not in kwargs: + kwargs[ + name] = self.func.__name__ + "@OUT@%d" % core.unique_integer( + ) + + outputs = self.func.all_output_args + for opt in outputs: + if opt in kwargs: + var = kwargs[opt] + if isinstance(var, basestring): + var = create_var(var) + if not isinstance(var, core.Variable): + raise TypeError( + "Output of op creation must be string or variable") + kwargs[opt] = get_cur_scope().get_var_name(var) + + op = self.func(**kwargs) + + self.net.net.add_op(op) + + lst = [get_var(kwargs[opt]) for opt in notemp_outputs] + if len(lst) == 1: + return lst[0] + elif len(lst) == 0: + return None + else: + return lst + + +class Network(object): + """ + The network concept. It avoid user to manually create operator, create + variable, and combine them into a Net. Just use Network.xxx can create the + operator, create variables in default scope, and add them into `self.net`. + + For example: + + .. code-block: python + + net = Network() + out = net.add_two(X="a", Y="b") + fc_out = net.fc(X="out", W="fc.w") + + net.run(...) + """ + + def __init__(self): + self.net = core.Net.create() + funcs = (func_name for func_name in dir(op_creations) + if not func_name.startswith("__")) + + # TODO(yuyang18): This code can work, but do not generate a good + # docstring, try to give a better way generate function in runtime + # later. + for func_name in funcs: + func = getattr(op_creations, func_name) + impl = NetworkFunctor(func, self) + setattr(self, func_name, impl.__call__) + self.__complete_add_op__ = False + + def infer_shape(self): + self.complete_add_op() + self.net.infer_shape(get_cur_scope()) + + def run(self, device_context): + self.complete_add_op() + self.net.run(get_cur_scope(), device_context) + + def __str__(self): + return str(self.net) + + def complete_add_op(self): + if not self.__complete_add_op__: + self.net.complete_add_op() + self.__complete_add_op__ = True + + +if __name__ == '__main__': + net = Network() + out = net.add_two(X="a", Y="b") + fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax") + net.complete_add_op() + print net diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index 254e8d37d13b2c0d2d6dcceb6c96ac0c2c02592d..deaa350133a88d8c9902b3c120f6583580ccd759 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -3,7 +3,7 @@ add_python_test(test_framework test_scope.py test_default_scope_funcs.py test_op_creation_methods.py - test_plain_net.py + test_net.py test_tensor.py test_fc_op.py test_add_two_op.py @@ -13,4 +13,5 @@ add_python_test(test_framework test_sigmoid_op.py test_softmax_op.py test_rowwise_add_op.py - test_random_op.py) + test_random_op.py + test_network.py) diff --git a/python/paddle/v2/framework/tests/test_plain_net.py b/python/paddle/v2/framework/tests/test_net.py similarity index 92% rename from python/paddle/v2/framework/tests/test_plain_net.py rename to python/paddle/v2/framework/tests/test_net.py index 53c8dd6c225df2fef9069816935e2778c36c10ee..c4cc5b43bc9cfb857c4d82d5f6a2940235949dc0 100644 --- a/python/paddle/v2/framework/tests/test_plain_net.py +++ b/python/paddle/v2/framework/tests/test_net.py @@ -5,11 +5,11 @@ import unittest class TestNet(unittest.TestCase): def test_net_all(self): - net = core.PlainNet.create() + net = core.Net.create() op1 = op_creations.add_two(X="X", Y="Y", Out="Out") net.add_op(op1) - net2 = core.PlainNet.create() + net2 = core.Net.create() net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out")) net2.complete_add_op(True) net.add_op(net2) diff --git a/python/paddle/v2/framework/tests/test_network.py b/python/paddle/v2/framework/tests/test_network.py new file mode 100644 index 0000000000000000000000000000000000000000..6d53e233e959bd39b558ac97cdca381135505f8d --- /dev/null +++ b/python/paddle/v2/framework/tests/test_network.py @@ -0,0 +1,32 @@ +from paddle.v2.framework.network import Network +import paddle.v2.framework.core as core +import unittest + + +class TestNet(unittest.TestCase): + def test_net_all(self): + net = Network() + out = net.add_two(X="X", Y="Y") + fc_out = net.fc(X=out, W="w") + net.complete_add_op() + self.assertTrue(isinstance(fc_out, core.Variable)) + self.assertEqual( + '''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1). + Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0). + Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0). + Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0). + Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1). +''', str(net)) + + net2 = Network() + tmp = net2.add_two(X="X", Y="Y") + self.assertTrue(isinstance(tmp, core.Variable)) + net2.complete_add_op() + self.assertEqual( + '''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2). + Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2). +''', str(net2)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0457e3f16a709140180ce433c1d56d146f0b6974 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_recurrent_op.py @@ -0,0 +1,92 @@ +import paddle.v2.framework.core as core +import unittest +import numpy as np +import paddle.v2.framework.create_op_creation_methods as creation + +ops = creation.op_creations + + +def create_tensor(scope, name, shape): + tensor = scope.create_var(name).get_tensor() + tensor.set_dims(shape) + tensor.alloc_float() + tensor.set(np.random.random(shape)) + return tensor + + +class TestRNN(unittest.TestCase): + ''' + Test RNNOp + + equation: + h_t = \sigma (W x_t + U h_{t-1}) + weights: + - W + - U + vars: + - x + memories: + - h + outputs: + - h + ''' + + def init(self): + input_dim = 30 + batch_size = 50 + weight_dim = 15 + + self.scope = core.Scope(None) + + # create vars + create_tensor(self.scope, "x", [batch_size, input_dim]) + create_tensor(self.scope, "W", [input_dim, weight_dim]) + create_tensor(self.scope, "U", [weight_dim, weight_dim]) + create_tensor(self.scope, "h_boot", [batch_size, weight_dim]) + + x_alias = "x@alias" + y_alias = "y@alias" + memory = "h@alias" + prememory = "h@pre" + output = "rnn_out" + output_alias = "rnn_out@alias" + + # create step net + stepnet_var = self.scope.create_var("stepnet") + stepnet = stepnet_var.get_net() + # stepnet = core.Net.create() + x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx") + h_fc_op = ops.fc(X=prememory, W="U", Y="Uh") + sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum") + sig_op = ops.sigmoid(X="sum", Y=memory) + stepnet.add_op(x_fc_op) + stepnet.add_op(h_fc_op) + stepnet.add_op(sum_op) + stepnet.add_op(sig_op) + stepnet.complete_add_op(True) + + # create RNNOp + rnnop = ops.recurrent_op( + # inputs + inlinks=["x"], + boot_memories=["h_boot"], + step_net="stepnet", + # outputs + outlinks=[output], + step_scopes="step_scopes", + # attributes + inlink_alias=["x@alias"], + outlink_alias=[output_alias], + pre_memories=[prememory], + memories=[memory]) + + ctx = core.DeviceContext.cpu_context() + rnnop.infer_shape(self.scope) + rnnop.run(self.scope, ctx) + + def test_recurrent(self): + self.init() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 40134a3270c3579fd2f6a891af66ff241050f60c..4dcc3ab57e7e6dfbe040ac61025e55b9e48b4415 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -35,6 +35,13 @@ class Inference(object): name = param.getName() assert isinstance(val, api.Vector) val.copyFromNumpyArray(parameters.get(name).flatten()) + # the setValueUpdated function is called in randomize, zeroMem, + # load function in paddle/parameter/Parameter.cpp. But in the + # inference mode, the setValueUpdated is never called, it will + # cause the parameter will not be dispatched + # in MultiGradientMachine for multi-GPU. So setValueUpdated is + # called here, but it's better to call this function in one place. + param.setValueUpdated() self.__gradient_machine__ = gm self.__data_types__ = topo.data_type() diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 4ade1c6f329ae39769279963af6809f938807bdd..6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -324,6 +324,3 @@ def parse_network(output_layers, extra_layers=None): def get_layer(name): return config_base.__layer_map__.get(name) - - -cp.begin_parse() diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 4dc31bff583ee933b33e475d9421c21a7bb74449..b658a81630733fea3976b812afe819d76de4cb25 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -1,8 +1,15 @@ import ctypes import os -path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so") -lib = ctypes.cdll.LoadLibrary(path) +__lib__ = None + + +def get_c_lib(): + global __lib__ + if __lib__ is None: + path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so") + __lib__ = ctypes.cdll.LoadLibrary(path) + return __lib__ class client(object): @@ -11,8 +18,8 @@ class client(object): """ def __init__(self, etcd_endpoints, timeout_sec, buf_size=0): - self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout_sec, - buf_size) + self.c = get_c_lib().paddle_new_etcd_master_client( + etcd_endpoints, timeout_sec, buf_size) def request_save_model(self, trainer_id, block_ms): """request to save model @@ -32,20 +39,20 @@ class client(object): saving the model, -1 if error happened. """ - return lib.paddle_request_save_model(self.c, trainer_id, block_ms) + return get_c_lib().paddle_request_save_model(self.c, trainer_id, + block_ms) def release(self): - lib.paddle_release_master_client(self.c) + get_c_lib().paddle_release_master_client(self.c) self.c = None def set_dataset(self, paths): holder_type = ctypes.c_char_p * len(paths) holder = holder_type() - print paths for idx, path in enumerate(paths): c_ptr = ctypes.c_char_p(path) holder[idx] = c_ptr - lib.paddle_set_dataset(self.c, holder, len(paths)) + get_c_lib().paddle_set_dataset(self.c, holder, len(paths)) def next_record(self): """gets next record for training @@ -56,7 +63,7 @@ class client(object): """ p = ctypes.c_char_p() ret = ctypes.pointer(p) - size = lib.paddle_next_record(self.c, ret) + size = get_c_lib().paddle_next_record(self.c, ret) if size < 0: # Error return None, size @@ -67,5 +74,5 @@ class client(object): record = ret.contents.value[:size] # Memory created from C should be freed. - lib.mem_free(ret.contents) + get_c_lib().mem_free(ret.contents) return record, 0