diff --git a/.travis.yml b/.travis.yml index 498674469b27f585af798b95f30b74ebed99e32c..2cf7666fb5d0c47034676864a52c3d3dbce19683 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ cache: - $HOME/.ccache - $HOME/.cache/pip - $TRAVIS_BUILD_DIR/build/third_party + - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: @@ -11,6 +12,7 @@ os: env: - JOB=build_doc - JOB=check_style + - JOB=build_android addons: apt: packages: diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bc6a8077c1e9b242af50f1b912e91bbc6c3a9fb..dcff6b54cafce35846627e78cfcdac65fae7e686 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,9 @@ if(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) -find_package(Boost QUIET) +if(NOT ANDROID) + find_package(Boost QUIET) +endif() include(simd) @@ -135,7 +137,8 @@ if(WITH_GPU) endif(WITH_GPU) if(USE_NNPACK) - list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt") + include(external/nnpack) + list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS}) endif(USE_NNPACK) add_subdirectory(proto) @@ -151,7 +154,9 @@ if(WITH_GOLANG) endif(WITH_GOLANG) add_subdirectory(paddle) -add_subdirectory(python) +if(WITH_PYTHON) + add_subdirectory(python) +endif() if(WITH_DOC) add_subdirectory(doc) endif() diff --git a/Dockerfile b/Dockerfile index ed5910d93b41dba8d50b2ba01c59c635797edd29..8cfb16928c95dcbfac08383d32562ff67933d873 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ git python-pip python-dev openssh-server bison \ - wget unzip tar xz-utils bzip2 gzip coreutils ntp \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-numpy python-matplotlib gcc g++ \ automake locales clang-format-3.8 swig doxygen cmake \ diff --git a/Dockerfile.android b/Dockerfile.android index fa24f6f06c4e76444c83bcf13fe312afdcb6c348..c0fa58c384f9ebcae60477ffce49ea4ffa929db9 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -14,6 +14,17 @@ RUN apt-get update && \ wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \ apt-get clean -y +# Install Go and glide +RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go.tgz && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src && \ + rm go.tgz +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin + # git credential to skip password typing RUN git config --global credential.helper store diff --git a/cmake/configure.cmake b/cmake/configure.cmake index a4f98ec7d4af652d0dd0650f4906696ff3a4efb9..7afab5d5344b704a9329e313a81379032ba0cc97 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -102,12 +102,19 @@ if(WITH_GOLANG) message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide") endif() - add_custom_target(go_vendor) - add_custom_command(TARGET go_vendor + # this command will only run when the file it depends is missing + # or has changed, or the output is missing. + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide COMMAND env GOPATH=${GOPATH} ${GLIDE} install + COMMAND touch ${CMAKE_BINARY_DIR}/glide + DEPENDS ${PROJ_ROOT}/go/glide.lock WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" - ) - add_dependencies(go_vendor go_path) + ) + + # depends on the custom command which outputs + # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to + # run every time this target is built. + add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path) endif() endif(WITH_GOLANG) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 6bbcd730e1b5ac49415cac676352e6df00eb6eb5..656e1a0803c6e389d70f37f592c3aa2e95a2bcd4 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -27,7 +27,8 @@ set(IGNORE_PATTERN .*cblas\\.h.* .*\\.pb\\.txt .*LtrDataProvider.* - .*MultiDataProvider.*) + .*MultiDataProvider.* + .*pb.*) # add_style_check_target # @@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME) endif() endforeach() if(LINT MATCHES ON) + # cpplint code style get_filename_component(base_filename ${filename} NAME) set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) - add_custom_command(OUTPUT ${CUR_GEN} - PRE_BUILD - COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" - "--write-success=${CUR_GEN}" ${filename} - DEPENDS ${filename} + add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD + COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" + "--filter=${STYLE_FILTER}" + "--write-success=${CUR_GEN}" ${filename} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endforeach() diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 9724c16122ab2e6be55864c8716698c9b9d7c3f0..5e3e437a8da9624df35a5c754fe77be73f20361d 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") SET(CMAKE_SYSTEM_PROCESSOR armv7-a) ENDIF() ENDIF() + IF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + ENDIF() SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") ENDIF() @@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ENDIF() ENDIF() + IF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) + ENDIF() + STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}") @@ -186,6 +194,10 @@ ELSE() SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN}) ENDIF() SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) - SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) - SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) + IF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index bd401faa6eb8a583bce542db68852f8571681daf..8a594a825abdca6a0f989b94fa42f97d6df5e10a 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) -ADD_DEPENDENCIES(glog extern_glog) +ADD_DEPENDENCIES(glog extern_glog gflags) +LINK_LIBRARIES(glog gflags) LIST(APPEND external_project_dependencies glog) diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake similarity index 54% rename from paddle/function/nnpack/nnpack.cmake rename to cmake/external/nnpack.cmake index 7182730ae8f133bdc4f73bfc46fa8acbe5f3b603..d42bcb0f329041462bd8b568052fbb8226d18e4e 100644 --- a/paddle/function/nnpack/nnpack.cmake +++ b/cmake/external/nnpack.cmake @@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib) if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) set(NNPACK_FOUND ON) INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) + + set(NNPACK_LIBS) + list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB}) + if (NNPACK_UKERNELS_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB}) + endif() + if (NNPACK_CPUFEATURES_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB}) + endif() + if(NOT ANDROID) + list(APPEND NNPACK_LIBS "rt") + endif() else() message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 5b9d9844ed21ceb507a8e01676c3533f4e3dd8fb..60a1041936437775e0994157b8ffcb7c52b7ab87 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND}) # arm_soft_fp_abi branch of OpenBLAS to support softfp # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(TARGET "ARMV7") + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(TARGET "ARMV8") + ENDIF() + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0) ELSEIF(RPI) # use hardfp SET(OPENBLAS_COMMIT "v0.2.19") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 71ee266611df27cd72c586c49ce2765e6ea0471b..e42e75c12ab1e5133f5ecbdb90ef26e3f8df5133 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -90,11 +90,11 @@ # including binary directory for generated headers. include_directories(${CMAKE_CURRENT_BINARY_DIR}) -if(NOT APPLE) +if(NOT APPLE AND NOT ANDROID) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt") -endif(NOT APPLE) +endif(NOT APPLE AND NOT ANDROID) function(merge_static_libs TARGET_NAME) set(libs ${ARGN}) @@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME) foreach(lib ${libs}) list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) endforeach() + list(REMOVE_DUPLICATES libs_deps) if(APPLE) # Use OSX's libtool to merge archives # To produce a library we need at least one source file. @@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME) # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) else() # general UNIX: use "ar" to extract objects and re-add to a common lib @@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - # Empty dummy source file that goes into merged library - set(mergebase ${lib}.mergebase.c) - add_custom_command(OUTPUT ${mergebase} - COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} - DEPENDS ${objlistfile}) + # Empty dummy source file that goes into merged library + set(mergebase ${lib}.mergebase.c) + add_custom_command(OUTPUT ${mergebase} + COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} + DEPENDS ${objlistfile}) list(APPEND mergebases "${mergebase}") endforeach() @@ -184,6 +185,10 @@ function(cc_library TARGET_NAME) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) endif() + + # cpplint code style + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS}) + else(cc_library_SRCS) if (cc_library_DEPS) merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) @@ -337,7 +342,7 @@ function(go_test TARGET_NAME) string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" ".${CMAKE_CURRENT_SOURCE_REL_DIR}" WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 4f4a9187bcbe8ef902e923622552909808b121d6..daee55b7f9adfffdf709ed2b5b0d957c7ca1aea4 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -474,6 +474,11 @@ prelu .. autoclass:: paddle.v2.layer.prelu :noindex: +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + Detection output Layer ====================== diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 54fa254863156455f66fa87de9077042a45f9735..9eaf8c04ae01fe7eebc92c51803bfcf977995ee3 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -11,6 +11,7 @@ import ( "github.com/namsral/flag" log "github.com/sirupsen/logrus" + "github.com/topicai/candy" "github.com/PaddlePaddle/Paddle/go/master" "github.com/PaddlePaddle/Paddle/go/utils/networkhelper" @@ -20,11 +21,18 @@ func main() { port := flag.Int("port", 8080, "port of the master server.") ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.") endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.") - taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.") - taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.") - chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.") + taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.") + taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.") + chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.") + logLevel := flag.String("log-level", "info", + "log level, possible values: debug, info, warning, error, fatal, panic") flag.Parse() + level, e := log.ParseLevel(*logLevel) + candy.Must(e) + + log.SetLevel(level) + if *endpoints == "" { log.Warningln("-endpoints not set, fault tolerance not be enabled.") } diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index b331b8126cadc2c5df516fb241913415b2e3e73d..652d7ba315d72ff19931b82a4b0d1c30b2ff8f37 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -40,7 +40,7 @@ func main() { idx = *index } else { e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout) - idx, err = e.Register() + idx, err = e.Register(*port) candy.Must(err) cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e) diff --git a/go/master/client.go b/go/master/client.go index a2ca3f3ef8ce300e3df09a302d74b56ee23c6d10..de883bf4b9a3de8d6d6e35e8e808dcf7ba54cb46 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -2,6 +2,7 @@ package master import ( "os" + "time" "github.com/PaddlePaddle/Paddle/go/connection" "github.com/PaddlePaddle/recordio" @@ -36,9 +37,9 @@ func (c *Client) getRecords() { for { t, err := c.getTask() if err != nil { - // TODO(helin): wait before move on with next // getTask call. - log.Errorln(err) + log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err) + time.Sleep(3 * time.Second) continue } diff --git a/go/master/service.go b/go/master/service.go index a6050ab99437244dade83f2943f6649453d47fff..9cef2270ce6a51425e40b9281f93f2f9c9981329 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) { } count := index.NumChunks() + log.Infof("readChunks: file %s has %d chunks", path, count) for i := 0; i < count; i++ { chunk := Chunk{ Path: path, diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py index d6922672f4c1253e62cfe54965f6c2f3b5e6c7bf..e9264592b4f18fddf68b198d73bf907206e77a3f 100644 --- a/go/pserver/client/c/test/test_train.py +++ b/go/pserver/client/c/test/test_train.py @@ -1,5 +1,23 @@ import paddle.v2 as paddle import paddle.v2.dataset.uci_housing as uci_housing +import paddle.v2.master as master +import os +import cPickle as pickle + +etcd_ip = os.getenv("MASTER_IP", "127.0.0.1") +etcd_endpoint = "http://" + etcd_ip + ":2379" + + +def cloud_reader(): + print "connecting to master, etcd endpoints: ", etcd_endpoint + master_client = master.client(etcd_endpoint, 5, 64) + master_client.set_dataset( + ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"]) + while 1: + r, e = master_client.next_record() + if not r: + break + yield pickle.loads(r) def main(): @@ -22,13 +40,13 @@ def main(): # create optimizer of new remote updater to pserver optimizer = paddle.optimizer.Momentum(momentum=0) - #TODO(zhihong) : replace optimizer with new OptimizerConfig - + print "etcd endoint: ", etcd_endpoint trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=optimizer, is_local=False, - pserver_spec="localhost:3000") + pserver_spec=etcd_endpoint, + use_etcd=True) # event_handler to print training and testing info def event_handler(event): @@ -47,11 +65,11 @@ def main(): print "Test %d, %.2f" % (event.pass_id, result.cost) # training + # NOTE: use uci_housing.train() as reader for non-paddlecloud training trainer.train( reader=paddle.batch( paddle.reader.shuffle( - uci_housing.train(), buf_size=500), - batch_size=2), + cloud_reader, buf_size=500), batch_size=2), feeding={'x': 0, 'y': 1}, event_handler=event_handler, diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go index 2b72a202b55eab4b0e9107b93807f3ceea95f099..aab91556b4b91fab6de66322987eabe24f1b0f70 100644 --- a/go/pserver/client/client_test.go +++ b/go/pserver/client/client_test.go @@ -3,11 +3,13 @@ package client_test import ( "context" "io/ioutil" + "math/rand" "net" "net/http" "net/rpc" "strconv" "strings" + "sync" "testing" "time" @@ -100,27 +102,34 @@ func (l lister) List() []client.Server { return l } -func ClientTest(t *testing.T, c *client.Client) { +func testClient(t *testing.T, c *client.Client) { selected := c.BeginInitParams() if !selected { t.Fatal("should be selected.") } - const numParameter = 100 + const numParameter = 1000 config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb") if err != nil { t.Fatalf("read optimizer proto failed") } + + var wg sync.WaitGroup for i := 0; i < numParameter; i++ { - var p pserver.Parameter - p.Name = "p_" + strconv.Itoa(i) - p.ElementType = pserver.Float32 - p.Content = make([]byte, (i+1)*100) - err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}) - if err != nil { - t.Fatal(err) - } + wg.Add(1) + go func(i int) { + var p pserver.Parameter + p.Name = "p_" + strconv.Itoa(i) + p.ElementType = pserver.Float32 + p.Content = make([]byte, (i+1)*100) + err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}) + if err != nil { + t.Fatal(err) + } + wg.Done() + }(i) } + wg.Wait() err = c.FinishInitParams() if err != nil { @@ -128,7 +137,7 @@ func ClientTest(t *testing.T, c *client.Client) { } var grads []pserver.Gradient - for i := 0; i < numParameter/2; i++ { + for i := 0; i < numParameter; i++ { var g pserver.Gradient g.Name = "p_" + strconv.Itoa(i) g.ElementType = pserver.Float32 @@ -136,9 +145,31 @@ func ClientTest(t *testing.T, c *client.Client) { grads = append(grads, g) } - err = c.SendGrads(grads) - if err != nil { - t.Fatal(err) + const paramPerGroup = 10 + const numGroups = numParameter / paramPerGroup + + // shuffle send grads order + for i := range grads { + j := rand.Intn(i + 1) + grads[i], grads[j] = grads[j], grads[i] + } + + for i := 0; i < numGroups; i++ { + var gs []pserver.Gradient + if i == numGroups-1 { + gs = grads[i*paramPerGroup:] + } else { + gs = grads[i*paramPerGroup : (i+1)*paramPerGroup] + } + + wg.Add(1) + go func(gs []pserver.Gradient) { + err := c.SendGrads(gs) + if err != nil { + t.Fatal(err) + } + wg.Done() + }(gs) } names := make([]string, numParameter) @@ -146,20 +177,35 @@ func ClientTest(t *testing.T, c *client.Client) { names[i] = "p_" + strconv.Itoa(i) } - params, err := c.GetParams(names) - if err != nil { - t.Fatal(err) - } + for i := 0; i < numGroups; i++ { + var ns []string + if i == numGroups-1 { + ns = names[i*paramPerGroup:] + } else { + ns = names[i*paramPerGroup : (i+1)*paramPerGroup] + } - if len(names) != len(params) { - t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params)) - } + wg.Add(1) + go func(ns []string) { + params, err := c.GetParams(ns) + if err != nil { + t.Fatal(err) + } - for i := range params { - if names[i] != params[i].Name { - t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name) - } + if len(ns) != len(params) { + t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params)) + } + + for i := range params { + if ns[i] != params[i].Name { + t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name) + } + } + wg.Done() + }(ns) } + + wg.Wait() } func TestNativeClient(t *testing.T) { @@ -169,13 +215,14 @@ func TestNativeClient(t *testing.T) { servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])} } c1 := client.NewClient(lister(servers), len(servers), selector(true)) - ClientTest(t, c1) + testClient(t, c1) } -// TODO: tmperary disable etcdClient test for dependency of etcd) +// EtcdClient is a disabled test, since we have not embedded etcd into +// our test. func EtcdClient(t *testing.T) { initEtcdClient() etcdClient := client.NewEtcd(etcdEndpoints) c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true)) - ClientTest(t, c2) + testClient(t, c2) } diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go index 1fd3479aa88ccbbe7c5067da1e9886b65352e847..8eb2a4f4511fc7139a55a2cd47ad73a82137b260 100644 --- a/go/pserver/client/etcd_client.go +++ b/go/pserver/client/etcd_client.go @@ -12,6 +12,7 @@ import ( ) const ( + // DefaultEtcdTimeout is the default etcd timeout DefaultEtcdTimeout time.Duration = 5 * time.Second ) @@ -66,12 +67,12 @@ func (p *EtcdClient) List() []Server { for { for i := 0; i < psDesired; i++ { ctx, cancel := context.WithTimeout(context.Background(), p.timeout) - cancel() psKey := pserver.PsPath + strconv.Itoa(i) log.Debugf("checking %s", psKey) resp, err := p.client.Get(ctx, psKey) + cancel() if err != nil { - log.Infof("Get psKey= %s error, %v", psKey, err) + log.Infof("Get psKey=%s error, %v", psKey, err) time.Sleep(p.timeout) continue } diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 4a694b97f47b2ab85d1e109ef7545d104194b5cf..66af4fa0b483f1caea385df61e54d871072a0375 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et // Register registers the pserver on etcd // // Register returns the index of the current pserver. -func (e *EtcdClient) Register() (int, error) { +func (e *EtcdClient) Register(port int) (int, error) { var err error e.externalIP, err = networkhelper.GetExternalIP() @@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) { for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) var err error - pserverIdx, err = e.registerPserverEtcd(ctx) + pserverIdx, err = e.registerPserverEtcd(ctx, port) cancel() if err != nil { log.Warn(err) @@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) ( } // registerPserverEtcd registers pserver node on etcd using transaction. -func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) { +func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) { var idx int _, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { registered := false @@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) { log.Fatal(err) } // find the first id and write info - c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID)) - log.Debugf("set pserver node %s with value %s", psKey, e.externalIP) + pserverAddr := e.externalIP + ":" + strconv.Itoa(port) + c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID)) + log.Debugf("set pserver node %s with value %s", psKey, pserverAddr) ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID) if kaerr != nil { log.Errorf("keepalive etcd node error: %v", kaerr) diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index a6b73dd5a187727f29b6bb0c8b949e5c8bea2195..d6b7fafd59c0453b9f40019166d01ebdc9775117 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -19,6 +19,7 @@ var nullPtr = unsafe.Pointer(uintptr(0)) type optimizer struct { opt *C.struct_paddle_optimizer elementType ElementType + contentLen int } func cArrayToSlice(p unsafe.Pointer, len int) []byte { @@ -37,10 +38,11 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte { func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer { o := &optimizer{} o.elementType = paramWithConfigs.Param.ElementType + o.contentLen = len(paramWithConfigs.Param.Content) p := paramWithConfigs.Param c := paramWithConfigs.Config s := State - paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float) + paramBufferSize := C.size_t(len(p.Content)) log.WithFields(log.Fields{ "ElementType": p.ElementType, "ParamSize": paramBufferSize, @@ -78,7 +80,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error { return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType) } - r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float) + if o.contentLen != len(g.Content) { + return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content)) + } + + r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))) if r != 0 { return fmt.Errorf("optimizer update returned error code: %d", r) } diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go index 9bf1a48a596f3e3e73a2e4df651855fd5f4e775f..a191f689fea9b5e64204c3ddfd12edf92f5ddb09 100644 --- a/go/pserver/service_test.go +++ b/go/pserver/service_test.go @@ -31,7 +31,7 @@ func TestServiceFull(t *testing.T) { err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil) if err != nil { - t.FailNow() + t.Fatal(err) } var p1 pserver.Parameter @@ -40,40 +40,40 @@ func TestServiceFull(t *testing.T) { p1.ElementType = pserver.Float32 err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil) if err != nil { - t.FailNow() + t.Fatal(err) } err = s.FinishInitParams(0, nil) if err != nil { - t.FailNow() + t.Fatal(err) } var param pserver.Parameter err = s.GetParam("param_b", ¶m) if err != nil { - t.FailNow() + t.Fatal(err) } if !reflect.DeepEqual(param, p1) { - t.FailNow() + t.Fatal("not equal:", param, p1) } g1, g2 := pserver.Gradient(p1), pserver.Gradient(p) err = s.SendGrad(g1, nil) if err != nil { - t.FailNow() + t.Fatal(err) } err = s.SendGrad(g2, nil) if err != nil { - t.FailNow() + t.Fatal(err) } var param1 pserver.Parameter err = s.GetParam("param_a", ¶m1) if err != nil { - t.FailNow() + t.Fatal(err) } // don't compare content, since it's already changed by @@ -82,7 +82,7 @@ func TestServiceFull(t *testing.T) { p.Content = nil if !reflect.DeepEqual(param1, p) { - t.FailNow() + t.Fatal("not equal:", param1, p) } } @@ -90,16 +90,16 @@ func TestMultipleInit(t *testing.T) { var cp pserver.Checkpoint s, err := pserver.NewService(0, 1, "", nil, cp) if err != nil { - t.Error(err) + t.Fatal(err) } err = s.FinishInitParams(0, nil) if err != nil { - t.FailNow() + t.Fatal(err) } err = s.FinishInitParams(0, nil) if err.Error() != pserver.AlreadyInitialized { - t.FailNow() + t.Fatal(err) } } @@ -108,7 +108,7 @@ func TestUninitialized(t *testing.T) { s, err := pserver.NewService(0, 1, "", nil, cp) err = s.SendGrad(pserver.Gradient{}, nil) if err.Error() != pserver.Uninitialized { - t.FailNow() + t.Fatal(err) } } @@ -154,12 +154,12 @@ func TestBlockUntilInitialized(t *testing.T) { err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil) if err != nil { - t.FailNow() + t.Fatal(err) } err = s.FinishInitParams(0, nil) if err != nil { - t.FailNow() + t.Fatal(err) } wg.Wait() diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 0b5e9a2599dbec6d40eb70609d61ebea2b7e1486..4b06966fba2bc9f92756be0cb8110bbcd5272423 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -14,6 +14,7 @@ if(Boost_FOUND) add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) + add_subdirectory(operators) add_subdirectory(pybind) endif() diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp index 2f45173bfd401ddda26d61ab7fcfe131d079f710..b6ff6ec7890c0b79d52a2f0784743289c7bc213f 100644 --- a/paddle/api/ConfigParser.cpp +++ b/paddle/api/ConfigParser.cpp @@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const { ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {} -ParameterConfig::~ParameterConfig() { - if (m) { - delete m; - } -} +ParameterConfig::~ParameterConfig() { delete m; } ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr( void* ptr) { @@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); } OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {} -OptimizationConfig::~OptimizationConfig() { - if (m) { - delete m; - } -} +OptimizationConfig::~OptimizationConfig() { delete m; } std::string OptimizationConfig::toProtoString() { return m->getConfig().SerializeAsString(); diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h index 5fb3d1c73bc56e921f13aafd27c25224e259b3fe..0b9b83d42974151d49250bdf0e7c397f59bf6a62 100644 --- a/paddle/api/PaddleAPI.h +++ b/paddle/api/PaddleAPI.h @@ -843,7 +843,8 @@ public: bool useSparseUpdater); static ParameterUpdater* createNewRemoteUpdater( OptimizationConfig* config, - const std::string pserverSpec) throw(UnsupportError); + const std::string pserverSpec, + const bool useEtcd) throw(UnsupportError); ~ParameterUpdater(); /** diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp index 21b851dd5e26c4752888067b20d0b1e16a4ab52d..120eea3f70125a57fb5ad685f2a11479bce12d0c 100644 --- a/paddle/api/ParameterOptimizer.cpp +++ b/paddle/api/ParameterOptimizer.cpp @@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate { ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {} -ParameterOptimizer::~ParameterOptimizer() { - if (m) { - delete m; - } -} +ParameterOptimizer::~ParameterOptimizer() { delete m; } ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) { CHECK(config != nullptr); @@ -104,11 +100,7 @@ std::vector ParameterOptimizer::getParameterTypes() const { ParameterTraverseCallback::ParameterTraverseCallback() : m(new ParameterTraverseCallbackPrivate()) {} -ParameterTraverseCallback::~ParameterTraverseCallback() { - if (m) { - delete m; - } -} +ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; } void ParameterTraverseCallback::apply(const std::vector& vecs, const ParameterConfig& conf, diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp index 1aaefdfb8107a2eaa0432211fd7df4f5f12d537f..5934cb898b5f6adc74c237b1733a7459d8437a28 100644 --- a/paddle/api/ParameterUpdater.cpp +++ b/paddle/api/ParameterUpdater.cpp @@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater( ParameterUpdater *ParameterUpdater::createNewRemoteUpdater( OptimizationConfig *config, - const std::string pserverSpec) throw(UnsupportError) { + const std::string pserverSpec, + const bool useEtcd) throw(UnsupportError) { #ifndef PADDLE_WITHOUT_GOLANG auto updater = new ParameterUpdater(); updater->m->updater.reset(new paddle::NewRemoteParameterUpdater( - config->m->getConfig(), pserverSpec)); + config->m->getConfig(), pserverSpec, useEtcd)); return updater; #else throw UnsupportError(); diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp index db8f005929d90f718fc1ad42c60b68108ff55005..500bc448c92630f4fc2f4df603c955e572d868ec 100644 --- a/paddle/api/Vector.cpp +++ b/paddle/api/Vector.cpp @@ -171,11 +171,7 @@ struct VectorPrivate { Vector::Vector() : m(new VectorPrivate()) {} -Vector::~Vector() { - if (m) { - delete m; - } -} +Vector::~Vector() { delete m; } Vector* Vector::createZero(size_t sz, bool useGpu) { auto retVec = new Vector(); diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index b8642ca22ab340cade5ded62b6e1b5d38680869d..e7d1c7203aa3422b3f5dab4a83da4e175219ba81 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -1,24 +1,29 @@ -# ddim lib +cc_library(enforce SRCS enforce.cc DEPS glog) +cc_test(enforce_test SRCS enforce_test.cc DEPS enforce) cc_library(ddim SRCS ddim.cc) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_test(tensor_test SRCS tensor_test.cc DEPS ddim) +cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory) +cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(variable_test SRCS variable_test.cc) cc_test(scope_test SRCS scope_test.cc) -cc_test(enforce_test SRCS enforce_test.cc) proto_library(attr_type SRCS attr_type.proto) proto_library(op_proto SRCS op_proto.proto DEPS attr_type) cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf) proto_library(op_desc SRCS op_desc.proto DEPS attr_type) cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf) -cc_library(operator SRCS operator.cc DEPS op_desc protobuf) -cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc) + +cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor) +cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) + +cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator) + py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) proto_library(net_proto SRCS net_proto.proto DEPS op_proto) -cc_library(net SRCS net.cc DEPS net_proto) +cc_library(net SRCS net.cc DEPS operator net_proto op_registry) +cc_test(net_op_test SRCS net_op_test.cc DEPS net) diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc index 3f949a6595ea326b97ac567daf9b35a68c8cf7f8..b6ad8b60aaf7b5657e1f18defe3c8d2c2ebbc060 100644 --- a/paddle/framework/ddim.cc +++ b/paddle/framework/ddim.cc @@ -1,9 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include "paddle/framework/ddim.h" +#include "paddle/framework/enforce.h" namespace paddle { namespace framework { -///@cond HIDDEN +/// @cond HIDDEN template Dim make_dim(const int* d) { @@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) { } } -///@endcond +/// @endcond DDim make_ddim(std::initializer_list dims) { DDim result(make_dim(0)); @@ -64,11 +79,11 @@ DDim make_ddim(const std::vector& dims) { return result; } -///@cond HIDDEN +/// @cond HIDDEN // XXX For some reason, putting this in an anonymous namespace causes errors class DynamicMutableIndexer : public boost::static_visitor { public: - DynamicMutableIndexer(int idx) : idx_(idx) {} + explicit DynamicMutableIndexer(int idx) : idx_(idx) {} template int& operator()(Dim& dim) const { @@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor { class DynamicConstIndexer : public boost::static_visitor { public: - DynamicConstIndexer(int idx) : idx_(idx) {} + explicit DynamicConstIndexer(int idx) : idx_(idx) {} template int operator()(const Dim& dim) const { @@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor { int idx_; }; -///@endcond +/// @endcond int& DDim::operator[](int idx) { return boost::apply_visitor(DynamicMutableIndexer(idx), var); @@ -102,6 +117,8 @@ int DDim::operator[](int idx) const { return boost::apply_visitor(DynamicConstIndexer(idx), var); } +ssize_t DDim::size() const { return arity(*this); } + bool DDim::operator==(DDim d) const { if (var.which() != d.getVar().which()) { return false; @@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; } void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } -///@cond HIDDEN +/// @cond HIDDEN struct VectorizeVisitor : public boost::static_visitor<> { std::vector& vector; - VectorizeVisitor(std::vector& v) : vector(v) {} + explicit VectorizeVisitor(std::vector& v) : vector(v) {} template void operator()(const T& t) { @@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> { void operator()(const Dim<1>& t) { vector.push_back(t.head); } }; -///@endcond +/// @endcond std::vector vectorize(const DDim& ddim) { std::vector result; @@ -178,16 +195,59 @@ std::vector vectorize(const DDim& ddim) { return result; } +struct ProductVisitor : public boost::static_visitor { + template + ssize_t operator()(const Dim& dim) { + return product(dim); + } +}; + ssize_t product(const DDim& ddim) { - ssize_t result = 1; - std::vector v = vectorize(ddim); - for (auto i : v) { - result *= i; + ProductVisitor visitor; + return boost::apply_visitor(visitor, ddim); +} + +struct SliceVectorizeVisitor : public boost::static_visitor<> { + std::vector& vector; + int begin; + int end; + + SliceVectorizeVisitor(std::vector& v, int b, int e) + : vector(v), begin(b), end(e) { + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); } - return result; + + template + void operator()(const Dim& dim) { + if (begin == 0) { + vector.push_back(dim.head); + } else { + --begin; + } + --end; + if (end > 0) { + this->operator()(dim.tail); + } + } + + void operator()(const Dim<1>& dim) { + PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound."); + vector.push_back(dim.head); + } +}; + +DDim slice_ddim(const DDim& dim, int begin, int end) { + std::vector vec; + vec.reserve(end - begin); + SliceVectorizeVisitor visitor(vec, begin, end); + boost::apply_visitor(visitor, dim); + return make_ddim(vec); } -///\cond HIDDEN +/// \cond HIDDEN struct ArityVisitor : boost::static_visitor { template @@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor { } }; -///\endcond +/// \endcond int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } -///\cond HIDDEN +/// \cond HIDDEN struct DDimPrinter : boost::static_visitor { std::ostream& os; - DDimPrinter(std::ostream& os_) : os(os_) {} + explicit DDimPrinter(std::ostream& os_) : os(os_) {} template void operator()(const T& t) { @@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor { } }; -///\endcond +/// \endcond std::ostream& operator<<(std::ostream& os, const DDim& ddim) { DDimPrinter printer(os); diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h index 223c4180bee45e21547364441476b27051daca56..7bc21a1e3455bcf8889c18084e8c96e7a3c76a13 100644 --- a/paddle/framework/ddim.h +++ b/paddle/framework/ddim.h @@ -27,7 +27,7 @@ struct DDim { DDim() : var(Dim<1>()) {} template - DDim(const Dim& in) : var(in) {} + explicit DDim(const Dim& in) : var(in) {} template DDim& operator=(const Dim& in) { @@ -50,6 +50,8 @@ struct DDim { DDimVar getVar() { return var; } + ssize_t size() const; + bool operator==(DDim d) const; bool operator!=(DDim d) const; @@ -81,6 +83,15 @@ std::vector vectorize(const DDim& ddim); ssize_t product(const DDim& ddim); +/** + * \brief Slice a ddim + * + * Slice dim with [begin, end). + * e.g. DDim d = make_ddim({1,2,3,4,5}); + * slice_ddim(d, 1, 3); ====> {2,3} + */ +DDim slice_ddim(const DDim& dim, int begin, int end); + /** * \brief What is the length of this dimension? * diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc index 36eef02370e0196c2af2c05f49176b70ce69235a..9d18a2972ce62139430b240b4599854b14290a32 100644 --- a/paddle/framework/ddim_test.cc +++ b/paddle/framework/ddim_test.cc @@ -49,9 +49,30 @@ TEST(DDim, Equality) { // arity of a DDim EXPECT_EQ(paddle::framework::arity(ddim), 3); + EXPECT_EQ(ddim.size(), 3); // product of a DDim EXPECT_EQ(paddle::framework::product(vddim), 45); + EXPECT_EQ( + paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})), + 90); + + // slice a DDim + paddle::framework::DDim ddim2 = + paddle::framework::make_ddim({1, 2, 3, 4, 5, 6}); + paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5); + EXPECT_EQ(arity(ss), 3); + EXPECT_EQ(ss[0], 3); + EXPECT_EQ(ss[1], 4); + EXPECT_EQ(ss[2], 5); + paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6); + EXPECT_EQ(arity(ss2), 6); + EXPECT_EQ(ss2[0], 1); + EXPECT_EQ(ss2[1], 2); + EXPECT_EQ(ss2[2], 3); + EXPECT_EQ(ss2[3], 4); + EXPECT_EQ(ss2[4], 5); + EXPECT_EQ(ss2[5], 6); } TEST(DDim, Print) { diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu index 05217415196f3ec3ce9b5de7cb2f82c9de960ba7..3898d0a447aa502813b3cb5e86c29eebb814ff84 100644 --- a/paddle/framework/dim_test.cu +++ b/paddle/framework/dim_test.cu @@ -1,100 +1,101 @@ #include #include -#include "paddle/framework/dim.h" #include "gtest/gtest.h" +#include "paddle/framework/dim.h" __global__ void test(paddle::framework::Dim<2>* o) { - o[0] = paddle::framework::make_dim(5, 6); + o[0] = paddle::framework::make_dim(5, 6); } __global__ void dyn_idx_gpu(int* o) { - auto d = paddle::framework::make_dim(5, 6); - o[0] = d[1]; + auto d = paddle::framework::make_dim(5, 6); + o[0] = d[1]; } TEST(Dim, Equality) { - // construct a Dim on the CPU - auto a = paddle::framework::make_dim(3, 4); - EXPECT_EQ(paddle::framework::get<0>(a), 3); - EXPECT_EQ(paddle::framework::get<1>(a), 4); - - // construct a Dim on the GPU - thrust::device_vector> t(2); - test<<<1,1>>>(thrust::raw_pointer_cast(t.data())); - a = t[0]; - EXPECT_EQ(paddle::framework::get<0>(a), 5); - EXPECT_EQ(paddle::framework::get<1>(a), 6); - - // linearization - auto b = paddle::framework::make_dim(7, 8); - EXPECT_EQ(paddle::framework::linearize(a, b), 83); - - // product - EXPECT_EQ(paddle::framework::product(a), 30); - - // mutate a Dim - paddle::framework::get<1>(b) = 10; - EXPECT_EQ(paddle::framework::get<0>(b), 7); - EXPECT_EQ(paddle::framework::get<1>(b), 10); - - // dynamic access - paddle::framework::get(b, 0) = 8; - b[1] = 11; - EXPECT_EQ(paddle::framework::get<0>(b), 8); - EXPECT_EQ(paddle::framework::get<1>(b), 11); - EXPECT_EQ(paddle::framework::get(b, 0), 8); - EXPECT_EQ(b[1], 11); - - // dynamic access on GPU - thrust::device_vector r(1); - dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data())); - int res = r[0]; - EXPECT_EQ(res, 6); - - // ex_prefix_mul - paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5)); - EXPECT_EQ(paddle::framework::get<0>(c), 1); - EXPECT_EQ(paddle::framework::get<1>(c), 3); - EXPECT_EQ(paddle::framework::get<2>(c), 12); - - // generate from an index - auto size = paddle::framework::make_dim(4, 5, 2); - c = paddle::framework::Dim<3>(14, size); - EXPECT_EQ(paddle::framework::get<0>(c), 2); - EXPECT_EQ(paddle::framework::get<1>(c), 3); - EXPECT_EQ(paddle::framework::get<2>(c), 0); - c = paddle::framework::Dim<3>(25, size); - EXPECT_EQ(paddle::framework::get<0>(c), 1); - EXPECT_EQ(paddle::framework::get<1>(c), 1); - EXPECT_EQ(paddle::framework::get<2>(c), 1); + // construct a Dim on the CPU + auto a = paddle::framework::make_dim(3, 4); + EXPECT_EQ(paddle::framework::get<0>(a), 3); + EXPECT_EQ(paddle::framework::get<1>(a), 4); + + // construct a Dim on the GPU + thrust::device_vector> t(2); + test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); + a = t[0]; + EXPECT_EQ(paddle::framework::get<0>(a), 5); + EXPECT_EQ(paddle::framework::get<1>(a), 6); + + // linearization + auto b = paddle::framework::make_dim(7, 8); + EXPECT_EQ(paddle::framework::linearize(a, b), 83); + + // product + EXPECT_EQ(paddle::framework::product(a), 30); + + // mutate a Dim + paddle::framework::get<1>(b) = 10; + EXPECT_EQ(paddle::framework::get<0>(b), 7); + EXPECT_EQ(paddle::framework::get<1>(b), 10); + + // dynamic access + paddle::framework::get(b, 0) = 8; + b[1] = 11; + EXPECT_EQ(paddle::framework::get<0>(b), 8); + EXPECT_EQ(paddle::framework::get<1>(b), 11); + EXPECT_EQ(paddle::framework::get(b, 0), 8); + EXPECT_EQ(b[1], 11); + + // dynamic access on GPU + thrust::device_vector r(1); + dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); + int res = r[0]; + EXPECT_EQ(res, 6); + + // ex_prefix_mul + paddle::framework::Dim<3> c = + paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5)); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 12); + + // generate from an index + auto size = paddle::framework::make_dim(4, 5, 2); + c = paddle::framework::Dim<3>(14, size); + EXPECT_EQ(paddle::framework::get<0>(c), 2); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 0); + c = paddle::framework::Dim<3>(25, size); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 1); + EXPECT_EQ(paddle::framework::get<2>(c), 1); } TEST(Dim, Bool) { - auto a = paddle::framework::make_dim(3, 4); - auto b = paddle::framework::make_dim(5, 6); - auto c = paddle::framework::make_dim(3, 4); - - // in_bounds check - EXPECT_TRUE(paddle::framework::contained(a, b)); - EXPECT_FALSE(paddle::framework::contained(b, a)); - - // comparison - EXPECT_TRUE(a == a); - EXPECT_FALSE(a == b); - EXPECT_TRUE(a == c); + auto a = paddle::framework::make_dim(3, 4); + auto b = paddle::framework::make_dim(5, 6); + auto c = paddle::framework::make_dim(3, 4); + + // in_bounds check + EXPECT_TRUE(paddle::framework::contained(a, b)); + EXPECT_FALSE(paddle::framework::contained(b, a)); + + // comparison + EXPECT_TRUE(a == a); + EXPECT_FALSE(a == b); + EXPECT_TRUE(a == c); } TEST(Dim, Print) { - { - std::stringstream ss; - auto a = paddle::framework::make_dim(2, 3); - ss << a; - EXPECT_EQ(ss.str(), "2, 3"); - } - { - std::stringstream ss; - ss << paddle::framework::make_dim(8); - EXPECT_EQ(ss.str(), "8"); - } + { + std::stringstream ss; + auto a = paddle::framework::make_dim(2, 3); + ss << a; + EXPECT_EQ(ss.str(), "2, 3"); + } + { + std::stringstream ss; + ss << paddle::framework::make_dim(8); + EXPECT_EQ(ss.str(), "8"); + } } diff --git a/paddle/framework/enforce.cc b/paddle/framework/enforce.cc new file mode 100644 index 0000000000000000000000000000000000000000..644930ff989bb8935f37642c117084f580379bd7 --- /dev/null +++ b/paddle/framework/enforce.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/enforce.h" diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h index 56cb7f95647e81efef58b156002d0d378ee22820..ffce8148e9516a5720757c87685ff6bd2937977c 100644 --- a/paddle/framework/enforce.h +++ b/paddle/framework/enforce.h @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include @@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception { /** * @brief Enforce a condition, otherwise throw an EnforceNotMet */ +#ifdef NDEBUG #define PADDLE_ENFORCE(condition, ...) \ do { \ if (UNLIKELY(!(condition))) { \ PADDLE_THROW(__VA_ARGS__); \ } \ } while (0) +#else +#define PADDLE_ENFORCE(condition, ...) \ + CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__); +#endif } // namespace framework } // namespace paddle diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc index 73b3051235ee90b31bd65acb22f454fc13d64da9..7311cda9a9ad282b21711d8eb0b9ba1cf9542296 100644 --- a/paddle/framework/net.cc +++ b/paddle/framework/net.cc @@ -1,20 +1,59 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #include "paddle/framework/net.h" namespace paddle { namespace framework { -PlainNet::PlainNet(const NetDesc& def) {} - -void PlainNet::InferShape(Scope* scope) { +void PlainNet::CompleteAddOp() { + std::unordered_set input_set; + std::unordered_set output_set; + std::unordered_set temp_output; for (auto& op : ops_) { - op.InferShape(); + for (auto& ipt : op->inputs_) { + if (!Contains(output_set, ipt)) { // Not other op's output + input_set.insert(ipt); + } else { + temp_output.insert(ipt); + } + } + + for (auto& opt : op->outputs_) { + output_set.insert(opt); + } } -} + inputs_.reserve(input_set.size()); + std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_)); -void PlainNet::Run(std::shared_ptr scope, DeviceContext* ctx) { - for (auto& op : ops_) { - op.Run(ctx); + outputs_.reserve(output_set.size()); + std::vector tmp_index; + tmp_index.reserve(temp_output.size()); + int idx = 0; + for (auto& opt : output_set) { + if (Contains(temp_output, opt)) { + tmp_index.push_back(idx); + } + outputs_.push_back(opt); + ++idx; } + + attrs_["temporary_index"] = tmp_index; + add_op_done_ = true; } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/net.h b/paddle/framework/net.h index 76992e07282904fd1074bb0ced2367a8d20e3ec2..19a1620e29b86fbccfc112a5f85a1784a197dd0b 100644 --- a/paddle/framework/net.h +++ b/paddle/framework/net.h @@ -1,99 +1,51 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once +#include +#include #include "paddle/framework/net_proto.pb.h" #include "paddle/framework/op_proto.pb.h" +#include "paddle/framework/op_registry.h" #include "paddle/framework/scope.h" #include "paddle/platform/device_context.h" namespace paddle { namespace framework { -using namespace paddle::platform; - -// operator's index stored in a network. -typedef int OpIndex; -/** - * NOTE following codes are some definitions of unimplemented concepts. - * We write some basic implementation to make Net compilable. These APIs will - * keep updating if the concepts related are implemented. - */ - -struct OpDesc; -struct OpAttrs {}; - -class Operator { - public: - Operator(const OpDesc &def) {} - void InferShape() {} - void Run(DeviceContext *ctx) {} -}; - /** - * @brief Network that manage the operators it has. + * @brief Network is also a type of Operator + * + * It will manage the operators it has. * - * Network is the container and controller of a set of operators, user can build - * a real network from a NetDesc which is a protobuf message and use - * Network.Run() * to run all the operators in the network. + * Network is the container and controller of a set of operators. * A network object knows all Operators belonging to this network. Variables, * which are inputs and outputs of these operators, are created and managed by a * hierarchy of Scope objects. * - * This is the base class of network, all the networks should implement the apis + * This is the base class of network, all the networks should implement the APIs * it defines. */ -class Net { +class Net : public OperatorBase { public: - /** - * @brief Infer shapes of all inputs and outputs of operators. - */ - virtual void InferShape(Scope *scope) = 0; - /** - * @brief Run the network. - * - * Run all the operators and return success(true) or not, with all the - * variables are located in `scope`. `context` describes the detail execution - * environment for ops. `begin` and `end` specify the scope of `ops_` to run, - * If no positive indexes are provided, all operators in `ops_` will run. - */ - virtual void Run(std::shared_ptr scope, DeviceContext *ctx) = 0; - - /** - * @brief Add an Operator according to `def`. - */ - virtual OpIndex AddOp(const OpProto &def) = 0; - - /** - * @brief Add optimizer operators acctording to `attrs`. - */ - virtual void AddOptimizerOps(const OpAttrs &attrs) = 0; - - /** - * @brief Add backward operators. - */ - virtual void AddBackwardOps() = 0; - - /** - * @brief Create a network. - */ - static std::unique_ptr Create(const NetDesc &def = NetDesc()); - - virtual ~Net() {} + virtual void AddOp(const OperatorPtr& op) = 0; + virtual void CompleteAddOp() = 0; }; +using NetPtr = std::shared_ptr; + /** * @brief a basic implementation of Net. * @@ -103,18 +55,14 @@ class Net { class PlainNet : public Net { public: /** - * @brief Initialize a PlainNet. - * - * Initialize from a network describe by `def`. NetDesc is the definition of - * a network. - */ - PlainNet(const NetDesc &def); - - /** - * Infer all the operators' input and output varialbes' shapes, will be called + * Infer all the operators' input and output variables' shapes, will be called * before every mini-batch */ - virtual void InferShape(Scope *scope) override; + void InferShape(const ScopePtr& scope) const override { + for (auto& op : ops_) { + op->InferShape(scope); + } + } /** * @brief Run the network. @@ -123,48 +71,32 @@ class PlainNet : public Net { * scope will be used instead. If no OpContext is provicded, default context * will be used. */ - virtual void Run(std::shared_ptr scope, DeviceContext *ctx) override; + void Run(const ScopePtr& scope, + const platform::DeviceContext& dev_ctx) const override { + for (auto& op : ops_) { + op->Run(scope, dev_ctx); + } + } /** - * @brief Add an operator to this network. + * @brief Add an operator by ptr */ - virtual OpIndex AddOp(const OpProto &def) override; + void AddOp(const OperatorPtr& op) override { + PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); + ops_.push_back(op); + } - /** - * @brief Add all optimizer operators related into the network. - */ - virtual void AddOptimizerOps(const OpAttrs &attrs) override; + void CompleteAddOp() override; - /** - * @brief Add all backward operators related into the network. - */ - virtual void AddBackwardOps() override; - - virtual ~PlainNet() override {} - - protected: - /** - * @brief Build the network. - * - * Create operators accordding to `def`, will be called by the constructor. - */ - void BuildNet(const NetDesc &def); - - /** - * @brief Add an operator into this network. - * - * Add a operator which is identified as `type` and has attributes described - * in `attrs`, the `inputs` are the keys of readonly input variables, - * `outputs` are keys of mutable output variables. An `OpIndex` will be - * returned to indicate the offset of the new operator in `ops_`. - */ - OpIndex AddOp(const std::string &type, const std::vector &inputs, - const std::vector &outputs, - const OpAttrs &attrs = OpAttrs()); + std::vector ops_; private: - // the operators owned by `Network`. - std::vector ops_; + bool add_op_done_{false}; + + template + static bool Contains(T container, KeyType key) { + return container.find(key) != container.end(); + } }; } // namespace framework diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5e1c22400a73c3aa09839ef9654f87def99bc77 --- /dev/null +++ b/paddle/framework/net_op_test.cc @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +namespace pd = paddle::framework; + +static int infer_shape_cnt = 0; +static int run_cnt = 0; + +class TestOp : public pd::OperatorBase { + public: + void InferShape(const paddle::framework::ScopePtr& scope) const override { + ++infer_shape_cnt; + } + void Run(const paddle::framework::ScopePtr& scope, + const paddle::platform::DeviceContext& dev_ctx) const override { + ++run_cnt; + } +}; + +template +void AssertSameVectorWithoutOrder(const std::vector& expected, + const std::vector& actual) { + ASSERT_EQ(expected.size(), actual.size()); + std::unordered_set expected_set; + for (auto& tmp : expected) { + expected_set.insert(tmp); + } + for (auto& act : actual) { + ASSERT_NE(expected_set.end(), expected_set.find(act)); + } +} + +TEST(OpKernel, all) { + auto net = std::make_shared(); + ASSERT_NE(net, nullptr); + + auto op1 = std::make_shared(); + op1->inputs_ = {"x", "w1", "b1"}; + op1->outputs_ = {"y"}; + net->AddOp(op1); + + auto op2 = std::make_shared(); + op2->inputs_ = {"y", "w2", "b2"}; + op2->outputs_ = {"z"}; + net->AddOp(op2); + + net->CompleteAddOp(); + AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_); + AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_); + auto tmp_idx_iter = net->attrs_.find("temporary_index"); + ASSERT_NE(net->attrs_.end(), tmp_idx_iter); + auto& tmp_idx = boost::get>(tmp_idx_iter->second); + ASSERT_EQ(1UL, tmp_idx.size()); + ASSERT_EQ("y", net->outputs_[tmp_idx[0]]); + + auto scope = std::make_shared(); + paddle::platform::CPUDeviceContext dev_ctx; + + net->InferShape(scope); + net->Run(scope, dev_ctx); + ASSERT_EQ(2, infer_shape_cnt); + ASSERT_EQ(2, run_cnt); + + ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet); +} diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto index 22df6f9c6b70277ddbf31e0432401889e3aa7483..596b8588e783722362815f75db876931f83484ec 100644 --- a/paddle/framework/op_proto.proto +++ b/paddle/framework/op_proto.proto @@ -34,6 +34,11 @@ message AttrProto { // Supported attribute comments. It helps 3rd-party language generate doc-string. required string comment = 3; + + // If that attribute is generated, it means the Paddle third language + // binding has responsibility to fill that attribute. End-User should + // not set that attribute. + optional bool generated = 4 [default=false]; } // Input or output message for 3rd-party language binding. @@ -45,6 +50,40 @@ message VarProto { // The comment for that input. It helps 3rd-party language generate doc-string. required string comment = 2; + + // Is that input/output could be a list or not. + // If so, that Op should write a attributed named `input_format` or + // `output_format`. + // + // e.g. + // If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W` + // could be multiple, so the multiple of `X` and `W` is True, and OpDesc + // will hold a attribute of them. + // + // The Op desc of same fc could be + // { + // "type": "fc", + // "input": ["X1", "X2", "W1", "W2", "b"], + // "output": "fc.out", + // "attrs" : { + // "input_format": [0, 2, 4, 5] + // } + // } + // + optional bool multiple = 3 [default=false]; + + // It marks that output is a temporary output. That output is not used by + // user, but used by other op internally as input. If other op is not use + // that output, it could be optimized early. + // + // Attribute temporary_index will be set in OpDesc if there is some + // outputs are temporary. + // + // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"], + // attrs = { + // "temporary_index": [1] + // } + optional bool temporary = 4 [default=false]; } // Op protocol message for 3rd-party language binding. diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc index 4b35e04e681b414c36cf6d9aee9e64dd68ba5da9..1d14535c50b542733663a6900a8b5f2033290ea6 100644 --- a/paddle/framework/op_registry.cc +++ b/paddle/framework/op_registry.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include namespace paddle { @@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType>(AttrProto* attr) { attr->set_type(paddle::framework::AttrType::STRINGS); } } // namespace framework -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 248c7a1a3b866ae3bf2af33d0ff67b92d0f9c456..7aa59f0b630d5a99fa15c00c9e32a22dd59b9a70 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -1,6 +1,10 @@ #pragma once #include +#include +#include +#include +#include #include "paddle/framework/attr_checker.h" #include "paddle/framework/op_desc.pb.h" #include "paddle/framework/op_proto.pb.h" @@ -58,37 +62,138 @@ class OpProtoAndCheckerMaker { OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : proto_(proto), op_checker_(op_checker) {} + ~OpProtoAndCheckerMaker() { + PADDLE_ENFORCE(validated_, "should call Validate after build"); + } + + void Validate() { + validated_ = true; + CheckNoDuplicatedInOutAttrs(); + } + protected: - void AddInput(const std::string& name, const std::string& comment) { + void AddInput(const std::string& name, const std::string& comment, + bool multiple = false) { auto input = proto_->mutable_inputs()->Add(); *input->mutable_name() = name; *input->mutable_comment() = comment; + input->set_multiple(multiple); + if (multiple) { + SetHasMultipleInput(); + } + } + + void AddInputs(const std::string& name, const std::string& comment) { + AddInput(name, comment, true); } - void AddOutput(const std::string& name, const std::string& comment) { + void AddOutput(const std::string& name, const std::string& comment, + bool temporary = false, bool multiple = false) { auto output = proto_->mutable_outputs()->Add(); *output->mutable_name() = name; *output->mutable_comment() = comment; + output->set_multiple(multiple); + if (multiple) { + SetHasMultipleOutput(); + } + output->set_temporary(temporary); + if (temporary) { + SetHasTemporaryOutput(); + } + } + + void AddOutputs(const std::string& name, const std::string& comment, + bool temporary = false) { + AddOutput(name, comment, temporary, true); } template TypedAttrChecker& AddAttr(const std::string& name, - const std::string& comment) { + const std::string& comment, + bool generated = false) { auto attr = proto_->mutable_attrs()->Add(); *attr->mutable_name() = name; *attr->mutable_comment() = comment; + attr->set_generated(generated); AttrTypeHelper::SetAttrType(attr); return op_checker_->AddAttrChecker(name); } - void AddType(const std::string& op_type) { proto_->set_type(op_type); } - void AddComment(const std::string& comment) { *(proto_->mutable_comment()) = comment; } + private: + void SetHasMultiple(const std::string& in_out, bool* flag) { + if (!*flag) { + AddAttr>(in_out + "_format", + "The multiple index of " + in_out + + "\n" + R"DOC( +This attribute is used by Paddle core framework. Paddle's Op support each input +or output could be a list of variable. This attribute is used to show how that +list organized. + +e.g. + input = ["a", "b", "c", "d", "e", "f"] + input_format = [0, 4, 5, 6] + +means + The number of all input variables this op is six, and they are segmented into + three inputs. + + The first input is input[0:4], second is input[4:5], third is input[5:6]. +)DOC", + /*generated*/ true); + *flag = true; + } + } + + void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); } + void SetHasMultipleOutput() { + SetHasMultiple("output", &has_multiple_output_); + } + + void SetHasTemporaryOutput() { + if (!has_temporary_output_) { + AddAttr>("temporary_index", + R"DOC(The temporary index of output. + +Not all output of Paddle Op is used by user. For faster computation, each op +could output some its internal state to other op, other op could take that +output to make compute faster. + +Add a mark to which output is temporary is helpful for future optimization. +)DOC", + /*generated*/ true) + .SetDefault(std::vector()); + has_temporary_output_ = true; + } + } + + void CheckNoDuplicatedInOutAttrs() { + std::unordered_set names; + auto checker = [&](const std::string& name) { + PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name); + names.insert(name); + }; + for (auto& attr : proto_->attrs()) { + checker(attr.name()); + } + for (auto& input : proto_->inputs()) { + checker(input.name()); + } + for (auto& output : proto_->outputs()) { + checker(output.name()); + } + } + OpProto* proto_; OpAttrChecker* op_checker_; + bool validated_{false}; + bool has_multiple_input_{false}; + bool has_multiple_output_{false}; + bool has_temporary_output_{false}; }; class OpRegistry { @@ -100,32 +205,52 @@ class OpRegistry { creators()[op_type] = [] { return new OpType; }; OpProto& op_proto = protos()[op_type]; OpAttrChecker& op_checker = op_checkers()[op_type]; - ProtoMakerType(&op_proto, &op_checker); - PADDLE_ENFORCE(op_proto.IsInitialized(), - "Fail to initialize %s's OpProto !", op_type); + auto maker = ProtoMakerType(&op_proto, &op_checker); + maker.Validate(); + *op_proto.mutable_type() = op_type; + PADDLE_ENFORCE( + op_proto.IsInitialized(), + "Fail to initialize %s's OpProto, because %s is not initialized", + op_type, op_proto.InitializationErrorString()); } - static OperatorBase* CreateOp(const OpDesc& op_desc) { + static OperatorPtr CreateOp(const OpDesc& op_desc) { + //! Create a OpPtr by type. std::string op_type = op_desc.type(); - OperatorBase* op = creators().at(op_type)(); - op->desc_ = op_desc; + OperatorPtr op(creators().at(op_type)()); + //! Fill op's data member. Not use constructor because it will be noising + //! for Op developer. + const OpProto& op_proto = protos().at(op_type); + op->type_ = op_desc.type(); + // set op's inputs_ from desc. op->inputs_.reserve((size_t)op_desc.inputs_size()); std::copy(op_desc.inputs().begin(), op_desc.inputs().end(), std::back_inserter(op->inputs_)); + // set op's outputs_ from desc. op->outputs_.reserve((size_t)op_desc.outputs_size()); std::copy(op_desc.outputs().begin(), op_desc.outputs().end(), std::back_inserter(op->outputs_)); + + //! Fill attrs, and validate attrs. for (auto& attr : op_desc.attrs()) { op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr); } op_checkers().at(op_type).Check(op->attrs_); + + //! Convert Temporary variable name to an unique variable name. + GenerateTempVariableName(op.get()); + + // set argument offsets stored in op. + CreateInOutOffsetMap(op, op_proto); + //! Other op's custom Init for a complex Op. For simple Op, the Init + //! method do nothing. + op->Init(); return op; } - private: - static std::unordered_map& creators() { - static std::unordered_map creators_; - return creators_; + // init op.in_out_idxs_ to accelerate argument's offset lookup. + static void CreateInOutOffsetMap(OperatorPtr op, const OpProto& proto) { + op->CreateInOutOffsetMap(proto); } static std::unordered_map& protos() { @@ -133,6 +258,23 @@ class OpRegistry { return protos_; }; + private: + static void GenerateTempVariableName(OperatorBase* op) { + static std::atomic gUniqId(0UL); + for (auto& outname : op->outputs_) { + if (outname == OperatorBase::TMP_VAR_NAME()) { + outname += op->type_; + outname += "@"; + outname += std::to_string(gUniqId.fetch_add(1)); + } + } + } + + static std::unordered_map& creators() { + static std::unordered_map creators_; + return creators_; + } + static std::unordered_map& op_checkers() { static std::unordered_map op_checkers_; return op_checkers_; @@ -142,18 +284,87 @@ class OpRegistry { template class OpRegisterHelper { public: - OpRegisterHelper(std::string op_type) { + OpRegisterHelper(const char* op_type) { OpRegistry::RegisterOp(op_type); } }; -#define REGISTER_OP(type, op_class, op_maker_class) \ - class op_class##Register { \ - private: \ - const static OpRegisterHelper reg; \ - }; \ - const OpRegisterHelper op_class##Register::reg( \ - #type) +/** + * check if MACRO is used in GLOBAL NAMESPACE. + */ +#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +/** + * Macro to Register Operator. + */ +#define REGISTER_OP(__op_type, __op_class, __op_maker_class) \ + STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type, \ + "REGISTER_OP must be in global namespace"); \ + static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \ + __op_register_##__op_type##__(#__op_type); \ + int __op_register_##__op_type##_handle__() { return 0; } + +/** + * Macro to Register OperatorKernel. + */ +#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##type##_##DEVICE_TYPE##__, \ + "REGISTER_OP_KERNEL must be in global namespace"); \ + struct __op_kernel_register__##type##__ { \ + __op_kernel_register__##type##__() { \ + ::paddle::framework::OperatorWithKernel::OpKernelKey key; \ + key.place_ = PlaceType(); \ + ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \ + .reset(new KernelType()); \ + } \ + }; \ + static __op_kernel_register__##type##__ __reg_kernel_##type##__; \ + int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; } + +#define REGISTER_OP_GPU_KERNEL(type, KernelType) \ + REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType) + +#define REGISTER_OP_CPU_KERNEL(type, KernelType) \ + REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType) + +/** + * Macro to mark what Operator and Kernel we will use and tell the compiler to + * link them into target. + */ +#define USE_OP_WITHOUT_KERNEL(op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_without_kernel_##op_type, \ + "USE_OP_WITHOUT_KERNEL must be in global namespace"); \ + extern int __op_register_##op_type##_handle__(); \ + static int __use_op_ptr_##op_type##_without_kernel__ \ + __attribute__((unused)) = __op_register_##op_type##_handle__() + +#define USE_OP_KERNEL(op_type, DEVICE_TYPE) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_kernel_##op_type##_##DEVICE_TYPE##__, \ + "USE_OP_KERNEL must be in global namespace"); \ + extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \ + static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__ \ + __attribute__((unused)) = \ + __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__() + +// use Operator with only cpu kernel. +#define USE_OP_CPU(op_type) \ + USE_OP_WITHOUT_KERNEL(op_type); \ + USE_OP_KERNEL(op_type, CPU) + +#ifdef PADDLE_ONLY_CPU +#define USE_OP(op_type) USE_OP_CPU(op_type) +#else +#define USE_OP(op_type) \ + USE_OP_CPU(op_type); \ + USE_OP_KERNEL(op_type, GPU) +#endif } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index f5162fb870a91e566a0d2b1419050fe0799b199b..d3a51a361aa56b26b87d79057f6700bd87264ca4 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -1,15 +1,15 @@ #include "paddle/framework/op_registry.h" #include -using namespace paddle::framework; +namespace pd = paddle::framework; namespace paddle { namespace framework { class CosineOp : public OperatorBase { public: - void Run(const std::shared_ptr& scope, + void Run(const ScopePtr& scope, const platform::DeviceContext& dev_ctx) const override {} - void InferShape(const std::shared_ptr& scope) const override {} + void InferShape(const ScopePtr& scope) const override {} }; class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { @@ -21,42 +21,40 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .LargerThan(0.0); - AddType("cos"); AddComment("This is cos op"); } }; -REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker); - class MyTestOp : public OperatorBase { public: - void InferShape(const std::shared_ptr& scope) const override {} - void Run(const std::shared_ptr& scope, + void InferShape(const ScopePtr& scope) const override {} + void Run(const ScopePtr& scope, const platform::DeviceContext& dev_ctx) const override {} - - public: }; class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("input", "input of cosine op"); - AddOutput("output", "output of cosine op"); + AddInputs("input", "input of cosine op"); + AddOutput("output", "output of cosine op", + /*temporary*/ true); auto my_checker = [](int i) { PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!"); }; AddAttr("test_attr", "a simple test attribute") .AddCustomChecker(my_checker); - AddType("my_test_op"); AddComment("This is my_test op"); } }; - -REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker); } // namespace framework } // namespace paddle +REGISTER_OP(cos_sim, paddle::framework::CosineOp, + paddle::framework::CosineOpProtoAndCheckerMaker); +REGISTER_OP(my_test_op, paddle::framework::MyTestOp, + paddle::framework::MyTestOpProtoAndCheckerMaker); + TEST(OpRegistry, CreateOp) { paddle::framework::OpDesc op_desc; op_desc.set_type("cos_sim"); @@ -69,9 +67,9 @@ TEST(OpRegistry, CreateOp) { attr->set_type(paddle::framework::AttrType::FLOAT); attr->set_f(scale); - paddle::framework::OperatorBase* op = + paddle::framework::OperatorPtr op = paddle::framework::OpRegistry::CreateOp(op_desc); - auto scope = std::make_shared(); + auto scope = std::make_shared(); paddle::platform::CPUDeviceContext dev_ctx; op->Run(scope, dev_ctx); float scale_get = op->GetAttr("scale"); @@ -91,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) { bool caught = false; try { - paddle::framework::OperatorBase* op __attribute__((unused)) = + paddle::framework::OperatorPtr op __attribute__((unused)) = paddle::framework::OpRegistry::CreateOp(op_desc); } catch (paddle::framework::EnforceNotMet err) { caught = true; @@ -112,24 +110,33 @@ TEST(OpRegistry, DefaultValue) { ASSERT_TRUE(op_desc.IsInitialized()); - paddle::framework::OperatorBase* op = + paddle::framework::OperatorPtr op = paddle::framework::OpRegistry::CreateOp(op_desc); - auto scope = std::make_shared(); + auto scope = std::make_shared(); paddle::platform::CPUDeviceContext dev_ctx; op->Run(scope, dev_ctx); ASSERT_EQ(op->GetAttr("scale"), 1.0); } +static void SetInputFormat(paddle::framework::OpDesc* desc) { + auto attr = desc->add_attrs(); + attr->set_name("input_format"); + attr->set_type(paddle::framework::INTS); + attr->mutable_ints()->Add(0); + attr->mutable_ints()->Add(1); +} + TEST(OpRegistry, CustomChecker) { paddle::framework::OpDesc op_desc; op_desc.set_type("my_test_op"); op_desc.add_inputs("ii"); op_desc.add_outputs("oo"); + SetInputFormat(&op_desc); // attr 'test_attr' is not set bool caught = false; try { - paddle::framework::OperatorBase* op __attribute__((unused)) = + paddle::framework::OperatorPtr op __attribute__((unused)) = paddle::framework::OpRegistry::CreateOp(op_desc); } catch (paddle::framework::EnforceNotMet err) { caught = true; @@ -148,7 +155,7 @@ TEST(OpRegistry, CustomChecker) { attr->set_i(3); caught = false; try { - paddle::framework::OperatorBase* op __attribute__((unused)) = + paddle::framework::OperatorPtr op __attribute__((unused)) = paddle::framework::OpRegistry::CreateOp(op_desc); } catch (paddle::framework::EnforceNotMet err) { caught = true; @@ -166,16 +173,44 @@ TEST(OpRegistry, CustomChecker) { attr->set_name("test_attr"); attr->set_type(paddle::framework::AttrType::INT); attr->set_i(4); - paddle::framework::OperatorBase* op = + SetInputFormat(&op_desc); + paddle::framework::OperatorPtr op = paddle::framework::OpRegistry::CreateOp(op_desc); paddle::platform::CPUDeviceContext dev_ctx; - auto scope = std::make_shared(); + auto scope = std::make_shared(); op->Run(scope, dev_ctx); int test_attr = op->GetAttr("test_attr"); ASSERT_EQ(test_attr, 4); } -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} \ No newline at end of file +class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker { + public: + TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("scale", "scale of test op"); + AddAttr("scale", "scale of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedAttr) { + pd::OpProto op_proto; + pd::OpAttrChecker op_checker; + auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet); +} + +class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker { + public: + TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of test op"); + AddInput("input", "input of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedInOut) { + pd::OpProto op_proto; + pd::OpAttrChecker op_checker; + auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet); +} diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 8f7adff8b3982e91a3d7f6d598cd62d5005d5f17..50cb2d936274dcc046d5641ff276aae77358d1bf 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -12,32 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/framework/operator.h" namespace paddle { namespace framework { +void OperatorBase::CreateInOutOffsetMap(const OpProto& proto) { + PADDLE_ENFORCE(in_out_idxs_.empty(), "duplicate call CreateInOutOffsetMap"); + for (int i = 0; i < proto.inputs_size(); i++) { + const auto& name = proto.inputs()[i].name(); + in_out_idxs_[name] = i; + } + for (int i = 0; i < proto.outputs_size(); i++) { + const auto& name = proto.outputs()[i].name(); + in_out_idxs_[name] = i; + } +} + +const std::string& OperatorBase::Input(const std::string& name) const { + auto it = in_out_idxs_.find(name); + PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name); + + if (attrs_.count("input_format") == 0) { + return inputs_[it->second]; + } else { + const auto& input_format = GetAttr>("input_format"); + int idx = input_format[it->second]; + return inputs_.at(idx); + } +} + +std::vector OperatorBase::Inputs(const std::string& name) const { + auto input_format = GetAttr>("input_format"); + auto offset = in_out_idxs_.at(name); + + return std::vector{ + inputs_.begin() + input_format.at(offset), + inputs_.begin() + input_format.at(offset + 1)}; +} + +const std::string& OperatorBase::Output(const std::string& name) const { + auto it = in_out_idxs_.find(name); + PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name); + + if (attrs_.count("output_format") == 0) { + return outputs_[it->second]; + } else { + const auto& output_format = GetAttr>("output_format"); + int idx = output_format[it->second]; + return outputs_.at(idx); + } +} + +std::vector OperatorBase::Outputs(const std::string& name) const { + auto output_format = GetAttr>("output_format"); + auto offset = in_out_idxs_.at(name); + + return std::vector{ + outputs_.begin() + output_format.at(offset), + outputs_.begin() + output_format.at(offset + 1)}; +} + std::string OperatorBase::DebugString() const { std::stringstream ss; - ss << "=================\n"; - ss << "type = " << desc_.type() << "\n"; - ss << "inputs = ["; - for (auto& ipt : inputs_) { - ss << ipt << ", "; - } - ss << "]\n"; - ss << "outputs = ["; - for (auto& opt : outputs_) { - ss << opt << ", "; + ss << "Op(" << type_ << "), inputs:("; + for (size_t i = 0; i < inputs_.size(); ++i) { + ss << inputs_[i]; + if (i != inputs_.size() - 1) { + ss << ", "; + } } - ss << "]\n"; - ss << "attr_keys = ["; - for (auto& attr : attrs_) { - ss << attr.first << ", "; + ss << "), outputs:("; + for (size_t i = 0; i < outputs_.size(); ++i) { + ss << outputs_[i]; + if (i != outputs_.size() - 1) { + ss << ", "; + } } - ss << "]\n"; + ss << ")."; return ss.str(); } } // namespace framework -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 0ce422e007c39ce0c3f5f7a89650cc211919ea8f..2fe9670677c6041c0360d096b88e818676a8c929 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -14,22 +14,25 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include -#include #include #include #include #include +#include "paddle/framework/attr_checker.h" +#include "paddle/framework/op_desc.pb.h" +#include "paddle/framework/op_proto.pb.h" +#include "paddle/framework/scope.h" +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/place.h" +#include "paddle/utils/Error.h" + namespace paddle { namespace framework { class OperatorBase; - +using OperatorPtr = std::shared_ptr; /** * OperatorBase has the basic element that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -38,6 +41,13 @@ class OperatorBase; */ class OperatorBase { public: + /// If a variable is a empty variable, that name will be used. + static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; } + + /// If a variable is a temporary variable, that name will be set in Python, + /// but it will be convert to a unique name in scope after OpCreator. + static std::string TMP_VAR_NAME() { return "@TEMP@"; } + virtual ~OperatorBase() {} template @@ -49,22 +59,84 @@ class OperatorBase { std::string DebugString() const; + /// Init will be called after CreateOperator, you can put some initialization + /// logic here. + virtual void Init() {} + /// InferShape infer the size of Variables used by this Operator with /// information inside scope - virtual void InferShape(const std::shared_ptr& scope) const = 0; + virtual void InferShape(const ScopePtr& scope) const = 0; /// Net will call this function to Run an op. - virtual void Run(const std::shared_ptr& scope, + virtual void Run(const ScopePtr& scope, const platform::DeviceContext& dev_ctx) const = 0; - protected: - std::string Type() const { return desc_.type(); } + // Get a input with argument's name described in `op_proto` + const std::string& Input(const std::string& name) const; + // Get a input which has multiple variables. + // TODO add a vector_view to prevent memory copy. + std::vector Inputs(const std::string& name) const; + // Get a output with argument's name described in `op_proto` + const std::string& Output(const std::string& name) const; + // Get an output which has multiple variables. + // TODO add a vector_view to prevent memory copy. + std::vector Outputs(const std::string& name) const; + + // init in_out_idxs_ to accelerate argument's offset lookup. + void CreateInOutOffsetMap(const OpProto& proto); public: - OpDesc desc_; + std::string type_; std::vector inputs_; std::vector outputs_; AttributeMap attrs_; + // store the arguments' offset described in op_desc. + std::unordered_map in_out_idxs_; +}; + +class KernelContext { + public: + KernelContext(const OperatorBase* op, const std::shared_ptr& scope, + const platform::DeviceContext& device_context) + : op_(*op), scope_(scope), device_context_(device_context) {} + + const Variable* Input(int index) const { + return scope_->GetVariable(op_.inputs_[index]); + } + + Variable* Output(int index) const { + return scope_->GetVariable(op_.outputs_[index]); + } + + const Variable* Input(const std::string& name) const { + return scope_->GetVariable(op_.Input(name)); + } + + const Variable* Output(const std::string& name) const { + return scope_->GetVariable(op_.Output(name)); + } + + const std::vector Inputs(const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + std::transform( + names.begin(), names.end(), res.begin(), + [this](const std::string& name) { return scope_->GetVariable(name); }); + return res; + } + + const std::vector Outputs(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + std::transform( + names.begin(), names.end(), res.begin(), + [this](const std::string& name) { return scope_->GetVariable(name); }); + return res; + } + + const OperatorBase& op_; + const std::shared_ptr& scope_; + const platform::DeviceContext& device_context_; }; class OpKernel { @@ -75,28 +147,22 @@ class OpKernel { * device resource such as CUDA stream, cublas handle, etc. from * KernelContext. User should construct it before run the Operator. */ - class KernelContext { - public: - KernelContext(const OperatorBase* op, const std::shared_ptr& scope, - const platform::DeviceContext& device_context) - : op_(*op), scope_(scope), device_context_(device_context) {} - - const Variable* Input(int index) const { - return scope_->GetVariable(op_.inputs_[index]); - } + virtual void Compute(const KernelContext& context) const = 0; - Variable* Output(int index) const { - return scope_->GetVariable(op_.outputs_[index]); - } + virtual ~OpKernel() {} +}; - const OperatorBase& op_; - const std::shared_ptr& scope_; - const platform::DeviceContext& device_context_; - }; +template +struct VarToTensor {}; - virtual void Compute(const KernelContext& context) const = 0; +template <> +struct VarToTensor { + Tensor* operator()(Variable* var) { return var->GetMutable(); } +}; - virtual ~OpKernel() {} +template <> +struct VarToTensor { + const Tensor* operator()(Variable* var) { return &var->Get(); } }; class OperatorWithKernel : public OperatorBase { @@ -122,29 +188,47 @@ class OperatorWithKernel : public OperatorBase { using OpKernelMap = std::unordered_map, OpKernelHash>; - void Run(const std::shared_ptr& scope, + void Run(const ScopePtr& scope, const platform::DeviceContext& dev_ctx) const final { - auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx)); - opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx)); + auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); + opKernel->Compute(KernelContext(this, scope, dev_ctx)); } static std::unordered_map& AllOpKernels() { static std::unordered_map g_all_op_kernels; return g_all_op_kernels; + } + + void InferShape(const std::shared_ptr& scope) const final { + std::vector ins; + VarNamesToTensors(scope, inputs_, &ins); + std::vector outs; + VarNamesToTensors(scope, outputs_, &outs); + InferShape(ins, outs); }; + + private: + template + void VarNamesToTensors(const std::shared_ptr& scope, + const std::vector& var_names, + std::vector* container) const { + container->reserve(var_names.size()); + VarToTensor convert; + for (auto& name : var_names) { + auto var = scope->GetVariable(name); + if (var != nullptr) { + container->push_back(convert(var)); + } else { + container->push_back(nullptr); + } + } + } + + protected: + virtual void InferShape(const std::vector& inputs, + const std::vector& outputs) const = 0; }; } // namespace framework } // namespace paddle - -#define REGISTER_OP_KERNEL(type, PlaceType, KernelType) \ - struct __op_kernel_register__##type##__ { \ - __op_kernel_register__##type##__() { \ - ::paddle::framework::OperatorWithKernel::OpKernelKey key; \ - key.place_ = PlaceType(); \ - ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \ - .reset(new KernelType()); \ - } \ - }; \ - static __op_kernel_register__##type##__ __reg_kernel_##type##__ diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index be8c4be2d429648b3c8a708c7f8bdcae3ff2d283..6fa110f94ccc0c0a2f2e61316aa5dc271631a11c 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -19,90 +19,163 @@ limitations under the License. */ namespace paddle { namespace framework { -class OperatorTest : public OperatorBase { +static int op_run_num = 0; + +class OpWithoutKernelTest : public OperatorBase { public: - void InferShape(const std::shared_ptr& scope) const override {} - void Run(const std::shared_ptr& scope, + void Init() override { x = 1; } + void InferShape(const ScopePtr& scope) const override {} + void Run(const ScopePtr& scope, const platform::DeviceContext& dev_ctx) const override { - float scale = GetAttr("scale"); - ASSERT_NEAR(scale, 3.14, 1e-5); + op_run_num++; + ASSERT_EQ((int)inputs_.size(), 1); + ASSERT_EQ((int)outputs_.size(), 1); ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr); + ASSERT_EQ(x, 1); ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr); } + + public: + float x = 0; }; -class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { +class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: - OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("input", "input of test op"); AddOutput("output", "output of test op"); - AddAttr("scale", "scale of cosine op") - .SetDefault(1.0) - .LargerThan(0.0); - AddType("test_operator"); + AddAttr("scale", "scale of cosine op"); AddComment("This is test op"); } }; -REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker); +} // namespace framework +} // namespace paddle + +REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest, + paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker); TEST(OperatorBase, all) { - OpDesc op_desc; + paddle::framework::OpDesc op_desc; op_desc.set_type("test_operator"); *op_desc.mutable_inputs()->Add() = "IN1"; *op_desc.mutable_outputs()->Add() = "OUT1"; auto attr = op_desc.mutable_attrs()->Add(); attr->set_name("scale"); attr->set_type(paddle::framework::AttrType::FLOAT); - float scale = 3.14; - attr->set_f(scale); + attr->set_f(3.14); - platform::CPUDeviceContext device_context; - auto scope = std::make_shared(); + paddle::platform::CPUDeviceContext device_context; + auto scope = std::make_shared(); - OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc); - ASSERT_EQ(op->GetAttr("scale"), scale); + paddle::framework::OperatorPtr op = + paddle::framework::OpRegistry::CreateOp(op_desc); scope->CreateVariable("OUT1"); + ASSERT_EQ(paddle::framework::op_run_num, 0); op->Run(scope, device_context); - std::cout << op->DebugString() << std::endl; - delete op; + ASSERT_EQ(paddle::framework::op_run_num, 1); } +namespace paddle { +namespace framework { + class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("input", "input of test op"); - AddOutput("output", "output of test op"); + AddInput("x", "input of test op"); + AddOutput("y", "output of test op"); AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .LargerThan(0.0); - AddType("test_operator"); AddComment("This is test op"); } }; +static int cpu_kernel_run_num = 0; + class OpWithKernelTest : public OperatorWithKernel { - public: - void InferShape(const std::shared_ptr& scope) const override {} + protected: + void InferShape(const std::vector& inputs, + const std::vector& outputs) const override {} }; class CPUKernelTest : public OpKernel { public: - void Compute(const KernelContext& context) const { - float scale = context.op_.GetAttr("scale"); - ASSERT_NEAR(scale, 3.14, 1e-5); + void Compute(const KernelContext& ctx) const { std::cout << "this is cpu kernel" << std::endl; - std::cout << context.op_.DebugString() << std::endl; + std::cout << ctx.op_.DebugString() << std::endl; + cpu_kernel_run_num++; + ASSERT_EQ(ctx.op_.Input("x"), "IN1"); + ASSERT_EQ(ctx.op_.Output("y"), "OUT1"); + } +}; + +// multiple inputs test +class OperatorMultiInputsTest : public OperatorBase { + public: + void Init() override { x = 1; } + void InferShape(const std::shared_ptr& scope) const override {} + void Run(const std::shared_ptr& scope, + const platform::DeviceContext& dev_ctx) const override { + ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr); + ASSERT_EQ(x, 1); + ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr); + ASSERT_EQ(Input("x"), "IN1"); + ASSERT_EQ(Input("y"), "OUT1"); + } + + public: + float x = 0; +}; + +class OpKernelTestMultiInputsProtoAndCheckerMaker + : public OpProtoAndCheckerMaker { + public: + OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInputs("xs", "inputs of test op"); + AddInput("k", "input of test op"); + AddOutputs("ys", "outputs of test op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .LargerThan(0.0); + AddComment("This is test op"); + } +}; + +class CPUKernalMultiInputsTest : public OpKernel { + public: + void Compute(const KernelContext& ctx) const { + auto xs = ctx.op_.Inputs("xs"); + ASSERT_EQ(xs.size(), 3UL); + ASSERT_EQ(xs[0], "x0"); + ASSERT_EQ(xs[1], "x1"); + ASSERT_EQ(xs[2], "x2"); + + auto k = ctx.op_.Input("k"); + ASSERT_EQ(k, "k0"); + + auto ys = ctx.op_.Outputs("ys"); + ASSERT_EQ(ys.size(), 2UL); + ASSERT_EQ(ys[0], "y0"); + ASSERT_EQ(ys[1], "y1"); } }; -REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker); -REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest); +} // namespace framework +} // namespace paddle + +REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest); +// test with single input TEST(OpKernel, all) { - OpDesc op_desc; + paddle::framework::OpDesc op_desc; op_desc.set_type("op_with_kernel"); *op_desc.mutable_inputs()->Add() = "IN1"; *op_desc.mutable_outputs()->Add() = "OUT1"; @@ -111,13 +184,56 @@ TEST(OpKernel, all) { attr->set_type(paddle::framework::AttrType::FLOAT); attr->set_f(3.14); - platform::CPUDeviceContext cpu_device_context; - auto scope = std::make_shared(); + paddle::platform::CPUDeviceContext cpu_device_context; + auto scope = std::make_shared(); - OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::OperatorPtr op = + paddle::framework::OpRegistry::CreateOp(op_desc); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); op->Run(scope, cpu_device_context); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); +} + +REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, + paddle::framework::CPUKernalMultiInputsTest); - delete op; +// test with multi inputs +TEST(OpKernel, multi_inputs) { + using namespace paddle::framework; + + OpDesc op_desc; + op_desc.set_type("op_multi_inputs_with_kernel"); + *op_desc.mutable_inputs()->Add() = "x0"; + *op_desc.mutable_inputs()->Add() = "x1"; + *op_desc.mutable_inputs()->Add() = "x2"; + *op_desc.mutable_inputs()->Add() = "k0"; + *op_desc.mutable_outputs()->Add() = "y0"; + *op_desc.mutable_outputs()->Add() = "y1"; + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::AttrType::FLOAT); + attr->set_f(3.14); + + auto attr0 = op_desc.mutable_attrs()->Add(); + attr0->set_name("input_format"); + attr0->set_type(paddle::framework::AttrType::INTS); + auto input_format = attr0->mutable_ints(); + input_format->Add(0); // x0 + input_format->Add(3); // k + input_format->Add(4); // end + + auto attr1 = op_desc.mutable_attrs()->Add(); + attr1->set_name("output_format"); + attr1->set_type(paddle::framework::AttrType::INTS); + auto output_format = attr1->mutable_ints(); + output_format->Add(0); // y0 + output_format->Add(2); // y1 + + paddle::platform::CPUDeviceContext cpu_device_context; + auto scope = std::make_shared(); + + OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc)); + op->Run(scope, cpu_device_context); } -} // namespace framework -} // namespace paddle \ No newline at end of file diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index a4470f726fb0d59a82db29b3239c111ea1569c55..ec62c9189fd2a5ea74c6d6e5635a4d500e4823e2 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -23,6 +23,9 @@ limitations under the License. */ namespace paddle { namespace framework { +class Scope; +using ScopePtr = std::shared_ptr; + /** * @brief Scope that manage all variables. * @@ -41,7 +44,7 @@ class Scope { /** * @brief Initialize a Scope with parent. */ - explicit Scope(const std::shared_ptr& parent) : parent_(parent) {} + explicit Scope(const ScopePtr& parent) : parent_(parent) {} /** * @brief Create Variable @@ -88,7 +91,7 @@ class Scope { private: std::unordered_map> vars_; - std::shared_ptr parent_{nullptr}; + ScopePtr parent_{nullptr}; }; } // namespace framework diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..964f15ab66bca7da75824e192e61600c29e572c0 --- /dev/null +++ b/paddle/framework/tensor.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +namespace paddle { +namespace framework {} +} // namespace paddle diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index a0945e8055625ca4c21ea1c3fa9f27321ca9ba3c..b405e3877c9f15712dd5bbf08de1e4f7148e0fd9 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include +#include #include -#include #include "paddle/framework/ddim.h" #include "paddle/framework/enforce.h" #include "paddle/memory/memory.h" @@ -29,43 +29,71 @@ class Tensor { public: Tensor() : offset_(0) {} - explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {} - template const T* data() const { - PADDLE_ENFORCE( - holder_ != nullptr, - "Tenosr has not been initialized. Call Tensor::mutable_data first."); + CheckDims(); return reinterpret_cast( - reinterpret_cast(holder_->Ptr()) + offset_); + reinterpret_cast(holder_->ptr()) + offset_); } - template ::value>::type* = nullptr> - T* mutable_data(DDim dims, paddle::platform::Place place) { - dims_ = dims; + template + T* mutable_data(DDim dims, platform::Place place) { + set_dims(dims); + return mutable_data(place); + } + + template + T* mutable_data(platform::Place place) { + PADDLE_ENFORCE(product(dims_) > 0, + "Tensor's numel must be larger than zero to call " + "Tensor::mutable_data. Call Tensor::set_dim first."); if (holder_ == nullptr || - !(holder_->Place() == + !(holder_->place() == place) /* some versions of boost::variant don't have operator!= */ - || holder_->Size() < product(dims) * sizeof(T) + offset_) { - holder_.reset(new PlaceholderImpl(place, product(dims) * sizeof(T))); + || holder_->size() < product(dims_) * sizeof(T) + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), product(dims_) * sizeof(T))); + } else if (platform::is_gpu_place(place)) { +#ifdef __CUDACC__ + holder_.reset(new PlaceholderImpl( + boost::get(place), product(dims_) * sizeof(T))); +#else + PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device."); +#endif + } else { + PADDLE_ENFORCE(true, "Unknown 'place'."); + } offset_ = 0; } - return reinterpret_cast(reinterpret_cast(holder_->Ptr()) + + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } + template void ShareDataFrom(const Tensor& src) { - PADDLE_ENFORCE(src.holder_ != nullptr, - "Can not share data from an uninitialized tensor."); + src.CheckDims(); holder_ = src.holder_; - dims_ = src.dims_; + set_dims(src.dims()); offset_ = src.offset_; } + template + void CopyFrom(const Tensor& src, platform::Place dst_place) { + PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) && + platform::is_cpu_place(dst_place), + "Tensor::CopyFrom only support CPU now."); + src.CheckDims(); + size_t size = product(src.dims_) * sizeof(T); + set_dims(src.dims()); + const void* src_ptr = static_cast(src.data()); + void* dst_ptr = static_cast(mutable_data(dst_place)); + memcpy(dst_ptr, src_ptr, size); + } + + template Tensor Slice(const int& begin_idx, const int& end_idx) const { - PADDLE_ENFORCE(holder_ != nullptr, - "The sliced tenosr has not been initialized."); + CheckDims(); PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0], "Slice index is less than zero or out of bound."); PADDLE_ENFORCE(begin_idx < end_idx, @@ -78,12 +106,20 @@ class Tensor { } Tensor dst; dst.holder_ = holder_; - dst.dims_ = dims_; - dst.dims_[0] = end_idx - begin_idx; - dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize(); + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.set_dims(dst_dims); + dst.offset_ = offset_ + begin_idx * base * sizeof(T); return dst; } + void set_dims(const DDim& dims) { + if (dims == dims_) { + return; + } + dims_ = dims; + } + DDim dims() const { return dims_; } private: @@ -91,43 +127,49 @@ class Tensor { // parameter of Variable. struct Placeholder { virtual ~Placeholder() {} - virtual void* Ptr() const = 0; - virtual paddle::platform::Place Place() const = 0; - virtual size_t Size() const = 0; - virtual size_t TypeSize() const = 0; + virtual void* ptr() const = 0; + virtual platform::Place place() const = 0; + virtual size_t size() const = 0; }; - template + template struct PlaceholderImpl : public Placeholder { private: + template class Deleter { public: - Deleter(platform::Place place) : place_(place) {} - void operator()(T* ptr) { - paddle::memory::Free(place_, static_cast(ptr)); - } + Deleter(PType place) : place_(place) {} + void operator()(T* ptr) { memory::Free(place_, static_cast(ptr)); } private: - paddle::platform::Place place_; + PType place_; }; public: - PlaceholderImpl(paddle::platform::Place place, size_t size) - : ptr_(static_cast(paddle::memory::Alloc(place, size)), - Deleter(place)), + PlaceholderImpl(PlaceType place, size_t size) + : ptr_(static_cast(memory::Alloc(place, size)), + Deleter(place)), place_(place), size_(size) {} - virtual void* Ptr() const { return static_cast(ptr_.get()); } - virtual size_t Size() const { return size_; } - virtual paddle::platform::Place Place() const { return place_; } - virtual size_t TypeSize() const { return sizeof(T); } + virtual void* ptr() const { return static_cast(ptr_.get()); } + virtual size_t size() const { return size_; } + virtual platform::Place place() const { return place_; } - std::unique_ptr ptr_; - paddle::platform::Place place_; // record the place of ptr_. - size_t size_; // size of the memory block. + std::unique_ptr> ptr_; + platform::Place place_; // record the place of ptr_. + size_t size_; // size of the memory block. }; + template + inline void CheckDims() const { + PADDLE_ENFORCE(holder_ != nullptr, + "Tenosr holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory."); + } + std::shared_ptr holder_; // holds the memory block if allocated. DDim dims_; size_t offset_; // marks the begin of tensor data area. diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index f4822838cfbd27656232a23b14f716f2fbe510e0..84c6f0cf6558819440458688ca52b06c1cf11dd0 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -18,7 +18,8 @@ TEST(Tensor, Dims) { using namespace paddle::framework; using namespace paddle::platform; - Tensor tt(make_ddim({2, 3, 4})); + Tensor tt; + tt.set_dims(make_ddim({2, 3, 4})); DDim dims = tt.dims(); ASSERT_EQ(arity(dims), 3); for (int i = 0; i < 3; ++i) { @@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) { } catch (paddle::framework::EnforceNotMet err) { caught = true; std::string msg = - "Tenosr has not been initialized. Call Tensor::mutable_data first."; + "Tenosr holds no memory. Call Tensor::mutable_data first."; const char* what = err.what(); for (size_t i = 0; i < msg.length(); ++i) { ASSERT_EQ(what[i], msg[i]); @@ -46,7 +47,7 @@ TEST(Tensor, DataAssert) { /* following tests are not available at present because Memory::Alloc() and Memory::Free() have not been ready. - +*/ TEST(Tensor, MutableData) { using namespace paddle::framework; using namespace paddle::platform; @@ -71,7 +72,7 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); EXPECT_EQ(p1, p2); } - +#ifdef __CUDACC__ { Tensor src_tensor; float* p1 = nullptr; @@ -93,6 +94,7 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(make_ddim({2, 2}), GPUPlace()); EXPECT_EQ(p1, p2); } +#endif } TEST(Tensor, ShareDataFrom) { @@ -104,10 +106,11 @@ TEST(Tensor, ShareDataFrom) { // Try to share data form uninitialized tensor bool caught = false; try { - dst_tensor.ShareDataFrom(src_tensor); + dst_tensor.ShareDataFrom(src_tensor); } catch (EnforceNotMet err) { caught = true; - std::string msg = "Can not share data from an uninitialized tensor."; + std::string msg = + "Tenosr holds no memory. Call Tensor::mutable_data first."; const char* what = err.what(); for (size_t i = 0; i < msg.length(); ++i) { ASSERT_EQ(what[i], msg[i]); @@ -116,17 +119,19 @@ TEST(Tensor, ShareDataFrom) { ASSERT_TRUE(caught); src_tensor.mutable_data(make_ddim({2, 3, 4}), CPUPlace()); - dst_tensor.ShareDataFrom(src_tensor); + dst_tensor.ShareDataFrom(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } +#ifdef __CUDACC__ { Tensor src_tensor; Tensor dst_tensor; src_tensor.mutable_data(make_ddim({2, 3, 4}), GPUPlace()); - dst_tensor.ShareDataFrom(src_tensor); + dst_tensor.ShareDataFrom(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } +#endif } TEST(Tensor, Slice) { @@ -135,7 +140,7 @@ TEST(Tensor, Slice) { { Tensor src_tensor; src_tensor.mutable_data(make_ddim({5, 3, 4}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 3); + Tensor slice_tensor = src_tensor.Slice(1, 3); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 3); EXPECT_EQ(slice_dims[0], 2); @@ -155,10 +160,11 @@ TEST(Tensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } +#ifdef __CUDACC__ { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); - Tensor slice_tensor = src_tensor.Slice(2, 6); + Tensor slice_tensor = src_tensor.Slice(2, 6); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); EXPECT_EQ(slice_dims[0], 4); @@ -176,6 +182,31 @@ TEST(Tensor, Slice) { EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); } +#endif } -*/ \ No newline at end of file +TEST(Tensor, CopyFrom) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor src_tensor; + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + Tensor dst_tensor; + dst_tensor.CopyFrom(src_tensor, CPUPlace()); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, CPUPlace()); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } +} diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 1518a8a654cfb54376a49760dc5873733c916937..a5b14c0c71c18da1bb0b506c663f8680b1c3830a 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -11,7 +11,6 @@ if(WITH_GPU) endif() if(USE_NNPACK) - include(nnpack/nnpack.cmake) list(APPEND cpp_files nnpack/NNPACKConvOp.cpp) if(WITH_TESTING) add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index a40e5d9d2e76605525f0956445fc43c693933cf8..00880effc59cc80b2761fb6a4d9f3246439afd3f 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -117,8 +117,7 @@ public: ConvFunctionBase::init(config); } - virtual void check(const BufferArgs& inputs, - const BufferArgs& outputs) override { + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { const TensorShape& input = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); const TensorShape& output = outputs[0].shape(); @@ -217,8 +216,7 @@ public: ConvFunctionBase::init(config); } - virtual void check(const BufferArgs& inputs, - const BufferArgs& outputs) override { + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { const TensorShape& output = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); const TensorShape& input = outputs[0].shape(); @@ -311,8 +309,7 @@ public: ConvFunctionBase::init(config); } - virtual void check(const BufferArgs& inputs, - const BufferArgs& outputs) override { + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { const TensorShape& output = inputs[0].shape(); const TensorShape& input = inputs[1].shape(); const TensorShape& filter = outputs[0].shape(); diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp index 4348f0f775e9442c50a3c45b9a8e6dad5c6b198d..e0692fa06d6e0c35cfa742ca3eac7fe2037b1a80 100644 --- a/paddle/function/NaiveConvOp.cpp +++ b/paddle/function/NaiveConvOp.cpp @@ -90,8 +90,7 @@ public: ConvFunctionBase::init(config); } - virtual void check(const BufferArgs& inputs, - const BufferArgs& outputs) override { + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { const TensorShape& input = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); const TensorShape& output = outputs[0].shape(); diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu index c0b947e224313abaf4fadfb8293dc78ca085ff84..d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978 100644 --- a/paddle/function/RowConvOpGpu.cu +++ b/paddle/function/RowConvOpGpu.cu @@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w, for (int i = tidy; i < context; i += blky) { sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; } - + __syncthreads(); for (int i = 0; i < numSeq; ++i) { @@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; - sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = (xoff < width && yoff < end) ? + x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? + dy[yoff * width + xoff] : 0.0; __syncthreads(); if (tidy < (context - 1)) { yoff = yoff - context + 1; - sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? + dy[yoff * width + xoff] : 0.0; } __syncthreads(); @@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, int yoff = start + j; // transpose - sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + sh_x[tidx][tidy] = (xoff < width && yoff < end) ? + x[yoff * width + xoff] : 0.0; __syncthreads(); for (int t = 0; t < context; t++) { - sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; + sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && + yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; __syncthreads(); real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; @@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, for (int i = tidy; i < context; i += blky) { sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; } - + __syncthreads(); for (int i = 0; i < numSeq; ++i) { @@ -312,7 +317,7 @@ void RowConvGrad(const GpuMatrix& outG, dim3 dimBlock(32, 32); dim3 dimGrid(DIVUP(width, dimBlock.x), 1); real* dw = filterG.getData(); - if (contextLength <= 32) { + if (contextLength <= 32) { KeRowConvBwWeight<32, 32, 32> <<>> (dw, x, dy, starts, height, width, numSeq, contextLength); diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp index e8080c3d714b324f072a380f738b9764477dfe04..f0ec77a5d00333993427fb8d0bc938c884e50c95 100644 --- a/paddle/function/nnpack/NNPACKConvOp.cpp +++ b/paddle/function/nnpack/NNPACKConvOp.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/function/ConvOp.h" DEFINE_bool(nnpack_allocate_outside, - false, + true, "Allocate and free workspace memory outside the NNPACK interface."); DEFINE_int32(nnpack_num_threads, 0, @@ -58,18 +58,10 @@ public: workspaceBuffer_ = nullptr; workspaceSize_ = 0; - threadpool_ = nullptr; - if (FLAGS_nnpack_num_threads) { - threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads); - VLOG(3) << "Number of threads " - << pthreadpool_get_threads_count(threadpool_); - } + create_nnpack_threadpool(); } ~NNPACKConvFunction() { - if (threadpool_) { - pthreadpool_destroy(threadpool_); - } if (workspaceBuffer_) { free(workspaceBuffer_); } @@ -225,14 +217,25 @@ public: } } + static void create_nnpack_threadpool() { + if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) { + threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads); + VLOG(3) << "Number of threads " + << pthreadpool_get_threads_count(threadpool_); + } + } + private: nnp_convolution_algorithm algorithm_; nnp_convolution_transform_strategy transform_strategy_; void* workspaceBuffer_; size_t workspaceSize_; - pthreadpool_t threadpool_; + static pthreadpool_t threadpool_; }; +template +pthreadpool_t NNPACKConvFunction::threadpool_ = nullptr; + REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction); } // namespace paddle diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h index 40036762179ebb1495b90907f16b97e3c60c50d8..265dbb54933540ff8b0d1e2e2d985b4b7fa51ecd 100644 --- a/paddle/gserver/dataproviders/DataProvider.h +++ b/paddle/gserver/dataproviders/DataProvider.h @@ -205,10 +205,8 @@ public: hl_destroy_event(hlEvent_); hlEvent_ = NULL; } - if (batchData_) { - delete batchData_; - batchData_ = NULL; - } + delete batchData_; + batchData_ = NULL; } void setDataBatch(DataBatch* batchData) { batchData_ = batchData; } diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index 2e839f640503b8f4e390fc87d9d59960dbc37f6e..cfa80a89365af5111746eec9599d16e37532a9f7 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -403,7 +403,7 @@ public: : layerName_(layerName) { addEvaluator(std::move(evaluator)); } - virtual void eval(const NeuralNetwork& nn) override { + void eval(const NeuralNetwork& nn) override { const LayerPtr& layer = nn.getLayer(layerName_); CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel " << nn.getName(); diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 9a972466d66ba1417b2c31e66dc375b3da229aa8..9ddd449de7500f5682d59469328f06971c6e83bf 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -636,7 +636,7 @@ void lenToStarts(std::vector& starts) { } starts.back() = pos; } -} +} // namespace void RecurrentGradientMachine::calcSequenceStartPositions() { std::vector starts(commonSeqInfo_.size() + 1); diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 15e7411b5fde0fa3a532394cf7d0e8477ef052d0..bdae7e623ae0472d4fe5ef3a88fc1e93bbf1e52c 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec, dest[index[i]] = src[i]; } } -} +} // namespace void GatherAgentLayer::forwardIds(PassType passType) { IVectorPtr realId = realLayers_[0]->getOutputLabel(); diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 7ce17a3207becb176a852a16fca52376009db9ee..4adaaef9838f0d178468af3af142031325bfc11d 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); }, StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {} StorageEngine::~StorageEngine() { - if (cpuAllocator_) { - delete cpuAllocator_; - } + delete cpuAllocator_; for (auto it : gpuAllocator_) { delete it; } diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 3943c3cfad31d13a00645aba6fc153d3d13da987..fac442cca56b81f56a750bd3b1c2c0911e79e468 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1 +1,11 @@ add_subdirectory(detail) + +cc_library(memory SRCS memory.cc) + +cc_library(paddle_memory + DEPS + memory meta_data + meta_cache memory_block + buddy_allocator system_allocator) + +cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 72d3749ad789eca9a4b10944131171c0cf8dfe5a..b9c3fc31c1523abf3acbd116745bbf1596454aac 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,7 +1,15 @@ if(${WITH_GPU}) - nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) - nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) else(${WITH_GPU}) - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) - cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) endif(${WITH_GPU}) + +cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) + +cc_library(meta_data SRCS meta_data.cc) + +cc_library(meta_cache SRCS meta_cache.cc) + +cc_library(memory_block SRCS memory_block.cc) + +cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index ebe680f5eea4948339fb8c5584a5b9f5d71c752e..27c1b4033b53b059d38ed88694b20b429cbb4cce 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -12,22 +12,317 @@ See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "paddle/memory/detail/buddy_allocator.h" +#include "glog/logging.h" namespace paddle { namespace memory { namespace detail { -BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator) - : pool_size_(pool_size), - max_pools_(max_pools), - system_allocator_(system_allocator) { - PADDLE_ASSERT(pool_size > 0); - PADDLE_ASSERT(max_pools > 0); - PADDLE_ASSERT(system_allocator != nullptr); +BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, + size_t min_chunk_size, size_t max_chunk_size) + : min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + cache_(system_allocator->UseGpu()), + system_allocator_(std::move(system_allocator)) {} + +BuddyAllocator::~BuddyAllocator() { + DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_ + << ")"; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + pool_.erase(pool_.begin()); + } +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + DLOG(INFO) << "Allocate from system allocator."; + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(); + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + } else { + DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void BuddyAllocator::Free(void* p) { + // Point back to metadata + auto block = static_cast(p)->metadata(); + + // Acquire the allocator lock + std::lock_guard lock(mutex_); + + DLOG(INFO) << "Free from address " << block; + + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + DLOG(INFO) << "Free directly from system allocator"; + system_allocator_->Free(block, block->total_size(cache_), + block->index(cache_)); + + // Invalidate GPU allocation from cache + cache_.invalidate(block); + + return; + } + + block->mark_as_free(cache_); + + total_used_ -= block->total_size(cache_); + total_free_ += block->total_size(cache_); + + // Trying to merge the right buddy + if (block->has_right_buddy(cache_)) { + DLOG(INFO) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); + + auto right_buddy = block->right_buddy(cache_); + + if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(right_buddy->index(cache_), + right_buddy->total_size(cache_), + right_buddy)); + + // merge its right buddy to the block + block->merge(cache_, right_buddy); + } + } + + // Trying to merge the left buddy + if (block->has_left_buddy(cache_)) { + DLOG(INFO) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); + + auto left_buddy = block->left_buddy(cache_); + + if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(left_buddy->index(cache_), + left_buddy->total_size(cache_), left_buddy)); + + // merge the block to its left buddy + left_buddy->merge(cache_, block); + block = left_buddy; + } + } + + // Dumping this block into pool + DLOG(INFO) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; + pool_.insert( + IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); + + // Clean up if existing too much free memory + + // Prefer freeing fallback allocation first + CleanIdleFallBackAlloc(); + + // Free normal allocation + CleanIdleNormalAlloc(); +} + +size_t BuddyAllocator::Used() { return total_used_; } + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(index, size); + + DLOG(INFO) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +#ifndef PADDLE_ONLY_CPU + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } + } +#endif // PADDLE_ONLY_CPU + + // Allocate a new maximum sized block + size_t index = 0; + void* p = system_allocator_->Alloc(index, max_chunk_size_); + + if (p == nullptr) return pool_.end(); + + DLOG(INFO) << "Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + max_chunk_size_, nullptr, nullptr); + + // gpu fallback allocation + if (system_allocator_->UseGpu() && + static_cast(p)->index(cache_) == 1) { + fallback_alloc_count_++; + } + + total_free_ += max_chunk_size_; + + // dump the block into pool + return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); + + // no match chunk memory + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + // find suitable one + if (std::get<1>(*it) >= size) { + return it; + } + // update and continue + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + pool_.erase(it); + + DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + block->split(cache_, size); + + DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; + block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; + + pool_.insert( + IndexSizeAddress(block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_))); + } + } + + return block; +} + +void BuddyAllocator::CleanIdleFallBackAlloc() { + // If fallback allocation does not exist, return directly + if (!fallback_alloc_count_) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + return; + } + + DLOG(INFO) << "Return block " << block << " to fallback allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + fallback_alloc_count_--; + + // If no fall allocation exists, return directly + if (!fallback_alloc_count_) return; + } +} + +void BuddyAllocator::CleanIdleNormalAlloc() { + auto shall_free_alloc = [&]() -> bool { + // free all fallback allocations + if (fallback_alloc_count_ > 0) { + return true; + } + // keep 2x overhead if we haven't fallen back + if ((total_used_ + max_chunk_size_) * 2 < total_free_) { + return true; + } + return false; + }; + + if (!shall_free_alloc()) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + DLOG(INFO) << "Return block " << block << " to base allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + + if (!shall_free_alloc()) return; + } } } // namespace detail diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 82e6aaedc719966b4074449ce1ef7193c73dc265..4fa3fb0ee5f826d2b084c0ba184c505aee3acc48 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -14,9 +14,16 @@ #pragma once +#include "paddle/memory/detail/meta_cache.h" +#include "paddle/memory/detail/meta_data.h" #include "paddle/memory/detail/system_allocator.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cpu_info.h" +#include "paddle/platform/gpu_info.h" #include +#include +#include #include namespace paddle { @@ -25,61 +32,80 @@ namespace detail { class BuddyAllocator { public: - BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator); + BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size, + size_t max_chunk_size); + ~BuddyAllocator(); - void* Alloc(size_t size); + public: + void* Alloc(size_t unaligned_size); void Free(void*); size_t Used(); + public: + // Disable copy and assignment + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + private: - struct Block { - size_t size_; - Block* left_; // left buddy - Block* right_; // right buddy - }; + // Tuple (allocator index, memory size, memory address) + using IndexSizeAddress = std::tuple; + // Each element in PoolSet is a free allocation + using PoolSet = std::set; - // Initially, there is only one pool. If a Alloc founds not enough - // memory from that pool, and there has not been max_num_pools_, - // create a new pool by calling system_allocator_.Alloc(pool_size_). - std::vector pools_; + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); - size_t pool_size_; // the size of each pool; - size_t max_num_pools_; // the size of all pools; + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(); - SystemAllocator* system_allocator_; + /** + * \brief Find the suitable chunk from existing pool and split + * it to left and right buddies + * + * \param it the iterator of pool list + * \param size the size of allocation + * + * \return the left buddy address + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); - std::mutex mutex_; + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); - // Disable copy and assignment. - BuddyAllocator(const BuddyAllocator&) = delete; - BuddyAllocator& operator=(const BuddyAllocator&) = delete; -}; + /*! \brief Clean idle fallback allocation */ + void CleanIdleFallBackAlloc(); + + /*! \brief Clean idle normal allocation */ + void CleanIdleNormalAlloc(); -BuddyAllocator* GetCPUBuddyAllocator() { - static BuddyAllocator* a = nullptr; - if (a == nullptr) { - a = new BuddyAllocator(); - } - return a; -} - -#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. - -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static BuddyAllocator** as = NULL; - if (as == NULL) { - int gpu_num = platform::GetDeviceCount(); - as = new BuddyAllocator*[gpu_num]; - for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = new BuddyAllocator(); - } - } - return as[gpu_id]; -} - -#endif // PADDLE_ONLY_CPU + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + private: + /** + * \brief A list of free allocation + * + * \note Only store free chunk memory in pool + */ + PoolSet pool_; + + /*! Record fallback allocation count for auto-scaling */ + size_t fallback_alloc_count_ = 0; + + private: + /*! Unify the metadata format between GPU and CPU allocations */ + MetadataCache cache_; + + private: + /*! Allocate CPU/GPU memory from system */ + SystemAllocator* system_allocator_; + std::mutex mutex_; +}; } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc40993208323f1f5d18103165c8835b5f829613 --- /dev/null +++ b/paddle/memory/detail/memory_block.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_cache.h" +#include "paddle/memory/detail/meta_data.h" +#include "paddle/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy) { + cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, + static_cast(left_buddy), + static_cast(right_buddy))); +} + +MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { + return cache.load(this).type; +} + +size_t MemoryBlock::size(MetadataCache& cache) const { + return cache.load(this).size; +} + +size_t MemoryBlock::total_size(MetadataCache& cache) const { + return cache.load(this).total_size; +} + +MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { + return cache.load(this).left_buddy; +} + +MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { + return cache.load(this).right_buddy; +} + +void MemoryBlock::split(MetadataCache& cache, size_t size) { + // make sure the split fits + PADDLE_ASSERT(total_size(cache) >= size); + + // bail out if there is no room for another partition + if (total_size(cache) - size <= sizeof(Metadata)) { + return; + } + + // find the position of the split + void* right_partition = reinterpret_cast(this) + size; + + size_t remaining_size = total_size(cache) - size; + + // Add the new block as a buddy + auto metadata = cache.load(this); + + // Write the metadata for the new block + auto new_block_right_buddy = metadata.right_buddy; + + cache.store( + static_cast(right_partition), + Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), + remaining_size, this, new_block_right_buddy)); + + metadata.right_buddy = static_cast(right_partition); + metadata.size = size - sizeof(Metadata); + metadata.total_size = size; + + cache.store(this, metadata); + + // Write metadata for the new block's right buddy + if (new_block_right_buddy != nullptr) { + auto buddy_metadata = cache.load(new_block_right_buddy); + + buddy_metadata.left_buddy = static_cast(right_partition); + + cache.store(new_block_right_buddy, buddy_metadata); + } +} + +void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { + // only free blocks can be merged + PADDLE_ASSERT(type(cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); + + auto metadata = cache.load(this); + + // link this->buddy's buddy + metadata.right_buddy = right_buddy->right_buddy(cache); + + // link buddy's buddy -> this + if (metadata.right_buddy != nullptr) { + auto buddy_metadata = cache.load(metadata.right_buddy); + + buddy_metadata.left_buddy = this; + + cache.store(metadata.right_buddy, buddy_metadata); + } + + metadata.size += right_buddy->total_size(cache); + metadata.total_size += right_buddy->total_size(cache); + + cache.store(this, metadata); + cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); +} + +void MemoryBlock::mark_as_free(MetadataCache& cache) { + // check for double free or corruption + PADDLE_ASSERT(type(cache) != FREE_CHUNK); + PADDLE_ASSERT(type(cache) != INVALID_CHUNK); + + set_type(cache, FREE_CHUNK); +} + +void MemoryBlock::set_type(MetadataCache& cache, Type t) { + auto metadata = cache.load(this); + + metadata.type = t; + + cache.store(this, metadata); +} + +bool MemoryBlock::has_left_buddy(MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +size_t MemoryBlock::index(MetadataCache& cache) const { + return cache.load(this).index; +} + +void* MemoryBlock::data() const { + return const_cast(reinterpret_cast(this)) + 1; +} + +MemoryBlock* MemoryBlock::metadata() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) - 1)); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h new file mode 100644 index 0000000000000000000000000000000000000000..a5168b519f3a3747f34ef2ea7b87d72dce70064d --- /dev/null +++ b/paddle/memory/detail/memory_block.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace memory { +namespace detail { + +// Forward Declarations +class MetadataCache; + +/*! \brief A class used to interpret the contents of a memory block */ +class MemoryBlock { + public: + enum Type { + FREE_CHUNK, // memory is free and idle + ARENA_CHUNK, // memory is being occupied + HUGE_CHUNK, // memory is out of management + INVALID_CHUNK // memory is invalid + }; + + public: + void init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy); + + public: + /*! \brief The type of the allocation */ + Type type(MetadataCache& cache) const; + + /*! \brief The size of the data region */ + size_t size(MetadataCache& cache) const; + + /*! \brief An index to track the allocator */ + size_t index(MetadataCache& cache) const; + + /*! \brief The total size of the block */ + size_t total_size(MetadataCache& cache) const; + + /*! \brief Check the left buddy of the block */ + bool has_left_buddy(MetadataCache& cache) const; + + /*! \brief Check the right buddy of the block */ + bool has_right_buddy(MetadataCache& cache) const; + + /*! \brief Get the left buddy */ + MemoryBlock* left_buddy(MetadataCache& cache) const; + + /*! \brief Get the right buddy */ + MemoryBlock* right_buddy(MetadataCache& cache) const; + + public: + /*! \brief Split the allocation into left/right blocks */ + void split(MetadataCache& cache, size_t size); + + /*! \brief Merge left and right blocks together */ + void merge(MetadataCache& cache, MemoryBlock* right_buddy); + + /*! \brief Mark the allocation as free */ + void mark_as_free(MetadataCache& cache); + + /*! \brief Change the type of the allocation */ + void set_type(MetadataCache& cache, Type t); + + public: + /*! \brief Get a pointer to the memory block's data */ + void* data() const; + + /*! \brief Get a pointer to the memory block's metadata */ + MemoryBlock* metadata() const; + + public: + static size_t overhead(); +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc new file mode 100644 index 0000000000000000000000000000000000000000..30ff80e7bac0b595fe60aeab0a3c59f4e23eae2d --- /dev/null +++ b/paddle/memory/detail/meta_cache.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/memory/detail/meta_cache.h" +#include "paddle/memory/detail/memory_block.h" +#include "paddle/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} + +Metadata MetadataCache::load(const MemoryBlock* block) { + if (uses_gpu_) { + auto existing_metadata = cache_.find(block); + PADDLE_ASSERT(existing_metadata->second.check_guards()); + return existing_metadata->second; + } else { + PADDLE_ASSERT(reinterpret_cast(block)->check_guards()); + return *reinterpret_cast(block); + } +} + +void MetadataCache::store(MemoryBlock* block, + const Metadata& original_metadata) { + auto metadata = original_metadata; + + metadata.update_guards(); + + if (uses_gpu_) { + cache_[block] = metadata; + } else { + *reinterpret_cast(block) = metadata; + } +} + +void MetadataCache::invalidate(MemoryBlock* block) { + if (uses_gpu_) { + cache_.erase(block); + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..ca0789779e273fb71c3d6282c0a921cda2d776cc --- /dev/null +++ b/paddle/memory/detail/meta_cache.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +/** + * \brief A cache for accessing memory block meta-data that may be expensive + * to access directly. + * + * \note This class exists to unify the metadata format between GPU and CPU + * allocations. It should be removed when the CPU can access all GPU + * allocations directly via UVM. + */ +class MetadataCache { + public: + MetadataCache(bool uses_gpu); + + public: + /*! \brief Load the associated metadata for the specified memory block. */ + Metadata load(const MemoryBlock*); + + /*! \brief Store the associated metadata for the specified memory block. */ + void store(MemoryBlock*, const Metadata&); + + /*! \brief Indicate that the specified metadata will no longer be used. */ + void invalidate(MemoryBlock*); + + public: + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + private: + bool uses_gpu_; + + private: + typedef std::unordered_map MetadataMap; + + private: + MetadataMap cache_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc new file mode 100644 index 0000000000000000000000000000000000000000..70c5c1f439e84ec33cf0507beae33f9cdfa51727 --- /dev/null +++ b/paddle/memory/detail/meta_data.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) + : type(t), + index(i), + size(s), + total_size(ts), + left_buddy(l), + right_buddy(r) {} + +Metadata::Metadata() + : type(MemoryBlock::INVALID_CHUNK), + index(0), + size(0), + total_size(0), + left_buddy(nullptr), + right_buddy(nullptr) {} + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +inline size_t hash(const Metadata* metadata, size_t initial_seed) { + size_t seed = initial_seed; + + hash_combine(seed, (size_t)metadata->type); + hash_combine(seed, metadata->index); + hash_combine(seed, metadata->size); + hash_combine(seed, metadata->total_size); + hash_combine(seed, metadata->left_buddy); + hash_combine(seed, metadata->right_buddy); + + return seed; +} + +void Metadata::update_guards() { + guard_begin = hash(this, 1); + guard_end = hash(this, 2); +} + +bool Metadata::check_guards() const { + return guard_begin == hash(this, 1) && guard_end == hash(this, 2); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h new file mode 100644 index 0000000000000000000000000000000000000000..628cf1f2e347e288d1bf34c14c7b2f13a28d3662 --- /dev/null +++ b/paddle/memory/detail/meta_data.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/memory_block.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +class Metadata { + public: + Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + Metadata(); + + public: + /*! \brief Update the guards when metadata is changed */ + void update_guards(); + + /*! \brief Check consistency to previous modification */ + bool check_guards() const; + + public: + // TODO(gangliao): compress this + // clang-format off + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + // clang-format on +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 50bec926f83dee8a4343d0b16aeb088f9d2a4871..1579174b1a6ff08824629d833d01411cff651f48 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -13,76 +13,128 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/detail/system_allocator.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/error.h" +#include "paddle/platform/gpu_info.h" #include // for malloc and free #include // for mlock and munlock #include "gflags/gflags.h" -#include "paddle/platform/assert.h" -#include "paddle/platform/cuda.h" // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, false, - "If set, allocate cpu/gpu pinned memory."); +DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory."); namespace paddle { namespace memory { namespace detail { -void* CPUAllocator::Alloc(size_t size) { +void* CPUAllocator::Alloc(size_t& index, size_t size) { // According to http://www.cplusplus.com/reference/cstdlib/malloc/, // malloc might not return nullptr if size is zero, but the returned // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; + index = 0; // unlock memory + void* p = malloc(size); - if (p != nullptr && FLAGS_use_pinned_memory) { - mlock(p, size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + index = 1; + mlock(p, size); // lock memory + } } + return p; } -void CPUAllocator::Free(void* p, size_t size) { - if (p != nullptr && FLAGS_use_pinned_memory) { +void CPUAllocator::Free(void* p, size_t size, size_t index) { + if (p != nullptr && index == 1) { munlock(p, size); } free(p); } +bool CPUAllocator::UseGpu() const { return false; } + #ifndef PADDLE_ONLY_CPU -void* GPUAllocator::Alloc(size_t size) { +void* GPUAllocator::Alloc(size_t& index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. - if (size <= 0) { - return nullptr; - } + if (size <= 0) return nullptr; + size_t available = 0; + size_t capacity = 0; + paddle::platform::GpuMemoryUsage(available, capacity); + + // Reserve memory for page tables, etc. + size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); + size_t usable = available > reserving ? available - reserving : 0; + + // If remaining size no less than expected size, using general + // cudaMalloc to allocate GPU memory. void* p = 0; - cudaError_t result = - FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); - if (result != cudaSuccess) { - cudaGetLastError(); // clear error if there is any. + if (size <= usable) { + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + index = 0; + gpu_alloc_size_ += size; + return p; + } + } + + // If remaining size less than expected size or cudaMalloc failed, + // cudaMallocHost will be considered as a fallback allocator. + // + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host fallback allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + fallback_alloc_size_ += size; + return p; } - return result == cudaSuccess ? p : nullptr; + + return nullptr; } -void GPUAllocator::Free(void* p, size_t size) { +void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. - cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { - platform::throw_on_error(err, "cudaFree{Host} failed"); + platform::throw_on_error(err, + "cudaFree{Host} failed in GPUAllocator::Free."); } } +bool GPUAllocator::UseGpu() const { return true; } + #endif // PADDLE_ONLY_CPU } // namespace detail diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 184b383f7f78244fa6632a3bffb1a0a78b3aa664..82ba322e057575c460b1d51d719c9b0fa459273e 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -20,31 +20,36 @@ namespace paddle { namespace memory { namespace detail { -// SystemAllocator is the parent class of CPUAllocator and -// GPUAllocator. A BuddyAllocator object uses a SystemAllocator* -// pointing to the underlying system allocator. An alternative to -// this class hierarchy is to pass a system allocator class to -// BuddyAllocator as a template parameter. This approach makes -// BuddyAllocator a class template, and it's very complicated -// algorithm would make the buddy_allocator.h messy. +/** + * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator. + * A BuddyAllocator object uses a SystemAllocator* pointing to the + * underlying system allocator. + */ class SystemAllocator { public: virtual ~SystemAllocator() {} - virtual void* Alloc(size_t size) = 0; - virtual void Free(void* p, size_t size) = 0; + virtual void* Alloc(size_t& index, size_t size) = 0; + virtual void Free(void* p, size_t size, size_t index) = 0; + virtual bool UseGpu() const = 0; }; class CPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; }; #ifndef PADDLE_ONLY_CPU class GPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t gpu_alloc_size_ = 0; + size_t fallback_alloc_size_ = 0; }; #endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index 9bd5706a4e4d1546a8c879ebbac0f3349c9d59f6..ba44e06ddb68e92e4086a8006b868557b0c89b50 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory); void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { bool freed = false; { - void* p = a.Alloc(size); + size_t index; + void* p = a.Alloc(index, size); if (size > 0) { EXPECT_NE(p, nullptr); } else { @@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { int* i = static_cast(p); std::shared_ptr ptr(i, [&](void* p) { freed = true; - a.Free(p, size); + a.Free(p, size, index); }); } EXPECT_TRUE(freed); @@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) { } #ifndef PADDLE_ONLY_CPU -TEST(GPUAllocator, NoStaging) { - FLAGS_use_pinned_memory = false; - paddle::memory::detail::GPUAllocator a; - TestAllocator(a, 2048); - TestAllocator(a, 0); -} -TEST(GPUAllocator, Staging) { - FLAGS_use_pinned_memory = true; +TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a; TestAllocator(a, 2048); TestAllocator(a, 0); diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 0d123d99e234a378ee64850eebacece223e2b121..df3d57d629184d28fd42130df9b020a7b52ade72 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -17,43 +17,67 @@ limitations under the License. */ #include "paddle/memory/detail/system_allocator.h" #include "paddle/platform/assert.h" -#include - namespace paddle { namespace memory { -void* Alloc(platform::Place pl, size_t size) { -#ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size); +detail::BuddyAllocator* GetCPUBuddyAllocator() { + static detail::BuddyAllocator* a = nullptr; + if (a == nullptr) { + a = new detail::BuddyAllocator(new detail::CPUAllocator, + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); } -#endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return detail::GetCPUBuddyAllocator()->Alloc(size); + return a; } -void Free(paddle::platform::Place pl, void* p) { -#ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - detail::GetGPUBuddyAllocator(gpu_id)->Free(p); - } -#endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - detail::GetCPUBuddyAllocator()->Free(p); +template <> +void* Alloc(platform::CPUPlace place, size_t size) { + return GetCPUBuddyAllocator()->Alloc(size); +} + +template <> +void Free(platform::CPUPlace place, void* p) { + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(platform::CPUPlace place) { + return GetCPUBuddyAllocator()->Used(); } -size_t Used(paddle::platform::Place pl) { #ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - return detail::GetGPUBuddyAllocator(gpu_id)->Used(); + +detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static detail::BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetDeviceCount(); + as = new detail::BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + platform::SetDeviceId(gpu); + as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + } } -#endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return detail::GetCPUBuddyAllocator()->Used(); + return as[gpu_id]; +} + +template <> +void* Alloc(platform::GPUPlace place, size_t size) { + return GetGPUBuddyAllocator(place.device)->Alloc(size); +} + +template <> +void Free(platform::GPUPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); +} + +template <> +size_t Used(platform::GPUPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); } +#endif // PADDLE_ONLY_CPU + } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index a33092bade65e6df0faee226a8967c9fc9caa032..2d6f4fd2a08ee0039647d276476263d0f8d00329 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,9 +19,14 @@ limitations under the License. */ namespace paddle { namespace memory { -void* Alloc(paddle::platform::Place, size_t); -void Free(paddle::platform::Place, void*); -size_t Used(paddle::platform::Place); +template +void* Alloc(Place, size_t); + +template +void Free(Place, void*); + +template +size_t Used(Place); } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..53cc63a098d0802479e3a371717adb7596c249ed --- /dev/null +++ b/paddle/memory/memory_test.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/memory.h" +#include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_data.h" + +#include "paddle/platform/cpu_info.h" +#include "paddle/platform/gpu_info.h" +#include "paddle/platform/place.h" + +#include +#include + +inline bool is_aligned(void const *p) { + return 0 == (reinterpret_cast(p) & 0x3); +} + +size_t align(size_t size, paddle::platform::CPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::CpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, CPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CPUPlace cpu; + p = paddle::memory::Alloc(cpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::memory::Free(cpu, p); +} + +TEST(BuddyAllocator, CPUMultAlloc) { + paddle::platform::CPUPlace cpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(cpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(cpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(size, cpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(cpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(p.second, cpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } +} + +#ifndef PADDLE_ONLY_CPU + +size_t align(size_t size, paddle::platform::GPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::GpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, GPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::GPUPlace gpu(0); + p = paddle::memory::Alloc(gpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::memory::Free(gpu, p); +} + +TEST(BuddyAllocator, GPUMultAlloc) { + paddle::platform::GPUPlace gpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(gpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(gpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(size, gpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(gpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(p.second, gpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } +} + +#endif // PADDLE_ONLY_CPU diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..441b9e30c4268cce899ac873b2657c34603e0035 --- /dev/null +++ b/paddle/operators/CMakeLists.txt @@ -0,0 +1,44 @@ +function(op_library TARGET) + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. + set(cc_srcs) + set(cu_srcs) + set(op_common_deps operator op_registry) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + foreach(src ${op_library_SRCS}) + if (${src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${src}) + elseif(${src} MATCHES ".*\\.cc$") + list(APPEND cc_srcs ${src}) + else() + message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") + endif() + endforeach() + + list(LENGTH cc_srcs cc_srcs_len) + if (${cc_srcs_len} EQUAL 0) + message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") + endif() + + list(LENGTH cu_srcs cu_srcs_len) + if (${cu_srcs_len} EQUAL 0) + message(WARNING "The op library ${TARGET} not support GPU!") + endif() + + if (WITH_GPU) + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + else() + cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + endif() +endfunction() + +op_library(add_op SRCS add_op.cc add_op.cu) +cc_test(add_op_test SRCS add_op_test.cc DEPS add_op) diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..522b23cbc49f025a1ff674ce157358899d690e6d --- /dev/null +++ b/paddle/operators/add_op.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include + +namespace paddle { +namespace operators { + +class AddOp : public framework::OperatorWithKernel { +protected: + void InferShape( + const std::vector &inputs, + const std::vector &outputs) const override { + PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two"); + PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one"); + PADDLE_ENFORCE( + inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr, + "Inputs/Outputs of AddOp must all be set"); + PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(), + "Two input of Add Op's dimension must be same."); + // Need set dims in Tensor + // outputs[0]->set_dims(inputs[0]->dims()) + } +}; + +class AddOpMaker : public framework::OpProtoAndCheckerMaker { +public: + AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of add op"); + AddInput("Y", "The second input of add op"); + AddOutput("Out", "The output of add op"); + AddComment(R"DOC( +Two Element Add Operator. + +The equation is: Out = X + Y +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker); +REGISTER_OP_CPU_KERNEL( + add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>); diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5979345fffd68d71ba09dc96874d8ff9471bdbcc --- /dev/null +++ b/paddle/operators/add_op.cu @@ -0,0 +1,5 @@ +#include +#include + +REGISTER_OP_GPU_KERNEL(add_two, + paddle::operators::AddKernel); \ No newline at end of file diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h new file mode 100644 index 0000000000000000000000000000000000000000..000564f66dff2b20d17cee10e44aaca114c2c908 --- /dev/null +++ b/paddle/operators/add_op.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include + +namespace paddle { +namespace operators { + +template +class AddKernel : public framework::OpKernel { +public: + void Compute(const framework::KernelContext &context) const override { + LOG(INFO) << "Add kernel in " << typeid(Place).name(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f554ac1bef3255f136ad4407a7a1096bdc2b1db5 --- /dev/null +++ b/paddle/operators/add_op_test.cc @@ -0,0 +1,9 @@ +#include +#define private public +#include +USE_OP(add_two); +TEST(AddOp, GetOpProto) { + auto& protos = paddle::framework::OpRegistry::protos(); + auto it = protos.find("add_two"); + ASSERT_NE(it, protos.end()); +} \ No newline at end of file diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc index 54662dc37891d3211950453b210db4b475837df4..eb7125adee769c97e16986cabf06ea389bf4c143 100644 --- a/paddle/optimizer/optimizer.cc +++ b/paddle/optimizer/optimizer.cc @@ -44,8 +44,8 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto, const int state_len) { paddle_optimizer* optimizer = new paddle_optimizer; std::string config(config_proto, config_proto + config_proto_len); - Tensor* parameter = - new Tensor(reinterpret_cast(param_buffer), num_bytes); + Tensor* parameter = new Tensor(reinterpret_cast(param_buffer), + num_bytes / sizeof(float)); optimizer->impl = ParameterOptimizer::Create(config, parameter); if (state != nullptr) { std::string s(state, state + state_len); @@ -65,7 +65,8 @@ int paddle_update_parameter(paddle_optimizer* o, int num_bytes) { // TOOD(zhihong): datatype not work. need to add the runtime datatype auto grad_type = reinterpret_cast(grad_buffer); - Tensor* gradient = new Tensor(const_cast(grad_type), num_bytes); + Tensor* gradient = + new Tensor(const_cast(grad_type), num_bytes / sizeof(float)); o->impl->Update(gradient); return PADDLE_SUCCESS; } diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp index 4e6254d9e4dab48279b4a880695959526d30d70c..edf4ae37a9beee2911d23dd1ab23e67a18065b1b 100644 --- a/paddle/optimizer/parameter_optimizer_test.cpp +++ b/paddle/optimizer/parameter_optimizer_test.cpp @@ -1,3 +1,19 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #include "parameter_optimizer.h" #include #include @@ -5,21 +21,18 @@ #include "gtest/gtest.h" #include "lr_policy.h" -using namespace paddle; -using namespace paddle::optimizer; - -Tensor* FillTensor(size_t size) { - Tensor* param = new Tensor(size); - Tensor& p = *param; +paddle::optimizer::Tensor* FillTensor(size_t size) { + paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size); + paddle::optimizer::Tensor& p = *param; for (size_t i = 0; i < p.size(); ++i) { p[i] = (float)rand() / (float)RAND_MAX; } return param; } -Tensor* FixedTensor(size_t size) { - Tensor* param = new Tensor(size); - Tensor& p = *param; +paddle::optimizer::Tensor* FixedTensor(size_t size) { + paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size); + paddle::optimizer::Tensor& p = *param; for (size_t i = 0; i < p.size(); ++i) { p[i] = i; } @@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) { class OptimizerTest : public testing::Test { public: - // init tensor shape + virtual ~OptimizerTest() {} + // init paddle::optimizer::Tensor shape const size_t kSize = 5; virtual void SetUp() { @@ -38,34 +52,36 @@ public: virtual void TearDown() {} void CreateSGD() { - Tensor* parameter = FixedTensor(kSize); - config_.set_optimizer(OptimizerConfig::SGD); + paddle::optimizer::Tensor* parameter = FixedTensor(kSize); + config_.set_optimizer(paddle::OptimizerConfig::SGD); config_.mutable_sgd()->set_momentum(0.0); config_.mutable_sgd()->set_decay(0.0); config_.mutable_sgd()->set_nesterov(false); - config_.set_lr_policy(OptimizerConfig::Const); + config_.set_lr_policy(paddle::OptimizerConfig::Const); config_.mutable_const_lr()->set_learning_rate(0.1); std::string str = config_.SerializeAsString(); - ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter); + paddle::optimizer::ParameterOptimizer* opt = + paddle::optimizer::ParameterOptimizer::Create(str, parameter); opts_.push_back(opt); } void CreateAdam() { - Tensor* parameter = FixedTensor(kSize); - config_.set_optimizer(OptimizerConfig::Adam); + paddle::optimizer::Tensor* parameter = FixedTensor(kSize); + config_.set_optimizer(paddle::OptimizerConfig::Adam); config_.mutable_adam()->set_beta_1(0.9); config_.mutable_adam()->set_beta_2(0.1); config_.mutable_adam()->set_epsilon(1e-3); config_.mutable_adam()->set_decay(0.0); - config_.set_lr_policy(OptimizerConfig::Const); + config_.set_lr_policy(paddle::OptimizerConfig::Const); config_.mutable_const_lr()->set_learning_rate(0.1); std::string str = config_.SerializeAsString(); - ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter); + paddle::optimizer::ParameterOptimizer* opt = + paddle::optimizer::ParameterOptimizer::Create(str, parameter); opts_.push_back(opt); } void TestGetWeight() { - Tensor* p = FixedTensor(kSize); + paddle::optimizer::Tensor* p = FixedTensor(kSize); for (size_t i = 0; i < opts_.size(); ++i) { int s = 0; float* newp = (float*)opts_[i]->get_weight(&s); @@ -76,7 +92,7 @@ public: } void TestUpdate() { - Tensor* g = FixedTensor(kSize); + paddle::optimizer::Tensor* g = FixedTensor(kSize); for (size_t i = 0; i < opts_.size(); ++i) { opts_[i]->Update(g); } @@ -91,8 +107,8 @@ public: } private: - std::vector opts_; - OptimizerConfig config_; + std::vector opts_; + paddle::OptimizerConfig config_; }; TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); } diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp index d2454140dc243b40ed8348578360b30894213838..e4d97cbdba545c4ba5adf5b30efd3fc9f3f744ee 100644 --- a/paddle/optimizer/serialization_test.cpp +++ b/paddle/optimizer/serialization_test.cpp @@ -1,19 +1,32 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #include "serialization.h" #include "gtest/gtest.h" -using namespace paddle; -using namespace paddle::optimizer; - TEST(TensorToProto, Case1) { - Tensor t(3), t1(3); + paddle::optimizer::Tensor t(3), t1(3); for (size_t i = 0; i < t.size(); ++i) { t[i] = i; t1[i] = 0; } - TensorProto proto; - TensorToProto(t, &proto); - ProtoToTensor(proto, &t1); + paddle::TensorProto proto; + paddle::optimizer::TensorToProto(t, &proto); + paddle::optimizer::ProtoToTensor(proto, &t1); for (size_t i = 0; i < t1.size(); ++i) { EXPECT_EQ(t1[i], t[i]); } diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 358d14f4555e1d046c8e7b91e23d54fb504926e5..6ac4035c0f863c5f63d17b523a7a8be668ff3da0 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,10 +1,13 @@ -add_subdirectory(dynload) +cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog) +cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) -nv_test(cuda_test SRCS cuda_test.cu) +nv_library(gpu_info SRCS gpu_info.cc DEPS gflags) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) +add_subdirectory(dynload) + IF(WITH_GPU) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) ELSE() @@ -12,4 +15,4 @@ ELSE() ENDIF() cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS}) -nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags) +nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..dfab391cfbe1f04bc2a998233f7e7909579ca72b --- /dev/null +++ b/paddle/platform/cpu_info.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/cpu_info.h" + +#ifdef __APPLE__ +#include +#include +#else +#include +#endif + +#include "gflags/gflags.h" +#include "paddle/platform/error.h" + +DEFINE_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +inline size_t CpuTotalPhysicalMemory() { +#ifdef __APPLE__ + int mib[2]; + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + int64_t size = 0; + size_t len = sizeof(size); + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; + return 0L; +#else + int64_t pages = sysconf(_SC_PHYS_PAGES); + int64_t page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +#endif +} + +size_t CpuMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. + return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); +} + +size_t CpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 4 KB. + return 1 << 12; +} + +size_t CpuMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. + return CpuMaxAllocSize() / 32; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h new file mode 100644 index 0000000000000000000000000000000000000000..8df7c7b4bca5bc88f6ed95d6ab82c81b73918e92 --- /dev/null +++ b/paddle/platform/cpu_info.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { + +//! Get the maximum allocation size for a machine. +size_t CpuMaxAllocSize(); + +//! Get the minimum chunk size for buddy allocator. +size_t CpuMinChunkSize(); + +//! Get the maximum chunk size for buddy allocator. +size_t CpuMaxChunkSize(); + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8fb195aa7c0a41b7417ff5cf63394046e9c72267 --- /dev/null +++ b/paddle/platform/cpu_info_test.cc @@ -0,0 +1,21 @@ +#include "paddle/platform/cpu_info.h" +#include "paddle/string/printf.h" + +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +DECLARE_double(fraction_of_cpu_memory_to_use); + +TEST(CpuMemoryUsage, Print) { + std::stringstream ss; + size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024; + float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100; + + std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n", + use_percent, memory_size) + << std::endl; +} diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu deleted file mode 100644 index 4067dda2f19f7661722d8a14a27c7b32ed6afc92..0000000000000000000000000000000000000000 --- a/paddle/platform/cuda_test.cu +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include "gtest/gtest.h" - -#define CHECK_ERR(x) \ - if (x != cudaSuccess) { \ - fprintf(stderr, \ - "%s in %s at line %d\n", \ - cudaGetErrorString(err), \ - __FILE__, \ - __LINE__); \ - exit(-1); \ - } - -__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < n) { - d_C[i] = d_A[i] + d_B[i]; - } -} - -TEST(Cuda, Equality) { - int n = 10; - // Memory allocation for h_A, h_B and h_C (in the host) - float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0}; - float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0}; - float h_C[10]; - float *d_A, *d_B, *d_C; - cudaError_t err; - // Memory allocation for d_A, d_B and d_C (in the device) - err = cudaMalloc((void **)&d_A, sizeof(float) * n); - CHECK_ERR(err); - - err = cudaMalloc((void **)&d_B, sizeof(float) * n); - CHECK_ERR(err); - - err = cudaMalloc((void **)&d_C, sizeof(float) * n); - CHECK_ERR(err); - - // Copying memory to device - err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice); - CHECK_ERR(err); - - err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice); - CHECK_ERR(err); - - // Calling the kernel - vecAdd<<>>(d_A, d_B, d_C, n); - - // Copying results back to host - err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost); - CHECK_ERR(err); - - EXPECT_EQ(h_C[0], 1.0); - for (int i = 1; i < n - 1; ++i) { - EXPECT_EQ(h_C[i], 11.0); - } - EXPECT_EQ(h_C[9], 1.0); -} diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 7de07d06bed885d6529a884fb81fedbdaba78f4a..51c8e1391324649d8e845e902a5632f6bca1fa58 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -13,10 +13,11 @@ limitations under the License. */ #include "paddle/framework/enforce.h" #ifndef PADDLE_ONLY_CPU -#include "paddle/platform/cuda.h" #include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/curand.h" +#include "paddle/platform/error.h" +#include "paddle/platform/gpu_info.h" #define EIGEN_USE_GPU #endif #include diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h index 47c7a8ec21fc6e84d5f45f68b7dc8a766003be75..c44b7240a885c2ef71e550df645dbaded69f9944 100644 --- a/paddle/platform/dynload/cublas.h +++ b/paddle/platform/dynload/cublas.h @@ -67,20 +67,20 @@ extern void *cublas_dso_handle; __macro(cublasSgemm); \ __macro(cublasDgemm); \ __macro(cublasSgeam); \ - __macro(cublasDgeam); - -DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate); -DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy); -DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream); -DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode); -DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched); + __macro(cublasDgeam); \ + __macro(cublasCreate_v2); \ + __macro(cublasDestroy_v2); \ + __macro(cublasSetStream_v2); \ + __macro(cublasSetPointerMode_v2); \ + __macro(cublasGetPointerMode_v2); \ + __macro(cublasSgemmBatched); \ + __macro(cublasDgemmBatched); \ + __macro(cublasCgemmBatched); \ + __macro(cublasZgemmBatched); \ + __macro(cublasSgetrfBatched); \ + __macro(cublasSgetriBatched); \ + __macro(cublasDgetrfBatched); \ + __macro(cublasDgetriBatched) CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); diff --git a/paddle/platform/error.h b/paddle/platform/error.h new file mode 100644 index 0000000000000000000000000000000000000000..93424bb61096503a4843da7942853a113f612e3b --- /dev/null +++ b/paddle/platform/error.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include + +#ifndef PADDLE_ONLY_CPU + +#include +#include +#include +#include +#include + +#endif // PADDLE_ONLY_CPU + +namespace paddle { +namespace platform { + +#ifndef PADDLE_ONLY_CPU + +inline void throw_on_error(cudaError_t e, const char* message) { + if (e) { + throw thrust::system_error(e, thrust::cuda_category(), message); + } +} + +inline void throw_on_error(curandStatus_t stat, const char* message) { + if (stat != CURAND_STATUS_SUCCESS) { + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + message); + } +} + +inline void throw_on_error(cudnnStatus_t stat, const char* message) { + std::stringstream ss; + if (stat == CUDNN_STATUS_SUCCESS) { + return; + } else { + ss << cudnnGetErrorString(stat); + ss << ", " << message; + throw std::runtime_error(ss.str()); + } +} + +inline void throw_on_error(cublasStatus_t stat, const char* message) { + std::stringstream ss; + if (stat == CUBLAS_STATUS_SUCCESS) { + return; + } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + ss << "CUBLAS: not initialized"; + } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { + ss << "CUBLAS: alloc failed"; + } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { + ss << "CUBLAS: invalid value"; + } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { + ss << "CUBLAS: arch mismatch"; + } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { + ss << "CUBLAS: mapping error"; + } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { + ss << "CUBLAS: execution failed"; + } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { + ss << "CUBLAS: internal error"; + } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { + ss << "CUBLAS: not supported"; + } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { + ss << "CUBLAS: license error"; + } + ss << ", " << message; + throw std::runtime_error(ss.str()); +} + +inline void throw_on_error(cublasStatus_t stat) { + const char* message = ""; + throw_on_error(stat, message); +} + +#endif // PADDLE_ONLY_CPU + +inline void throw_on_error(int stat, const char* message) { + if (stat) { + throw std::runtime_error(message + (", stat = " + std::to_string(stat))); + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..a1383d3524aedf834c329425419b989d47668bea --- /dev/null +++ b/paddle/platform/gpu_info.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/gpu_info.h" +#include "gflags/gflags.h" +#include "paddle/platform/error.h" + +DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, + "Default use 95% of GPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +int GetDeviceCount() { + int count; + throw_on_error( + cudaGetDeviceCount(&count), + "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount"); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + throw_on_error( + cudaGetDevice(&device_id), + "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId"); + return device_id; +} + +void SetDeviceId(int id) { + throw_on_error(cudaSetDevice(id), + "cudaSetDevice failed in paddle::platform::SetDeviceId"); +} + +void GpuMemoryUsage(size_t& available, size_t& total) { + throw_on_error(cudaMemGetInfo(&available, &total), + "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); +} + +size_t GpuMaxAllocSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserve the rest for page tables, etc. + return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserving the rest memory for page tables, etc. + size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total; + + // If available less than minimum chunk size, no usable memory exists. + available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(); + + // If available less than reserving, no usable memory exists. + size_t usable = std::max(available, reserving) - reserving; + + return usable; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cuda.h b/paddle/platform/gpu_info.h similarity index 54% rename from paddle/platform/cuda.h rename to paddle/platform/gpu_info.h index 96889abf9eb14dd203eb55ffd0b720450323b38e..79e71956bd32e8c253ac4192a04e5903bed1c94a 100644 --- a/paddle/platform/cuda.h +++ b/paddle/platform/gpu_info.h @@ -16,33 +16,31 @@ limitations under the License. */ #ifndef PADDLE_ONLY_CPU -#include -#include +#include namespace paddle { namespace platform { -inline void throw_on_error(cudaError_t e, const char* message) { - if (e) { - throw thrust::system_error(e, thrust::cuda_category(), message); - } -} - -inline int GetDeviceCount(void) { - int count; - throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); - return count; -} - -inline int GetCurrentDeviceId(void) { - int device_id; - throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed"); - return device_id; -} - -inline void SetDeviceId(int device_id) { - throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed"); -} +//! Get the total number of GPU devices in system. +int GetDeviceCount(); + +//! Get the current GPU device id in system. +int GetCurrentDeviceId(); + +//! Set the GPU device id for next execution. +void SetDeviceId(int device_id); + +//!Get the memory usage of current GPU device. +void GpuMemoryUsage(size_t& available, size_t& total); + +//! Get the maximum allocation size of current GPU device. +size_t GpuMaxAllocSize(); + +//! Get the minimum chunk size for GPU buddy allocator. +size_t GpuMinChunkSize(); + +//! Get the maximum chunk size for GPU buddy allocator. +size_t GpuMaxChunkSize(); } // namespace platform } // namespace paddle diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc index 0704820aa05079401eb56814d689d6e280311edb..b31515e1f028acac885a506ff1c20479407a05e3 100644 --- a/paddle/platform/place.cc +++ b/paddle/platform/place.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + #include "paddle/platform/place.h" namespace paddle { @@ -7,7 +21,7 @@ namespace detail { class PlacePrinter : public boost::static_visitor<> { public: - PlacePrinter(std::ostream &os) : os_(os) {} + explicit PlacePrinter(std::ostream &os) : os_(os) {} void operator()(const CPUPlace &) { os_ << "CPUPlace"; } void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index af85fdeecb57729d7fb580ebd4c59c1afc61d61a..8564a5f5fe474dbd55ab3e413f9c2cf93f88e38e 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1 +1 @@ -cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python) +cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index f9f87acf15a6b62c343cc0e3db9ebc7e0aabb786..b5ead21fd0128f65f4adc662209af842a631fc41 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -13,12 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include +#include +#include +#include namespace py = pybind11; namespace pd = paddle::framework; +USE_OP(add_two); + PYBIND11_PLUGIN(core) { py::module m("core", "C++ core of Paddle Paddle"); @@ -43,5 +49,37 @@ All parameter, weight, gradient are variables in Paddle. &pd::Scope::CreateVariable, py::return_value_policy::reference); + //! @note: Be careful! PyBind will return std::string as an unicode, not + //! Python str. If you want a str object, you should cast them in Python. + m.def("get_all_op_protos", []() -> std::vector { + auto& protos = pd::OpRegistry::protos(); + std::vector ret_values; + for (auto it = protos.begin(); it != protos.end(); ++it) { + PADDLE_ENFORCE(it->second.IsInitialized(), + "OpProto must all be initialized"); + ret_values.emplace_back(); + PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()), + "Serialize OpProto Error. This could be a bug of Paddle."); + } + return ret_values; + }); + m.def_submodule( + "var_names", + "The module will return special predefined variable name in Paddle") + .def("empty", pd::OperatorBase::EMPTY_VAR_NAME) + .def("temp", pd::OperatorBase::TMP_VAR_NAME); + + py::class_(m, "Operator") + .def("__str__", &pd::OperatorBase::DebugString) + .def_static("create", [](const std::string& protobin) { + pd::OpDesc desc; + PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), + "Cannot parse user input to OpDesc"); + PADDLE_ENFORCE(desc.IsInitialized(), + "User OpDesc is not initialized, reason %s", + desc.InitializationErrorString()); + return pd::OpRegistry::CreateOp(desc); + }); + return m.ptr(); } diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ab60f1a38dd4cd1d9799c0019dccae5f1c7d4310..3860facb099950a5287d3f6b89c3de38f588f568 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -155,7 +155,8 @@ RUN apt-get update &&\ paddle version ${DOCKERFILE_CUDNN_DSO} ${DOCKERFILE_GPU_ENV} - +ADD go/cmd/pserver/pserver /usr/bin/ +ADD go/cmd/master/master /usr/bin/ # default command shows the paddle version and exit CMD ["paddle", "version"] EOF diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh index bfa10c91553563bddac8c1b41bf21490fb89d3cf..56d290be4ab04a9f6974023159aa8571d27f8dd5 100644 --- a/paddle/scripts/docker/build_android.sh +++ b/paddle/scripts/docker/build_android.sh @@ -2,9 +2,9 @@ set -xe -mkdir -p /paddle/build -cd /paddle/build -rm -f /paddle/install 2>/dev/null || true +mkdir -p /paddle/build_android +cd /paddle/build_android +rm -rf /paddle/install 2>/dev/null || true cmake -DCMAKE_SYSTEM_NAME=Android \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ -DANDROID_ABI=armeabi-v7a \ @@ -21,6 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ .. make -j `nproc` make install - -export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH -paddle version diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh new file mode 100755 index 0000000000000000000000000000000000000000..004067a8f55351509caaf2bbf6d5c349a4698a79 --- /dev/null +++ b/paddle/scripts/travis/build_android.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc +TMP_DIR=$HOME/$JOB/tmp +mkdir -p $TMP_DIR +cd $TMP_DIR +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh +$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN +cd $HOME +rm -rf $TMP_DIR + +# Create the build directory for CMake. +mkdir -p $TRAVIS_BUILD_DIR/build_android +cd $TRAVIS_BUILD_DIR/build_android + +# Compile paddle binaries +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_STYLE_CHECK=OFF \ + .. + +make -j `nproc` diff --git a/paddle/string/piece.h b/paddle/string/piece.h index db7c3e69804a6a8f0510ba376432fe560ae74442..0272529d1c9b2cb6000a26f1d4d80276d06bf27b 100644 --- a/paddle/string/piece.h +++ b/paddle/string/piece.h @@ -35,7 +35,7 @@ public: // We provide non-explicit singleton constructors so users can // pass in a "const char*" or a "string" wherever a "Piece" - // is expected. These contructors ensure that if data_ is NULL, + // is expected. These constructors ensure that if data_ is NULL, // size_ is 0. Piece(); Piece(const char* d, size_t n); diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp index b359d9da2167bf459504e15c3140b3d956f417f3..a830ceba5772846cd9255a3eeb26e8d6a17dcfbc 100644 --- a/paddle/trainer/NewRemoteParameterUpdater.cpp +++ b/paddle/trainer/NewRemoteParameterUpdater.cpp @@ -28,6 +28,17 @@ NewRemoteParameterUpdater::NewRemoteParameterUpdater( newGradients_(nullptr), pserverSpec_(pserverSpec) {} +NewRemoteParameterUpdater::NewRemoteParameterUpdater( + const OptimizationConfig &config, + const std::string pserverSpec, + const bool useEtcd) + : trainerConfig_(config), + parameterClient_(-1), + newParameters_(nullptr), + newGradients_(nullptr), + pserverSpec_(pserverSpec), + useEtcd_(useEtcd) {} + void NewRemoteParameterUpdater::init( const std::vector ¶meters) { ParameterUpdater::init(parameters); @@ -38,8 +49,13 @@ void NewRemoteParameterUpdater::init( } // create parameter server client. - parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(), - FLAGS_trainer_id == 0); + if (useEtcd_) { + parameterClient_ = paddle_new_etcd_pserver_client( + (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0); + } else { + parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(), + FLAGS_trainer_id == 0); + } // init new parameter and gradient. newParameters_ = initNewParameter(PARAMETER_VALUE); diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h index dfed00bc216b1d41bb7520619b76702f9fe650f2..6223ba427c9b94494c2bee8f0847442f1b0574c9 100644 --- a/paddle/trainer/NewRemoteParameterUpdater.h +++ b/paddle/trainer/NewRemoteParameterUpdater.h @@ -32,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater { public: NewRemoteParameterUpdater(const OptimizationConfig& config, const std::string pserverSpec); + NewRemoteParameterUpdater(const OptimizationConfig& config, + const std::string pserverSpec, + const bool useEtcd); ~NewRemoteParameterUpdater() { releaseNewParameter(newParameters_); releaseNewParameter(newGradients_); @@ -111,6 +114,8 @@ protected: paddle_parameter** newGradients_; /// the specification of parameter server "host1:port,host1:port" std::string pserverSpec_; + /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr + bool useEtcd_; }; } // namespace paddle diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp index 60ac8459a12db801321da4a9d9c1d48ac8bd6d16..133e2be104c6fbfddefd8698d2b6aa8315c56c70 100644 --- a/paddle/trainer/TrainerConfigHelper.cpp +++ b/paddle/trainer/TrainerConfigHelper.cpp @@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config) m->conf = config; } -TrainerConfigHelper::~TrainerConfigHelper() { - if (m) { - delete m; - } -} +TrainerConfigHelper::~TrainerConfigHelper() { delete m; } const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; } diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h index 9b5ad21724afd7176f958619e7e10d12dc08fa49..2e5ff76a06152b6a12818f06baaeaa6a69726ba8 100644 --- a/paddle/utils/DynamicLoader.h +++ b/paddle/utils/DynamicLoader.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef DYNAMIC_LOAD_H_ -#define DYNAMIC_LOAD_H_ +#pragma once #include #include @@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle); * */ void GetLapackDsoHandle(void** dso_handle); - -#endif // DYNAMIC_LOAD_H_ diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h index b5e2862546212041a774599ec664b95e56224a07..0a27b8b97b83a9066af23039a317c437ea56777a 100644 --- a/paddle/utils/ThreadLocal.h +++ b/paddle/utils/ThreadLocal.h @@ -51,7 +51,7 @@ template class ThreadLocal { public: ThreadLocal() { - CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0); + CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0); } ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); } @@ -65,7 +65,7 @@ public: if (!p && createLocal) { p = new T(); int ret = pthread_setspecific(threadSpecificKey_, p); - CHECK(ret == 0); + CHECK_EQ(ret, 0); } return p; } @@ -79,7 +79,7 @@ public: if (T* q = get(false)) { dataDestructor(q); } - CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0); } /** @@ -112,7 +112,7 @@ private: template class ThreadLocalD { public: - ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); } + ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); } ~ThreadLocalD() { pthread_key_delete(threadSpecificKey_); for (auto t : threadMap_) { @@ -127,7 +127,7 @@ public: T* p = (T*)pthread_getspecific(threadSpecificKey_); if (!p) { p = new T(); - CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0); updateMap(p); } return p; @@ -141,7 +141,7 @@ public: if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) { dataDestructor(q); } - CHECK(pthread_setspecific(threadSpecificKey_, p) == 0); + CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0); updateMap(p); } diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b0524a507bacec6768424045e58bf91305de2d08..78aa0778f8d1dca9fae82f0411be5a00e636cbc9 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -126,6 +126,7 @@ __all__ = [ 'row_conv_layer', 'dropout_layer', 'prelu_layer', + 'gated_unit_layer', ] @@ -5862,7 +5863,7 @@ def prelu_layer(input, :rtype: LayerOutput """ - assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input' + assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.' assert isinstance(param_attr, ParameterAttribute) l = Layer( @@ -5876,3 +5877,96 @@ def prelu_layer(input, layer_type=LayerType.PRELU, parents=input, size=l.config.size) + + +@wrap_name_default() +@layer_support(ERROR_CLIPPING, DROPOUT) +@wrap_act_default(act=LinearActivation()) +def gated_unit_layer(input, + size, + act=None, + name=None, + gate_attr=None, + gate_param_attr=None, + gate_bias_attr=True, + inproj_attr=None, + inproj_param_attr=None, + inproj_bias_attr=True, + layer_attr=None): + """ + The gated unit layer implements a simple gating mechanism over the input. + The input :math:`X` is first projected into a new space :math:`X'`, and + it is also used to produce a gate weight :math:`\sigma`. Element-wise + prodict between :match:`X'` and :math:`\sigma` is finally returned. + + Reference: + Language Modeling with Gated Convolutional Networks + https://arxiv.org/abs/1612.08083 + + .. math:: + y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c) + + The example usage is: + + .. code-block:: python + gated_unit = gated_unit_layer(size=128, input=input_layer)) + + :param input: input for this layer. + :type input: LayerOutput + :param size: output size of the gated unit. + :type size: int + :param act: activation type of the projected input. + :type act: BaseActivation + :param name: name of this layer. + :type name: basestring + :param gate_attr: Attributes to tune the gate output, for example, error + clipping threshold, dropout and so on. See ExtraLayerAttribute for + more details. + :type gate_attr: ExtraLayerAttribute|None + :param gate_param_attr: Attributes to tune the learnable projected matrix + parameter of the gate. + :type gate_param_attr: ParameterAttribute|None + :param gate_bias_attr: Attributes to tune the learnable bias of the gate. + :type gate_bias_attr: ParameterAttribute|None + :param inproj_attr: Attributes to the tune the projected input, for + example, error clipping threshold, dropout and so on. See + ExtraLayerAttribute for more details. + :type inproj_attr: ExtraLayerAttribute|None + :param inproj_param_attr: Attributes to tune the learnable parameter of + the projection of input. + :type inproj_param_attr: ParameterAttribute|None + :param inproj_bias_attr: Attributes to tune the learnable bias of + projection of the input. + :type inproj_bias_attr: ParameterAttribute|None + :param layer_attr: Attributes to tune the final output of the gated unit, + for example, error clipping threshold, dropout and so on. See + ExtraLayerAttribute for more details. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + """ + + assert isinstance( + input, LayerOutput), 'The gated linear unit accepts only one input.' + + input_proj = fc_layer( + input=input, + name="%s_input_proj" % name, + size=size, + act=act, + layer_attr=inproj_attr, + param_attr=inproj_param_attr, + bias_attr=inproj_bias_attr) + + gate = fc_layer( + size=size, + name="%s_gate" % name, + act=SigmoidActivation(), + input=input, + layer_attr=gate_attr, + param_attr=gate_param_attr, + bias_attr=gate_bias_attr) + return mixed_layer( + name="%s_gated_act" % name, + input=dotmul_operator(input_proj, gate), + layer_attr=layer_attr) diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 70e342fb79ab51e3376ea6ad8f593c4c3a1fff18..cdf9b2eab733adb173cf33cd6a93ef7b5abefc50 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer -test_recursive_topology) +test_recursive_topology test_gated_unit_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..f1e4d894a5fb0040f48bdb5a751c3f0d956c23bb --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr @@ -0,0 +1,106 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 256 + active_type: "" +} +layers { + name: "__gated_unit_layer_0___input_proj" + type: "fc" + size: 512 + active_type: "tanh" + inputs { + input_layer_name: "input" + input_parameter_name: "___gated_unit_layer_0___input_proj.w0" + } + bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias" + error_clipping_threshold: 100.0 +} +layers { + name: "__gated_unit_layer_0___gate" + type: "fc" + size: 512 + active_type: "sigmoid" + inputs { + input_layer_name: "input" + input_parameter_name: "___gated_unit_layer_0___gate.w0" + } + bias_parameter_name: "___gated_unit_layer_0___gate.wbias" + error_clipping_threshold: 100.0 +} +layers { + name: "__gated_unit_layer_0___gated_act" + type: "mixed" + size: 512 + active_type: "" + inputs { + input_layer_name: "__gated_unit_layer_0___input_proj" + } + inputs { + input_layer_name: "__gated_unit_layer_0___gate" + } + error_clipping_threshold: 100.0 + operator_confs { + type: "dot_mul" + input_indices: 0 + input_indices: 1 + input_sizes: 512 + input_sizes: 512 + output_size: 512 + dotmul_scale: 1 + } +} +parameters { + name: "___gated_unit_layer_0___input_proj.w0" + size: 131072 + initial_mean: 0.0 + initial_std: 0.0001 + dims: 256 + dims: 512 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___gated_unit_layer_0___input_proj.wbias" + size: 512 + initial_mean: 0.0 + initial_std: 1 + dims: 1 + dims: 512 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___gated_unit_layer_0___gate.w0" + size: 131072 + initial_mean: 0.0 + initial_std: 0.0001 + dims: 256 + dims: 512 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___gated_unit_layer_0___gate.wbias" + size: 512 + initial_mean: 0.0 + initial_std: 1 + dims: 1 + dims: 512 + initial_strategy: 0 + initial_smart: false +} +input_layer_names: "input" +output_layer_names: "__gated_unit_layer_0___gated_act" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__gated_unit_layer_0___input_proj" + layer_names: "__gated_unit_layer_0___gate" + layer_names: "__gated_unit_layer_0___gated_act" + input_layer_names: "input" + output_layer_names: "__gated_unit_layer_0___gated_act" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..9dab45519c65b0ca686558ec7fe2064bb9ad8824 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py @@ -0,0 +1,16 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=256) +glu = gated_unit_layer( + size=512, + input=data, + act=TanhActivation(), + gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0), + gate_param_attr=ParamAttr(initial_std=1e-4), + gate_bias_attr=ParamAttr(initial_std=1), + inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0), + inproj_param_attr=ParamAttr(initial_std=1e-4), + inproj_bias_attr=ParamAttr(initial_std=1), + layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0)) + +outputs(glu) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 3ba5c31871807027e452df5d889b3b403e1c6414..3c75ca4c3abf1e94fc00b87f3af51d1cbf6dc430 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -20,7 +20,6 @@ import trainer import event import data_type import topology -import data_feeder import networks import evaluator from . import dataset @@ -31,7 +30,6 @@ import op import pooling import inference import networks -import py_paddle.swig_paddle as api import minibatch import plot import image @@ -47,7 +45,6 @@ __all__ = [ 'data_type', 'attr', 'pooling', - 'data_feeder', 'dataset', 'reader', 'topology', @@ -61,6 +58,7 @@ __all__ = [ def init(**kwargs): + import py_paddle.swig_paddle as api args = [] args_dict = {} # NOTE: append arguments if they are in ENV diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 2698251b9e15046eb14f71c3f5b0546ecbb4a5dd..98dfb85a0ea57050bf8dd8d46fca9574801d8eb3 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from py_paddle import DataProviderConverter import collections import paddle.trainer.PyDataProvider2 as pydp2 diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 4a2eb59c340f5d0d3818170e56d730330e0bab29..645f3cc0dce70752c20569523e4bab440861f6a1 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -22,6 +22,8 @@ import importlib import paddle.v2.dataset import cPickle import glob +import cPickle as pickle +import random __all__ = [ 'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader', @@ -170,8 +172,6 @@ def convert(output_path, name_prefix, max_lines_to_shuffle=1000): import recordio - import cPickle as pickle - import random """ Convert data from reader to recordio format files. @@ -201,8 +201,10 @@ def convert(output_path, def write_data(w, lines): random.shuffle(lines) for i, d in enumerate(lines): - d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL) - w[i % num_shards].write(d) + # FIXME(Yancey1989): + # dumps with protocol: pickle.HIGHEST_PROTOCOL + o = pickle.dumps(d) + w[i % num_shards].write(o) w = open_writers() lines = [] diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py index fd71b341662ca6f540ce44a86348e782561a97d7..cffb319ad8f56ccddba3fef63e1b6ec68e5bac1e 100644 --- a/python/paddle/v2/dataset/mq2007.py +++ b/python/paddle/v2/dataset/mq2007.py @@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"): for j in range(i + 1, len(querylist)): query_right = querylist[j] if query_left.relevance_score > query_right.relevance_score: - labels.append(1) + labels.append([1]) docpairs.append([ np.array(query_left.feature_vector), np.array(query_right.feature_vector) ]) elif query_left.relevance_score < query_right.relevance_score: - labels.append(1) + labels.append([1]) docpairs.append([ np.array(query_right.feature_vector), np.array(query_left.feature_vector) ]) for label, pair in zip(labels, docpairs): - yield label, pair[0], pair[1] + yield np.array(label), pair[0], pair[1] def gen_list(querylist): diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index fd6050fa339d280ad54e40128ea6bae25132c873..7589cc9917f26375d595e200245d5ba099bc38d7 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -9,8 +9,6 @@ There are: * BeginPass * EndPass """ -import py_paddle.swig_paddle as api - __all__ = [ 'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult' ] @@ -18,6 +16,7 @@ __all__ = [ class WithMetric(object): def __init__(self, evaluator): + import py_paddle.swig_paddle as api if not isinstance(evaluator, api.Evaluator): raise TypeError("Evaluator should be api.Evaluator type") self.__evaluator__ = evaluator diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..c2a7ae7692b08762ffbc91726be7bfa90e8ddedb --- /dev/null +++ b/python/paddle/v2/framework/create_op_creation_methods.py @@ -0,0 +1,246 @@ +import paddle.v2.framework.core as core +import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 +import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 +import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 +import cStringIO + + +def get_all_op_protos(): + """ + Get all registered op proto from Paddle C++ + :return: list of OpProto + """ + protostrs = core.get_all_op_protos() + ret_values = [] + for pbstr in protostrs: + op_proto = op_proto_pb2.OpProto.FromString(str(pbstr)) + ret_values.append(op_proto) + return ret_values + + +class OpDescCreationMethod(object): + """ + A Functor object to convert user input(use key word args) to OpDesc based on + OpProto. + + :param op_proto: The OpProto object. + :type op_proto: op_proto_pb2.OpProto + """ + + def __init__(self, op_proto): + if not isinstance(op_proto, op_proto_pb2.OpProto): + raise TypeError("Argument should be OpProto") + self.__op_proto__ = op_proto + + def __call__(self, *args, **kwargs): + """ + Convert user input to OpDesc. Only key-word args are supported. + :return: OpDesc based on user input + :rtype: op_desc_pb2.OpDesc + """ + if len(args) != 0: + raise ValueError("Only keyword arguments is supported by Paddle") + op_desc = op_desc_pb2.OpDesc() + + # Inputs + ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output( + "input", kwargs, self.__op_proto__.inputs) + op_desc.inputs.extend(ipts) + if ipt_format is not None: + op_desc.attrs.extend([ipt_format]) + + # Outputs + outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output( + "output", kwargs, self.__op_proto__.outputs) + op_desc.outputs.extend(outs) + if out_format is not None: + op_desc.attrs.extend([out_format]) + if len(tmp_index) != 0: + tmp_index_attr = op_desc.attrs.add() + tmp_index_attr.type = attr_type_pb2.INTS + tmp_index_attr.name = "temporary_index" + tmp_index_attr.ints.extend(tmp_index) + + # Types + op_desc.type = self.__op_proto__.type + + # Attrs + for attr in self.__op_proto__.attrs: + if attr.generated: + continue + user_defined_attr = kwargs.get(attr.name, None) + if user_defined_attr is not None: + new_attr = op_desc.attrs.add() + new_attr.name = attr.name + new_attr.type = attr.type + if attr.type == attr_type_pb2.INT: + new_attr.i = user_defined_attr + elif attr.type == attr_type_pb2.FLOAT: + new_attr.f = user_defined_attr + elif attr.type == attr_type_pb2.STRING: + new_attr.s = user_defined_attr + elif attr.type == attr_type_pb2.INTS: + new_attr.ints.extend(user_defined_attr) + elif attr.type == attr_type_pb2.FLOATS: + new_attr.floats.extend(user_defined_attr) + elif attr.type == attr_type_pb2.STRINGS: + new_attr.strings.extend(user_defined_attr) + else: + raise NotImplementedError("Not support attribute type " + + attr.type) + + return op_desc + + @staticmethod + def extract_input_or_output(in_out, kwargs, meta): + """ + Extract input variable names or output variable names from key-word + arguments, which base on VarProtos. + + :param in_out: "input" or "output" + :param kwargs: key-word arguments that user inputted. + :param meta: a list of VarProto + :return: The three object will be return. The variable names. The + input_format or output_format attribute(None if the input or output is + not multiple). The temporary variable index list. + """ + multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta)) + tmp_index = [] + retv = [] + if multiple: + var_format = op_desc_pb2.AttrDesc() + var_format.type = attr_type_pb2.INTS + var_format.name = "%s_format" % in_out + var_format.ints.append(0) + + for var in meta: + var_name = var.name + + if var.temporary: + var_name = [core.var_names.temp()] + tmp_index.append(len(retv)) + else: + var_name = kwargs.get(var_name, []) + if not isinstance(var_name, list): + var_name = [var_name] + retv.extend(var_name) + var_format.ints.append(len(var_name) + var_format.ints[-1]) + return retv, var_format, tmp_index + else: + for var in meta: + if var.temporary: + retv.append(kwargs.get(var.name, core.var_names.temp())) + tmp_index.append(len(retv)) + else: + retv.append(kwargs.get(var.name, core.var_names.empty())) + return retv, None, tmp_index + + @staticmethod + def any_is_true(generator): + """ + Reduce a bool array to one. If any of them is True, then return True. + """ + for flag in generator: + if flag: + return True + return False + + +def get_docstring_from_op_proto(op_proto): + """ + Generate docstring from a OpProto + :param op_proto: a OpProto instance. + :type op_proto: op_proto_pb2.OpProto + :return: docstring + """ + if not isinstance(op_proto, op_proto_pb2.OpProto): + raise TypeError("Input must be OpProto") + f = cStringIO.StringIO() + f.write(op_proto.comment) + f.write("\n") + + def __append_param__(name, comment, type): + # Maybe replace the following line with template engine is better. + f.write(":param ") + f.write(name) + f.write(": ") + f.write(comment) + f.write("\n") + f.write(":type ") + f.write(name) + f.write(": ") + f.write(type) + f.write("\n") + + for ipt in op_proto.inputs: + __append_param__(ipt.name, ipt.comment, "list | basestr" + if ipt.multiple else "basestr") + + temp_var_prefix = \ + "This is a temporary variable. It does not have to set by user. " + for opt in op_proto.outputs: + __append_param__(opt.name, opt.comment if not opt.temporary else + temp_var_prefix + opt.comment, "list | basestr" + if opt.multiple else "basestr") + + for attr in op_proto.attrs: + attr_type = None + if attr.type == attr_type_pb2.INT: + attr_type = "int" + elif attr.type == attr_type_pb2.FLOAT: + attr_type = "float" + elif attr.type == attr_type_pb2.STRING: + attr_type = "basestr" + elif attr.type == attr_type_pb2.INTS: + attr_type = "list of int" + elif attr.type == attr_type_pb2.FLOATS: + attr_type = "list of float" + elif attr.type == attr_type_pb2.STRINGS: + attr_type = "list of basestr" + + if attr_type is None: + raise RuntimeError("Not supported attribute type " + attr.type) + + __append_param__(attr.name, attr.comment, attr_type) + + return f.getvalue() + + +def create_op_creation_method(op_proto): + """ + Generate op creation method for an OpProto + """ + method = OpDescCreationMethod(op_proto) + + def __impl__(*args, **kwargs): + opdesc = method(*args, **kwargs) + return core.Operator.create(opdesc.SerializeToString()) + + __impl__.__doc__ = get_docstring_from_op_proto(op_proto) + return __impl__ + + +class OpCreationsHolder(object): + """ + A object will holds all op creation methods. + + Use `op_creations.xxx_op` to access them. + """ + pass + + +op_creations = OpCreationsHolder() + + +def __bootstrap__(): + """ + Bootstrap function for this module. It will dynamic create all op creation + methods in runtime. + """ + for op_proto in get_all_op_protos(): + func = create_op_creation_method(op_proto) + func.__name__ = str(op_proto.type) + setattr(op_creations, func.__name__, func) + + +__bootstrap__() diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py new file mode 100644 index 0000000000000000000000000000000000000000..4e772326c94b7ee44906c71f13e9420e078a1917 --- /dev/null +++ b/python/paddle/v2/framework/default_scope_funcs.py @@ -0,0 +1,83 @@ +""" +Default scope function. + +`Paddle` manages Scope as programming language's scope. It just a +thread-local stack of Scope. Top of that stack is current scope, the bottom +of that stack is all scopes' parent. + +Invoking `create_var/get_var` can `create/get` variable in current scope. +Invoking `enter_local_scope/leave_local_scope` can create or destroy local +scope. + +A `scoped_function` will take a `function` as input. That function will be +invoked in a new local scope. +""" + +import paddle.v2.framework.core +import threading + +__tl_scope__ = threading.local() + +__all__ = [ + 'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var', + 'get_var', 'scoped_function' +] + + +def get_cur_scope(): + """ + Get current scope. + :rtype: paddle.v2.framework.core.Scope + """ + cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None) + if cur_scope_stack is None: + __tl_scope__.cur_scope = list() + if len(__tl_scope__.cur_scope) == 0: + __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None)) + return __tl_scope__.cur_scope[-1] + + +def enter_local_scope(): + """ + Enter a new local scope + """ + cur_scope = get_cur_scope() + new_scope = paddle.v2.framework.core.Scope(cur_scope) + __tl_scope__.cur_scope.append(new_scope) + + +def leave_local_scope(): + """ + Leave local scope + """ + __tl_scope__.cur_scope.pop() + + +def create_var(name): + """ + create variable in current scope. + """ + return get_cur_scope().create_var(name) + + +def get_var(name): + """ + get variable in current scope. + """ + return get_cur_scope().get_var(name) + + +def scoped_function(func): + """ + invoke `func` in new scope. + + :param func: a callable function that will be run in new scope. + :type func: callable + """ + enter_local_scope() + try: + func() + except: + raise + finally: + leave_local_scope() diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt index d809917af14aaf0ffc3a3d95fd5d8fbe028a61cb..86fc60f26aee0fbdcf4ac4938d20d26d35df57f6 100644 --- a/python/paddle/v2/framework/tests/CMakeLists.txt +++ b/python/paddle/v2/framework/tests/CMakeLists.txt @@ -1 +1,2 @@ -add_python_test(test_framework test_protobuf.py test_scope.py) +add_python_test(test_framework test_protobuf.py test_scope.py + test_default_scope_funcs.py test_op_creation_methods.py) diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py new file mode 100644 index 0000000000000000000000000000000000000000..81033deb1546c81e2566ec5474f45dc56781644a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py @@ -0,0 +1,33 @@ +from paddle.v2.framework.default_scope_funcs import * +import unittest + + +class TestDefaultScopeFuncs(unittest.TestCase): + def test_cur_scope(self): + self.assertIsNotNone(get_cur_scope()) + + def test_none_variable(self): + self.assertIsNone(get_var("test")) + + def test_create_var_get_var(self): + var_a = create_var("var_a") + self.assertIsNotNone(var_a) + self.assertIsNotNone(get_cur_scope().get_var('var_a')) + enter_local_scope() + self.assertIsNotNone(get_cur_scope().get_var('var_a')) + leave_local_scope() + + def test_var_get_int(self): + def __new_scope__(): + i = create_var("var_i") + self.assertFalse(i.is_int()) + i.set_int(10) + self.assertTrue(i.is_int()) + self.assertEqual(10, i.get_int()) + + for _ in xrange(10): + scoped_function(__new_scope__) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..41db7c0d535aa920b34d6cc346090a8c15bfb110 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py @@ -0,0 +1,254 @@ +import unittest +import paddle.v2.framework.create_op_creation_methods as creation +import paddle.v2.framework.core as core +import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2 +import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2 +import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2 + + +class TestGetAllProtos(unittest.TestCase): + def test_all(self): + all_protos = creation.get_all_op_protos() + self.assertNotEqual(0, len(all_protos)) + + for each in all_protos: + self.assertTrue(each.IsInitialized()) + + +class TestOpDescCreationMethod(unittest.TestCase): + def test_plain_input_output(self): + op = op_proto_pb2.OpProto() + op.type = "test" + ipt = op.inputs.add() + ipt.name = "X" + ipt.comment = "not matter" + + ipt = op.inputs.add() + ipt.name = "Y" + ipt.comment = "not matter" + + opt = op.outputs.add() + opt.name = "Z" + opt.comment = "not matter" + + op.comment = "not matter" + + self.assertTrue(op.IsInitialized()) + + method = creation.OpDescCreationMethod(op) + output = method(X="a", Y="b", Z="c") + + expected = op_desc_pb2.OpDesc() + expected.type = "test" + expected.inputs.extend(["a", "b"]) + expected.outputs.append("c") + self.assertEqual(expected, output) + + def test_multiple_input_plain_output(self): + op = op_proto_pb2.OpProto() + op.type = "fc" + ipt = op.inputs.add() + ipt.name = "X" + ipt.comment = "" + ipt.multiple = True + + ipt = op.inputs.add() + ipt.name = "W" + ipt.comment = "" + ipt.multiple = True + + ipt = op.inputs.add() + ipt.name = "b" + ipt.comment = "" + + out = op.outputs.add() + out.name = "Y" + out.comment = "" + + op.comment = "" + self.assertTrue(op.IsInitialized()) + method = creation.OpDescCreationMethod(op) + + generated1 = method(X="x", W="w", b="b", Y="y") + expected1 = op_desc_pb2.OpDesc() + expected1.inputs.extend(['x', 'w', 'b']) + expected1.outputs.extend(['y']) + expected1.type = 'fc' + attr = expected1.attrs.add() + attr.name = 'input_format' + attr.type = attr_type_pb2.INTS + attr.ints.extend([0, 1, 2, 3]) + self.assertEqual(expected1, generated1) + + generated2 = method( + X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y') + expected2 = op_desc_pb2.OpDesc() + expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b']) + expected2.outputs.extend(['y']) + expected2.type = 'fc' + attr = expected2.attrs.add() + attr.name = 'input_format' + attr.type = attr_type_pb2.INTS + attr.ints.extend([0, 3, 6, 7]) + self.assertEqual(expected2, generated2) + + def test_attrs(self): + op = op_proto_pb2.OpProto() + op.type = "test" + ipt = op.inputs.add() + ipt.name = 'X' + ipt.comment = "" + + def __add_attr__(name, type): + attr = op.attrs.add() + attr.name = name + attr.comment = "" + attr.type = type + + __add_attr__("int_attr", attr_type_pb2.INT) + __add_attr__("float_attr", attr_type_pb2.FLOAT) + __add_attr__("string_attr", attr_type_pb2.STRING) + __add_attr__("ints_attr", attr_type_pb2.INTS) + __add_attr__("floats_attr", attr_type_pb2.FLOATS) + __add_attr__("strings_attr", attr_type_pb2.STRINGS) + + op.comment = "" + self.assertTrue(op.IsInitialized()) + + method = creation.OpDescCreationMethod(op) + + generated = method( + X="a", + int_attr=10, + float_attr=3.2, + string_attr="test_str", + ints_attr=[0, 1, 2, 3, 4], + floats_attr=[0.2, 3.2, 4.5], + strings_attr=["a", "b", "c"]) + + expected = op_desc_pb2.OpDesc() + expected.type = "test" + expected.inputs.extend(['a']) + attr = expected.attrs.add() + attr.name = "int_attr" + attr.type = attr_type_pb2.INT + attr.i = 10 + + attr = expected.attrs.add() + attr.name = "float_attr" + attr.type = attr_type_pb2.FLOAT + attr.f = 3.2 + + attr = expected.attrs.add() + attr.name = "string_attr" + attr.type = attr_type_pb2.STRING + attr.s = "test_str" + + attr = expected.attrs.add() + attr.name = "ints_attr" + attr.type = attr_type_pb2.INTS + attr.ints.extend([0, 1, 2, 3, 4]) + + attr = expected.attrs.add() + attr.name = 'floats_attr' + attr.type = attr_type_pb2.FLOATS + attr.floats.extend([0.2, 3.2, 4.5]) + + attr = expected.attrs.add() + attr.name = 'strings_attr' + attr.type = attr_type_pb2.STRINGS + attr.strings.extend(['a', 'b', 'c']) + + self.assertEqual(expected, generated) + + def test_input_temporary_output(self): + op = op_proto_pb2.OpProto() + op.type = "test" + out = op.outputs.add() + out.name = "OUT" + out.comment = "" + + out = op.outputs.add() + out.name = "TMP" + out.comment = "" + out.temporary = True + + out = op.outputs.add() + out.name = "OUT2" + out.comment = "" + op.comment = "" + + method = creation.OpDescCreationMethod(op) + generated = method(OUT="a", OUT2="b") + desc = op_desc_pb2.OpDesc() + desc.outputs.extend(["a", core.var_names.temp(), "b"]) + desc.type = "test" + attr = desc.attrs.add() + attr.name = "temporary_index" + attr.type = attr_type_pb2.INTS + attr.ints.append(2) + self.assertEqual(generated, desc) + + +class TestOpCreationDocStr(unittest.TestCase): + def test_all(self): + op = op_proto_pb2.OpProto() + op.type = "test" + op.comment = """Test Op. + +This op is used for unit test, not a real op. +""" + a = op.inputs.add() + a.name = "a" + a.comment = "Input a for test op" + a.multiple = True + + b = op.inputs.add() + b.name = "b" + b.comment = "Input b for test op" + self.assertTrue(op.IsInitialized()) + + o1 = op.outputs.add() + o1.name = "output" + o1.comment = "The output of test op" + + o2 = op.outputs.add() + o2.name = "temp output" + o2.comment = "The temporary output of test op" + o2.temporary = True + + test_str = op.attrs.add() + test_str.name = "str_attr" + test_str.type = attr_type_pb2.STRING + test_str.comment = "A string attribute for test op" + + actual = creation.get_docstring_from_op_proto(op) + expected_docstring = '''Test Op. + +This op is used for unit test, not a real op. + +:param a: Input a for test op +:type a: list | basestr +:param b: Input b for test op +:type b: basestr +:param output: The output of test op +:type output: basestr +:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op +:type temp output: basestr +:param str_attr: A string attribute for test op +:type str_attr: basestr +''' + self.assertEqual(expected_docstring, actual) + + +class TestOpCreations(unittest.TestCase): + def test_all(self): + add_op = creation.op_creations.add_two(X="a", Y="b", Out="z") + self.assertIsNotNone(add_op) + # Invoke C++ DebugString() + self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).', + str(add_op)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 34b7308601390a4ccb0c19ef10d2c7a60b3fa576..40134a3270c3579fd2f6a891af66ff241050f60c 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -1,9 +1,7 @@ import numpy -import py_paddle.swig_paddle as api import collections import topology import minibatch -from data_feeder import DataFeeder __all__ = ['infer', 'Inference'] @@ -28,6 +26,7 @@ class Inference(object): """ def __init__(self, output_layer, parameters): + import py_paddle.swig_paddle as api topo = topology.Topology(output_layer) gm = api.GradientMachine.createFromConfigProto( topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE]) @@ -40,6 +39,7 @@ class Inference(object): self.__data_types__ = topo.data_type() def iter_infer(self, input, feeding=None): + from data_feeder import DataFeeder feeder = DataFeeder(self.__data_types__, feeding) batch_size = len(input) diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py index 70f9e43c9683033233d48a750668771a4c7ba045..4c041fb509903008a7a5648a112b2472ed856aea 100644 --- a/python/paddle/v2/master/client.py +++ b/python/paddle/v2/master/client.py @@ -10,8 +10,9 @@ class client(object): client is a client to the master server. """ - def __init__(self, addr, buf_size): - self.c = lib.paddle_new_master_client(addr, buf_size) + def __init__(self, etcd_endpoints, timeout, buf_size): + self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout, + buf_size) def close(self): lib.paddle_release_master_client(self.c) diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py index 390c22ee552c506fde1567efba1326a6d735ad2e..ba581980334fec6226a537af2cf53b3465d32c1e 100644 --- a/python/paddle/v2/optimizer.py +++ b/python/paddle/v2/optimizer.py @@ -1,5 +1,3 @@ -import py_paddle.swig_paddle as swig_api - import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils import paddle.trainer_config_helpers.optimizers as v1_optimizers """ @@ -18,6 +16,7 @@ __all__ = [ class Optimizer(object): def __init__(self, **kwargs): + import py_paddle.swig_paddle as swig_api if 'batch_size' in kwargs: del kwargs['batch_size'] # not important for python library. @@ -36,23 +35,27 @@ class Optimizer(object): For each optimizer(SGD, Adam), GradientMachine should enable different buffers. """ + import py_paddle.swig_paddle as swig_api tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__) assert isinstance(tmp, swig_api.ParameterOptimizer) return tmp.getParameterTypes() def __create_local_updater__(self): + import py_paddle.swig_paddle as swig_api return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__) def __create_remote_updater__(self, pass_num, use_sparse_updater): + import py_paddle.swig_paddle as swig_api return swig_api.ParameterUpdater.createRemoteUpdater( self.__opt_conf__, pass_num, use_sparse_updater) - def __create_new_remote_updater__(self, pserver_spec): + def __create_new_remote_updater__(self, pserver_spec, use_etcd): + import py_paddle.swig_paddle as swig_api return swig_api.ParameterUpdater.createNewRemoteUpdater( - self.__opt_conf__, pserver_spec) + self.__opt_conf__, pserver_spec, use_etcd) def create_updater(self, is_local, num_passes, use_sparse_updater, - pserver_spec): + pserver_spec, use_etcd): """ create proper parameter_updater by configuration. :param is_local: create local or remote parameter updater @@ -78,7 +81,7 @@ class Optimizer(object): num_passes, use_sparse_updater) else: parameter_updater = self.__create_new_remote_updater__( - pserver_spec) + pserver_spec, use_etcd) return parameter_updater @@ -268,6 +271,7 @@ ModelAverage = v1_optimizers.ModelAverage L2Regularization = v1_optimizers.L2Regularization if __name__ == '__main__': + import py_paddle.swig_paddle as swig_api swig_api.initPaddle('--use_gpu=false') for opt in [ Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(), diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index bbaf8bfa979fbbf460561ebf7077b75b9c41a11a..a9cba8ca0b1efd4149463f6c7bf2dcdfbea350c9 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -1,5 +1,4 @@ import numpy as np -import py_paddle.swig_paddle as api from paddle.proto.ParameterConfig_pb2 import ParameterConfig import paddle.trainer.config_parser as cp import struct @@ -124,6 +123,7 @@ class Parameters(object): :return: parameter value :rtype: np.ndarray """ + import py_paddle.swig_paddle as api shape = self.get_shape(key) if len(self.__gradient_machines__) == 0: @@ -223,7 +223,7 @@ class Parameters(object): :type gradient_machine: api.GradientMachine :return: """ - + import py_paddle.swig_paddle as api if not isinstance(gradient_machine, api.GradientMachine): raise ValueError("gradient_machine should be api.GradientMachine") @@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr): :return: :rtype: api.Parameter """ + import py_paddle.swig_paddle as api param = __get_parameter_in_gradient_machine__(gradient_machine, name) vec = param.getBuf(api.PARAMETER_VALUE) assert isinstance(vec, api.Vector) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 96c6c4b89a2f2e2c3ecb95213e0e0191b1998f50..76bae0bb12b6c33f88530386f9cc19ae9b59f457 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -2,12 +2,6 @@ Module Trainer """ import collections -import gzip -import os - -import py_paddle.swig_paddle as api - -from data_feeder import DataFeeder from topology import Topology from . import event as v2_event from . import optimizer as v2_optimizer @@ -51,7 +45,8 @@ class SGD(object): update_equation, extra_layers=None, is_local=True, - pserver_spec=None): + pserver_spec=None, + use_etcd=True): if not isinstance(parameters, v2_parameters.Parameters): raise TypeError('parameters should be parameters') @@ -59,6 +54,7 @@ class SGD(object): if not isinstance(update_equation, v2_optimizer.Optimizer): raise TypeError("update equation parameter must be " "paddle.v2.optimizer.Optimizer") + import py_paddle.swig_paddle as api topology = Topology(cost, extra_layers=extra_layers) self.__optimizer__ = update_equation self.__topology__ = topology @@ -66,6 +62,7 @@ class SGD(object): self.__topology_in_proto__ = topology.proto() self.__is_local__ = is_local self.__pserver_spec__ = pserver_spec + self.__use_etcd__ = use_etcd self.__use_sparse_updater__ = self.__topology__.use_sparse_updater() # # In local mode, disable sparse_remote_update. @@ -124,13 +121,15 @@ class SGD(object): :type feeding: dict|list :return: """ + import py_paddle.swig_paddle as api + from data_feeder import DataFeeder if event_handler is None: event_handler = default_event_handler __check_train_args__(**locals()) self.__parameter_updater__ = self.__optimizer__.create_updater( self.__is_local__, num_passes, self.__use_sparse_updater__, - self.__pserver_spec__) + self.__pserver_spec__, self.__use_etcd__) self.__parameter_updater__.init(self.__gradient_machine__) self.__gradient_machine__.start() @@ -187,6 +186,8 @@ class SGD(object): :type feeding: dict :return: """ + import py_paddle.swig_paddle as api + from data_feeder import DataFeeder feeder = DataFeeder(self.__data_types__, feeding) evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) diff --git a/python/setup.py.in b/python/setup.py.in index 310ac403a98fb142813a3f9d3ca9147b03127ea0..65a26940d4d703ea4fbb5022523a90716982ec10 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -20,7 +20,8 @@ setup_requires=["requests", "matplotlib", "rarfile", "scipy>=0.19.0", - "Pillow"] + "Pillow", + "nltk"] if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: setup_requires+=["opencv-python"]