提交 b142a6b0 编写于 作者: W wanghaoshuang

Merge branch 'develop' of https://github.com/paddlepaddle/paddle into voc_dataset

......@@ -4,6 +4,7 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
os:
......@@ -11,6 +12,7 @@ os:
env:
- JOB=build_doc
- JOB=check_style
- JOB=build_android
addons:
apt:
packages:
......
......@@ -28,7 +28,9 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
find_package(Boost QUIET)
if(NOT ANDROID)
find_package(Boost QUIET)
endif()
include(simd)
......@@ -135,7 +137,8 @@ if(WITH_GPU)
endif(WITH_GPU)
if(USE_NNPACK)
list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
include(external/nnpack)
list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
endif(USE_NNPACK)
add_subdirectory(proto)
......@@ -151,7 +154,9 @@ if(WITH_GOLANG)
endif(WITH_GOLANG)
add_subdirectory(paddle)
add_subdirectory(python)
if(WITH_PYTHON)
add_subdirectory(python)
endif()
if(WITH_DOC)
add_subdirectory(doc)
endif()
......@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y \
git python-pip python-dev openssh-server bison \
wget unzip tar xz-utils bzip2 gzip coreutils ntp \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc g++ \
automake locales clang-format-3.8 swig doxygen cmake \
......
......@@ -14,6 +14,17 @@ RUN apt-get update && \
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
apt-get clean -y
# Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src && \
rm go.tgz
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# git credential to skip password typing
RUN git config --global credential.helper store
......
......@@ -102,12 +102,19 @@ if(WITH_GOLANG)
message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
endif()
add_custom_target(go_vendor)
add_custom_command(TARGET go_vendor
# this command will only run when the file it depends is missing
# or has changed, or the output is missing.
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PROJ_ROOT}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
)
add_dependencies(go_vendor go_path)
)
# depends on the custom command which outputs
# ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
# run every time this target is built.
add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
endif()
endif(WITH_GOLANG)
......@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
.*cblas\\.h.*
.*\\.pb\\.txt
.*LtrDataProvider.*
.*MultiDataProvider.*)
.*MultiDataProvider.*
.*pb.*)
# add_style_check_target
#
......@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
endif()
endforeach()
if(LINT MATCHES ON)
# cpplint code style
get_filename_component(base_filename ${filename} NAME)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
add_custom_command(OUTPUT ${CUR_GEN}
PRE_BUILD
COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
DEPENDS ${filename}
add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endforeach()
......
......@@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
ENDIF()
......@@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
......@@ -186,6 +194,10 @@ ELSE()
SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
ENDIF()
SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
IF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
ENDIF()
ENDIF()
ENDIF()
......@@ -52,6 +52,7 @@ ExternalProject_Add(
ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog)
ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
......@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
set(NNPACK_FOUND ON)
INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
set(NNPACK_LIBS)
list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
if (NNPACK_UKERNELS_LIB)
list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
endif()
if (NNPACK_CPUFEATURES_LIB)
list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
endif()
if(NOT ANDROID)
list(APPEND NNPACK_LIBS "rt")
endif()
else()
message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
endif()
......@@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(TARGET "ARMV7")
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(TARGET "ARMV8")
ENDIF()
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.19")
......
......@@ -90,11 +90,11 @@
# including binary directory for generated headers.
include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE)
if(NOT APPLE AND NOT ANDROID)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
endif(NOT APPLE)
endif(NOT APPLE AND NOT ANDROID)
function(merge_static_libs TARGET_NAME)
set(libs ${ARGN})
......@@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs})
list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach()
list(REMOVE_DUPLICATES libs_deps)
if(APPLE) # Use OSX's libtool to merge archives
# To produce a library we need at least one source file.
......@@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
# Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
......@@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${lib} ${objdir}
WORKING_DIRECTORY ${objdir})
# Empty dummy source file that goes into merged library
set(mergebase ${lib}.mergebase.c)
add_custom_command(OUTPUT ${mergebase}
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
DEPENDS ${objlistfile})
# Empty dummy source file that goes into merged library
set(mergebase ${lib}.mergebase.c)
add_custom_command(OUTPUT ${mergebase}
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
DEPENDS ${objlistfile})
list(APPEND mergebases "${mergebase}")
endforeach()
......@@ -184,6 +185,10 @@ function(cc_library TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
# cpplint code style
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
else(cc_library_SRCS)
if (cc_library_DEPS)
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
......@@ -337,7 +342,7 @@ function(go_test TARGET_NAME)
string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
-c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
".${CMAKE_CURRENT_SOURCE_REL_DIR}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
......
......@@ -474,6 +474,11 @@ prelu
.. autoclass:: paddle.v2.layer.prelu
:noindex:
gated_unit
-----------
.. autoclass:: paddle.v2.layer.gated_unit
:noindex:
Detection output Layer
======================
......
......@@ -11,6 +11,7 @@ import (
"github.com/namsral/flag"
log "github.com/sirupsen/logrus"
"github.com/topicai/candy"
"github.com/PaddlePaddle/Paddle/go/master"
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
......@@ -20,11 +21,18 @@ func main() {
port := flag.Int("port", 8080, "port of the master server.")
ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
logLevel := flag.String("log-level", "info",
"log level, possible values: debug, info, warning, error, fatal, panic")
flag.Parse()
level, e := log.ParseLevel(*logLevel)
candy.Must(e)
log.SetLevel(level)
if *endpoints == "" {
log.Warningln("-endpoints not set, fault tolerance not be enabled.")
}
......
......@@ -40,7 +40,7 @@ func main() {
idx = *index
} else {
e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
idx, err = e.Register()
idx, err = e.Register(*port)
candy.Must(err)
cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
......
......@@ -2,6 +2,7 @@ package master
import (
"os"
"time"
"github.com/PaddlePaddle/Paddle/go/connection"
"github.com/PaddlePaddle/recordio"
......@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
for {
t, err := c.getTask()
if err != nil {
// TODO(helin): wait before move on with next
// getTask call.
log.Errorln(err)
log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
time.Sleep(3 * time.Second)
continue
}
......
......@@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
}
count := index.NumChunks()
log.Infof("readChunks: file %s has %d chunks", path, count)
for i := 0; i < count; i++ {
chunk := Chunk{
Path: path,
......
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
import paddle.v2.master as master
import os
import cPickle as pickle
etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
etcd_endpoint = "http://" + etcd_ip + ":2379"
def cloud_reader():
print "connecting to master, etcd endpoints: ", etcd_endpoint
master_client = master.client(etcd_endpoint, 5, 64)
master_client.set_dataset(
["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
while 1:
r, e = master_client.next_record()
if not r:
break
yield pickle.loads(r)
def main():
......@@ -22,13 +40,13 @@ def main():
# create optimizer of new remote updater to pserver
optimizer = paddle.optimizer.Momentum(momentum=0)
#TODO(zhihong) : replace optimizer with new OptimizerConfig
print "etcd endoint: ", etcd_endpoint
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
is_local=False,
pserver_spec="localhost:3000")
pserver_spec=etcd_endpoint,
use_etcd=True)
# event_handler to print training and testing info
def event_handler(event):
......@@ -47,11 +65,11 @@ def main():
print "Test %d, %.2f" % (event.pass_id, result.cost)
# training
# NOTE: use uci_housing.train() as reader for non-paddlecloud training
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
cloud_reader, buf_size=500), batch_size=2),
feeding={'x': 0,
'y': 1},
event_handler=event_handler,
......
......@@ -3,11 +3,13 @@ package client_test
import (
"context"
"io/ioutil"
"math/rand"
"net"
"net/http"
"net/rpc"
"strconv"
"strings"
"sync"
"testing"
"time"
......@@ -100,27 +102,34 @@ func (l lister) List() []client.Server {
return l
}
func ClientTest(t *testing.T, c *client.Client) {
func testClient(t *testing.T, c *client.Client) {
selected := c.BeginInitParams()
if !selected {
t.Fatal("should be selected.")
}
const numParameter = 100
const numParameter = 1000
config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
if err != nil {
t.Fatalf("read optimizer proto failed")
}
var wg sync.WaitGroup
for i := 0; i < numParameter; i++ {
var p pserver.Parameter
p.Name = "p_" + strconv.Itoa(i)
p.ElementType = pserver.Float32
p.Content = make([]byte, (i+1)*100)
err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
if err != nil {
t.Fatal(err)
}
wg.Add(1)
go func(i int) {
var p pserver.Parameter
p.Name = "p_" + strconv.Itoa(i)
p.ElementType = pserver.Float32
p.Content = make([]byte, (i+1)*100)
err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
if err != nil {
t.Fatal(err)
}
wg.Done()
}(i)
}
wg.Wait()
err = c.FinishInitParams()
if err != nil {
......@@ -128,7 +137,7 @@ func ClientTest(t *testing.T, c *client.Client) {
}
var grads []pserver.Gradient
for i := 0; i < numParameter/2; i++ {
for i := 0; i < numParameter; i++ {
var g pserver.Gradient
g.Name = "p_" + strconv.Itoa(i)
g.ElementType = pserver.Float32
......@@ -136,9 +145,31 @@ func ClientTest(t *testing.T, c *client.Client) {
grads = append(grads, g)
}
err = c.SendGrads(grads)
if err != nil {
t.Fatal(err)
const paramPerGroup = 10
const numGroups = numParameter / paramPerGroup
// shuffle send grads order
for i := range grads {
j := rand.Intn(i + 1)
grads[i], grads[j] = grads[j], grads[i]
}
for i := 0; i < numGroups; i++ {
var gs []pserver.Gradient
if i == numGroups-1 {
gs = grads[i*paramPerGroup:]
} else {
gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
}
wg.Add(1)
go func(gs []pserver.Gradient) {
err := c.SendGrads(gs)
if err != nil {
t.Fatal(err)
}
wg.Done()
}(gs)
}
names := make([]string, numParameter)
......@@ -146,20 +177,35 @@ func ClientTest(t *testing.T, c *client.Client) {
names[i] = "p_" + strconv.Itoa(i)
}
params, err := c.GetParams(names)
if err != nil {
t.Fatal(err)
}
for i := 0; i < numGroups; i++ {
var ns []string
if i == numGroups-1 {
ns = names[i*paramPerGroup:]
} else {
ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
}
if len(names) != len(params) {
t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
}
wg.Add(1)
go func(ns []string) {
params, err := c.GetParams(ns)
if err != nil {
t.Fatal(err)
}
for i := range params {
if names[i] != params[i].Name {
t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
}
if len(ns) != len(params) {
t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
}
for i := range params {
if ns[i] != params[i].Name {
t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
}
}
wg.Done()
}(ns)
}
wg.Wait()
}
func TestNativeClient(t *testing.T) {
......@@ -169,13 +215,14 @@ func TestNativeClient(t *testing.T) {
servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
}
c1 := client.NewClient(lister(servers), len(servers), selector(true))
ClientTest(t, c1)
testClient(t, c1)
}
// TODO: tmperary disable etcdClient test for dependency of etcd)
// EtcdClient is a disabled test, since we have not embedded etcd into
// our test.
func EtcdClient(t *testing.T) {
initEtcdClient()
etcdClient := client.NewEtcd(etcdEndpoints)
c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
ClientTest(t, c2)
testClient(t, c2)
}
......@@ -12,6 +12,7 @@ import (
)
const (
// DefaultEtcdTimeout is the default etcd timeout
DefaultEtcdTimeout time.Duration = 5 * time.Second
)
......@@ -66,12 +67,12 @@ func (p *EtcdClient) List() []Server {
for {
for i := 0; i < psDesired; i++ {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
cancel()
psKey := pserver.PsPath + strconv.Itoa(i)
log.Debugf("checking %s", psKey)
resp, err := p.client.Get(ctx, psKey)
cancel()
if err != nil {
log.Infof("Get psKey= %s error, %v", psKey, err)
log.Infof("Get psKey=%s error, %v", psKey, err)
time.Sleep(p.timeout)
continue
}
......
......@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
// Register registers the pserver on etcd
//
// Register returns the index of the current pserver.
func (e *EtcdClient) Register() (int, error) {
func (e *EtcdClient) Register(port int) (int, error) {
var err error
e.externalIP, err = networkhelper.GetExternalIP()
......@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
for {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
var err error
pserverIdx, err = e.registerPserverEtcd(ctx)
pserverIdx, err = e.registerPserverEtcd(ctx, port)
cancel()
if err != nil {
log.Warn(err)
......@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
}
// registerPserverEtcd registers pserver node on etcd using transaction.
func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
var idx int
_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
registered := false
......@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
log.Fatal(err)
}
// find the first id and write info
c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
if kaerr != nil {
log.Errorf("keepalive etcd node error: %v", kaerr)
......
......@@ -19,6 +19,7 @@ var nullPtr = unsafe.Pointer(uintptr(0))
type optimizer struct {
opt *C.struct_paddle_optimizer
elementType ElementType
contentLen int
}
func cArrayToSlice(p unsafe.Pointer, len int) []byte {
......@@ -37,10 +38,11 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
o := &optimizer{}
o.elementType = paramWithConfigs.Param.ElementType
o.contentLen = len(paramWithConfigs.Param.Content)
p := paramWithConfigs.Param
c := paramWithConfigs.Config
s := State
paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
paramBufferSize := C.size_t(len(p.Content))
log.WithFields(log.Fields{
"ElementType": p.ElementType,
"ParamSize": paramBufferSize,
......@@ -78,7 +80,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
}
r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
if o.contentLen != len(g.Content) {
return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
}
r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
if r != 0 {
return fmt.Errorf("optimizer update returned error code: %d", r)
}
......
......@@ -31,7 +31,7 @@ func TestServiceFull(t *testing.T) {
err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
var p1 pserver.Parameter
......@@ -40,40 +40,40 @@ func TestServiceFull(t *testing.T) {
p1.ElementType = pserver.Float32
err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
err = s.FinishInitParams(0, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
var param pserver.Parameter
err = s.GetParam("param_b", &param)
if err != nil {
t.FailNow()
t.Fatal(err)
}
if !reflect.DeepEqual(param, p1) {
t.FailNow()
t.Fatal("not equal:", param, p1)
}
g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
err = s.SendGrad(g1, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
err = s.SendGrad(g2, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
var param1 pserver.Parameter
err = s.GetParam("param_a", &param1)
if err != nil {
t.FailNow()
t.Fatal(err)
}
// don't compare content, since it's already changed by
......@@ -82,7 +82,7 @@ func TestServiceFull(t *testing.T) {
p.Content = nil
if !reflect.DeepEqual(param1, p) {
t.FailNow()
t.Fatal("not equal:", param1, p)
}
}
......@@ -90,16 +90,16 @@ func TestMultipleInit(t *testing.T) {
var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp)
if err != nil {
t.Error(err)
t.Fatal(err)
}
err = s.FinishInitParams(0, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
err = s.FinishInitParams(0, nil)
if err.Error() != pserver.AlreadyInitialized {
t.FailNow()
t.Fatal(err)
}
}
......@@ -108,7 +108,7 @@ func TestUninitialized(t *testing.T) {
s, err := pserver.NewService(0, 1, "", nil, cp)
err = s.SendGrad(pserver.Gradient{}, nil)
if err.Error() != pserver.Uninitialized {
t.FailNow()
t.Fatal(err)
}
}
......@@ -154,12 +154,12 @@ func TestBlockUntilInitialized(t *testing.T) {
err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
err = s.FinishInitParams(0, nil)
if err != nil {
t.FailNow()
t.Fatal(err)
}
wg.Wait()
......
......@@ -14,6 +14,7 @@ if(Boost_FOUND)
add_subdirectory(memory)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
endif()
......
......@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
ParameterConfig::~ParameterConfig() {
if (m) {
delete m;
}
}
ParameterConfig::~ParameterConfig() { delete m; }
ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
void* ptr) {
......@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
OptimizationConfig::~OptimizationConfig() {
if (m) {
delete m;
}
}
OptimizationConfig::~OptimizationConfig() { delete m; }
std::string OptimizationConfig::toProtoString() {
return m->getConfig().SerializeAsString();
......
......@@ -843,7 +843,8 @@ public:
bool useSparseUpdater);
static ParameterUpdater* createNewRemoteUpdater(
OptimizationConfig* config,
const std::string pserverSpec) throw(UnsupportError);
const std::string pserverSpec,
const bool useEtcd) throw(UnsupportError);
~ParameterUpdater();
/**
......
......@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
ParameterOptimizer::~ParameterOptimizer() {
if (m) {
delete m;
}
}
ParameterOptimizer::~ParameterOptimizer() { delete m; }
ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
CHECK(config != nullptr);
......@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
ParameterTraverseCallback::ParameterTraverseCallback()
: m(new ParameterTraverseCallbackPrivate()) {}
ParameterTraverseCallback::~ParameterTraverseCallback() {
if (m) {
delete m;
}
}
ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
const ParameterConfig& conf,
......
......@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
OptimizationConfig *config,
const std::string pserverSpec) throw(UnsupportError) {
const std::string pserverSpec,
const bool useEtcd) throw(UnsupportError) {
#ifndef PADDLE_WITHOUT_GOLANG
auto updater = new ParameterUpdater();
updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
config->m->getConfig(), pserverSpec));
config->m->getConfig(), pserverSpec, useEtcd));
return updater;
#else
throw UnsupportError();
......
......@@ -171,11 +171,7 @@ struct VectorPrivate {
Vector::Vector() : m(new VectorPrivate()) {}
Vector::~Vector() {
if (m) {
delete m;
}
}
Vector::~Vector() { delete m; }
Vector* Vector::createZero(size_t sz, bool useGpu) {
auto retVec = new Vector();
......
# ddim lib
cc_library(enforce SRCS enforce.cc DEPS glog)
cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
cc_library(ddim SRCS ddim.cc)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory)
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
cc_test(variable_test SRCS variable_test.cc)
cc_test(scope_test SRCS scope_test.cc)
cc_test(enforce_test SRCS enforce_test.cc)
proto_library(attr_type SRCS attr_type.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
cc_library(net SRCS net.cc DEPS net_proto)
cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
cc_test(net_op_test SRCS net_op_test.cc DEPS net)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/ddim.h"
#include "paddle/framework/enforce.h"
namespace paddle {
namespace framework {
///@cond HIDDEN
/// @cond HIDDEN
template <int i>
Dim<i> make_dim(const int* d) {
......@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
}
}
///@endcond
/// @endcond
DDim make_ddim(std::initializer_list<int> dims) {
DDim result(make_dim(0));
......@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
return result;
}
///@cond HIDDEN
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor<int&> {
public:
DynamicMutableIndexer(int idx) : idx_(idx) {}
explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
template <int D>
int& operator()(Dim<D>& dim) const {
......@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
class DynamicConstIndexer : public boost::static_visitor<int> {
public:
DynamicConstIndexer(int idx) : idx_(idx) {}
explicit DynamicConstIndexer(int idx) : idx_(idx) {}
template <int D>
int operator()(const Dim<D>& dim) const {
......@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
int idx_;
};
///@endcond
/// @endcond
int& DDim::operator[](int idx) {
return boost::apply_visitor(DynamicMutableIndexer(idx), var);
......@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var);
}
ssize_t DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const {
if (var.which() != d.getVar().which()) {
return false;
......@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
///@cond HIDDEN
/// @cond HIDDEN
struct VectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
VectorizeVisitor(std::vector<int>& v) : vector(v) {}
explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
template <typename T>
void operator()(const T& t) {
......@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
void operator()(const Dim<1>& t) { vector.push_back(t.head); }
};
///@endcond
/// @endcond
std::vector<int> vectorize(const DDim& ddim) {
std::vector<int> result;
......@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
return result;
}
struct ProductVisitor : public boost::static_visitor<ssize_t> {
template <int D>
ssize_t operator()(const Dim<D>& dim) {
return product(dim);
}
};
ssize_t product(const DDim& ddim) {
ssize_t result = 1;
std::vector<int> v = vectorize(ddim);
for (auto i : v) {
result *= i;
ProductVisitor visitor;
return boost::apply_visitor(visitor, ddim);
}
struct SliceVectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
int begin;
int end;
SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
: vector(v), begin(b), end(e) {
PADDLE_ENFORCE(begin < end,
"Begin index must be less than end index in ddim slice.");
PADDLE_ENFORCE(begin >= 0,
"Begin index can't be less than zero in ddim slice.");
}
return result;
template <int S>
void operator()(const Dim<S>& dim) {
if (begin == 0) {
vector.push_back(dim.head);
} else {
--begin;
}
--end;
if (end > 0) {
this->operator()(dim.tail);
}
}
void operator()(const Dim<1>& dim) {
PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
vector.push_back(dim.head);
}
};
DDim slice_ddim(const DDim& dim, int begin, int end) {
std::vector<int> vec;
vec.reserve(end - begin);
SliceVectorizeVisitor visitor(vec, begin, end);
boost::apply_visitor(visitor, dim);
return make_ddim(vec);
}
///\cond HIDDEN
/// \cond HIDDEN
struct ArityVisitor : boost::static_visitor<int> {
template <int D>
......@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
}
};
///\endcond
/// \endcond
int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
///\cond HIDDEN
/// \cond HIDDEN
struct DDimPrinter : boost::static_visitor<void> {
std::ostream& os;
DDimPrinter(std::ostream& os_) : os(os_) {}
explicit DDimPrinter(std::ostream& os_) : os(os_) {}
template <typename T>
void operator()(const T& t) {
......@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
}
};
///\endcond
/// \endcond
std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
DDimPrinter printer(os);
......
......@@ -27,7 +27,7 @@ struct DDim {
DDim() : var(Dim<1>()) {}
template <int D>
DDim(const Dim<D>& in) : var(in) {}
explicit DDim(const Dim<D>& in) : var(in) {}
template <int D>
DDim& operator=(const Dim<D>& in) {
......@@ -50,6 +50,8 @@ struct DDim {
DDimVar getVar() { return var; }
ssize_t size() const;
bool operator==(DDim d) const;
bool operator!=(DDim d) const;
......@@ -81,6 +83,15 @@ std::vector<int> vectorize(const DDim& ddim);
ssize_t product(const DDim& ddim);
/**
* \brief Slice a ddim
*
* Slice dim with [begin, end).
* e.g. DDim d = make_ddim({1,2,3,4,5});
* slice_ddim(d, 1, 3); ====> {2,3}
*/
DDim slice_ddim(const DDim& dim, int begin, int end);
/**
* \brief What is the length of this dimension?
*
......
......@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
// arity of a DDim
EXPECT_EQ(paddle::framework::arity(ddim), 3);
EXPECT_EQ(ddim.size(), 3);
// product of a DDim
EXPECT_EQ(paddle::framework::product(vddim), 45);
EXPECT_EQ(
paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
90);
// slice a DDim
paddle::framework::DDim ddim2 =
paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
EXPECT_EQ(arity(ss), 3);
EXPECT_EQ(ss[0], 3);
EXPECT_EQ(ss[1], 4);
EXPECT_EQ(ss[2], 5);
paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
EXPECT_EQ(arity(ss2), 6);
EXPECT_EQ(ss2[0], 1);
EXPECT_EQ(ss2[1], 2);
EXPECT_EQ(ss2[2], 3);
EXPECT_EQ(ss2[3], 4);
EXPECT_EQ(ss2[4], 5);
EXPECT_EQ(ss2[5], 6);
}
TEST(DDim, Print) {
......
#include <thrust/device_vector.h>
#include <sstream>
#include "paddle/framework/dim.h"
#include "gtest/gtest.h"
#include "paddle/framework/dim.h"
__global__ void test(paddle::framework::Dim<2>* o) {
o[0] = paddle::framework::make_dim(5, 6);
o[0] = paddle::framework::make_dim(5, 6);
}
__global__ void dyn_idx_gpu(int* o) {
auto d = paddle::framework::make_dim(5, 6);
o[0] = d[1];
auto d = paddle::framework::make_dim(5, 6);
o[0] = d[1];
}
TEST(Dim, Equality) {
// construct a Dim on the CPU
auto a = paddle::framework::make_dim(3, 4);
EXPECT_EQ(paddle::framework::get<0>(a), 3);
EXPECT_EQ(paddle::framework::get<1>(a), 4);
// construct a Dim on the GPU
thrust::device_vector<paddle::framework::Dim<2>> t(2);
test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
a = t[0];
EXPECT_EQ(paddle::framework::get<0>(a), 5);
EXPECT_EQ(paddle::framework::get<1>(a), 6);
// linearization
auto b = paddle::framework::make_dim(7, 8);
EXPECT_EQ(paddle::framework::linearize(a, b), 83);
// product
EXPECT_EQ(paddle::framework::product(a), 30);
// mutate a Dim
paddle::framework::get<1>(b) = 10;
EXPECT_EQ(paddle::framework::get<0>(b), 7);
EXPECT_EQ(paddle::framework::get<1>(b), 10);
// dynamic access
paddle::framework::get(b, 0) = 8;
b[1] = 11;
EXPECT_EQ(paddle::framework::get<0>(b), 8);
EXPECT_EQ(paddle::framework::get<1>(b), 11);
EXPECT_EQ(paddle::framework::get(b, 0), 8);
EXPECT_EQ(b[1], 11);
// dynamic access on GPU
thrust::device_vector<int> r(1);
dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
int res = r[0];
EXPECT_EQ(res, 6);
// ex_prefix_mul
paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
EXPECT_EQ(paddle::framework::get<0>(c), 1);
EXPECT_EQ(paddle::framework::get<1>(c), 3);
EXPECT_EQ(paddle::framework::get<2>(c), 12);
// generate from an index
auto size = paddle::framework::make_dim(4, 5, 2);
c = paddle::framework::Dim<3>(14, size);
EXPECT_EQ(paddle::framework::get<0>(c), 2);
EXPECT_EQ(paddle::framework::get<1>(c), 3);
EXPECT_EQ(paddle::framework::get<2>(c), 0);
c = paddle::framework::Dim<3>(25, size);
EXPECT_EQ(paddle::framework::get<0>(c), 1);
EXPECT_EQ(paddle::framework::get<1>(c), 1);
EXPECT_EQ(paddle::framework::get<2>(c), 1);
// construct a Dim on the CPU
auto a = paddle::framework::make_dim(3, 4);
EXPECT_EQ(paddle::framework::get<0>(a), 3);
EXPECT_EQ(paddle::framework::get<1>(a), 4);
// construct a Dim on the GPU
thrust::device_vector<paddle::framework::Dim<2>> t(2);
test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
a = t[0];
EXPECT_EQ(paddle::framework::get<0>(a), 5);
EXPECT_EQ(paddle::framework::get<1>(a), 6);
// linearization
auto b = paddle::framework::make_dim(7, 8);
EXPECT_EQ(paddle::framework::linearize(a, b), 83);
// product
EXPECT_EQ(paddle::framework::product(a), 30);
// mutate a Dim
paddle::framework::get<1>(b) = 10;
EXPECT_EQ(paddle::framework::get<0>(b), 7);
EXPECT_EQ(paddle::framework::get<1>(b), 10);
// dynamic access
paddle::framework::get(b, 0) = 8;
b[1] = 11;
EXPECT_EQ(paddle::framework::get<0>(b), 8);
EXPECT_EQ(paddle::framework::get<1>(b), 11);
EXPECT_EQ(paddle::framework::get(b, 0), 8);
EXPECT_EQ(b[1], 11);
// dynamic access on GPU
thrust::device_vector<int> r(1);
dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
int res = r[0];
EXPECT_EQ(res, 6);
// ex_prefix_mul
paddle::framework::Dim<3> c =
paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
EXPECT_EQ(paddle::framework::get<0>(c), 1);
EXPECT_EQ(paddle::framework::get<1>(c), 3);
EXPECT_EQ(paddle::framework::get<2>(c), 12);
// generate from an index
auto size = paddle::framework::make_dim(4, 5, 2);
c = paddle::framework::Dim<3>(14, size);
EXPECT_EQ(paddle::framework::get<0>(c), 2);
EXPECT_EQ(paddle::framework::get<1>(c), 3);
EXPECT_EQ(paddle::framework::get<2>(c), 0);
c = paddle::framework::Dim<3>(25, size);
EXPECT_EQ(paddle::framework::get<0>(c), 1);
EXPECT_EQ(paddle::framework::get<1>(c), 1);
EXPECT_EQ(paddle::framework::get<2>(c), 1);
}
TEST(Dim, Bool) {
auto a = paddle::framework::make_dim(3, 4);
auto b = paddle::framework::make_dim(5, 6);
auto c = paddle::framework::make_dim(3, 4);
// in_bounds check
EXPECT_TRUE(paddle::framework::contained(a, b));
EXPECT_FALSE(paddle::framework::contained(b, a));
// comparison
EXPECT_TRUE(a == a);
EXPECT_FALSE(a == b);
EXPECT_TRUE(a == c);
auto a = paddle::framework::make_dim(3, 4);
auto b = paddle::framework::make_dim(5, 6);
auto c = paddle::framework::make_dim(3, 4);
// in_bounds check
EXPECT_TRUE(paddle::framework::contained(a, b));
EXPECT_FALSE(paddle::framework::contained(b, a));
// comparison
EXPECT_TRUE(a == a);
EXPECT_FALSE(a == b);
EXPECT_TRUE(a == c);
}
TEST(Dim, Print) {
{
std::stringstream ss;
auto a = paddle::framework::make_dim(2, 3);
ss << a;
EXPECT_EQ(ss.str(), "2, 3");
}
{
std::stringstream ss;
ss << paddle::framework::make_dim(8);
EXPECT_EQ(ss.str(), "8");
}
{
std::stringstream ss;
auto a = paddle::framework::make_dim(2, 3);
ss << a;
EXPECT_EQ(ss.str(), "2, 3");
}
{
std::stringstream ss;
ss << paddle::framework::make_dim(8);
EXPECT_EQ(ss.str(), "8");
}
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/enforce.h"
......@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <paddle/string/printf.h>
#include <exception>
#include <sstream>
......@@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception {
/**
* @brief Enforce a condition, otherwise throw an EnforceNotMet
*/
#ifdef NDEBUG
#define PADDLE_ENFORCE(condition, ...) \
do { \
if (UNLIKELY(!(condition))) { \
PADDLE_THROW(__VA_ARGS__); \
} \
} while (0)
#else
#define PADDLE_ENFORCE(condition, ...) \
CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__);
#endif
} // namespace framework
} // namespace paddle
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "paddle/framework/net.h"
namespace paddle {
namespace framework {
PlainNet::PlainNet(const NetDesc& def) {}
void PlainNet::InferShape(Scope* scope) {
void PlainNet::CompleteAddOp() {
std::unordered_set<std::string> input_set;
std::unordered_set<std::string> output_set;
std::unordered_set<std::string> temp_output;
for (auto& op : ops_) {
op.InferShape();
for (auto& ipt : op->inputs_) {
if (!Contains(output_set, ipt)) { // Not other op's output
input_set.insert(ipt);
} else {
temp_output.insert(ipt);
}
}
for (auto& opt : op->outputs_) {
output_set.insert(opt);
}
}
}
inputs_.reserve(input_set.size());
std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
for (auto& op : ops_) {
op.Run(ctx);
outputs_.reserve(output_set.size());
std::vector<int> tmp_index;
tmp_index.reserve(temp_output.size());
int idx = 0;
for (auto& opt : output_set) {
if (Contains(temp_output, opt)) {
tmp_index.push_back(idx);
}
outputs_.push_back(opt);
++idx;
}
attrs_["temporary_index"] = tmp_index;
add_op_done_ = true;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <paddle/framework/op_desc.pb.h>
#include <paddle/framework/operator.h>
#include "paddle/framework/net_proto.pb.h"
#include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
using namespace paddle::platform;
// operator's index stored in a network.
typedef int OpIndex;
/**
* NOTE following codes are some definitions of unimplemented concepts.
* We write some basic implementation to make Net compilable. These APIs will
* keep updating if the concepts related are implemented.
*/
struct OpDesc;
struct OpAttrs {};
class Operator {
public:
Operator(const OpDesc &def) {}
void InferShape() {}
void Run(DeviceContext *ctx) {}
};
/**
* @brief Network that manage the operators it has.
* @brief Network is also a type of Operator
*
* It will manage the operators it has.
*
* Network is the container and controller of a set of operators, user can build
* a real network from a NetDesc which is a protobuf message and use
* Network.Run() * to run all the operators in the network.
* Network is the container and controller of a set of operators.
* A network object knows all Operators belonging to this network. Variables,
* which are inputs and outputs of these operators, are created and managed by a
* hierarchy of Scope objects.
*
* This is the base class of network, all the networks should implement the apis
* This is the base class of network, all the networks should implement the APIs
* it defines.
*/
class Net {
class Net : public OperatorBase {
public:
/**
* @brief Infer shapes of all inputs and outputs of operators.
*/
virtual void InferShape(Scope *scope) = 0;
/**
* @brief Run the network.
*
* Run all the operators and return success(true) or not, with all the
* variables are located in `scope`. `context` describes the detail execution
* environment for ops. `begin` and `end` specify the scope of `ops_` to run,
* If no positive indexes are provided, all operators in `ops_` will run.
*/
virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
/**
* @brief Add an Operator according to `def`.
*/
virtual OpIndex AddOp(const OpProto &def) = 0;
/**
* @brief Add optimizer operators acctording to `attrs`.
*/
virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
/**
* @brief Add backward operators.
*/
virtual void AddBackwardOps() = 0;
/**
* @brief Create a network.
*/
static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
virtual ~Net() {}
virtual void AddOp(const OperatorPtr& op) = 0;
virtual void CompleteAddOp() = 0;
};
using NetPtr = std::shared_ptr<Net>;
/**
* @brief a basic implementation of Net.
*
......@@ -103,18 +55,14 @@ class Net {
class PlainNet : public Net {
public:
/**
* @brief Initialize a PlainNet.
*
* Initialize from a network describe by `def`. NetDesc is the definition of
* a network.
*/
PlainNet(const NetDesc &def);
/**
* Infer all the operators' input and output varialbes' shapes, will be called
* Infer all the operators' input and output variables' shapes, will be called
* before every mini-batch
*/
virtual void InferShape(Scope *scope) override;
void InferShape(const ScopePtr& scope) const override {
for (auto& op : ops_) {
op->InferShape(scope);
}
}
/**
* @brief Run the network.
......@@ -123,48 +71,32 @@ class PlainNet : public Net {
* scope will be used instead. If no OpContext is provicded, default context
* will be used.
*/
virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const override {
for (auto& op : ops_) {
op->Run(scope, dev_ctx);
}
}
/**
* @brief Add an operator to this network.
* @brief Add an operator by ptr
*/
virtual OpIndex AddOp(const OpProto &def) override;
void AddOp(const OperatorPtr& op) override {
PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
ops_.push_back(op);
}
/**
* @brief Add all optimizer operators related into the network.
*/
virtual void AddOptimizerOps(const OpAttrs &attrs) override;
void CompleteAddOp() override;
/**
* @brief Add all backward operators related into the network.
*/
virtual void AddBackwardOps() override;
virtual ~PlainNet() override {}
protected:
/**
* @brief Build the network.
*
* Create operators accordding to `def`, will be called by the constructor.
*/
void BuildNet(const NetDesc &def);
/**
* @brief Add an operator into this network.
*
* Add a operator which is identified as `type` and has attributes described
* in `attrs`, the `inputs` are the keys of readonly input variables,
* `outputs` are keys of mutable output variables. An `OpIndex` will be
* returned to indicate the offset of the new operator in `ops_`.
*/
OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
const std::vector<std::string> &outputs,
const OpAttrs &attrs = OpAttrs());
std::vector<OperatorPtr> ops_;
private:
// the operators owned by `Network`.
std::vector<Operator> ops_;
bool add_op_done_{false};
template <typename T, typename KeyType>
static bool Contains(T container, KeyType key) {
return container.find(key) != container.end();
}
};
} // namespace framework
......
#include <gtest/gtest.h>
#include <paddle/framework/net.h>
#include <paddle/framework/op_registry.h>
#include <paddle/framework/operator.h>
namespace pd = paddle::framework;
static int infer_shape_cnt = 0;
static int run_cnt = 0;
class TestOp : public pd::OperatorBase {
public:
void InferShape(const paddle::framework::ScopePtr& scope) const override {
++infer_shape_cnt;
}
void Run(const paddle::framework::ScopePtr& scope,
const paddle::platform::DeviceContext& dev_ctx) const override {
++run_cnt;
}
};
template <typename T>
void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
const std::vector<T>& actual) {
ASSERT_EQ(expected.size(), actual.size());
std::unordered_set<T> expected_set;
for (auto& tmp : expected) {
expected_set.insert(tmp);
}
for (auto& act : actual) {
ASSERT_NE(expected_set.end(), expected_set.find(act));
}
}
TEST(OpKernel, all) {
auto net = std::make_shared<paddle::framework::PlainNet>();
ASSERT_NE(net, nullptr);
auto op1 = std::make_shared<TestOp>();
op1->inputs_ = {"x", "w1", "b1"};
op1->outputs_ = {"y"};
net->AddOp(op1);
auto op2 = std::make_shared<TestOp>();
op2->inputs_ = {"y", "w2", "b2"};
op2->outputs_ = {"z"};
net->AddOp(op2);
net->CompleteAddOp();
AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
auto tmp_idx_iter = net->attrs_.find("temporary_index");
ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
ASSERT_EQ(1UL, tmp_idx.size());
ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
auto scope = std::make_shared<pd::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
net->InferShape(scope);
net->Run(scope, dev_ctx);
ASSERT_EQ(2, infer_shape_cnt);
ASSERT_EQ(2, run_cnt);
ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
}
......@@ -34,6 +34,11 @@ message AttrProto {
// Supported attribute comments. It helps 3rd-party language generate doc-string.
required string comment = 3;
// If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should
// not set that attribute.
optional bool generated = 4 [default=false];
}
// Input or output message for 3rd-party language binding.
......@@ -45,6 +50,40 @@ message VarProto {
// The comment for that input. It helps 3rd-party language generate doc-string.
required string comment = 2;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or
// `output_format`.
//
// e.g.
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// will hold a attribute of them.
//
// The Op desc of same fc could be
// {
// "type": "fc",
// "input": ["X1", "X2", "W1", "W2", "b"],
// "output": "fc.out",
// "attrs" : {
// "input_format": [0, 2, 4, 5]
// }
// }
//
optional bool multiple = 3 [default=false];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use
// that output, it could be optimized early.
//
// Attribute temporary_index will be set in OpDesc if there is some
// outputs are temporary.
//
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// attrs = {
// "temporary_index": [1]
// }
optional bool temporary = 4 [default=false];
}
// Op protocol message for 3rd-party language binding.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
namespace paddle {
......@@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRINGS);
}
} // namespace framework
} // namespace paddle
\ No newline at end of file
} // namespace paddle
#pragma once
#include <algorithm>
#include <atomic>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include "paddle/framework/attr_checker.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
......@@ -58,37 +62,138 @@ class OpProtoAndCheckerMaker {
OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: proto_(proto), op_checker_(op_checker) {}
~OpProtoAndCheckerMaker() {
PADDLE_ENFORCE(validated_, "should call Validate after build");
}
void Validate() {
validated_ = true;
CheckNoDuplicatedInOutAttrs();
}
protected:
void AddInput(const std::string& name, const std::string& comment) {
void AddInput(const std::string& name, const std::string& comment,
bool multiple = false) {
auto input = proto_->mutable_inputs()->Add();
*input->mutable_name() = name;
*input->mutable_comment() = comment;
input->set_multiple(multiple);
if (multiple) {
SetHasMultipleInput();
}
}
void AddInputs(const std::string& name, const std::string& comment) {
AddInput(name, comment, true);
}
void AddOutput(const std::string& name, const std::string& comment) {
void AddOutput(const std::string& name, const std::string& comment,
bool temporary = false, bool multiple = false) {
auto output = proto_->mutable_outputs()->Add();
*output->mutable_name() = name;
*output->mutable_comment() = comment;
output->set_multiple(multiple);
if (multiple) {
SetHasMultipleOutput();
}
output->set_temporary(temporary);
if (temporary) {
SetHasTemporaryOutput();
}
}
void AddOutputs(const std::string& name, const std::string& comment,
bool temporary = false) {
AddOutput(name, comment, temporary, true);
}
template <typename T>
TypedAttrChecker<T>& AddAttr(const std::string& name,
const std::string& comment) {
const std::string& comment,
bool generated = false) {
auto attr = proto_->mutable_attrs()->Add();
*attr->mutable_name() = name;
*attr->mutable_comment() = comment;
attr->set_generated(generated);
AttrTypeHelper::SetAttrType<T>(attr);
return op_checker_->AddAttrChecker<T>(name);
}
void AddType(const std::string& op_type) { proto_->set_type(op_type); }
void AddComment(const std::string& comment) {
*(proto_->mutable_comment()) = comment;
}
private:
void SetHasMultiple(const std::string& in_out, bool* flag) {
if (!*flag) {
AddAttr<std::vector<int>>(in_out + "_format",
"The multiple index of " + in_out +
"\n"
R"DOC(
This attribute is used by Paddle core framework. Paddle's Op support each input
or output could be a list of variable. This attribute is used to show how that
list organized.
e.g.
input = ["a", "b", "c", "d", "e", "f"]
input_format = [0, 4, 5, 6]
means
The number of all input variables this op is six, and they are segmented into
three inputs.
The first input is input[0:4], second is input[4:5], third is input[5:6].
)DOC",
/*generated*/ true);
*flag = true;
}
}
void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
void SetHasMultipleOutput() {
SetHasMultiple("output", &has_multiple_output_);
}
void SetHasTemporaryOutput() {
if (!has_temporary_output_) {
AddAttr<std::vector<int>>("temporary_index",
R"DOC(The temporary index of output.
Not all output of Paddle Op is used by user. For faster computation, each op
could output some its internal state to other op, other op could take that
output to make compute faster.
Add a mark to which output is temporary is helpful for future optimization.
)DOC",
/*generated*/ true)
.SetDefault(std::vector<int>());
has_temporary_output_ = true;
}
}
void CheckNoDuplicatedInOutAttrs() {
std::unordered_set<std::string> names;
auto checker = [&](const std::string& name) {
PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
names.insert(name);
};
for (auto& attr : proto_->attrs()) {
checker(attr.name());
}
for (auto& input : proto_->inputs()) {
checker(input.name());
}
for (auto& output : proto_->outputs()) {
checker(output.name());
}
}
OpProto* proto_;
OpAttrChecker* op_checker_;
bool validated_{false};
bool has_multiple_input_{false};
bool has_multiple_output_{false};
bool has_temporary_output_{false};
};
class OpRegistry {
......@@ -100,32 +205,52 @@ class OpRegistry {
creators()[op_type] = [] { return new OpType; };
OpProto& op_proto = protos()[op_type];
OpAttrChecker& op_checker = op_checkers()[op_type];
ProtoMakerType(&op_proto, &op_checker);
PADDLE_ENFORCE(op_proto.IsInitialized(),
"Fail to initialize %s's OpProto !", op_type);
auto maker = ProtoMakerType(&op_proto, &op_checker);
maker.Validate();
*op_proto.mutable_type() = op_type;
PADDLE_ENFORCE(
op_proto.IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, op_proto.InitializationErrorString());
}
static OperatorBase* CreateOp(const OpDesc& op_desc) {
static OperatorPtr CreateOp(const OpDesc& op_desc) {
//! Create a OpPtr by type.
std::string op_type = op_desc.type();
OperatorBase* op = creators().at(op_type)();
op->desc_ = op_desc;
OperatorPtr op(creators().at(op_type)());
//! Fill op's data member. Not use constructor because it will be noising
//! for Op developer.
const OpProto& op_proto = protos().at(op_type);
op->type_ = op_desc.type();
// set op's inputs_ from desc.
op->inputs_.reserve((size_t)op_desc.inputs_size());
std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
std::back_inserter(op->inputs_));
// set op's outputs_ from desc.
op->outputs_.reserve((size_t)op_desc.outputs_size());
std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
std::back_inserter(op->outputs_));
//! Fill attrs, and validate attrs.
for (auto& attr : op_desc.attrs()) {
op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
}
op_checkers().at(op_type).Check(op->attrs_);
//! Convert Temporary variable name to an unique variable name.
GenerateTempVariableName(op.get());
// set argument offsets stored in op.
CreateInOutOffsetMap(op, op_proto);
//! Other op's custom Init for a complex Op. For simple Op, the Init
//! method do nothing.
op->Init();
return op;
}
private:
static std::unordered_map<std::string, OpCreator>& creators() {
static std::unordered_map<std::string, OpCreator> creators_;
return creators_;
// init op.in_out_idxs_ to accelerate argument's offset lookup.
static void CreateInOutOffsetMap(OperatorPtr op, const OpProto& proto) {
op->CreateInOutOffsetMap(proto);
}
static std::unordered_map<std::string, OpProto>& protos() {
......@@ -133,6 +258,23 @@ class OpRegistry {
return protos_;
};
private:
static void GenerateTempVariableName(OperatorBase* op) {
static std::atomic<size_t> gUniqId(0UL);
for (auto& outname : op->outputs_) {
if (outname == OperatorBase::TMP_VAR_NAME()) {
outname += op->type_;
outname += "@";
outname += std::to_string(gUniqId.fetch_add(1));
}
}
}
static std::unordered_map<std::string, OpCreator>& creators() {
static std::unordered_map<std::string, OpCreator> creators_;
return creators_;
}
static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
return op_checkers_;
......@@ -142,18 +284,87 @@ class OpRegistry {
template <typename OpType, typename ProtoMakerType>
class OpRegisterHelper {
public:
OpRegisterHelper(std::string op_type) {
OpRegisterHelper(const char* op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
}
};
#define REGISTER_OP(type, op_class, op_maker_class) \
class op_class##Register { \
private: \
const static OpRegisterHelper<op_class, op_maker_class> reg; \
}; \
const OpRegisterHelper<op_class, op_maker_class> op_class##Register::reg( \
#type)
/**
* check if MACRO is used in GLOBAL NAMESPACE.
*/
#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
struct __test_global_namespace_##uniq_name##__ {}; \
static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \
__test_global_namespace_##uniq_name##__>::value, \
msg)
/**
* Macro to Register Operator.
*/
#define REGISTER_OP(__op_type, __op_class, __op_maker_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type, \
"REGISTER_OP must be in global namespace"); \
static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \
__op_register_##__op_type##__(#__op_type); \
int __op_register_##__op_type##_handle__() { return 0; }
/**
* Macro to Register OperatorKernel.
*/
#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##type##_##DEVICE_TYPE##__, \
"REGISTER_OP_KERNEL must be in global namespace"); \
struct __op_kernel_register__##type##__ { \
__op_kernel_register__##type##__() { \
::paddle::framework::OperatorWithKernel::OpKernelKey key; \
key.place_ = PlaceType(); \
::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
.reset(new KernelType()); \
} \
}; \
static __op_kernel_register__##type##__ __reg_kernel_##type##__; \
int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
/**
* Macro to mark what Operator and Kernel we will use and tell the compiler to
* link them into target.
*/
#define USE_OP_WITHOUT_KERNEL(op_type) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_without_kernel_##op_type, \
"USE_OP_WITHOUT_KERNEL must be in global namespace"); \
extern int __op_register_##op_type##_handle__(); \
static int __use_op_ptr_##op_type##_without_kernel__ \
__attribute__((unused)) = __op_register_##op_type##_handle__()
#define USE_OP_KERNEL(op_type, DEVICE_TYPE) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_kernel_##op_type##_##DEVICE_TYPE##__, \
"USE_OP_KERNEL must be in global namespace"); \
extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \
static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__ \
__attribute__((unused)) = \
__op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
// use Operator with only cpu kernel.
#define USE_OP_CPU(op_type) \
USE_OP_WITHOUT_KERNEL(op_type); \
USE_OP_KERNEL(op_type, CPU)
#ifdef PADDLE_ONLY_CPU
#define USE_OP(op_type) USE_OP_CPU(op_type)
#else
#define USE_OP(op_type) \
USE_OP_CPU(op_type); \
USE_OP_KERNEL(op_type, GPU)
#endif
} // namespace framework
} // namespace paddle
#include "paddle/framework/op_registry.h"
#include <gtest/gtest.h>
using namespace paddle::framework;
namespace pd = paddle::framework;
namespace paddle {
namespace framework {
class CosineOp : public OperatorBase {
public:
void Run(const std::shared_ptr<Scope>& scope,
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const override {}
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void InferShape(const ScopePtr& scope) const override {}
};
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
......@@ -21,42 +21,40 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddType("cos");
AddComment("This is cos op");
}
};
REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker);
class MyTestOp : public OperatorBase {
public:
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const override {}
public:
};
class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of cosine op");
AddOutput("output", "output of cosine op");
AddInputs("input", "input of cosine op");
AddOutput("output", "output of cosine op",
/*temporary*/ true);
auto my_checker = [](int i) {
PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
};
AddAttr<int>("test_attr", "a simple test attribute")
.AddCustomChecker(my_checker);
AddType("my_test_op");
AddComment("This is my_test op");
}
};
REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker);
} // namespace framework
} // namespace paddle
REGISTER_OP(cos_sim, paddle::framework::CosineOp,
paddle::framework::CosineOpProtoAndCheckerMaker);
REGISTER_OP(my_test_op, paddle::framework::MyTestOp,
paddle::framework::MyTestOpProtoAndCheckerMaker);
TEST(OpRegistry, CreateOp) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
......@@ -69,9 +67,9 @@ TEST(OpRegistry, CreateOp) {
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(scale);
paddle::framework::OperatorBase* op =
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<Scope>();
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
float scale_get = op->GetAttr<float>("scale");
......@@ -91,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
bool caught = false;
try {
paddle::framework::OperatorBase* op __attribute__((unused)) =
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
......@@ -112,24 +110,33 @@ TEST(OpRegistry, DefaultValue) {
ASSERT_TRUE(op_desc.IsInitialized());
paddle::framework::OperatorBase* op =
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<Scope>();
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
}
static void SetInputFormat(paddle::framework::OpDesc* desc) {
auto attr = desc->add_attrs();
attr->set_name("input_format");
attr->set_type(paddle::framework::INTS);
attr->mutable_ints()->Add(0);
attr->mutable_ints()->Add(1);
}
TEST(OpRegistry, CustomChecker) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("my_test_op");
op_desc.add_inputs("ii");
op_desc.add_outputs("oo");
SetInputFormat(&op_desc);
// attr 'test_attr' is not set
bool caught = false;
try {
paddle::framework::OperatorBase* op __attribute__((unused)) =
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
......@@ -148,7 +155,7 @@ TEST(OpRegistry, CustomChecker) {
attr->set_i(3);
caught = false;
try {
paddle::framework::OperatorBase* op __attribute__((unused)) =
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
......@@ -166,16 +173,44 @@ TEST(OpRegistry, CustomChecker) {
attr->set_name("test_attr");
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(4);
paddle::framework::OperatorBase* op =
SetInputFormat(&op_desc);
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::platform::CPUDeviceContext dev_ctx;
auto scope = std::make_shared<Scope>();
auto scope = std::make_shared<paddle::framework::Scope>();
op->Run(scope, dev_ctx);
int test_attr = op->GetAttr<int>("test_attr");
ASSERT_EQ(test_attr, 4);
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
\ No newline at end of file
class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
public:
TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<float>("scale", "scale of test op");
AddAttr<float>("scale", "scale of test op");
}
};
TEST(ProtoMaker, DuplicatedAttr) {
pd::OpProto op_proto;
pd::OpAttrChecker op_checker;
auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
}
class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
public:
TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op");
AddInput("input", "input of test op");
}
};
TEST(ProtoMaker, DuplicatedInOut) {
pd::OpProto op_proto;
pd::OpAttrChecker op_checker;
auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
}
......@@ -12,32 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
void OperatorBase::CreateInOutOffsetMap(const OpProto& proto) {
PADDLE_ENFORCE(in_out_idxs_.empty(), "duplicate call CreateInOutOffsetMap");
for (int i = 0; i < proto.inputs_size(); i++) {
const auto& name = proto.inputs()[i].name();
in_out_idxs_[name] = i;
}
for (int i = 0; i < proto.outputs_size(); i++) {
const auto& name = proto.outputs()[i].name();
in_out_idxs_[name] = i;
}
}
const std::string& OperatorBase::Input(const std::string& name) const {
auto it = in_out_idxs_.find(name);
PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
if (attrs_.count("input_format") == 0) {
return inputs_[it->second];
} else {
const auto& input_format = GetAttr<std::vector<int>>("input_format");
int idx = input_format[it->second];
return inputs_.at(idx);
}
}
std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
auto input_format = GetAttr<std::vector<int>>("input_format");
auto offset = in_out_idxs_.at(name);
return std::vector<std::string>{
inputs_.begin() + input_format.at(offset),
inputs_.begin() + input_format.at(offset + 1)};
}
const std::string& OperatorBase::Output(const std::string& name) const {
auto it = in_out_idxs_.find(name);
PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
if (attrs_.count("output_format") == 0) {
return outputs_[it->second];
} else {
const auto& output_format = GetAttr<std::vector<int>>("output_format");
int idx = output_format[it->second];
return outputs_.at(idx);
}
}
std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
auto output_format = GetAttr<std::vector<int>>("output_format");
auto offset = in_out_idxs_.at(name);
return std::vector<std::string>{
outputs_.begin() + output_format.at(offset),
outputs_.begin() + output_format.at(offset + 1)};
}
std::string OperatorBase::DebugString() const {
std::stringstream ss;
ss << "=================\n";
ss << "type = " << desc_.type() << "\n";
ss << "inputs = [";
for (auto& ipt : inputs_) {
ss << ipt << ", ";
}
ss << "]\n";
ss << "outputs = [";
for (auto& opt : outputs_) {
ss << opt << ", ";
ss << "Op(" << type_ << "), inputs:(";
for (size_t i = 0; i < inputs_.size(); ++i) {
ss << inputs_[i];
if (i != inputs_.size() - 1) {
ss << ", ";
}
}
ss << "]\n";
ss << "attr_keys = [";
for (auto& attr : attrs_) {
ss << attr.first << ", ";
ss << "), outputs:(";
for (size_t i = 0; i < outputs_.size(); ++i) {
ss << outputs_[i];
if (i != outputs_.size() - 1) {
ss << ", ";
}
}
ss << "]\n";
ss << ").";
return ss.str();
}
} // namespace framework
} // namespace paddle
\ No newline at end of file
} // namespace paddle
......@@ -14,22 +14,25 @@ limitations under the License. */
#pragma once
#include <paddle/framework/attr_checker.h>
#include <paddle/framework/op_desc.pb.h>
#include <paddle/framework/scope.h>
#include <paddle/platform/device_context.h>
#include <paddle/platform/place.h>
#include <paddle/utils/Error.h>
#include <boost/variant.hpp>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/framework/attr_checker.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/utils/Error.h"
namespace paddle {
namespace framework {
class OperatorBase;
using OperatorPtr = std::shared_ptr<OperatorBase>;
/**
* OperatorBase has the basic element that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
......@@ -38,6 +41,13 @@ class OperatorBase;
*/
class OperatorBase {
public:
/// If a variable is a empty variable, that name will be used.
static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
/// If a variable is a temporary variable, that name will be set in Python,
/// but it will be convert to a unique name in scope after OpCreator.
static std::string TMP_VAR_NAME() { return "@TEMP@"; }
virtual ~OperatorBase() {}
template <typename T>
......@@ -49,22 +59,84 @@ class OperatorBase {
std::string DebugString() const;
/// Init will be called after CreateOperator, you can put some initialization
/// logic here.
virtual void Init() {}
/// InferShape infer the size of Variables used by this Operator with
/// information inside scope
virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
virtual void InferShape(const ScopePtr& scope) const = 0;
/// Net will call this function to Run an op.
virtual void Run(const std::shared_ptr<Scope>& scope,
virtual void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const = 0;
protected:
std::string Type() const { return desc_.type(); }
// Get a input with argument's name described in `op_proto`
const std::string& Input(const std::string& name) const;
// Get a input which has multiple variables.
// TODO add a vector_view to prevent memory copy.
std::vector<std::string> Inputs(const std::string& name) const;
// Get a output with argument's name described in `op_proto`
const std::string& Output(const std::string& name) const;
// Get an output which has multiple variables.
// TODO add a vector_view to prevent memory copy.
std::vector<std::string> Outputs(const std::string& name) const;
// init in_out_idxs_ to accelerate argument's offset lookup.
void CreateInOutOffsetMap(const OpProto& proto);
public:
OpDesc desc_;
std::string type_;
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
AttributeMap attrs_;
// store the arguments' offset described in op_desc.
std::unordered_map<std::string, int> in_out_idxs_;
};
class KernelContext {
public:
KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& device_context)
: op_(*op), scope_(scope), device_context_(device_context) {}
const Variable* Input(int index) const {
return scope_->GetVariable(op_.inputs_[index]);
}
Variable* Output(int index) const {
return scope_->GetVariable(op_.outputs_[index]);
}
const Variable* Input(const std::string& name) const {
return scope_->GetVariable(op_.Input(name));
}
const Variable* Output(const std::string& name) const {
return scope_->GetVariable(op_.Output(name));
}
const std::vector<const Variable*> Inputs(const std::string& name) const {
auto names = op_.Inputs(name);
std::vector<const Variable*> res;
std::transform(
names.begin(), names.end(), res.begin(),
[this](const std::string& name) { return scope_->GetVariable(name); });
return res;
}
const std::vector<const Variable*> Outputs(const std::string& name) const {
auto names = op_.Outputs(name);
std::vector<const Variable*> res;
std::transform(
names.begin(), names.end(), res.begin(),
[this](const std::string& name) { return scope_->GetVariable(name); });
return res;
}
const OperatorBase& op_;
const std::shared_ptr<Scope>& scope_;
const platform::DeviceContext& device_context_;
};
class OpKernel {
......@@ -75,28 +147,22 @@ class OpKernel {
* device resource such as CUDA stream, cublas handle, etc. from
* KernelContext. User should construct it before run the Operator.
*/
class KernelContext {
public:
KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& device_context)
: op_(*op), scope_(scope), device_context_(device_context) {}
const Variable* Input(int index) const {
return scope_->GetVariable(op_.inputs_[index]);
}
virtual void Compute(const KernelContext& context) const = 0;
Variable* Output(int index) const {
return scope_->GetVariable(op_.outputs_[index]);
}
virtual ~OpKernel() {}
};
const OperatorBase& op_;
const std::shared_ptr<Scope>& scope_;
const platform::DeviceContext& device_context_;
};
template <typename T>
struct VarToTensor {};
virtual void Compute(const KernelContext& context) const = 0;
template <>
struct VarToTensor<Tensor*> {
Tensor* operator()(Variable* var) { return var->GetMutable<Tensor>(); }
};
virtual ~OpKernel() {}
template <>
struct VarToTensor<const Tensor*> {
const Tensor* operator()(Variable* var) { return &var->Get<Tensor>(); }
};
class OperatorWithKernel : public OperatorBase {
......@@ -122,29 +188,47 @@ class OperatorWithKernel : public OperatorBase {
using OpKernelMap =
std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
void Run(const std::shared_ptr<Scope>& scope,
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(KernelContext(this, scope, dev_ctx));
}
static std::unordered_map<std::string /* op_type */, OpKernelMap>&
AllOpKernels() {
static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
return g_all_op_kernels;
}
void InferShape(const std::shared_ptr<Scope>& scope) const final {
std::vector<const Tensor*> ins;
VarNamesToTensors(scope, inputs_, &ins);
std::vector<Tensor*> outs;
VarNamesToTensors(scope, outputs_, &outs);
InferShape(ins, outs);
};
private:
template <typename T>
void VarNamesToTensors(const std::shared_ptr<Scope>& scope,
const std::vector<std::string>& var_names,
std::vector<T>* container) const {
container->reserve(var_names.size());
VarToTensor<T> convert;
for (auto& name : var_names) {
auto var = scope->GetVariable(name);
if (var != nullptr) {
container->push_back(convert(var));
} else {
container->push_back(nullptr);
}
}
}
protected:
virtual void InferShape(const std::vector<const Tensor*>& inputs,
const std::vector<Tensor*>& outputs) const = 0;
};
} // namespace framework
} // namespace paddle
#define REGISTER_OP_KERNEL(type, PlaceType, KernelType) \
struct __op_kernel_register__##type##__ { \
__op_kernel_register__##type##__() { \
::paddle::framework::OperatorWithKernel::OpKernelKey key; \
key.place_ = PlaceType(); \
::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
.reset(new KernelType()); \
} \
}; \
static __op_kernel_register__##type##__ __reg_kernel_##type##__
......@@ -19,90 +19,163 @@ limitations under the License. */
namespace paddle {
namespace framework {
class OperatorTest : public OperatorBase {
static int op_run_num = 0;
class OpWithoutKernelTest : public OperatorBase {
public:
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
void Init() override { x = 1; }
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const override {
float scale = GetAttr<float>("scale");
ASSERT_NEAR(scale, 3.14, 1e-5);
op_run_num++;
ASSERT_EQ((int)inputs_.size(), 1);
ASSERT_EQ((int)outputs_.size(), 1);
ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
ASSERT_EQ(x, 1);
ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
}
public:
float x = 0;
};
class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op");
AddOutput("output", "output of test op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddType("test_operator");
AddAttr<float>("scale", "scale of cosine op");
AddComment("This is test op");
}
};
REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker);
} // namespace framework
} // namespace paddle
REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
TEST(OperatorBase, all) {
OpDesc op_desc;
paddle::framework::OpDesc op_desc;
op_desc.set_type("test_operator");
*op_desc.mutable_inputs()->Add() = "IN1";
*op_desc.mutable_outputs()->Add() = "OUT1";
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
float scale = 3.14;
attr->set_f(scale);
attr->set_f(3.14);
platform::CPUDeviceContext device_context;
auto scope = std::make_shared<Scope>();
paddle::platform::CPUDeviceContext device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(op->GetAttr<float>("scale"), scale);
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
scope->CreateVariable("OUT1");
ASSERT_EQ(paddle::framework::op_run_num, 0);
op->Run(scope, device_context);
std::cout << op->DebugString() << std::endl;
delete op;
ASSERT_EQ(paddle::framework::op_run_num, 1);
}
namespace paddle {
namespace framework {
class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op");
AddOutput("output", "output of test op");
AddInput("x", "input of test op");
AddOutput("y", "output of test op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddType("test_operator");
AddComment("This is test op");
}
};
static int cpu_kernel_run_num = 0;
class OpWithKernelTest : public OperatorWithKernel {
public:
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
protected:
void InferShape(const std::vector<const Tensor*>& inputs,
const std::vector<Tensor*>& outputs) const override {}
};
class CPUKernelTest : public OpKernel {
public:
void Compute(const KernelContext& context) const {
float scale = context.op_.GetAttr<float>("scale");
ASSERT_NEAR(scale, 3.14, 1e-5);
void Compute(const KernelContext& ctx) const {
std::cout << "this is cpu kernel" << std::endl;
std::cout << context.op_.DebugString() << std::endl;
std::cout << ctx.op_.DebugString() << std::endl;
cpu_kernel_run_num++;
ASSERT_EQ(ctx.op_.Input("x"), "IN1");
ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
}
};
// multiple inputs test
class OperatorMultiInputsTest : public OperatorBase {
public:
void Init() override { x = 1; }
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {
ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
ASSERT_EQ(x, 1);
ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
ASSERT_EQ(Input("x"), "IN1");
ASSERT_EQ(Input("y"), "OUT1");
}
public:
float x = 0;
};
class OpKernelTestMultiInputsProtoAndCheckerMaker
: public OpProtoAndCheckerMaker {
public:
OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInputs("xs", "inputs of test op");
AddInput("k", "input of test op");
AddOutputs("ys", "outputs of test op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddComment("This is test op");
}
};
class CPUKernalMultiInputsTest : public OpKernel {
public:
void Compute(const KernelContext& ctx) const {
auto xs = ctx.op_.Inputs("xs");
ASSERT_EQ(xs.size(), 3UL);
ASSERT_EQ(xs[0], "x0");
ASSERT_EQ(xs[1], "x1");
ASSERT_EQ(xs[2], "x2");
auto k = ctx.op_.Input("k");
ASSERT_EQ(k, "k0");
auto ys = ctx.op_.Outputs("ys");
ASSERT_EQ(ys.size(), 2UL);
ASSERT_EQ(ys[0], "y0");
ASSERT_EQ(ys[1], "y1");
}
};
REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest);
} // namespace framework
} // namespace paddle
REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
// test with single input
TEST(OpKernel, all) {
OpDesc op_desc;
paddle::framework::OpDesc op_desc;
op_desc.set_type("op_with_kernel");
*op_desc.mutable_inputs()->Add() = "IN1";
*op_desc.mutable_outputs()->Add() = "OUT1";
......@@ -111,13 +184,56 @@ TEST(OpKernel, all) {
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.14);
platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<Scope>();
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_device_context);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
}
REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
paddle::framework::CPUKernalMultiInputsTest);
delete op;
// test with multi inputs
TEST(OpKernel, multi_inputs) {
using namespace paddle::framework;
OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel");
*op_desc.mutable_inputs()->Add() = "x0";
*op_desc.mutable_inputs()->Add() = "x1";
*op_desc.mutable_inputs()->Add() = "x2";
*op_desc.mutable_inputs()->Add() = "k0";
*op_desc.mutable_outputs()->Add() = "y0";
*op_desc.mutable_outputs()->Add() = "y1";
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.14);
auto attr0 = op_desc.mutable_attrs()->Add();
attr0->set_name("input_format");
attr0->set_type(paddle::framework::AttrType::INTS);
auto input_format = attr0->mutable_ints();
input_format->Add(0); // x0
input_format->Add(3); // k
input_format->Add(4); // end
auto attr1 = op_desc.mutable_attrs()->Add();
attr1->set_name("output_format");
attr1->set_type(paddle::framework::AttrType::INTS);
auto output_format = attr1->mutable_ints();
output_format->Add(0); // y0
output_format->Add(2); // y1
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<Scope>();
OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
op->Run(scope, cpu_device_context);
}
} // namespace framework
} // namespace paddle
\ No newline at end of file
......@@ -23,6 +23,9 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Scope;
using ScopePtr = std::shared_ptr<Scope>;
/**
* @brief Scope that manage all variables.
*
......@@ -41,7 +44,7 @@ class Scope {
/**
* @brief Initialize a Scope with parent.
*/
explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
explicit Scope(const ScopePtr& parent) : parent_(parent) {}
/**
* @brief Create Variable
......@@ -88,7 +91,7 @@ class Scope {
private:
std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
std::shared_ptr<Scope> parent_{nullptr};
ScopePtr parent_{nullptr};
};
} // namespace framework
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/tensor.h>
namespace paddle {
namespace framework {}
} // namespace paddle
......@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <type_traits>
#include "paddle/framework/ddim.h"
#include "paddle/framework/enforce.h"
#include "paddle/memory/memory.h"
......@@ -29,43 +29,71 @@ class Tensor {
public:
Tensor() : offset_(0) {}
explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
template <typename T>
const T* data() const {
PADDLE_ENFORCE(
holder_ != nullptr,
"Tenosr has not been initialized. Call Tensor::mutable_data first.");
CheckDims<T>();
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
template <typename T, // must be POD types
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, paddle::platform::Place place) {
dims_ = dims;
template <typename T>
T* mutable_data(DDim dims, platform::Place place) {
set_dims(dims);
return mutable_data<T>(place);
}
template <typename T>
T* mutable_data(platform::Place place) {
PADDLE_ENFORCE(product(dims_) > 0,
"Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first.");
if (holder_ == nullptr ||
!(holder_->Place() ==
!(holder_->place() ==
place) /* some versions of boost::variant don't have operator!= */
|| holder_->Size() < product(dims) * sizeof(T) + offset_) {
holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
|| holder_->size() < product(dims_) * sizeof(T) + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
} else if (platform::is_gpu_place(place)) {
#ifdef __CUDACC__
holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
#else
PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
#endif
} else {
PADDLE_ENFORCE(true, "Unknown 'place'.");
}
offset_ = 0;
}
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
template <typename T>
void ShareDataFrom(const Tensor& src) {
PADDLE_ENFORCE(src.holder_ != nullptr,
"Can not share data from an uninitialized tensor.");
src.CheckDims<T>();
holder_ = src.holder_;
dims_ = src.dims_;
set_dims(src.dims());
offset_ = src.offset_;
}
template <typename T>
void CopyFrom(const Tensor& src, platform::Place dst_place) {
PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
platform::is_cpu_place(dst_place),
"Tensor::CopyFrom only support CPU now.");
src.CheckDims<T>();
size_t size = product(src.dims_) * sizeof(T);
set_dims(src.dims());
const void* src_ptr = static_cast<const void*>(src.data<T>());
void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
memcpy(dst_ptr, src_ptr, size);
}
template <typename T>
Tensor Slice(const int& begin_idx, const int& end_idx) const {
PADDLE_ENFORCE(holder_ != nullptr,
"The sliced tenosr has not been initialized.");
CheckDims<T>();
PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
"Slice index is less than zero or out of bound.");
PADDLE_ENFORCE(begin_idx < end_idx,
......@@ -78,12 +106,20 @@ class Tensor {
}
Tensor dst;
dst.holder_ = holder_;
dst.dims_ = dims_;
dst.dims_[0] = end_idx - begin_idx;
dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.set_dims(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T);
return dst;
}
void set_dims(const DDim& dims) {
if (dims == dims_) {
return;
}
dims_ = dims;
}
DDim dims() const { return dims_; }
private:
......@@ -91,43 +127,49 @@ class Tensor {
// parameter of Variable.
struct Placeholder {
virtual ~Placeholder() {}
virtual void* Ptr() const = 0;
virtual paddle::platform::Place Place() const = 0;
virtual size_t Size() const = 0;
virtual size_t TypeSize() const = 0;
virtual void* ptr() const = 0;
virtual platform::Place place() const = 0;
virtual size_t size() const = 0;
};
template <typename T>
template <typename T, typename PlaceType>
struct PlaceholderImpl : public Placeholder {
private:
template <typename PType>
class Deleter {
public:
Deleter(platform::Place place) : place_(place) {}
void operator()(T* ptr) {
paddle::memory::Free(place_, static_cast<void*>(ptr));
}
Deleter(PType place) : place_(place) {}
void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
private:
paddle::platform::Place place_;
PType place_;
};
public:
PlaceholderImpl(paddle::platform::Place place, size_t size)
: ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
Deleter(place)),
PlaceholderImpl(PlaceType place, size_t size)
: ptr_(static_cast<T*>(memory::Alloc(place, size)),
Deleter<PlaceType>(place)),
place_(place),
size_(size) {}
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
virtual size_t Size() const { return size_; }
virtual paddle::platform::Place Place() const { return place_; }
virtual size_t TypeSize() const { return sizeof(T); }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
std::unique_ptr<T, Deleter> ptr_;
paddle::platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
std::unique_ptr<T, Deleter<PlaceType>> ptr_;
platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
};
template <typename T>
inline void CheckDims() const {
PADDLE_ENFORCE(holder_ != nullptr,
"Tenosr holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.");
}
std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
DDim dims_;
size_t offset_; // marks the begin of tensor data area.
......
......@@ -18,7 +18,8 @@
TEST(Tensor, Dims) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor tt(make_ddim({2, 3, 4}));
Tensor tt;
tt.set_dims(make_ddim({2, 3, 4}));
DDim dims = tt.dims();
ASSERT_EQ(arity(dims), 3);
for (int i = 0; i < 3; ++i) {
......@@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) {
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg =
"Tenosr has not been initialized. Call Tensor::mutable_data first.";
"Tenosr holds no memory. Call Tensor::mutable_data first.";
const char* what = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(what[i], msg[i]);
......@@ -46,7 +47,7 @@ TEST(Tensor, DataAssert) {
/* following tests are not available at present
because Memory::Alloc() and Memory::Free() have not been ready.
*/
TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
......@@ -71,7 +72,7 @@ TEST(Tensor, MutableData) {
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
EXPECT_EQ(p1, p2);
}
#ifdef __CUDACC__
{
Tensor src_tensor;
float* p1 = nullptr;
......@@ -93,6 +94,7 @@ TEST(Tensor, MutableData) {
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
EXPECT_EQ(p1, p2);
}
#endif
}
TEST(Tensor, ShareDataFrom) {
......@@ -104,10 +106,11 @@ TEST(Tensor, ShareDataFrom) {
// Try to share data form uninitialized tensor
bool caught = false;
try {
dst_tensor.ShareDataFrom(src_tensor);
dst_tensor.ShareDataFrom<float>(src_tensor);
} catch (EnforceNotMet err) {
caught = true;
std::string msg = "Can not share data from an uninitialized tensor.";
std::string msg =
"Tenosr holds no memory. Call Tensor::mutable_data first.";
const char* what = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(what[i], msg[i]);
......@@ -116,17 +119,19 @@ TEST(Tensor, ShareDataFrom) {
ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataFrom(src_tensor);
dst_tensor.ShareDataFrom<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#ifdef __CUDACC__
{
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataFrom(src_tensor);
dst_tensor.ShareDataFrom<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
}
TEST(Tensor, Slice) {
......@@ -135,7 +140,7 @@ TEST(Tensor, Slice) {
{
Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
Tensor slice_tensor = src_tensor.Slice(1, 3);
Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2);
......@@ -155,10 +160,11 @@ TEST(Tensor, Slice) {
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
}
#ifdef __CUDACC__
{
Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
Tensor slice_tensor = src_tensor.Slice(2, 6);
Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
......@@ -176,6 +182,31 @@ TEST(Tensor, Slice) {
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
}
#endif
}
*/
\ No newline at end of file
TEST(Tensor, CopyFrom) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src_tensor;
int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
memcpy(src_ptr, arr, 9 * sizeof(int));
Tensor dst_tensor;
dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 9; ++i) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr);
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
}
}
......@@ -11,7 +11,6 @@ if(WITH_GPU)
endif()
if(USE_NNPACK)
include(nnpack/nnpack.cmake)
list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
if(WITH_TESTING)
add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
......
......@@ -117,8 +117,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
......@@ -217,8 +216,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
......@@ -311,8 +309,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();
......
......@@ -90,8 +90,7 @@ public:
ConvFunctionBase::init(config);
}
virtual void check(const BufferArgs& inputs,
const BufferArgs& outputs) override {
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
......
......@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
}
__syncthreads();
for (int i = 0; i < numSeq; ++i) {
......@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
dy[yoff * width + xoff] : 0.0;
__syncthreads();
if (tidy < (context - 1)) {
yoff = yoff - context + 1;
sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
dy[yoff * width + xoff] : 0.0;
}
__syncthreads();
......@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
x[yoff * width + xoff] : 0.0;
__syncthreads();
for (int t = 0; t < context; t++) {
sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
__syncthreads();
real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
......@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
}
__syncthreads();
for (int i = 0; i < numSeq; ++i) {
......@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
real* dw = filterG.getData();
if (contextLength <= 32) {
if (contextLength <= 32) {
KeRowConvBwWeight<32, 32, 32>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/function/ConvOp.h"
DEFINE_bool(nnpack_allocate_outside,
false,
true,
"Allocate and free workspace memory outside the NNPACK interface.");
DEFINE_int32(nnpack_num_threads,
0,
......@@ -58,18 +58,10 @@ public:
workspaceBuffer_ = nullptr;
workspaceSize_ = 0;
threadpool_ = nullptr;
if (FLAGS_nnpack_num_threads) {
threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
VLOG(3) << "Number of threads "
<< pthreadpool_get_threads_count(threadpool_);
}
create_nnpack_threadpool();
}
~NNPACKConvFunction() {
if (threadpool_) {
pthreadpool_destroy(threadpool_);
}
if (workspaceBuffer_) {
free(workspaceBuffer_);
}
......@@ -225,14 +217,25 @@ public:
}
}
static void create_nnpack_threadpool() {
if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
VLOG(3) << "Number of threads "
<< pthreadpool_get_threads_count(threadpool_);
}
}
private:
nnp_convolution_algorithm algorithm_;
nnp_convolution_transform_strategy transform_strategy_;
void* workspaceBuffer_;
size_t workspaceSize_;
pthreadpool_t threadpool_;
static pthreadpool_t threadpool_;
};
template <DeviceType Device>
pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
} // namespace paddle
......@@ -205,10 +205,8 @@ public:
hl_destroy_event(hlEvent_);
hlEvent_ = NULL;
}
if (batchData_) {
delete batchData_;
batchData_ = NULL;
}
delete batchData_;
batchData_ = NULL;
}
void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
......
......@@ -403,7 +403,7 @@ public:
: layerName_(layerName) {
addEvaluator(std::move(evaluator));
}
virtual void eval(const NeuralNetwork& nn) override {
void eval(const NeuralNetwork& nn) override {
const LayerPtr& layer = nn.getLayer(layerName_);
CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
<< nn.getName();
......
......@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
}
starts.back() = pos;
}
}
} // namespace
void RecurrentGradientMachine::calcSequenceStartPositions() {
std::vector<int> starts(commonSeqInfo_.size() + 1);
......
......@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
dest[index[i]] = src[i];
}
}
}
} // namespace
void GatherAgentLayer::forwardIds(PassType passType) {
IVectorPtr realId = realLayers_[0]->getOutputLabel();
......
......@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
StorageEngine::~StorageEngine() {
if (cpuAllocator_) {
delete cpuAllocator_;
}
delete cpuAllocator_;
for (auto it : gpuAllocator_) {
delete it;
}
......
add_subdirectory(detail)
cc_library(memory SRCS memory.cc)
cc_library(paddle_memory
DEPS
memory meta_data
meta_cache memory_block
buddy_allocator system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
else(${WITH_GPU})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(meta_data SRCS meta_data.cc)
cc_library(meta_cache SRCS meta_cache.cc)
cc_library(memory_block SRCS memory_block.cc)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
......@@ -12,22 +12,317 @@
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/memory/detail/buddy_allocator.h"
#include "glog/logging.h"
namespace paddle {
namespace memory {
namespace detail {
BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
SystemAllocator* system_allocator)
: pool_size_(pool_size),
max_pools_(max_pools),
system_allocator_(system_allocator) {
PADDLE_ASSERT(pool_size > 0);
PADDLE_ASSERT(max_pools > 0);
PADDLE_ASSERT(system_allocator != nullptr);
BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
size_t min_chunk_size, size_t max_chunk_size)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}
BuddyAllocator::~BuddyAllocator() {
DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
<< ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
}
inline size_t align(size_t size, size_t alignment) {
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment
size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
// acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);
DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
<< size;
// if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) {
DLOG(INFO) << "Allocate from system allocator.";
return SystemAlloc(size);
}
// query and allocate from the existing chunk
auto it = FindExistChunk(size);
// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
// if still failure, fail fatally
if (it == pool_.end()) {
return nullptr;
}
} else {
DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
<< " at address "
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
}
total_used_ += size;
total_free_ -= size;
// split the allocation and return data for use
return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
}
void BuddyAllocator::Free(void* p) {
// Point back to metadata
auto block = static_cast<MemoryBlock*>(p)->metadata();
// Acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);
DLOG(INFO) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
DLOG(INFO) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_));
// Invalidate GPU allocation from cache
cache_.invalidate(block);
return;
}
block->mark_as_free(cache_);
total_used_ -= block->total_size(cache_);
total_free_ += block->total_size(cache_);
// Trying to merge the right buddy
if (block->has_right_buddy(cache_)) {
DLOG(INFO) << "Merging this block " << block << " with its right buddy "
<< block->right_buddy(cache_);
auto right_buddy = block->right_buddy(cache_);
if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
// Take away right buddy from pool
pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
right_buddy->total_size(cache_),
right_buddy));
// merge its right buddy to the block
block->merge(cache_, right_buddy);
}
}
// Trying to merge the left buddy
if (block->has_left_buddy(cache_)) {
DLOG(INFO) << "Merging this block " << block << " with its left buddy "
<< block->left_buddy(cache_);
auto left_buddy = block->left_buddy(cache_);
if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
// Take away right buddy from pool
pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
left_buddy->total_size(cache_), left_buddy));
// merge the block to its left buddy
left_buddy->merge(cache_, block);
block = left_buddy;
}
}
// Dumping this block into pool
DLOG(INFO) << "Inserting free block (" << block << ", "
<< block->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
// Clean up if existing too much free memory
// Prefer freeing fallback allocation first
CleanIdleFallBackAlloc();
// Free normal allocation
CleanIdleNormalAlloc();
}
size_t BuddyAllocator::Used() { return total_used_; }
void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0;
void* p = system_allocator_->Alloc(index, size);
DLOG(INFO) << "Allocated " << p << " from system allocator.";
if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data();
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
#ifndef PADDLE_ONLY_CPU
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
}
}
#endif // PADDLE_ONLY_CPU
// Allocate a new maximum sized block
size_t index = 0;
void* p = system_allocator_->Alloc(index, max_chunk_size_);
if (p == nullptr) return pool_.end();
DLOG(INFO) << "Creating and inserting new block " << p
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
fallback_alloc_count_++;
}
total_free_ += max_chunk_size_;
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
size_t index = 0;
while (1) {
auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
// no match chunk memory
if (it == pool_.end()) return it;
if (std::get<0>(*it) > index) {
// find suitable one
if (std::get<1>(*it) >= size) {
return it;
}
// update and continue
index = std::get<0>(*it);
continue;
}
return it;
}
}
void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
size_t size) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
pool_.erase(it);
DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into";
block->split(cache_, size);
DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")";
block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
// the rest of memory if exist
if (block->has_right_buddy(cache_)) {
if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
<< block->right_buddy(cache_)->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
block->right_buddy(cache_)->total_size(cache_),
block->right_buddy(cache_)));
}
}
return block;
}
void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly
if (!fallback_alloc_count_) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
// If no GPU fallback allocator, return
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
return;
}
DLOG(INFO) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
fallback_alloc_count_--;
// If no fall allocation exists, return directly
if (!fallback_alloc_count_) return;
}
}
void BuddyAllocator::CleanIdleNormalAlloc() {
auto shall_free_alloc = [&]() -> bool {
// free all fallback allocations
if (fallback_alloc_count_ > 0) {
return true;
}
// keep 2x overhead if we haven't fallen back
if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
return true;
}
return false;
};
if (!shall_free_alloc()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
DLOG(INFO) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
if (!shall_free_alloc()) return;
}
}
} // namespace detail
......
......@@ -14,9 +14,16 @@
#pragma once
#include "paddle/memory/detail/meta_cache.h"
#include "paddle/memory/detail/meta_data.h"
#include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cpu_info.h"
#include "paddle/platform/gpu_info.h"
#include <mutex>
#include <set>
#include <unordered_map>
#include <vector>
namespace paddle {
......@@ -25,61 +32,80 @@ namespace detail {
class BuddyAllocator {
public:
BuddyAllocator(size_t pool_size, size_t max_pools,
SystemAllocator* system_allocator);
BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
size_t max_chunk_size);
~BuddyAllocator();
void* Alloc(size_t size);
public:
void* Alloc(size_t unaligned_size);
void Free(void*);
size_t Used();
public:
// Disable copy and assignment
BuddyAllocator(const BuddyAllocator&) = delete;
BuddyAllocator& operator=(const BuddyAllocator&) = delete;
private:
struct Block {
size_t size_;
Block* left_; // left buddy
Block* right_; // right buddy
};
// Tuple (allocator index, memory size, memory address)
using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
// Each element in PoolSet is a free allocation
using PoolSet = std::set<IndexSizeAddress>;
// Initially, there is only one pool. If a Alloc founds not enough
// memory from that pool, and there has not been max_num_pools_,
// create a new pool by calling system_allocator_.Alloc(pool_size_).
std::vector<void*> pools_;
/*! \brief Allocate fixed-size memory from system */
void* SystemAlloc(size_t size);
size_t pool_size_; // the size of each pool;
size_t max_num_pools_; // the size of all pools;
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool();
SystemAllocator* system_allocator_;
/**
* \brief Find the suitable chunk from existing pool and split
* it to left and right buddies
*
* \param it the iterator of pool list
* \param size the size of allocation
*
* \return the left buddy address
*/
void* SplitToAlloc(PoolSet::iterator it, size_t size);
std::mutex mutex_;
/*! \brief Find the existing chunk which used to allocation */
PoolSet::iterator FindExistChunk(size_t size);
// Disable copy and assignment.
BuddyAllocator(const BuddyAllocator&) = delete;
BuddyAllocator& operator=(const BuddyAllocator&) = delete;
};
/*! \brief Clean idle fallback allocation */
void CleanIdleFallBackAlloc();
/*! \brief Clean idle normal allocation */
void CleanIdleNormalAlloc();
BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
static BuddyAllocator<CPUAllocator>* a = nullptr;
if (a == nullptr) {
a = new BuddyAllocator<CPUAllocator>();
}
return a;
}
#ifndef PADDLE_ONLY_CPU // The following code are for CUDA.
BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
static BuddyAllocator<GPUAllocator>** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetDeviceCount();
as = new BuddyAllocator<GPUAllocator>*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = new BuddyAllocator<GPUAllocator>();
}
}
return as[gpu_id];
}
#endif // PADDLE_ONLY_CPU
private:
size_t total_used_ = 0; // the total size of used memory
size_t total_free_ = 0; // the total size of free memory
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk
private:
/**
* \brief A list of free allocation
*
* \note Only store free chunk memory in pool
*/
PoolSet pool_;
/*! Record fallback allocation count for auto-scaling */
size_t fallback_alloc_count_ = 0;
private:
/*! Unify the metadata format between GPU and CPU allocations */
MetadataCache cache_;
private:
/*! Allocate CPU/GPU memory from system */
SystemAllocator* system_allocator_;
std::mutex mutex_;
};
} // namespace detail
} // namespace memory
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/memory_block.h"
#include "paddle/memory/detail/meta_cache.h"
#include "paddle/memory/detail/meta_data.h"
#include "paddle/platform/assert.h"
namespace paddle {
namespace memory {
namespace detail {
void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy) {
cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
static_cast<MemoryBlock*>(left_buddy),
static_cast<MemoryBlock*>(right_buddy)));
}
MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
return cache.load(this).type;
}
size_t MemoryBlock::size(MetadataCache& cache) const {
return cache.load(this).size;
}
size_t MemoryBlock::total_size(MetadataCache& cache) const {
return cache.load(this).total_size;
}
MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
return cache.load(this).left_buddy;
}
MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
return cache.load(this).right_buddy;
}
void MemoryBlock::split(MetadataCache& cache, size_t size) {
// make sure the split fits
PADDLE_ASSERT(total_size(cache) >= size);
// bail out if there is no room for another partition
if (total_size(cache) - size <= sizeof(Metadata)) {
return;
}
// find the position of the split
void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
size_t remaining_size = total_size(cache) - size;
// Add the new block as a buddy
auto metadata = cache.load(this);
// Write the metadata for the new block
auto new_block_right_buddy = metadata.right_buddy;
cache.store(
static_cast<MemoryBlock*>(right_partition),
Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
remaining_size, this, new_block_right_buddy));
metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
metadata.size = size - sizeof(Metadata);
metadata.total_size = size;
cache.store(this, metadata);
// Write metadata for the new block's right buddy
if (new_block_right_buddy != nullptr) {
auto buddy_metadata = cache.load(new_block_right_buddy);
buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
cache.store(new_block_right_buddy, buddy_metadata);
}
}
void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
// only free blocks can be merged
PADDLE_ASSERT(type(cache) == FREE_CHUNK);
PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
auto metadata = cache.load(this);
// link this->buddy's buddy
metadata.right_buddy = right_buddy->right_buddy(cache);
// link buddy's buddy -> this
if (metadata.right_buddy != nullptr) {
auto buddy_metadata = cache.load(metadata.right_buddy);
buddy_metadata.left_buddy = this;
cache.store(metadata.right_buddy, buddy_metadata);
}
metadata.size += right_buddy->total_size(cache);
metadata.total_size += right_buddy->total_size(cache);
cache.store(this, metadata);
cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
}
void MemoryBlock::mark_as_free(MetadataCache& cache) {
// check for double free or corruption
PADDLE_ASSERT(type(cache) != FREE_CHUNK);
PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
set_type(cache, FREE_CHUNK);
}
void MemoryBlock::set_type(MetadataCache& cache, Type t) {
auto metadata = cache.load(this);
metadata.type = t;
cache.store(this, metadata);
}
bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
return left_buddy(cache) != nullptr;
}
bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
return right_buddy(cache) != nullptr;
}
size_t MemoryBlock::index(MetadataCache& cache) const {
return cache.load(this).index;
}
void* MemoryBlock::data() const {
return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
}
MemoryBlock* MemoryBlock::metadata() const {
return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
reinterpret_cast<const Metadata*>(this) - 1));
}
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstddef>
namespace paddle {
namespace memory {
namespace detail {
// Forward Declarations
class MetadataCache;
/*! \brief A class used to interpret the contents of a memory block */
class MemoryBlock {
public:
enum Type {
FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied
HUGE_CHUNK, // memory is out of management
INVALID_CHUNK // memory is invalid
};
public:
void init(MetadataCache& cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy);
public:
/*! \brief The type of the allocation */
Type type(MetadataCache& cache) const;
/*! \brief The size of the data region */
size_t size(MetadataCache& cache) const;
/*! \brief An index to track the allocator */
size_t index(MetadataCache& cache) const;
/*! \brief The total size of the block */
size_t total_size(MetadataCache& cache) const;
/*! \brief Check the left buddy of the block */
bool has_left_buddy(MetadataCache& cache) const;
/*! \brief Check the right buddy of the block */
bool has_right_buddy(MetadataCache& cache) const;
/*! \brief Get the left buddy */
MemoryBlock* left_buddy(MetadataCache& cache) const;
/*! \brief Get the right buddy */
MemoryBlock* right_buddy(MetadataCache& cache) const;
public:
/*! \brief Split the allocation into left/right blocks */
void split(MetadataCache& cache, size_t size);
/*! \brief Merge left and right blocks together */
void merge(MetadataCache& cache, MemoryBlock* right_buddy);
/*! \brief Mark the allocation as free */
void mark_as_free(MetadataCache& cache);
/*! \brief Change the type of the allocation */
void set_type(MetadataCache& cache, Type t);
public:
/*! \brief Get a pointer to the memory block's data */
void* data() const;
/*! \brief Get a pointer to the memory block's metadata */
MemoryBlock* metadata() const;
public:
static size_t overhead();
};
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/meta_cache.h"
#include "paddle/memory/detail/memory_block.h"
#include "paddle/platform/assert.h"
namespace paddle {
namespace memory {
namespace detail {
MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
Metadata MetadataCache::load(const MemoryBlock* block) {
if (uses_gpu_) {
auto existing_metadata = cache_.find(block);
PADDLE_ASSERT(existing_metadata->second.check_guards());
return existing_metadata->second;
} else {
PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
return *reinterpret_cast<const Metadata*>(block);
}
}
void MetadataCache::store(MemoryBlock* block,
const Metadata& original_metadata) {
auto metadata = original_metadata;
metadata.update_guards();
if (uses_gpu_) {
cache_[block] = metadata;
} else {
*reinterpret_cast<Metadata*>(block) = metadata;
}
}
void MetadataCache::invalidate(MemoryBlock* block) {
if (uses_gpu_) {
cache_.erase(block);
}
}
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/memory/detail/memory_block.h"
#include "paddle/memory/detail/meta_data.h"
#include <unordered_map>
namespace paddle {
namespace memory {
namespace detail {
/**
* \brief A cache for accessing memory block meta-data that may be expensive
* to access directly.
*
* \note This class exists to unify the metadata format between GPU and CPU
* allocations. It should be removed when the CPU can access all GPU
* allocations directly via UVM.
*/
class MetadataCache {
public:
MetadataCache(bool uses_gpu);
public:
/*! \brief Load the associated metadata for the specified memory block. */
Metadata load(const MemoryBlock*);
/*! \brief Store the associated metadata for the specified memory block. */
void store(MemoryBlock*, const Metadata&);
/*! \brief Indicate that the specified metadata will no longer be used. */
void invalidate(MemoryBlock*);
public:
MetadataCache(const MetadataCache&) = delete;
MetadataCache& operator=(const MetadataCache&) = delete;
private:
bool uses_gpu_;
private:
typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
private:
MetadataMap cache_;
};
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/meta_data.h"
#include <functional>
namespace paddle {
namespace memory {
namespace detail {
Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
MemoryBlock* l, MemoryBlock* r)
: type(t),
index(i),
size(s),
total_size(ts),
left_buddy(l),
right_buddy(r) {}
Metadata::Metadata()
: type(MemoryBlock::INVALID_CHUNK),
index(0),
size(0),
total_size(0),
left_buddy(nullptr),
right_buddy(nullptr) {}
template <class T>
inline void hash_combine(std::size_t& seed, const T& v) {
std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
inline size_t hash(const Metadata* metadata, size_t initial_seed) {
size_t seed = initial_seed;
hash_combine(seed, (size_t)metadata->type);
hash_combine(seed, metadata->index);
hash_combine(seed, metadata->size);
hash_combine(seed, metadata->total_size);
hash_combine(seed, metadata->left_buddy);
hash_combine(seed, metadata->right_buddy);
return seed;
}
void Metadata::update_guards() {
guard_begin = hash(this, 1);
guard_end = hash(this, 2);
}
bool Metadata::check_guards() const {
return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
}
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/memory/detail/memory_block.h"
#include <stddef.h>
namespace paddle {
namespace memory {
namespace detail {
class Metadata {
public:
Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
MemoryBlock* r);
Metadata();
public:
/*! \brief Update the guards when metadata is changed */
void update_guards();
/*! \brief Check consistency to previous modification */
bool check_guards() const;
public:
// TODO(gangliao): compress this
// clang-format off
size_t guard_begin = 0;
MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
size_t index = 0;
size_t size = 0;
size_t total_size = 0;
MemoryBlock* left_buddy = nullptr;
MemoryBlock* right_buddy = nullptr;
size_t guard_end = 0;
// clang-format on
};
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -13,76 +13,128 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/error.h"
#include "paddle/platform/gpu_info.h"
#include <stdlib.h> // for malloc and free
#include <sys/mman.h> // for mlock and munlock
#include "gflags/gflags.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cuda.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, false,
"If set, allocate cpu/gpu pinned memory.");
DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
namespace paddle {
namespace memory {
namespace detail {
void* CPUAllocator::Alloc(size_t size) {
void* CPUAllocator::Alloc(size_t& index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr;
index = 0; // unlock memory
void* p = malloc(size);
if (p != nullptr && FLAGS_use_pinned_memory) {
mlock(p, size);
if (p != nullptr) {
if (FLAGS_use_pinned_memory) {
index = 1;
mlock(p, size); // lock memory
}
}
return p;
}
void CPUAllocator::Free(void* p, size_t size) {
if (p != nullptr && FLAGS_use_pinned_memory) {
void CPUAllocator::Free(void* p, size_t size, size_t index) {
if (p != nullptr && index == 1) {
munlock(p, size);
}
free(p);
}
bool CPUAllocator::UseGpu() const { return false; }
#ifndef PADDLE_ONLY_CPU
void* GPUAllocator::Alloc(size_t size) {
void* GPUAllocator::Alloc(size_t& index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does.
if (size <= 0) {
return nullptr;
}
if (size <= 0) return nullptr;
size_t available = 0;
size_t capacity = 0;
paddle::platform::GpuMemoryUsage(available, capacity);
// Reserve memory for page tables, etc.
size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
size_t usable = available > reserving ? available - reserving : 0;
// If remaining size no less than expected size, using general
// cudaMalloc to allocate GPU memory.
void* p = 0;
cudaError_t result =
FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
if (result != cudaSuccess) {
cudaGetLastError(); // clear error if there is any.
if (size <= usable) {
cudaError_t result = cudaMalloc(&p, size);
if (result == cudaSuccess) {
index = 0;
gpu_alloc_size_ += size;
return p;
}
}
// If remaining size less than expected size or cudaMalloc failed,
// cudaMallocHost will be considered as a fallback allocator.
//
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host fallback allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return result == cudaSuccess ? p : nullptr;
return nullptr;
}
void GPUAllocator::Free(void* p, size_t size) {
void GPUAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
if (index == 0) {
PADDLE_ASSERT(gpu_alloc_size_ >= size);
gpu_alloc_size_ -= size;
err = cudaFree(p);
} else {
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
}
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFree after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFree succeeds.
cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
if (err != cudaErrorCudartUnloading) {
platform::throw_on_error(err, "cudaFree{Host} failed");
platform::throw_on_error(err,
"cudaFree{Host} failed in GPUAllocator::Free.");
}
}
bool GPUAllocator::UseGpu() const { return true; }
#endif // PADDLE_ONLY_CPU
} // namespace detail
......
......@@ -20,31 +20,36 @@ namespace paddle {
namespace memory {
namespace detail {
// SystemAllocator is the parent class of CPUAllocator and
// GPUAllocator. A BuddyAllocator object uses a SystemAllocator*
// pointing to the underlying system allocator. An alternative to
// this class hierarchy is to pass a system allocator class to
// BuddyAllocator as a template parameter. This approach makes
// BuddyAllocator a class template, and it's very complicated
// algorithm would make the buddy_allocator.h messy.
/**
* \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
* A BuddyAllocator object uses a SystemAllocator* pointing to the
* underlying system allocator.
*/
class SystemAllocator {
public:
virtual ~SystemAllocator() {}
virtual void* Alloc(size_t size) = 0;
virtual void Free(void* p, size_t size) = 0;
virtual void* Alloc(size_t& index, size_t size) = 0;
virtual void Free(void* p, size_t size, size_t index) = 0;
virtual bool UseGpu() const = 0;
};
class CPUAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t size);
virtual void Free(void* p, size_t size);
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
};
#ifndef PADDLE_ONLY_CPU
class GPUAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t size);
virtual void Free(void* p, size_t size);
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0;
};
#endif // PADDLE_ONLY_CPU
......
......@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
bool freed = false;
{
void* p = a.Alloc(size);
size_t index;
void* p = a.Alloc(index, size);
if (size > 0) {
EXPECT_NE(p, nullptr);
} else {
......@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
int* i = static_cast<int*>(p);
std::shared_ptr<int> ptr(i, [&](void* p) {
freed = true;
a.Free(p, size);
a.Free(p, size, index);
});
}
EXPECT_TRUE(freed);
......@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
}
#ifndef PADDLE_ONLY_CPU
TEST(GPUAllocator, NoStaging) {
FLAGS_use_pinned_memory = false;
paddle::memory::detail::GPUAllocator a;
TestAllocator(a, 2048);
TestAllocator(a, 0);
}
TEST(GPUAllocator, Staging) {
FLAGS_use_pinned_memory = true;
TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a;
TestAllocator(a, 2048);
TestAllocator(a, 0);
......
......@@ -17,43 +17,67 @@ limitations under the License. */
#include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include <boost/variant.hpp>
namespace paddle {
namespace memory {
void* Alloc(platform::Place pl, size_t size) {
#ifndef PADDLE_ONLY_CPU
if (paddle::platform::is_gpu_place(pl)) {
size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
detail::BuddyAllocator* GetCPUBuddyAllocator() {
static detail::BuddyAllocator* a = nullptr;
if (a == nullptr) {
a = new detail::BuddyAllocator(new detail::CPUAllocator,
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
return detail::GetCPUBuddyAllocator()->Alloc(size);
return a;
}
void Free(paddle::platform::Place pl, void* p) {
#ifndef PADDLE_ONLY_CPU
if (paddle::platform::is_gpu_place(pl)) {
size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
detail::GetCPUBuddyAllocator()->Free(p);
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
return GetCPUBuddyAllocator()->Alloc(size);
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
GetCPUBuddyAllocator()->Free(p);
}
template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used();
}
size_t Used(paddle::platform::Place pl) {
#ifndef PADDLE_ONLY_CPU
if (paddle::platform::is_gpu_place(pl)) {
size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
return detail::GetGPUBuddyAllocator(gpu_id)->Used();
detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static detail::BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetDeviceCount();
as = new detail::BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
platform::SetDeviceId(gpu);
as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
}
}
#endif // PADDLE_ONLY_CPU
PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
return detail::GetCPUBuddyAllocator()->Used();
return as[gpu_id];
}
template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
return GetGPUBuddyAllocator(place.device)->Alloc(size);
}
template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
#endif // PADDLE_ONLY_CPU
} // namespace memory
} // namespace paddle
......@@ -19,9 +19,14 @@ limitations under the License. */
namespace paddle {
namespace memory {
void* Alloc(paddle::platform::Place, size_t);
void Free(paddle::platform::Place, void*);
size_t Used(paddle::platform::Place);
template <class Place>
void* Alloc(Place, size_t);
template <class Place>
void Free(Place, void*);
template <class Place>
size_t Used(Place);
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/memory.h"
#include "paddle/memory/detail/memory_block.h"
#include "paddle/memory/detail/meta_data.h"
#include "paddle/platform/cpu_info.h"
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h"
#include <gtest/gtest.h>
#include <unordered_map>
inline bool is_aligned(void const *p) {
return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
}
size_t align(size_t size, paddle::platform::CPUPlace place) {
size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::CpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CPUPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CPUMultAlloc) {
paddle::platform::CPUPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#ifndef PADDLE_ONLY_CPU
size_t align(size_t size, paddle::platform::GPUPlace place) {
size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, GPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::GPUPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr);
paddle::memory::Free(gpu, p);
}
TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::GPUPlace gpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(gpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(size, gpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(gpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(p.second, gpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
}
#endif // PADDLE_ONLY_CPU
function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
# for ops.
set(cc_srcs)
set(cu_srcs)
set(op_common_deps operator op_registry)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
endif()
endforeach()
list(LENGTH cc_srcs cc_srcs_len)
if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif()
list(LENGTH cu_srcs cu_srcs_len)
if (${cu_srcs_len} EQUAL 0)
message(WARNING "The op library ${TARGET} not support GPU!")
endif()
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
endfunction()
op_library(add_op SRCS add_op.cc add_op.cu)
cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/framework/tensor.h>
#include <paddle/operators/add_op.h>
namespace paddle {
namespace operators {
class AddOp : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
PADDLE_ENFORCE(
inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr,
"Inputs/Outputs of AddOp must all be set");
PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
"Two input of Add Op's dimension must be same.");
// Need set dims in Tensor
// outputs[0]->set_dims(inputs[0]->dims())
}
};
class AddOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of add op");
AddInput("Y", "The second input of add op");
AddOutput("Out", "The output of add op");
AddComment(R"DOC(
Two Element Add Operator.
The equation is: Out = X + Y
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
REGISTER_OP_CPU_KERNEL(
add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
#include <paddle/operators/add_op.h>
#include <paddle/framework/op_registry.h>
REGISTER_OP_GPU_KERNEL(add_two,
paddle::operators::AddKernel<paddle::platform::GPUPlace>);
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册