提交 ddafe5ce 编写于 作者: S Superjom

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into rnn_varilen_design

......@@ -36,6 +36,8 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF)
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -74,6 +76,10 @@ if(ANDROID)
"Disable PYTHON when cross-compiling for Android" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android" FORCE)
endif(ANDROID)
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
......@@ -87,6 +93,7 @@ endif()
########################################################################################
include(external/mklml) # download mklml package
include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog
......@@ -94,6 +101,7 @@ include(external/gtest) # download, build, install gtest
include(external/protobuf) # download, build, install protobuf
include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
......@@ -135,6 +143,10 @@ if(WITH_GPU)
endif(NOT WITH_DSO)
endif(WITH_GPU)
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
endif()
if(USE_NNPACK)
include(external/nnpack)
list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
......
......@@ -15,23 +15,44 @@
set(CBLAS_FOUND OFF)
## Find MKL First.
set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
## Find MKLML First.
if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER MKLML)
set(CBLAS_INC_DIR ${MKLML_INC_DIR})
set(CBLAS_LIBRARIES ${MKLML_LIB})
add_definitions(-DPADDLE_USE_MKLML)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found cblas and lapack in MKLML "
"(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
return()
endif()
## Then find MKL.
set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
set(MKL_INCLUDE_SEARCH_PATHS
${MKL_ROOT}/include
${INTEL_MKL_ROOT}/include)
set(MKL_LIB_SEARCH_PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64
${INTEL_MKL_ROOT}/lib
${INTEL_MKL_ROOT}/lib/intel64)
find_path(MKL_INC_DIR mkl.h PATHS
${MKL_ROOT}/include)
${MKL_INCLUDE_SEARCH_PATHS})
find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
${MKL_INCLUDE_SEARCH_PATHS})
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
set(CBLAS_FOUND ON)
......
......@@ -67,6 +67,30 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_MKLDNN)
add_definitions(-DPADDLE_USE_MKLDNN)
if (WITH_MKLML AND MKLDNN_IOMP_DIR)
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
......@@ -34,9 +34,15 @@ IF(WITH_TESTING)
"${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
ENDIF(WIN32)
IF(WITH_MKLML)
# wait for mklml downloading completed
SET(GTEST_DEPENDS ${MKLML_PROJECT})
ENDIF()
ExternalProject_Add(
extern_gtest
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${GTEST_DEPENDS}
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "release-1.8.0"
PREFIX ${GTEST_SOURCES_DIR}
......
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_MKLDNN})
return()
ENDIF(NOT ${WITH_MKLDNN})
INCLUDE(ExternalProject)
SET(MKLDNN_PROJECT "extern_mkldnn")
SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_INSTALL_ROOT ${CMAKE_INSTALL_PREFIX})
IF(NOT "$ENV{HOME}" STREQUAL "/root")
SET(MKLDNN_INSTALL_ROOT "$ENV{HOME}")
ENDIF()
SET(MKLDNN_INSTALL_DIR "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
IF(WIN32)
MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
"Force WITH_MKLDNN=OFF")
SET(WITH_MKLDNN OFF)
return()
ELSE(WIN32)
SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
#SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
ENDIF()
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
PREFIX ${MKLDNN_SOURCES_DIR}
CONFIGURE_COMMAND mkdir -p <SOURCE_DIR>/build
BUILD_COMMAND cd <SOURCE_DIR>/build
&& cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
&& $(MAKE)
INSTALL_COMMAND cd <SOURCE_DIR>/build && $(MAKE) install
UPDATE_COMMAND ""
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
LIST(APPEND external_project_dependencies mkldnn)
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_MKLML})
return()
ENDIF(NOT ${WITH_MKLML})
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170425")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "opt/paddle/third_party/mklml")
SET(MKLML_INSTALL_ROOT "${CMAKE_INSTALL_PREFIX}")
IF(NOT "$ENV{HOME}" STREQUAL "/root")
SET(MKLML_INSTALL_ROOT "$ENV{HOME}")
ENDIF()
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n"
" DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add(
${MKLML_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${MKLML_SOURCE_DIR}
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
&& tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
)
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
......@@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=literal-suffix
-Wno-error=unused-local-typedefs
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
if (APPLE)
......
......@@ -75,10 +75,11 @@ snapshot to a model will be a TODO for future.
### Trainer Election
One trainer will be elected as the one to save the model. When using
etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
elect one trainer. When not using etcd, unique trainer IDs will be
given by the administrator, the trainer whose ID is "0" is elected to
save the model.
etcd, trainer ID is a randomly generated UUID, the trainer will
contact the master server requesting to save the model, and find out
if itself is elected. When the master server is not used, unique
trainer IDs will be given by the administrator, the trainer whose ID
is "0" is elected to save the model.
### Model Save Path
......
......@@ -49,6 +49,7 @@ message AttrProto {
message VarProto {
required string name = 1;
required string comment = 2;
required bool is_tensor = 3;
};
message OpProto {
......
......@@ -311,3 +311,13 @@ Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异
* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
主要的解决办法是减小学习律或者对数据进行归一化处理。
15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
------------------------------------------------------------------------
先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载:
pip uninstall py_paddle paddle
然后安装paddle的python环境, 在build目录下执行
pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
......@@ -59,7 +59,11 @@ func main() {
cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
if err != nil {
log.Errorf("Fetch checkpoint failed, %s", err)
if err == pserver.ErrCheckpointNotFound {
log.Infof("Could not find the pserver checkpoint.")
} else {
log.Errorf("Fetch checkpoint failed, %s", err)
}
}
}
......
......@@ -22,6 +22,9 @@ package main
#define PADDLE_MASTER_OK 0
#define PADDLE_MASTER_ERROR -1
#define PADDLE_SAVE_MODEL_OK 1
#define PADDLE_SAVE_MODEL_SKIP 0
typedef int paddle_master_client;
*/
import "C"
......@@ -33,7 +36,6 @@ import (
"unsafe"
"github.com/PaddlePaddle/Paddle/go/master"
"github.com/coreos/etcd/clientv3"
log "github.com/sirupsen/logrus"
)
......@@ -65,32 +67,32 @@ func remove(client C.paddle_master_client) *master.Client {
}
//export paddle_new_etcd_master_client
//
// bufSize is the record buffer size.
func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
p := C.GoString(etcdEndpoints)
cli, err := clientv3.New(clientv3.Config{
Endpoints: strings.Split(p, ","),
DialTimeout: time.Second * time.Duration(timeout),
})
endpoints := strings.Split(p, ",")
c, err := master.NewClient(
master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
master.WithBuffer(bufSize),
)
if err != nil {
panic(err)
}
ch := make(chan string, 1)
a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
if err != nil {
panic(err)
}
ch <- a
go master.WatchKey(cli, master.DefaultAddrPath, ch)
c := master.NewClient(ch, bufSize)
return add(c)
}
//export paddle_new_master_client
//
// bufSize is the record buffer size.
func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
a := C.GoString(addr)
ch := make(chan string, 1)
ch <- a
c := master.NewClient(ch, bufSize)
c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
if err != nil {
panic(err)
}
return add(c)
}
......@@ -117,9 +119,10 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
return C.PADDLE_MASTER_OK
}
// return value:
// 0:ok
// -1:error
// paddle_next_record gets the nexts training record.
//
// returns number of bytes of the records if success, -1 if failed.
//
//export paddle_next_record
func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
c := get(client)
......@@ -143,6 +146,29 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
return C.int(size)
}
// paddle_request_save_model requests the master server to approve the
// caller to save the model.
//
// returns 1 if the save the model request is approved, 0 if the
// request is rejected because other trainer is saving the model, -1
// if error happened.
//
//export paddle_request_save_model
func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
c := get(client)
need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
if err != nil {
log.Errorln(err)
return C.PADDLE_MASTER_ERROR
}
if need {
return C.PADDLE_SAVE_MODEL_OK
}
return C.PADDLE_SAVE_MODEL_SKIP
}
//export mem_free
func mem_free(p unsafe.Pointer) {
// "free" may be a better name for this function, but doing so
......
......@@ -16,17 +16,20 @@ package master
import (
"os"
"sync"
"time"
"github.com/PaddlePaddle/Paddle/go/connection"
"github.com/PaddlePaddle/recordio"
"github.com/coreos/etcd/clientv3"
log "github.com/sirupsen/logrus"
)
// Client is the client of the master server.
type Client struct {
conn *connection.Conn
ch chan record
conn *connection.Conn
ch chan record
initChOnce sync.Once
}
type record struct {
......@@ -34,24 +37,83 @@ type record struct {
err error
}
// NewClient creates a new Client.
// WithBuffer sets the client to buffer the training record.
//
// bufSize is the record buffer size. NextRecord will read from this
// buffer.
func NewClient(addrCh <-chan string, bufSize int) *Client {
func WithBuffer(bufSize int) func(*Client) error {
return func(c *Client) error {
if bufSize <= 0 {
return nil
}
c.initChOnce.Do(func() {
c.ch = make(chan record, bufSize)
go c.getRecords()
})
return nil
}
}
// WithAddr sets the client to use fixed master address.
func WithAddr(addr string) func(c *Client) error {
return func(c *Client) error {
ch := make(chan string, 1)
ch <- addr
go c.monitorMaster(ch)
return nil
}
}
// WithEtcd sets the client to use etcd for master discovery.
func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
return func(c *Client) error {
cli, err := clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: timeout,
})
if err != nil {
return err
}
ch := make(chan string, 1)
a, err := GetKey(cli, DefaultAddrPath, timeout)
if err != nil {
return err
}
if a != "" {
// Master is registered, send to the master address
// channel.
ch <- a
}
go watchKey(cli, DefaultAddrPath, ch)
go c.monitorMaster(ch)
return nil
}
}
// NewClient creates a new Client.
func NewClient(opts ...func(*Client) error) (*Client, error) {
c := &Client{}
c.conn = connection.New()
c.ch = make(chan record, bufSize)
go c.monitorMaster(addrCh)
go c.getRecords()
return c
for _, opt := range opts {
err := opt(c)
if err != nil {
return nil, err
}
}
return c, nil
}
func (c *Client) getRecords() {
for {
t, err := c.getTask()
if err != nil {
// getTask call.
log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
time.Sleep(3 * time.Second)
continue
......@@ -146,6 +208,20 @@ func (c *Client) taskFailed(meta TaskMeta) error {
// NextRecord will block until the next record is available. It is
// thread-safe.
func (c *Client) NextRecord() ([]byte, error) {
c.initChOnce.Do(func() {
// initialize with in case WithBuffer is not used.
c.ch = make(chan record, 0)
go c.getRecords()
})
r := <-c.ch
return r.r, r.err
}
// RequestSaveModel requests the master server to approve the caller
// to save the model.
func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
var need bool
err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
return need, err
}
......@@ -87,9 +87,11 @@ func TestNextRecord(t *testing.T) {
panic(err)
}
curAddr := make(chan string, 1)
curAddr <- fmt.Sprintf(":%d", p)
c := master.NewClient(curAddr, 10)
c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10))
if err != nil {
panic(err)
}
err = c.SetDataset([]string{path})
if err != nil {
panic(err)
......
......@@ -158,8 +158,8 @@ func (e *EtcdClient) Load() ([]byte, error) {
}
// GetKey gets the value by the specify key.
func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
resp, err := c.Get(ctx, key)
cancel()
if err != nil {
......@@ -173,8 +173,8 @@ func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
return string(v), nil
}
// WatchKey watches the specify key and send to valChan if there is some event.
func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
// watchKey watches the specify key and send to valChan if there is some event.
func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
rch := c.Watch(context.Background(), key)
for wresp := range rch {
for _, ev := range wresp.Events {
......
......@@ -78,9 +78,10 @@ type Service struct {
ready chan struct{}
store Store
mu sync.Mutex
initDone bool
taskQueues taskQueues
mu sync.Mutex
initDone bool
taskQueues taskQueues
savingTrainer string
}
func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
......@@ -246,7 +247,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
//
// SetDataset can be call multiple times. But only the first call will
// be honored.
func (s *Service) SetDataset(globPaths []string, dummy *int) error {
func (s *Service) SetDataset(globPaths []string, _ *int) error {
if len(globPaths) == 0 {
return errors.New("no dataset specified")
}
......@@ -330,7 +331,7 @@ func (s *Service) logFields() log.Fields {
}
// GetTask gets a new task from the service.
func (s *Service) GetTask(dummy int, task *Task) error {
func (s *Service) GetTask(_ int, task *Task) error {
select {
case <-s.ready:
}
......@@ -380,7 +381,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
// TaskFinished tell the service that a task is finished.
func (s *Service) TaskFinished(taskID int, dummy *int) error {
func (s *Service) TaskFinished(taskID int, _ *int) error {
select {
case <-s.ready:
}
......@@ -415,7 +416,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
}
// TaskFailed tells the service that a task is failed.
func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
func (s *Service) TaskFailed(meta TaskMeta, _ *int) error {
select {
case <-s.ready:
}
......@@ -432,3 +433,42 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
s.processFailedTask(t, meta.Epoch)
return nil
}
// SaveModelRequest is the request for saving model
type SaveModelRequest struct {
TrainerID string
BlockDur time.Duration
}
// RequestSaveModel requests the master server to approve the caller
// to save the model.
func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
s.mu.Lock()
defer s.mu.Unlock()
if req.TrainerID == "" {
return errors.New("trainer id is empty")
}
if s.savingTrainer == "" {
*need = true
} else {
if req.TrainerID == s.savingTrainer {
// save trainer asked to save model again
*need = true
} else {
*need = false
}
}
if *need {
s.savingTrainer = req.TrainerID
time.AfterFunc(req.BlockDur, func() {
s.mu.Lock()
s.savingTrainer = ""
s.mu.Unlock()
})
}
return nil
}
......@@ -127,13 +127,19 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
remove(client)
}
// paddle_begin_init_params tells trainer if it needs to init the
// parameters.
//
// returns 1 if the trainer needs to init the parameters. 0 if the
// trainer does not need to init the parameters.
//
//export paddle_begin_init_params
func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
c := get(client)
if selected := c.BeginInitParams(); selected {
return 1
}
return C.PSERVER_OK
return 0
}
//export paddle_init_param
......@@ -256,17 +262,4 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
return C.PSERVER_OK
}
//export paddle_save_model
func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
p := C.GoString(path)
c := get(client)
err := c.Save(p)
if err != nil {
log.Errorln(err)
return C.PSERVER_ERROR
}
return C.PSERVER_OK
}
func main() {} // Required but ignored
......@@ -111,9 +111,5 @@ retry:
getParams(c);
}
if (paddle_save_model(c, "/tmp/")) {
fail();
}
return 0;
}
......@@ -219,32 +219,6 @@ func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
return ps, nil
}
// Save indicates parameters to save the parameter to the given path.
func (c *Client) Save(path string) error {
errCh := make(chan error, len(c.pservers))
for _, p := range c.pservers {
err := p.Call("Service.Save", path, nil)
errCh <- err
}
recv := 0
for err := range errCh {
if err != nil {
return err
}
recv++
if recv == len(c.pservers) {
break
}
}
// TODO(helin): there will be many files under path, need to
// merge them into a single file.
return nil
}
func strHash(s string) uint32 {
h := fnv.New32a()
_, _ = h.Write([]byte(s))
......
......@@ -36,6 +36,10 @@ import (
// ElementType is the type of elements of a Parameter.
type ElementType int
// ErrCheckpointNotFound indicates that the pserver checkpoint could
// not be found.
var ErrCheckpointNotFound = errors.New("checkpoint not found")
// RPC error message.
const (
AlreadyInitialized = "pserver already initialized"
......@@ -103,6 +107,10 @@ func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, e
return nil, err
}
if len(v) == 0 {
return nil, ErrCheckpointNotFound
}
var cpMeta checkpointMeta
if err = json.Unmarshal(v, &cpMeta); err != nil {
return nil, err
......@@ -156,7 +164,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
}
// InitParam initializes a parameter.
func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
select {
case <-s.initialized:
return errors.New(AlreadyInitialized)
......@@ -177,7 +185,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
// FinishInitParams tells the parameter server that the parameter
// initialization has finished.
func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
func (s *Service) FinishInitParams(_ int, _ *int) error {
select {
case <-s.initialized:
return errors.New(AlreadyInitialized)
......@@ -190,7 +198,7 @@ func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
// SendGrad sends gradient to parameter servers for parameter
// optimization.
func (s *Service) SendGrad(g Gradient, dummy *int) error {
func (s *Service) SendGrad(g Gradient, _ *int) error {
select {
case <-s.initialized:
default:
......
......@@ -330,7 +330,7 @@ __global__ void KeSequenceAvgForward(real* dst,
}
sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
dst[gid] = sum;
dst[gid] += sum;
}
}
......
......@@ -19,8 +19,10 @@ cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
......@@ -28,5 +30,6 @@ add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch
add_dependencies(framework_py_proto framework_py_proto_init)
proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
cc_test(net_op_test SRCS net_op_test.cc DEPS net)
cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
......@@ -61,25 +61,24 @@ struct EigenTensor {
}
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten is to reshape a Tensor into a one dimension EigenVector
using Parent = EigenTensor<T, 1, MajorType, IndexType>;
static typename Parent::Type Flatten(Tensor& tensor) {
return Parent::From(tensor,
make_ddim({static_cast<int>(product(tensor.dims_))}));
// Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
}
static typename Parent::ConstType Flatten(const Tensor& tensor) {
return Parent::From(tensor,
make_ddim({static_cast<int>(product(tensor.dims_))}));
static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
}
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace framework {
OperatorBase* GradOpBuilder::Build() {
BuildOpInOutArgList();
std::string grad_op_type = OpRegistry::grad_ops().at(op_->type_);
OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
grad_op->type_ = grad_op_type;
CompleteGradOp(grad_op);
return grad_op;
}
OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
const VarIndexMap& var_map,
const std::vector<int>& format,
InOutType type) {
int idx = var_map.at(var.name());
int begin_idx = format.empty() ? idx : format.at(idx);
int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
end_idx);
}
void GradOpBuilder::BuildOpInOutArgList() {
const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
const std::vector<int>& in_format =
op_->attrs_.count("input_format")
? op_->GetAttr<std::vector<int>>("input_format")
: std::vector<int>();
const std::vector<int>& out_format =
op_->attrs_.count("output_format")
? op_->GetAttr<std::vector<int>>("output_format")
: std::vector<int>();
for (const auto& var : op_proto.inputs()) {
arg_list_.emplace_back(
std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
}
for (const auto& var : op_proto.outputs()) {
arg_list_.emplace_back(
std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
}
}
void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
std::vector<std::string>& in_out,
std::vector<int>& format,
VarIndexMap* varmap, int& idx,
bool is_grad) const {
std::string var_name = arg->proto_name_;
if (is_grad) {
var_name += OperatorBase::GRAD_VAR_SUFFIX();
}
(*varmap)[var_name] = idx++;
size_t pre_sz = in_out.size();
auto base_it =
arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
std::back_inserter(in_out));
if (is_grad) {
for (size_t i = pre_sz; i < in_out.size(); ++i) {
in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
}
}
format.push_back(in_out.size());
}
void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
grad_op->attrs_ = op_->attrs_;
grad_op->attrs_.erase("input_format");
grad_op->attrs_.erase("output_format");
VarIndexMap* grad_varmap = new VarIndexMap();
int in_idx = 0;
int out_idx = 0;
std::vector<int> in_format({0});
std::vector<int> out_format({0});
for (const auto& arg : arg_list_) {
// op_'s inputs_ and outputs_
if (arg->needed_in_grad_) {
AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
in_idx, false);
}
if (arg->type_ == IN) {
// gradients of op_'s inputs_
AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
out_idx, true);
} else {
// gradients of op_'s outputs_
AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
in_idx, true);
}
}
grad_op->attrs_["input_format"] = in_format;
grad_op->attrs_["output_format"] = out_format;
grad_op->in_out_idxs_.reset(grad_varmap);
}
} // namespace framework
} // namespace paddle
#pragma once
#include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
class OpRegistry;
enum InOutType { IN, OUT };
struct OpInOutArg {
OpInOutArg(const std::string& proto_name, const InOutType& type,
bool needed_in_grad, size_t begin_idx, size_t end_idx)
: proto_name_(proto_name),
type_(type),
needed_in_grad_(needed_in_grad),
begin_idx_(begin_idx),
end_idx_(end_idx) {}
std::string proto_name_;
InOutType type_;
bool needed_in_grad_;
size_t begin_idx_;
size_t end_idx_;
};
class GradOpBuilder {
using VarIndexMap = std::unordered_map<std::string, int>;
public:
GradOpBuilder(const OperatorBase* op) : op_(op) {}
OperatorBase* Build();
private:
OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
const std::vector<int>& format, InOutType type);
void BuildOpInOutArgList();
void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
std::vector<int>& format, VarIndexMap* varmap, int& idx,
bool is_grad) const;
void CompleteGradOp(OperatorBase* grad_op) const;
const OperatorBase* op_;
std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
};
} // namespace framework
} // namespace paddle
#include "paddle/framework/grad_op_builder.h"
#include <gtest/gtest.h>
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
USE_OP(add_two);
namespace paddle {
namespace framework {
TEST(GradOpBuilder, AddTwo) {
std::shared_ptr<OperatorBase> add_op(
OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(add_op);
EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
EXPECT_EQ(grad_add_op->Input("X"), "x");
EXPECT_EQ(grad_add_op->Input("Y"), "y");
EXPECT_EQ(grad_add_op->Input("Out"), "out");
EXPECT_EQ(grad_add_op->Input("Out@GRAD"), "out@GRAD");
EXPECT_EQ(grad_add_op->Output("X@GRAD"), "x@GRAD");
EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
}
} // namespace framework
} // namespace paddle
\ No newline at end of file
......@@ -15,14 +15,24 @@
*/
#include "paddle/framework/net.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace framework {
std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
auto grad_ops = std::make_shared<PlainNet>();
for (auto& op : ForwardOps->ops_) {
auto op_grad = OpRegistry::CreateGradOp(op);
grad_ops->AddOp(op_grad);
}
grad_ops->CompleteAddOp();
return grad_ops;
}
void PlainNet::CompleteAddOp(bool calc) {
add_op_done_ = true;
if (!calc) return;
std::unordered_set<std::string> input_set;
std::unordered_set<std::string> output_set;
std::unordered_set<std::string> temp_output;
......@@ -39,19 +49,22 @@ void PlainNet::CompleteAddOp(bool calc) {
output_set.insert(opt);
}
}
inputs_.reserve(input_set.size());
std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
std::sort(inputs_.begin(), inputs_.end());
outputs_.reserve(output_set.size());
std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
std::sort(outputs_.begin(), outputs_.end());
std::vector<int> tmp_index;
tmp_index.reserve(temp_output.size());
int idx = 0;
for (auto& opt : output_set) {
if (Contains(temp_output, opt)) {
tmp_index.push_back(idx);
int output_len = static_cast<int>(outputs_.size());
for (int i = 0; i < output_len; ++i) {
if (Contains(temp_output, outputs_[i])) {
tmp_index.push_back(i);
}
outputs_.push_back(opt);
++idx;
}
attrs_["temporary_index"] = tmp_index;
......@@ -59,9 +72,12 @@ void PlainNet::CompleteAddOp(bool calc) {
std::string PlainNet::DebugString() const {
std::ostringstream os;
os << this->type_ << ":" << std::endl;
os << OperatorBase::DebugString() << std::endl;
for (auto& op : ops_) {
os << "\t" << op->DebugString() << std::endl;
std::istringstream is(op->DebugString());
for (std::string line; std::getline(is, line);) {
os << " " << line << std::endl;
}
}
return os.str();
}
......
......@@ -39,7 +39,7 @@ namespace framework {
*/
class Net : public OperatorBase {
public:
virtual void AddOp(const OperatorPtr& op) = 0;
virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
virtual void CompleteAddOp(bool calc) = 0;
};
......@@ -57,7 +57,7 @@ class PlainNet : public Net {
* Infer all the operators' input and output variables' shapes, will be called
* before every mini-batch
*/
void InferShape(const ScopePtr& scope) const override {
void InferShape(const std::shared_ptr<Scope>& scope) const override {
for (auto& op : ops_) {
op->InferShape(scope);
}
......@@ -70,7 +70,7 @@ class PlainNet : public Net {
* scope will be used instead. If no OpContext is provicded, default context
* will be used.
*/
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {
for (auto& op : ops_) {
op->Run(scope, dev_ctx);
......@@ -80,7 +80,7 @@ class PlainNet : public Net {
/**
* @brief Add an operator by ptr
*/
void AddOp(const OperatorPtr& op) override {
void AddOp(const std::shared_ptr<OperatorBase>& op) override {
PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
ops_.push_back(op);
}
......@@ -89,7 +89,7 @@ class PlainNet : public Net {
std::string DebugString() const override;
std::vector<OperatorPtr> ops_;
std::vector<std::shared_ptr<OperatorBase>> ops_;
private:
bool add_op_done_{false};
......@@ -100,5 +100,7 @@ class PlainNet : public Net {
}
};
std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps);
} // namespace framework
} // namespace paddle
......@@ -3,17 +3,24 @@
#include <paddle/framework/op_registry.h>
#include <paddle/framework/operator.h>
namespace pd = paddle::framework;
USE_OP(add_two);
USE_OP(mul);
USE_OP(sigmoid);
USE_OP(softmax);
namespace paddle {
namespace framework {
static int infer_shape_cnt = 0;
static int run_cnt = 0;
class TestOp : public pd::OperatorBase {
class TestOp : public OperatorBase {
public:
void InferShape(const paddle::framework::ScopePtr& scope) const override {
void InferShape(
const std::shared_ptr<framework::Scope>& scope) const override {
++infer_shape_cnt;
}
void Run(const paddle::framework::ScopePtr& scope,
void Run(const std::shared_ptr<framework::Scope>& scope,
const paddle::platform::DeviceContext& dev_ctx) const override {
++run_cnt;
}
......@@ -33,7 +40,7 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
}
TEST(OpKernel, all) {
auto net = std::make_shared<paddle::framework::PlainNet>();
auto net = std::make_shared<PlainNet>();
ASSERT_NE(net, nullptr);
auto op1 = std::make_shared<TestOp>();
......@@ -55,13 +62,37 @@ TEST(OpKernel, all) {
ASSERT_EQ(1UL, tmp_idx.size());
ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
auto scope = std::make_shared<pd::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
auto scope = std::make_shared<Scope>();
platform::CPUDeviceContext dev_ctx;
net->InferShape(scope);
net->Run(scope, dev_ctx);
ASSERT_EQ(2, infer_shape_cnt);
ASSERT_EQ(2, run_cnt);
ASSERT_THROW(net->AddOp(op2), std::runtime_error);
}
TEST(AddBackwardOp, TestGradOp) {
auto net = std::make_shared<PlainNet>();
ASSERT_NE(net, nullptr);
net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
net->AddOp(
framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {}));
auto grad_ops = AddBackwardOp(net);
for (auto& op : grad_ops->ops_) {
op->DebugString();
}
}
// TODO(zhihong): add fc grad without registering.
// TEST(AddBackwardOp, TestNoGradOp) {
// auto net = std::make_shared<PlainNet>();
// ASSERT_NE(net, nullptr);
// net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"},
// {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) {
// op->DebugString();
// }
// }
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/net.h"
#include "paddle/framework/op_registry.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
class FakeFC : public Operator {}
} // namespace framework
} // namespace paddle
......@@ -84,6 +84,11 @@ message VarProto {
// "temporary_index": [1]
// }
optional bool temporary = 4 [default=false];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
// can be ignored for the future optimized on graph.
optional bool ignore_gradient = 6;
}
// Op protocol message for 3rd-party language binding.
......@@ -105,4 +110,5 @@ message OpProto {
// The type of that Op.
required string type = 5;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
......@@ -6,9 +20,9 @@
#include <unordered_map>
#include <unordered_set>
#include "paddle/framework/attr_checker.h"
#include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/scope.h"
namespace paddle {
namespace framework {
......@@ -73,25 +87,29 @@ class OpProtoAndCheckerMaker {
protected:
void AddInput(const std::string& name, const std::string& comment,
bool multiple = false) {
bool multiple = false, bool ignore_gradient = false) {
auto input = proto_->mutable_inputs()->Add();
*input->mutable_name() = name;
*input->mutable_comment() = comment;
input->set_ignore_gradient(ignore_gradient);
input->set_multiple(multiple);
if (multiple) {
SetHasMultipleInput();
}
}
void AddInputs(const std::string& name, const std::string& comment) {
AddInput(name, comment, true);
void AddInputs(const std::string& name, const std::string& comment,
bool ignore_gradient = false) {
AddInput(name, comment, true, ignore_gradient);
}
void AddOutput(const std::string& name, const std::string& comment,
bool temporary = false, bool multiple = false) {
bool temporary = false, bool multiple = false,
bool ignore_gradient = false) {
auto output = proto_->mutable_outputs()->Add();
*output->mutable_name() = name;
*output->mutable_comment() = comment;
output->set_ignore_gradient(ignore_gradient);
output->set_multiple(multiple);
if (multiple) {
SetHasMultipleOutput();
......@@ -103,8 +121,8 @@ class OpProtoAndCheckerMaker {
}
void AddOutputs(const std::string& name, const std::string& comment,
bool temporary = false) {
AddOutput(name, comment, temporary, true);
bool temporary = false, bool ignore_gradient = false) {
AddOutput(name, comment, temporary, true, ignore_gradient);
}
template <typename T>
......@@ -204,9 +222,9 @@ class OpRegistry {
public:
template <typename OpType, typename ProtoMakerType>
static void RegisterOp(const std::string& op_type) {
creators()[op_type] = [] { return new OpType; };
OpProto& op_proto = protos()[op_type];
op_creators()[op_type] = [] { return new OpType; };
OpAttrChecker& op_checker = op_checkers()[op_type];
OpProto& op_proto = protos()[op_type];
auto maker = ProtoMakerType(&op_proto, &op_checker);
maker.Validate();
*op_proto.mutable_type() = op_type;
......@@ -227,18 +245,26 @@ class OpRegistry {
}
}
static OperatorPtr CreateOp(const std::string& type,
const VarNameList& inputs,
const VarNameList& outputs,
const AttributeMap& attrs) {
auto op_create_it = creators().find(type);
PADDLE_ENFORCE(op_create_it != creators().end(),
"Operator %s cannot be found", type);
template <typename GradOpType>
static void RegisterGradOp(const std::string& op_type,
const std::string& grad_op_type) {
op_creators()[grad_op_type] = [] { return new GradOpType; };
grad_ops()[op_type] = grad_op_type;
}
static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
const VarNameList& inputs,
const VarNameList& outputs,
const AttributeMap& attrs) {
auto op_create_it = op_creators().find(type);
PADDLE_ENFORCE(op_create_it != op_creators().end(),
"Operator %s cannot be found.", type);
auto op = op_create_it->second();
op->type_ = type;
op->inputs_ = inputs;
op->outputs_ = outputs;
op->attrs_ = attrs;
op_checkers().at(type).Check(op->attrs_);
......@@ -252,10 +278,10 @@ class OpRegistry {
}
op->Init();
return OperatorPtr(op);
return std::shared_ptr<OperatorBase>(op);
}
static OperatorPtr CreateOp(const OpDesc& op_desc) {
static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
std::vector<std::string> inputs;
inputs.reserve((size_t)op_desc.inputs_size());
std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
......@@ -274,18 +300,41 @@ class OpRegistry {
return CreateOp(op_desc.type(), inputs, outputs, attrs);
}
static std::shared_ptr<OperatorBase> CreateGradOp(
std::shared_ptr<OperatorBase> op) {
GradOpBuilder builder(op.get());
std::shared_ptr<OperatorBase> grad_op(builder.Build());
grad_op->Init();
return grad_op;
}
static std::unordered_map<std::string, OpProto>& protos() {
static std::unordered_map<std::string, OpProto> protos_;
return protos_;
};
private:
static std::unordered_map<std::string, std::string>& grad_ops() {
static std::unordered_map<std::string, std::string> grad_ops_;
return grad_ops_;
}
static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
VarIndexMaps() {
static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
return maps_;
}
static std::unordered_map<std::string, OpCreator>& op_creators() {
static std::unordered_map<std::string, OpCreator> op_creators_;
return op_creators_;
}
private:
static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
return op_checkers_;
};
static void GenerateTempVariableName(OperatorBase* op) {
static std::atomic<size_t> gUniqId(0UL);
for (auto& outname : op->outputs_) {
......@@ -296,16 +345,6 @@ class OpRegistry {
}
}
}
static std::unordered_map<std::string, OpCreator>& creators() {
static std::unordered_map<std::string, OpCreator> creators_;
return creators_;
}
static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
return op_checkers_;
};
};
template <typename OpType, typename ProtoMakerType>
......@@ -316,6 +355,14 @@ class OpRegisterHelper {
}
};
template <typename GradOpType>
class GradOpRegisterHelper {
public:
GradOpRegisterHelper(const char* op_type, const char* grad_op_type) {
OpRegistry::RegisterGradOp<GradOpType>(op_type, grad_op_type);
}
};
/**
* check if MACRO is used in GLOBAL NAMESPACE.
*/
......@@ -335,6 +382,20 @@ class OpRegisterHelper {
__op_register_##__op_type##__(#__op_type); \
int __op_register_##__op_type##_handle__() { return 0; }
/**
* Macro to Register Gradient Operator.
*/
#define REGISTER_GRADIENT_OP(__op_type, __grad_op_type, __grad_op_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_gradient_op__##__op_type##__grad_op_type, \
"REGISTER_GRADIENT_OP must be in global namespace"); \
static ::paddle::framework::GradOpRegisterHelper<__grad_op_class> \
__op_gradient_register_##__op_type##__grad_op_type##__(#__op_type, \
#__grad_op_type); \
int __op_gradient_register_##__op_type##__grad_op_type##_handle__() { \
return 0; \
}
/**
* Macro to Register OperatorKernel.
*/
......
......@@ -7,9 +7,9 @@ namespace paddle {
namespace framework {
class CosineOp : public OperatorBase {
public:
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {}
void InferShape(const ScopePtr& scope) const override {}
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
};
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
......@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
class MyTestOp : public OperatorBase {
public:
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {}
};
......@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(scale);
paddle::framework::OperatorPtr op =
std::shared_ptr<paddle::framework::OperatorBase> op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
......@@ -89,8 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
bool caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
std::string msg = "larger_than check fail";
......@@ -110,7 +109,7 @@ TEST(OpRegistry, DefaultValue) {
ASSERT_TRUE(op_desc.IsInitialized());
paddle::framework::OperatorPtr op =
std::shared_ptr<paddle::framework::OperatorBase> op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
......@@ -136,8 +135,7 @@ TEST(OpRegistry, CustomChecker) {
// attr 'test_attr' is not set
bool caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
std::string msg = "Attribute 'test_attr' is required!";
......@@ -155,8 +153,7 @@ TEST(OpRegistry, CustomChecker) {
attr->set_i(3);
caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
std::string msg = "'test_attr' must be even!";
......@@ -174,8 +171,7 @@ TEST(OpRegistry, CustomChecker) {
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(4);
SetInputFormat(&op_desc);
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::platform::CPUDeviceContext dev_ctx;
auto scope = std::make_shared<paddle::framework::Scope>();
op->Run(scope, dev_ctx);
......
......@@ -47,7 +47,6 @@ struct EigenDeviceConverter<platform::GPUPlace> {
#endif
class OperatorBase;
using OperatorPtr = std::shared_ptr<OperatorBase>;
/**
* OperatorBase has the basic element that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
......@@ -63,6 +62,11 @@ class OperatorBase {
/// but it will be convert to a unique name in scope after OpCreator.
static std::string TMP_VAR_NAME() { return "@TEMP@"; }
/// If a variable's name has a certain suffix, it means that the
/// variable is the gradient of another varibale.
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
virtual ~OperatorBase() {}
template <typename T>
......@@ -80,10 +84,10 @@ class OperatorBase {
/// InferShape infer the size of Variables used by this Operator with
/// information inside scope
virtual void InferShape(const ScopePtr& scope) const = 0;
virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
/// Net will call this function to Run an op.
virtual void Run(const ScopePtr& scope,
virtual void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const = 0;
// Get a input with argument's name described in `op_proto`
......@@ -208,7 +212,7 @@ class OperatorWithKernel : public OperatorBase {
using OpKernelMap =
std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(KernelContext(this, scope, dev_ctx));
......
......@@ -24,8 +24,8 @@ static int op_run_num = 0;
class OpWithoutKernelTest : public OperatorBase {
public:
void Init() override { x = 1; }
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {
op_run_num++;
ASSERT_EQ((int)inputs_.size(), 1);
......@@ -70,8 +70,7 @@ TEST(OperatorBase, all) {
paddle::platform::CPUDeviceContext device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
scope->CreateVariable("OUT1");
ASSERT_EQ(paddle::framework::op_run_num, 0);
op->Run(scope, device_context);
......@@ -189,8 +188,7 @@ TEST(OpKernel, all) {
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_device_context);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
......@@ -236,6 +234,6 @@ TEST(OpKernel, multi_inputs) {
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<Scope>();
OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_device_context);
}
......@@ -24,7 +24,6 @@ namespace paddle {
namespace framework {
class Scope;
using ScopePtr = std::shared_ptr<Scope>;
/**
* @brief Scope that manage all variables.
......@@ -44,7 +43,7 @@ class Scope {
/**
* @brief Initialize a Scope with parent.
*/
explicit Scope(const ScopePtr& parent) : parent_(parent) {}
explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
/**
* @brief Create Variable
......@@ -91,7 +90,7 @@ class Scope {
private:
std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
ScopePtr parent_{nullptr};
std::shared_ptr<Scope> parent_{nullptr};
};
} // namespace framework
......
......@@ -48,25 +48,27 @@ class Tensor {
template <typename T>
const T* data() const {
CheckDims<T>();
EnforceSufficientMemory<T>();
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
template <typename T>
T* data() {
CheckDims<T>();
EnforceSufficientMemory<T>();
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
template <typename T>
template <typename T, // must be POD types
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, platform::Place place) {
set_dims(dims);
Resize(dims);
return mutable_data<T>(place);
}
template <typename T>
template <typename T, // must be POD types
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(platform::Place place) {
PADDLE_ENFORCE(product(dims_) > 0,
"Tensor's numel must be larger than zero to call "
......@@ -95,11 +97,9 @@ class Tensor {
}
template <typename T>
void ShareDataFrom(const Tensor& src) {
src.CheckDims<T>();
holder_ = src.holder_;
set_dims(src.dims());
offset_ = src.offset_;
void ShareDataWith(const Tensor& src) {
src.EnforceSufficientMemory<T>();
*this = src;
}
template <typename T>
......@@ -107,9 +107,9 @@ class Tensor {
PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
platform::is_cpu_place(dst_place),
"Tensor::CopyFrom only support CPU now.");
src.CheckDims<T>();
src.EnforceSufficientMemory<T>();
size_t size = product(src.dims_) * sizeof(T);
set_dims(src.dims());
Resize(src.dims());
const void* src_ptr = static_cast<const void*>(src.data<T>());
void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
memcpy(dst_ptr, src_ptr, size);
......@@ -117,34 +117,25 @@ class Tensor {
template <typename T>
Tensor Slice(const int& begin_idx, const int& end_idx) const {
CheckDims<T>();
PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
"Slice index is less than zero or out of bound.");
EnforceSufficientMemory<T>();
PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
PADDLE_ENFORCE(begin_idx < end_idx,
"Begin index must be less than end index.");
PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
std::vector<int> d = vectorize(dims_);
int base = 1;
for (size_t i = 1; i < d.size(); ++i) {
base *= d[i];
}
int base = product(dims_) / dims_[0];
Tensor dst;
dst.holder_ = holder_;
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.set_dims(dst_dims);
dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T);
return dst;
}
void set_dims(const DDim& dims) {
if (dims == dims_) {
return;
}
dims_ = dims;
}
void Resize(const DDim& dims) { dims_ = dims; }
DDim dims() const { return dims_; }
const DDim& dims() const { return dims_; }
private:
// Placeholder hides type T, so it doesn't appear as a template
......@@ -159,21 +150,9 @@ class Tensor {
template <typename T, typename PlaceType>
struct PlaceholderImpl : public Placeholder {
private:
template <typename PType>
class Deleter {
public:
Deleter(PType place) : place_(place) {}
void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
private:
PType place_;
};
public:
PlaceholderImpl(PlaceType place, size_t size)
: ptr_(static_cast<T*>(memory::Alloc(place, size)),
Deleter<PlaceType>(place)),
memory::PODDeleter<T, PlaceType>(place)),
place_(place),
size_(size) {}
......@@ -182,13 +161,13 @@ class Tensor {
virtual paddle::platform::Place place() const { return place_; }
virtual std::type_index type() const { return std::type_index(typeid(T)); }
std::unique_ptr<T, Deleter<PlaceType>> ptr_;
std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
};
template <typename T>
inline void CheckDims() const {
inline void EnforceSufficientMemory() const {
PADDLE_ENFORCE(holder_ != nullptr,
"Tenosr holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
......@@ -198,7 +177,11 @@ class Tensor {
std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
DDim dims_;
size_t offset_; // marks the begin of tensor data area.
// A PlaceHolder may be shared by more than one tensor. Some of them may be
// slices of the others. So the offset_ is introduced here to indicate the
// byte offset between PlaceHolder::ptr_ and where tensor's data really
// begins.
size_t offset_;
};
} // namespace framework
......
......@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor tt;
tt.set_dims(make_ddim({2, 3, 4}));
tt.Resize(make_ddim({2, 3, 4}));
DDim dims = tt.dims();
ASSERT_EQ(arity(dims), 3);
for (int i = 0; i < 3; ++i) {
......@@ -97,7 +97,7 @@ TEST(Tensor, MutableData) {
#endif
}
TEST(Tensor, ShareDataFrom) {
TEST(Tensor, ShareDataWith) {
using namespace paddle::framework;
using namespace paddle::platform;
{
......@@ -106,7 +106,7 @@ TEST(Tensor, ShareDataFrom) {
// Try to share data form uninitialized tensor
bool caught = false;
try {
dst_tensor.ShareDataFrom<float>(src_tensor);
dst_tensor.ShareDataWith<float>(src_tensor);
} catch (std::runtime_error& err) {
caught = true;
std::string msg =
......@@ -119,7 +119,7 @@ TEST(Tensor, ShareDataFrom) {
ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataFrom<int>(src_tensor);
dst_tensor.ShareDataWith<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
......@@ -128,7 +128,7 @@ TEST(Tensor, ShareDataFrom) {
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataFrom<int>(src_tensor);
dst_tensor.ShareDataWith<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
......
......@@ -31,13 +31,22 @@ public:
ConvolutionTest(const std::string& conv1,
const std::string& conv2,
TestType type,
bool useGroups = true,
std::string algo = "auto") {
for (size_t batchSize : {1, 32}) {
for (size_t inputSize : {7, 14, 54}) {
for (size_t filterSize : {1, 3, 5}) {
for (size_t inputChannels : {3, 64}) {
for (size_t outputChannels : {3, 64, 128}) {
if (inputChannels < outputChannels) break;
for (size_t outputChannels : {3, 64}) {
if (inputChannels > outputChannels) break;
size_t groups;
if (!useGroups) {
groups = 1;
} else {
if (outputChannels % inputChannels != 0) continue;
groups = inputChannels;
}
for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) {
if (padding >= filterSize) break;
......@@ -62,13 +71,24 @@ public:
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("groups", groups)
.set("algo", algo));
TensorShape input{
batchSize, inputChannels, inputSize, inputSize};
TensorShape filter{
outputChannels, inputChannels, filterSize, filterSize};
TensorShape filter;
if (groups > 1)
filter = TensorShape({groups,
outputChannels / groups,
inputChannels / groups,
filterSize,
filterSize});
else
filter = TensorShape({outputChannels,
inputChannels,
filterSize,
filterSize});
TensorShape output{
batchSize, outputChannels, outputSize, outputSize};
......@@ -85,7 +105,8 @@ public:
} else if (type == kBackwardFilterTest) {
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
ADD_TO);
test.run();
}
}
......@@ -106,6 +127,7 @@ public:
ConvolutionTest2(const std::string& conv1,
const std::string& conv2,
TestType type,
bool useGroups = true,
std::string algo = "auto") {
for (size_t batchSize : {16}) {
for (size_t inputHeight : {7, 31}) {
......@@ -113,7 +135,15 @@ public:
for (size_t filterHeight : {1, 5}) {
for (size_t filterWidth : {3, 7}) {
for (size_t inputChannels : {7}) {
for (size_t outputChannels : {32}) {
for (size_t outputChannels : {7}) {
size_t groups;
if (!useGroups) {
groups = 1;
} else {
if (outputChannels % inputChannels != 0) continue;
groups = inputChannels;
}
size_t stride = 1;
size_t padding = 0;
size_t outputHeight =
......@@ -141,13 +171,24 @@ public:
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("groups", groups)
.set("algo", algo));
TensorShape input{
batchSize, inputChannels, inputHeight, inputWidth};
TensorShape filter{
outputChannels, inputChannels, filterHeight, filterWidth};
TensorShape filter;
if (groups > 1)
filter = TensorShape({groups,
outputChannels / groups,
inputChannels / groups,
filterHeight,
filterWidth});
else
filter = TensorShape({outputChannels,
inputChannels,
filterHeight,
filterWidth});
TensorShape output{
batchSize, outputChannels, outputHeight, outputWidth};
......@@ -164,7 +205,8 @@ public:
} else if (type == kBackwardFilterTest) {
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
ADD_TO);
test.run();
}
}
......@@ -177,34 +219,88 @@ public:
}
};
// ======Start Convolution TEST======
TEST(Forward, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
}
#ifndef PADDLE_ONLY_CPU
TEST(Forward, GEMM2) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConv-CPU", "GemmConv-GPU", kForwardTest);
"GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConv-CPU", "GemmConv-GPU", kForwardTest);
"GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
}
TEST(BackwardInput, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
"GemmConvGradInput-CPU",
"GemmConvGradInput-GPU",
kBackwardInputTest,
false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
"GemmConvGradInput-CPU",
"GemmConvGradInput-GPU",
kBackwardInputTest,
false);
}
TEST(BackwardFilter, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
"GemmConvGradFilter-CPU",
"GemmConvGradFilter-GPU",
kBackwardFilterTest,
false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
"GemmConvGradFilter-CPU",
"GemmConvGradFilter-GPU",
kBackwardFilterTest,
false);
}
#endif
// ======End Convolution TEST======
// ======Start DepthwiseConvolution TEST======
// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
// version of depthwiseConv is implemented.
#ifndef PADDLE_ONLY_CPU
TEST(DepthwiseConvForward, GEMM2) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
}
TEST(DepthwiseConvBackwardInput, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradInput-CPU",
"DepthwiseConvGradInput-GPU",
kBackwardInputTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradInput-CPU",
"DepthwiseConvGradInput-GPU",
kBackwardInputTest);
}
TEST(DepthwiseConvBackwardFilter, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradFilter-CPU",
"DepthwiseConvGradFilter-GPU",
kBackwardFilterTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradFilter-CPU",
"DepthwiseConvGradFilter-GPU",
kBackwardFilterTest);
}
#endif
// ======End DepthwiseConvolution TEST======
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DepthwiseConvOp.h"
#include "ConvOp.h"
#include "GemmFunctor.h"
namespace paddle {
template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData) {
// TODO(zhaolong) : cpu implementation of depthwise convolution
}
};
template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad) {}
// TODO(zhaolong) : cpu implementation of depthwise convolution
};
template <class T>
class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad) {}
// TODO(zhaolong) : cpu implementation of depthwise convolution
};
/*
* \brief Forward calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>();
DepthwiseConvFunctor<Device, real> depthwiseConv;
depthwiseConv(inputData,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
outputData);
}
};
/*
* \brief Backward input calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvGradInputFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
check(inputs, outputs);
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* outputGrad = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* inputGrad = outputs[0].data<real>();
DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
depthwiseConvGradInput(outputGrad,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
inputGrad);
}
};
/*
* \brief Backward filter calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
check(inputs, outputs);
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* outputGrad = inputs[0].data<real>();
real* inputData = inputs[1].data<real>();
real* filterGrad = outputs[0].data<real>();
int size = outputChannels * filterHeight * filterWidth * outputHeight *
outputWidth;
resizeBuffer<Device>(size);
real* colData = reinterpret_cast<real*>(memory_->getBuf());
DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
depthwiseConvGradFilter(outputGrad,
inputData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
colData,
filterGrad);
}
};
REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
CPU,
DepthwiseConvGradInputFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
CPU,
DepthwiseConvGradFilterFunction);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
GPU,
DepthwiseConvGradInputFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
GPU,
DepthwiseConvGradFilterFunction);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "TensorType.h"
namespace paddle {
/**
*\brief Depthwise convolution forward. The outputData
* of depthwise convolution is same with ExpandConvLayer
* when groups equals inputChannels in ExpandConvLayer.
*
* \param[in] inputData input data.
* \param[in] filterData the Paramters of the depthwise conv layer..
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of inputData.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData..
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[out] outputData outputData.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvFunctor {
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData);
};
/**
*\brief Functor tot compute the depthwise convolution backprop w.r.t input.
*
*
* \param[in] outputGradData the grad data of output.
* \param[in] filterData the Paramters of the depthwise conv layer..
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of input data.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData.
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[out] inputGrad the grad data of input.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvGradInputFunctor {
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad);
};
/**
*\brief Functor tot compute the depthwise convolution backprop w.r.t filter.
*
* \param[in] outputGradData the grad data of output.
* \param[in] inputData inputData.
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of input data.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData.
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[in] colData Auxiliary data when calculating filterGrad.
* \param[in] multiplierData Auxiliary data when calculating filterGrad.
* \param[out] filterGrad the grad data of filter.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvGradFilterFunctor {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad);
};
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/BaseMatrix.h"
namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass
template <class T>
__global__
void ConvolutionDepthwiseForward(const int nthreads,
const T* const inputData, const T* const filterData,
const int batchSize, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const outputData) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / outputChannels / outputHeight / outputWidth;
const int c_out = (index / outputHeight / outputWidth) % outputChannels;
const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth;
const int c_in = c_out / filterMultiplier;
const T* weight = filterData + c_out * filterHeight * filterWidth;
T value = 0;
const int h_in_start = -paddingH + h_out * strideH;
const int w_in_start = -paddingW + w_out * strideW;
const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
if ((h_in_start >= 0) && (h_in_end < inputHeight)
&& (w_in_start >= 0) && (w_in_end < inputWidth)) {
for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
++weight;
}
}
} else {
for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
}
++weight;
}
}
}
outputData[index] = value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <class T>
__global__
void ConvolutionDepthwiseInputBackward(const int nthreads,
const T* const top_diff, const T* const weight_data,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const bottom_diff) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / inputChannels / inputHeight / inputWidth;
const int c_in = (index / inputHeight / inputWidth) % inputChannels;
const int h_in = (index / inputWidth) % inputHeight;
const int w_in = index % inputWidth;
const int c_out_start = c_in * filterMultiplier;
int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + paddingH)/strideH;
h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + paddingW)/strideW;
w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
T value = 0;
for (int c_out = c_out_start;
c_out < c_out_start + filterMultiplier; c_out ++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + paddingH - h_out * strideH;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + paddingW - w_out * strideW;
const int filter_offset = c_out * filterHeight * filterWidth
+ filter_h * filterWidth + filter_w;
const int top_diff_offset = ((batch * outputChannels + c_out) *
outputHeight + h_out)* outputWidth + w_out;
value += top_diff[top_diff_offset] * weight_data[filter_offset];
}
}
}
bottom_diff[index] += value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template <class T>
__global__
void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
const T* const top_diff, const T* const inputData,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const buffer_data) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth;
const int kh = (index / filterWidth / outputHeight / outputWidth)
% filterHeight;
const int kw = (index / outputHeight / outputWidth) % filterWidth;
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int c_out = index /
(filterHeight * filterWidth * outputHeight * outputWidth);
const int c_in = c_out / filterMultiplier;
const int batch = num_i;
const int top_offset = ((batch * outputChannels + c_out) *
outputHeight + h_out) * outputWidth + w_out;
const int bottom_offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
} else {
buffer_data[index] = 0;
}
}
}
template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData){
int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
size_t blocks = (outputSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
ConvolutionDepthwiseForward<T>
<<< grid, threads, 0, STREAM_DEFAULT >>>(
outputSize,
inputData,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
outputData);
}
};
template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad){
int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
size_t blocks = (inputSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
ConvolutionDepthwiseInputBackward<T>
// NOLINT_NEXT_LINE(whitespace/operators)
<<< grid, threads, 0, STREAM_DEFAULT >>>(
inputSize,
outputGrad,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
inputGrad);
}
};
template <class T>
class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad){
int colDataSize = outputChannels * filterHeight * filterWidth
* outputHeight * outputWidth;
size_t blocks = (colDataSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
1, filterGrad, false, true);
for (int i = 0; i < batchSize; i++) {
ConvolutionDepthwiseFilterBackward<T>
<<< grid, threads, 0, STREAM_DEFAULT >>>(
i,
colDataSize,
outputGrad,
inputData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
colData);
int K = outputHeight * outputWidth;
int M = colDataSize / K;
BaseMatrix colMatrix(M, K, colData, false, true);
filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
}
}
};
#ifdef PADDLE_TYPE_DOUBLE
template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
#else
template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
#endif
} // namespace paddle
......@@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() {
bwdDataLimitBytes_ = 0;
bwdFilterLimitBytes_ = 0;
workSpaceInBytes_ = 0;
batchNum_ = 0;
isSelectAlgo_ = false;
}
void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
......@@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) {
CHECK_EQ(width, out_->value->getWidth());
CHECK_EQ(calInputSize(), in_->value->getWidth());
isSelectAlgo_ = (batchSize == batchNum_);
batchNum_ = batchSize;
if (!isSelectAlgo_) {
reshapeTensorDesc(batchSize);
hl_conv_workspace(imageDesc_,
outputDesc_,
filterDesc_,
convDesc_,
&fwdAlgo_,
&fwdLimitBytes_,
&bwdDataAlgo_,
&bwdDataLimitBytes_,
&bwdFilterAlgo_,
&bwdFilterLimitBytes_);
size_t maxWorkSpace = 0;
maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
workSpaceInBytes_ = maxWorkSpace;
VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
<< " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
}
isSelectAlgo_ = true;
reshapeTensorDesc(batchSize);
hl_conv_workspace(imageDesc_,
outputDesc_,
filterDesc_,
convDesc_,
&fwdAlgo_,
&fwdLimitBytes_,
&bwdDataAlgo_,
&bwdDataLimitBytes_,
&bwdFilterAlgo_,
&bwdFilterLimitBytes_);
size_t maxWorkSpace = 0;
maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
workSpaceInBytes_ = maxWorkSpace;
VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
<< " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
}
void *ConvBaseProjection::getSpaceBytes(size_t size) {
......
......@@ -101,12 +101,6 @@ protected:
size_t bwdFilterLimitBytes_;
/// Size of total work space.
size_t workSpaceInBytes_;
/// Whether to call cuDNN api to choose conv algorithm.
bool isSelectAlgo_;
/// batchNum is used to record batch size. If the batch size is changed,
/// the selection algorithm will be called.
int batchNum_;
bool bias_;
std::unique_ptr<Weight> weight_;
......
......@@ -38,10 +38,25 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
inputShape_.resize(numInputs);
filterShape_.resize(numInputs);
outputShape_.resize(numInputs);
std::string convType;
std::string convGradInputType;
std::string convGradFilterType;
for (int i = 0; i < config_.inputs_size(); i++) {
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
convType = "DepthwiseConv";
convGradInputType = "DepthwiseConvGradInput";
convGradFilterType = "DepthwiseConvGradFilter";
} else {
convType = "GemmConv";
convGradInputType = "GemmConvGradInput";
convGradFilterType = "GemmConvGradFilter";
}
if (FLAGS_use_nnpack) {
CHECK_EQ(isDeconv_, false);
createFunction(forward_,
......@@ -53,21 +68,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
.set("algo", std::string("auto")));
} else {
createFunction(forward_,
!isDeconv_ ? "GemmConv" : "GemmConvGradInput",
!isDeconv_ ? convType : convGradInputType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)groups_[i]));
createFunction(backward_,
!isDeconv_ ? "GemmConvGradInput" : "GemmConv",
!isDeconv_ ? convGradInputType : convType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)groups_[i]));
createFunction(backward_,
"GemmConvGradFilter",
convGradFilterType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
......
......@@ -347,6 +347,55 @@ TEST(Layer, CosSimVecMatLayer) {
}
}
void testDepthwiseConvLayer(const string& type, bool useGpu) {
TestConfig config;
config.biasSize = 32;
config.layerConfig.set_type(type);
config.layerConfig.set_num_filters(32);
config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
LayerInputConfig* input = config.layerConfig.add_inputs();
ConvConfig* conv = input->mutable_conv_conf();
conv->set_filter_size(2);
conv->set_filter_size_y(3);
conv->set_channels(16);
conv->set_padding(0);
conv->set_padding_y(1);
conv->set_stride(2);
conv->set_stride_y(2);
conv->set_groups(16);
conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(16);
conv->set_img_size_y(8);
conv->set_output_x(outputSize(conv->img_size(),
conv->filter_size(),
conv->padding(),
conv->stride(),
/* caffeMode */ true));
conv->set_output_y(outputSize(conv->img_size_y(),
conv->filter_size_y(),
conv->padding_y(),
conv->stride_y(),
/* caffeMode */ true));
config.layerConfig.set_size(conv->output_x() * conv->output_y() *
config.layerConfig.num_filters());
testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
// Use small batch_size and useWeight=true to test biasGrad
testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
}
TEST(Layer, depthwiseConvLayer) {
// 'depthwise_conv' is a sepecial case of 'exconv' whose
// groups size equals to the input channels size.
testDepthwiseConvLayer("exconv", /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU
testDepthwiseConvLayer("exconv", /* useGpu= */ true);
#endif
}
void testConvLayer(const string& type, bool trans, bool useGpu) {
TestConfig config;
config.biasSize = 16;
......
......@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
return cblas_ddot(n, x, 1, y, 1);
}
#ifdef PADDLE_USE_MKL
#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
template <>
void vExp<float>(const int n, const float* a, float* r) {
......@@ -243,7 +243,55 @@ template <>
void vAdd<double>(const int n, const double* a, const double* b, double* r) {
vdAdd(n, a, b, r);
}
#else
DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
template <class T>
void vExp(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
template <class T>
void vLog(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
template <class T>
void vPow(const int n, const T* a, const T b, T* r) {
hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
template <class T>
void vAdd(const int n, const T* a, const T* b, T* r) {
hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
const_cast<T*>(a),
const_cast<T*>(b),
r,
1,
n,
n,
n,
n);
}
template void vExp(const int n, const float* a, float* r);
template void vExp(const int n, const double* a, double* r);
template void vLog(const int n, const float* a, float* r);
template void vLog(const int n, const double* a, double* r);
template void vPow(const int n, const float* a, const float b, float* r);
template void vPow(const int n, const double* a, const double b, double* r);
template void vAdd(const int n, const float* a, const float* b, float* r);
template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
#ifdef PADDLE_USE_MKL
template <>
void vInvSqrt<float>(const int n, const float* a, float* r) {
vsInvSqrt(n, a, r);
......@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
}
#else
DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
template <class T>
void vExp(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
template <class T>
void vLog(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
template <class T>
void vInvSqrt(const int n, const T* a, T* r) {
......@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
template <class T>
void vPow(const int n, const T* a, const T b, T* r) {
hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
template <class T>
void vAdd(const int n, const T* a, const T* b, T* r) {
hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
const_cast<T*>(a),
const_cast<T*>(b),
r,
1,
n,
n,
n,
n);
}
template void vExp(const int n, const float* a, float* r);
template void vExp(const int n, const double* a, double* r);
template void vLog(const int n, const float* a, float* r);
template void vLog(const int n, const double* a, double* r);
template void vInvSqrt(const int n, const double* a, double* r);
template void vInvSqrt(const int n, const float* a, float* r);
template void vLog1p(const int n, const float* a, float* r);
template void vLog1p(const int n, const double* a, double* r);
template void vTanh(const int n, const float* a, float* r);
template void vTanh(const int n, const double* a, double* r);
template void vPow(const int n, const float* a, const float b, float* r);
template void vPow(const int n, const double* a, const double b, double* r);
template void vAdd(const int n, const float* a, const float* b, float* r);
template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
......
......@@ -15,6 +15,12 @@ limitations under the License. */
#ifndef MATHFUNCTIONS_H_
#define MATHFUNCTIONS_H_
#ifdef PADDLE_USE_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_MKL
#include <mkl.h>
#include <mkl_lapacke.h>
......
add_subdirectory(detail)
cc_library(memory SRCS memory.cc)
cc_library(memcpy SRCS memcpy.cc DEPS device_context)
cc_library(paddle_memory
DEPS
memory meta_data
meta_cache memory_block
buddy_allocator system_allocator)
memory
memcpy
meta_data
meta_cache
memory_block
buddy_allocator
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/platform/device_context.h"
namespace paddle {
namespace memory {
template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
std::memcpy(dst, src, num);
}
#ifndef PADDLE_ONLY_CPU
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::GPUPlaceGuard g(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::GPUPlaceGuard g(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
if (dst_place == src_place) {
platform::GPUPlaceGuard g(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
stream);
}
}
#endif // PADDLE_ONLY_CPU
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace memory {
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifndef PADDLE_ONLY_CPU
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream);
#endif // PADDLE_ONLY_CPU
} // namespace memory
} // namespace paddle
......@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/memory/memory.h"
#include "paddle/memory/detail/buddy_allocator.h"
#include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include <cstring> // for memcpy
namespace paddle {
namespace memory {
......
......@@ -14,19 +14,32 @@ limitations under the License. */
#pragma once
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace memory {
template <class Place>
template <typename Place>
void* Alloc(Place, size_t);
template <class Place>
template <typename Place>
void Free(Place, void*);
template <class Place>
template <typename Place>
size_t Used(Place);
template <typename T, /* must be POD types */
typename Place /* platform::GPUPlace or platform::CPUPlace */,
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
class PODDeleter {
public:
PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
......@@ -31,7 +31,7 @@ protected:
"Inputs/Outputs of AddOp must all be set");
PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
"Two input of Add Op's dimension must be same.");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -49,10 +49,22 @@ The equation is: Out = X + Y
)DOC");
}
};
class AddOpGrad : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {}
std::string DebugString() const override {
LOG(INFO) << "AddOpGrad";
return "";
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
AddKernel_CPU_float;
REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
REGISTER_GRADIENT_OP(add_two, add_two_grad, paddle::operators::AddOpGrad);
REGISTER_OP_CPU_KERNEL(
add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
#include "paddle/operators/add_op.h"
#include "paddle/framework/op_registry.h"
typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
REGISTER_OP_GPU_KERNEL(add_two,
AddKernel_GPU_float);
\ No newline at end of file
paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
......@@ -16,8 +16,13 @@ limitations under the License. */
#define private public
#include <paddle/framework/op_registry.h>
USE_OP(add_two);
// USE_OP(add_two_grad);
TEST(AddOp, GetOpProto) {
auto& protos = paddle::framework::OpRegistry::protos();
auto it = protos.find("add_two");
ASSERT_NE(it, protos.end());
}
\ No newline at end of file
auto& op_creators = paddle::framework::OpRegistry::op_creators();
auto it1 = op_creators.find("add_two_grad");
ASSERT_NE(it1, op_creators.end());
}
......@@ -35,7 +35,7 @@ protected:
PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
"label's dimension must be 1.");
outputs[0]->set_dims(framework::make_ddim({inputs[0]->dims()[0]}));
outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]}));
}
};
......
......@@ -12,9 +12,9 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/framework/tensor.h>
#include <paddle/operators/mul_op.h>
#include "paddle/operators/mul_op.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
......@@ -33,7 +33,7 @@ protected:
dim0[1] == dim1[0],
"First matrix's width must be equal with second matrix's height.");
PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
outputs[0]->set_dims({dim0[0], dim1[1]});
outputs[0]->Resize({dim0[0], dim1[1]});
}
};
......@@ -52,9 +52,22 @@ The equation is: Out = X * Y
}
};
class MulOpGrad : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {}
std::string DebugString() const override {
LOG(INFO) << "MulGrad";
return "";
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
REGISTER_GRADIENT_OP(mul, mul_grad, paddle::operators::MulOpGrad);
REGISTER_OP_CPU_KERNEL(
mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
......@@ -12,9 +12,9 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/operators/mul_op.h>
#include <paddle/framework/op_registry.h>
#include "paddle/operators/mul_op.h"
#include "paddle/framework/op_registry.h"
REGISTER_OP_GPU_KERNEL(mul,
paddle::operators::MulKernel<paddle::platform
::GPUPlace>);
\ No newline at end of file
::GPUPlace, float>);
\ No newline at end of file
......@@ -14,17 +14,30 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Mul kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
{Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
auto input0 = context.Input(0)->Get<framework::Tensor>();
auto input1 = context.Input(1)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
framework::EigenMatrix<T>::From(*output).device(
*(context.GetEigenDevice<Place>())) =
framework::EigenMatrix<T>::From(input0).contract(
framework::EigenMatrix<T>::From(input1), dim_pair);
}
};
} // namespace operators
......
......@@ -12,8 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/rowwise_add_op.h>
#include "paddle/operators/rowwise_add_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -30,7 +30,7 @@ protected:
PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -58,4 +58,4 @@ REGISTER_OP(rowwise_add,
paddle::operators::RowWiseAddOpMaker);
REGISTER_OP_CPU_KERNEL(
rowwise_add,
paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
#include <paddle/framework/op_registry.h>
#include <paddle/operators/rowwise_add_op.h>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/rowwise_add_op.h"
REGISTER_OP_GPU_KERNEL(
rowwise_add,
paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
......@@ -13,17 +13,32 @@
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class RowWiseAddKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto in0 = context.Input(0)->Get<framework::Tensor>();
auto in1 = context.Input(1)->Get<framework::Tensor>();
auto* out = context.Output(0)->GetMutable<framework::Tensor>();
out->mutable_data<T>(context.GetPlace());
auto input = framework::EigenMatrix<T>::From(in0);
auto bias = framework::EigenVector<T>::From(in1);
auto output = framework::EigenMatrix<T>::From(*out);
const int bias_size = bias.dimension(0);
const int rest_size = input.size() / bias_size;
Eigen::DSizes<int, 1> one_d(input.size());
Eigen::DSizes<int, 1> bcast(rest_size);
output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
}
};
......
......@@ -31,7 +31,7 @@ protected:
PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
"Two input of SGD Op's dimension must be same.");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......
......@@ -12,8 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/sigmoid_op.h>
#include "paddle/operators/sigmoid_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -24,7 +24,7 @@ protected:
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -34,16 +34,30 @@ public:
framework::OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "sigmoid input");
AddInput("Y", "sigmoid output");
AddOutput("Y", "sigmoid output");
AddComment("Sigmoid function");
}
};
class SigmoidOpGrad : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {}
std::string DebugString() const override {
LOG(INFO) << "SigmoidGrad";
return "";
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP(sigmoid,
paddle::operators::SigmoidOp,
paddle::operators::SigmoidOpMaker);
REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, paddle::operators::SigmoidOpGrad);
REGISTER_OP_CPU_KERNEL(
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
sigmoid,
paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
#include <paddle/operators/sigmoid_op.h>
#include <paddle/framework/op_registry.h>
#include "paddle/operators/sigmoid_op.h"
#include "paddle/framework/op_registry.h"
REGISTER_OP_GPU_KERNEL(
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
......@@ -14,17 +14,25 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class SigmoidKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto input = context.Input(0)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
framework::EigenVector<T>::Flatten(*output).device(
*(context.GetEigenDevice<Place>())) =
1.0 / (1.0 + (-1.0 * framework::EigenVector<T>::Flatten(input)).exp());
}
};
} // namespace operators
......
......@@ -11,8 +11,8 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/softmax_op.h>
#include "paddle/operators/softmax_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -23,9 +23,11 @@ protected:
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
"The input of softmax op must be matrix");
PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -40,10 +42,23 @@ public:
}
};
class SoftmaxOpGrad : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {}
std::string DebugString() const override {
LOG(INFO) << "SoftmaxOpGrad";
return "";
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
REGISTER_GRADIENT_OP(softmax, softmax_grad, paddle::operators::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL(softmax,
ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
#include <paddle/framework/op_registry.h>
#include <paddle/operators/softmax_op.h>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/softmax_op.h"
REGISTER_OP_GPU_KERNEL(
softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
......@@ -14,17 +14,49 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class SoftmaxKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto input = context.Input(0)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
auto logits = framework::EigenMatrix<T>::From(input);
auto softmax = framework::EigenMatrix<T>::From(*output);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto shifted_logits = (logits -
logits.maximum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
softmax.device(*(context.GetEigenDevice<Place>())) =
(softmax *
softmax.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
}
};
} // namespace operators
......
......@@ -43,10 +43,26 @@ namespace platform {
// For more details, please check https://stackoverflow.com/a/43870188/724872.
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
template <typename T>
inline void throw_on_error(T e) {
throw_on_error(e, "");
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#ifndef PADDLE_ONLY_CPU
template <typename... Args>
inline void throw_on_error(cudaError_t e, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
if (UNLIKELY(e)) {
// clang-format off
throw thrust::system_error(
......@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(curandStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
if (stat != CURAND_STATUS_SUCCESS) {
// clang-format off
throw thrust::system_error(
......@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudnnStatus_t stat, const Args&... args) {
if (stat == CUDNN_STATUS_SUCCESS) {
return;
} else {
......@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cublasStatus_t stat, const Args&... args) {
std::string err;
if (stat == CUBLAS_STATUS_SUCCESS) {
return;
......@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
#endif // PADDLE_ONLY_CPU
template <typename... Args>
inline void throw_on_error(int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#define PADDLE_THROW(...) \
do { \
throw std::runtime_error( \
......@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) {
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
} while (0)
/**
* @brief Enforce a condition, otherwise throw an EnforceNotMet
*/
#define PADDLE_ENFORCE(condition, ...) \
do { \
::paddle::platform::throw_on_error(condition, __VA_ARGS__); \
#define PADDLE_ENFORCE(...) \
do { \
::paddle::platform::throw_on_error(__VA_ARGS__); \
} while (0)
} // namespace platform
......
......@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
"cudaSetDevice failed in paddle::platform::SetDeviceId");
}
void GpuMemoryUsage(size_t& available, size_t& total) {
void GpuMemoryUsage(size_t &available, size_t &total) {
PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
}
......@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() {
return usable;
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
}
} // namespace platform
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#ifndef PADDLE_ONLY_CPU
#include <cuda_runtime.h>
#include <stddef.h>
namespace paddle {
......@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
void SetDeviceId(int device_id);
//!Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t& available, size_t& total);
void GpuMemoryUsage(size_t &available, size_t &total);
//! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize();
......@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);
} // namespace platform
} // namespace paddle
......
......@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <paddle/framework/op_registry.h>
#include <paddle/framework/operator.h>
#include <paddle/framework/scope.h>
#include <paddle/pybind/tensor_bind.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <fstream>
#include <vector>
#include "paddle/framework/net.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/scope.h"
#include "paddle/pybind/tensor_bind.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace pd = paddle::framework;
......@@ -30,9 +32,24 @@ USE_OP(add_two);
USE_OP(onehot_cross_entropy);
USE_OP_WITHOUT_KERNEL(fc);
USE_OP(sgd);
USE_OP(mul);
USE_OP(sigmoid);
USE_OP(softmax);
USE_OP(rowwise_add);
template <typename ClassType>
void ExposeOperator(ClassType& m) {
m.def("infer_shape", &ClassType::type::InferShape)
.def("run", &ClassType::type::Run)
.def("outputs",
[](const typename ClassType::type& op) -> std::vector<std::string> {
return op.outputs_;
})
.def("__str__", &ClassType::type::DebugString);
}
PYBIND11_PLUGIN(core) {
py::module m("core", "C++ core of Paddle Paddle");
py::module m("core", "C++ core of PaddlePaddle");
py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer([](pd::Tensor& self) -> py::buffer_info {
......@@ -42,7 +59,7 @@ PYBIND11_PLUGIN(core) {
[](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
.def("set_dims",
[](pd::Tensor& self, const std::vector<int>& dim) {
self.set_dims(pd::make_ddim(dim));
self.Resize(pd::make_ddim(dim));
})
.def("alloc_float",
[](pd::Tensor& self) {
......@@ -109,21 +126,38 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::CPUDeviceContext();
});
py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
.def("__str__", &pd::OperatorBase::DebugString)
py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
m, "Operator");
operator_base.def_static("create", [](py::bytes protobin) {
pd::OpDesc desc;
PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
"Cannot parse user input to OpDesc");
PADDLE_ENFORCE(desc.IsInitialized(),
"User OpDesc is not initialized, reason %s",
desc.InitializationErrorString());
return pd::OpRegistry::CreateOp(desc);
});
ExposeOperator(operator_base);
using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
py::class_<pd::PlainNet, PlainNetPtr> plain_net(m, "PlainNet");
plain_net
.def_static("create",
[](py::bytes protobin) {
pd::OpDesc desc;
PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
"Cannot parse user input to OpDesc");
PADDLE_ENFORCE(desc.IsInitialized(),
"User OpDesc is not initialized, reason %s",
desc.InitializationErrorString());
return pd::OpRegistry::CreateOp(desc);
[]() -> std::shared_ptr<pd::PlainNet> {
auto retv = std::make_shared<pd::PlainNet>();
retv->type_ = "plain_net";
return retv;
})
.def("infer_shape", &pd::OperatorBase::InferShape)
.def("run", &pd::OperatorBase::Run)
.def("outputs", [](const pd::OperatorPtr& op) { return op->outputs_; });
.def("add_op", &pd::PlainNet::AddOp)
.def("add_op",
[](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void {
self->AddOp(std::static_pointer_cast<pd::OperatorBase>(plain_net));
})
.def("complete_add_op", &pd::PlainNet::CompleteAddOp)
.def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
ExposeOperator(plain_net);
return m.ptr();
}
......@@ -86,7 +86,7 @@ void PyTensorSetFromArray(
dims.push_back((int)array.shape()[i]);
}
self.set_dims(framework::make_ddim(dims));
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
......
......@@ -3219,6 +3219,10 @@ def ParameterHook(type, **kwargs):
if sparsity_ratio is not None:
hook.sparsity_ratio = sparsity_ratio
return hook
elif type == 'dpruning':
hook = ParameterUpdaterHookConfig()
hook.type = type
return hook
else:
return None
......
......@@ -340,24 +340,40 @@ def img_conv_group(input,
conv_with_batchnorm=False,
conv_batchnorm_drop_rate=0,
pool_stride=1,
pool_type=None):
pool_type=None,
param_attr=None):
"""
Image Convolution Group, Used for vgg net.
TODO(yuyang18): Complete docs
:param conv_batchnorm_drop_rate:
:param input:
:param conv_num_filter:
:param pool_size:
:param num_channels:
:param conv_padding:
:param conv_filter_size:
:param conv_act:
:param conv_with_batchnorm:
:param pool_stride:
:param pool_type:
:return:
:param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
:type conv_batchnorm_drop_rate: list
:param input: layer's input.
:type input: LayerOutput
:param conv_num_filter: output channels num.
:type conv_num_filter: int
:param pool_size: pooling filter size.
:type pool_size: int
:param num_channels: input channels num.
:type num_channels: int
:param conv_padding: convolution padding size.
:type conv_padding: int
:param conv_filter_size: convolution filter size.
:type conv_filter_size: int
:param conv_act: activation funciton after convolution.
:type conv_act: BaseActivation
:param conv_with_batchnorm: conv_with_batchnorm[i] represents
if there is a batch normalization after each convolution.
:type conv_with_batchnorm: list
:param pool_stride: pooling stride size.
:type pool_stride: int
:param pool_type: pooling type.
:type pool_type: BasePoolingType
:param param_attr: Convolution param attribute.
None means default attribute.
:type param_attr: ParameterAttribute
:return: Layer's output
:type: LayerOutput
"""
tmp = input
......@@ -397,6 +413,7 @@ def img_conv_group(input,
padding=conv_padding[i],
filter_size=conv_filter_size[i],
num_filters=conv_num_filter[i],
param_attr=param_attr,
**extra_kwargs)
# logger.debug("tmp.num_filters = %d" % tmp.num_filters)
......
......@@ -33,6 +33,7 @@ import networks
import minibatch
import plot
import image
import model
__all__ = [
'optimizer',
......@@ -54,6 +55,7 @@ __all__ = [
'evaluator',
'image',
'master',
'model',
]
......
add_python_test(test_framework test_protobuf.py test_scope.py
test_default_scope_funcs.py test_op_creation_methods.py
test_tensor.py test_fc_op.py test_add_two_op.py test_sgd_op.py test_cross_entropy_op.py)
add_python_test(test_framework
test_protobuf.py
test_scope.py
test_default_scope_funcs.py
test_op_creation_methods.py
test_plain_net.py
test_tensor.py
test_fc_op.py
test_add_two_op.py
test_sgd_op.py
test_cross_entropy_op.py
test_mul_op.py
test_sigmoid_op.py
test_softmax_op.py
test_rowwise_add_op.py)
......@@ -56,7 +56,10 @@ class OpTestMeta(type):
for out_name in func.all_output_args:
actual = numpy.array(scope.get_var(out_name).get_tensor())
expect = getattr(self, out_name)
numpy.testing.assert_almost_equal(actual, expect)
# TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
# has some diff, and could not pass unittest. So I set decimal 3 here.
# And I will check this in future.
numpy.testing.assert_almost_equal(actual, expect, decimal=3)
obj.test_all = test_all
return obj
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "mul"
self.X = np.random.random((32, 784)).astype("float32")
self.Y = np.random.random((784, 100)).astype("float32")
self.Out = np.dot(self.X, self.Y)
if __name__ == '__main__':
unittest.main()
import paddle.v2.framework.core as core
from paddle.v2.framework.create_op_creation_methods import op_creations
import unittest
class TestNet(unittest.TestCase):
def test_net_all(self):
net = core.PlainNet.create()
op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
net.add_op(op1)
net2 = core.PlainNet.create()
net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
net2.complete_add_op(True)
net.add_op(net2)
net.complete_add_op(True)
expected = '''
Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
Op(add_two), inputs:(X, Y), outputs:(Out).
Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
'''
self.assertEqual(expected, "\n" + str(net))
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestRowwiseAddOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "rowwise_add"
self.X = np.random.random((32, 784)).astype("float32")
self.b = np.random.random(784).astype("float32")
self.Out = np.add(self.X, self.b)
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestSigmoidOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "sigmoid"
self.X = np.random.random((32, 100)).astype("float32")
self.Y = 1 / (1 + np.exp(-self.X))
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
def stable_softmax(x):
"""Compute the softmax of vector x in a numerically stable way."""
shiftx = x - np.max(x)
exps = np.exp(shiftx)
return exps / np.sum(exps)
class TestSoftmaxOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "softmax"
self.X = np.random.random((32, 100)).astype("float32")
self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
if __name__ == '__main__':
unittest.main()
......@@ -10,11 +10,31 @@ class client(object):
client is a client to the master server.
"""
def __init__(self, etcd_endpoints, timeout, buf_size):
self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout,
def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout_sec,
buf_size)
def close(self):
def request_save_model(self, trainer_id, block_ms):
"""request to save model
Conventionally the 0-th trainer will save model. But in
distributed training, any trainer could be killed. This
function asks the master server if the trainer should proceed
with saving model.
:param trainer_id: trainer id.
:param block_ms: number of millisecond that other save model
will be blocked if this save model request succeeded.
Returns:
int: 1 if the save the model request is approved, 0 if
does the request is rejected because other trainer is
saving the model, -1 if error happened.
"""
return lib.paddle_request_save_model(self.c, trainer_id, block_ms)
def release(self):
lib.paddle_release_master_client(self.c)
self.c = None
......@@ -27,10 +47,13 @@ class client(object):
holder[idx] = c_ptr
lib.paddle_set_dataset(self.c, holder, len(paths))
# return format: (record, errno)
# errno = 0: ok
# < 0: error
def next_record(self):
"""gets next record for training
Returns:
string: the record.
int: error code, 0 if successful, < 0 otherwise.
"""
p = ctypes.c_char_p()
ret = ctypes.pointer(p)
size = lib.paddle_next_record(self.c, ret)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import errno
import uuid
import paddle.v2.master
__all__ = ["save_model", "load_model"]
trainer_id = str(uuid.uuid4())
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def save_model(parameters, path):
need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
if need_request:
# TODO(helin): figure out how MPI trains, since MPI only save
# model when trainer_id == "0", we can consolidate the logic
# here.
# TODO(helin): change this environment variable name from
# MASTER_IP to ETCD_IP
etcd_name = "MASTER_IP"
if etcd_name not in os.environ.keys():
raise Exception('not find ' + etcd_name +
' in environment variable.')
etcd_ip = os.environ.get(etcd_name)
client = master.client("http://" + etcd_ip + ":2379", 5, 0)
r = client.request_save_model(trainer_id, 5000)
if r == 0:
# do not need to save
return
elif r < 0:
# error
return
else:
# save model
path = os.path.join(path, trainer_id)
path = os.path.join(path, "model.tar")
mkdir_p(path)
with open(path, 'wb') as f:
parameters.to_tar(f)
def load_model(parameters, path):
with open(path, 'rb') as f:
parameters.from_tar(f)
......@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Creator package contains some simple reader creator, which could be used in user
program.
Creator package contains some simple reader creator, which could
be used in user program.
"""
__all__ = ['np_array', 'text_file', "recordio"]
......@@ -59,7 +59,7 @@ def text_file(path):
def recordio_local(paths, buf_size=100):
"""
Creates a data reader from given RecordIO file paths separated by ",",
Creates a data reader from given RecordIO file paths separated by ",",
glob pattern is supported.
:path: path of recordio files.
:returns: data reader of recordio files.
......@@ -83,7 +83,7 @@ def recordio_local(paths, buf_size=100):
def recordio(paths, buf_size=100):
"""
Creates a data reader that outputs record one one by one
Creates a data reader that outputs record one one by one
from given local or cloud recordio path.
:path: path of recordio files.
:returns: data reader of recordio files.
......@@ -96,7 +96,7 @@ def recordio(paths, buf_size=100):
host_name = "MASTER_SERVICE_HOST"
if host_name not in os.environ.keys():
raise Exception('not find ' + host_name + ' in environ.')
raise Exception('not find ' + host_name + ' in environment variable.')
addr = os.environ(host)
......@@ -110,6 +110,6 @@ def recordio(paths, buf_size=100):
break
yield r
c.close()
c.release()
return reader
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册