提交 2a9d71a5 编写于 作者: Y Yuan Gao 提交者: GitHub

Merge pull request #1 from PaddlePaddle/develop

Update from the origin
...@@ -4,6 +4,7 @@ cache: ...@@ -4,6 +4,7 @@ cache:
- $HOME/third_party - $HOME/third_party
- $HOME/.ccache - $HOME/.ccache
- $HOME/.cache/pip - $HOME/.cache/pip
- $HOME/Library/Caches/Homebrew
sudo: required sudo: required
dist: trusty dist: trusty
os: os:
...@@ -25,9 +26,9 @@ addons: ...@@ -25,9 +26,9 @@ addons:
packages: packages:
- gcc-4.8 - gcc-4.8
- g++-4.8 - g++-4.8
- gfortran-4.8
- git - git
- build-essential - build-essential
- libatlas-base-dev
- python - python
- python-pip - python-pip
- python2.7-dev - python2.7-dev
...@@ -54,7 +55,9 @@ before_install: ...@@ -54,7 +55,9 @@ before_install:
fi fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
- pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version.
- pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
script: script:
- paddle/scripts/travis/main.sh - paddle/scripts/travis/main.sh
notifications: notifications:
......
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
set(CBLAS_FOUND OFF) set(CBLAS_FOUND OFF)
## Find MKL First. ## Find MKL First.
set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL") set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS find_path(MKL_INCLUDE_DIR mkl.h PATHS
${MKL_ROOT}/include) ${MKL_ROOT}/include)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -29,12 +29,14 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) ...@@ -29,12 +29,14 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
glog glog
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS gflags
GIT_REPOSITORY "https://github.com/google/glog.git" GIT_REPOSITORY "https://github.com/google/glog.git"
PREFIX ${GLOG_SOURCES_DIR} PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=OFF CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF CMAKE_ARGS -DBUILD_TESTING=OFF
) )
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
INCLUDE(cblas) INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
...@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
ENDIF(WIN32) ENDIF(WIN32)
IF(CMAKE_COMPILER_IS_GNUCC)
ENABLE_LANGUAGE(Fortran)
LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
ENDIF(CMAKE_COMPILER_IS_GNUCC)
IF(NOT CMAKE_Fortran_COMPILER)
MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
"you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
ENDIF(NOT CMAKE_Fortran_COMPILER)
ExternalProject_Add( ExternalProject_Add(
openblas openblas
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
URL "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz" GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
GIT_TAG v0.2.19
PREFIX ${CBLAS_SOURCES_DIR} PREFIX ${CBLAS_SOURCES_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
CONFIGURE_COMMAND "" BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
BUILD_COMMAND make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER} INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
ExternalProject_Add_Step(
openblas lapacke_install
COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
DEPENDEES install
) )
LIST(APPEND external_project_dependencies openblas) LIST(APPEND external_project_dependencies openblas)
ENDIF() ENDIF(NOT ${CBLAS_FOUND})
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
...@@ -29,17 +29,12 @@ IF(WIN32) ...@@ -29,17 +29,12 @@ IF(WIN32)
"${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
ELSE(WIN32) ELSE(WIN32)
IF(${HOST_SYSTEM} STREQUAL "centos")
SET(LIB "lib64")
ELSE()
SET(LIB "lib")
ENDIF()
SET(PROTOBUF_LITE_LIBRARY SET(PROTOBUF_LITE_LIBRARY
"${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
SET(PROTOBUF_LIBRARY SET(PROTOBUF_LIBRARY
"${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
SET(PROTOBUF_PROTOC_LIBRARY SET(PROTOBUF_PROTOC_LIBRARY
"${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
ENDIF(WIN32) ENDIF(WIN32)
...@@ -58,6 +53,7 @@ ExternalProject_Add( ...@@ -58,6 +53,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
) )
LIST(APPEND external_project_dependencies protobuf) LIST(APPEND external_project_dependencies protobuf)
...@@ -26,11 +26,12 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ...@@ -26,11 +26,12 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
find_python_module(wheel REQUIRED) find_python_module(wheel REQUIRED)
find_python_module(google.protobuf REQUIRED) find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED) FIND_PACKAGE(NumPy REQUIRED)
IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
"please use pip to upgrade protobuf.") "please use pip to upgrade protobuf. pip install -U protobuf")
ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") ENDIF()
ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
##################################### PYTHON ######################################## ##################################### PYTHON ########################################
SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python) SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python) SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
......
...@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND) ...@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE) SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe CACHE FILEPATH "SWIG Executable" FORCE) SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe CACHE FILEPATH "SWIG Executable" FORCE)
ELSE(WIN32) ELSE(WIN32)
# From PCRE configure
ExternalProject_Add(pcre
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY https://github.com/svn2github/pcre.git
PREFIX ${SWIG_SOURCES_DIR}/pcre
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
)
# swig uses bison find it by cmake and pass it down # swig uses bison find it by cmake and pass it down
FIND_PACKAGE(BISON) FIND_PACKAGE(BISON)
...@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND) ...@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
GIT_REPOSITORY https://github.com/swig/swig.git GIT_REPOSITORY https://github.com/swig/swig.git
GIT_TAG rel-3.0.10 GIT_TAG rel-3.0.10
PREFIX ${SWIG_SOURCES_DIR} PREFIX ${SWIG_SOURCES_DIR}
CONFIGURE_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh CONFIGURE_COMMAND cd <SOURCE_DIR> && ./autogen.sh && ./configure
CONFIGURE_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && --prefix=${SWIG_INSTALL_DIR} --without-pcre
env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a" BUILD_COMMAND cd <SOURCE_DIR> && make
./configure INSTALL_COMMAND cd <SOURCE_DIR> && make install
--prefix=${SWIG_INSTALL_DIR} UPDATE_COMMAND ""
--with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
BUILD_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make
INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
UPDATE_COMMAND ""
DEPENDS pcre
) )
SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION}) SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
......
...@@ -54,6 +54,7 @@ ExternalProject_Add( ...@@ -54,6 +54,7 @@ ExternalProject_Add(
CMAKE_ARGS -DWITH_GPU=${WITH_GPU} CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP} CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
CMAKE_ARGS -DBUILD_SHARED=ON CMAKE_ARGS -DBUILD_SHARED=ON
) )
......
...@@ -12,6 +12,14 @@ ...@@ -12,6 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Detects the OS and sets appropriate variables.
# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
# but the name like centos is necessary in some scenes
# to distinguish system for customization.
#
# for instance, protobuf libs path is <install_dir>/lib64
# on CentOS, but <install_dir>/lib on other systems.
IF(WIN32) IF(WIN32)
SET(HOST_SYSTEM "win32") SET(HOST_SYSTEM "win32")
ELSE(WIN32) ELSE(WIN32)
...@@ -21,6 +29,7 @@ ELSE(WIN32) ...@@ -21,6 +29,7 @@ ELSE(WIN32)
SET(MACOS_VERSION ${VERSION}) SET(MACOS_VERSION ${VERSION})
SET(HOST_SYSTEM "macosx") SET(HOST_SYSTEM "macosx")
ELSE(APPLE) ELSE(APPLE)
IF(EXISTS "/etc/issue") IF(EXISTS "/etc/issue")
FILE(READ "/etc/issue" LINUX_ISSUE) FILE(READ "/etc/issue" LINUX_ISSUE)
IF(LINUX_ISSUE MATCHES "CentOS") IF(LINUX_ISSUE MATCHES "CentOS")
...@@ -29,8 +38,24 @@ ELSE(WIN32) ...@@ -29,8 +38,24 @@ ELSE(WIN32)
SET(HOST_SYSTEM "debian") SET(HOST_SYSTEM "debian")
ELSEIF(LINUX_ISSUE MATCHES "Ubuntu") ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
SET(HOST_SYSTEM "ubuntu") SET(HOST_SYSTEM "ubuntu")
ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
SET(HOST_SYSTEM "redhat")
ELSEIF(LINUX_ISSUE MATCHES "Fedora")
SET(HOST_SYSTEM "fedora")
ENDIF() ENDIF()
ENDIF(EXISTS "/etc/issue") ENDIF(EXISTS "/etc/issue")
IF(EXISTS "/etc/redhat-release")
FILE(READ "/etc/redhat-release" LINUX_ISSUE)
IF(LINUX_ISSUE MATCHES "CentOS")
SET(HOST_SYSTEM "centos")
ENDIF()
ENDIF(EXISTS "/etc/redhat-release")
IF(NOT HOST_SYSTEM)
SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
ENDIF()
ENDIF(APPLE) ENDIF(APPLE)
ENDIF(WIN32) ENDIF(WIN32)
...@@ -47,7 +72,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS ...@@ -47,7 +72,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0 # Wrap download in script to log output LOG_DOWNLOAD 0 # Wrap download in script to log output
LOG_UPDATE 1 # Wrap update in script to log output LOG_UPDATE 1 # Wrap update in script to log output
LOG_CONFIGURE 1 # Wrap configure in script to log output LOG_CONFIGURE 1 # Wrap configure in script to log output
LOG_BUILD 1 # Wrap build in script to log output LOG_BUILD 0 # Wrap build in script to log output
LOG_TEST 1 # Wrap test in script to log output LOG_TEST 1 # Wrap test in script to log output
LOG_INSTALL 1 # Wrap install in script to log output LOG_INSTALL 0 # Wrap install in script to log output
) )
...@@ -126,7 +126,7 @@ class ImageClassifier(): ...@@ -126,7 +126,7 @@ class ImageClassifier():
# For oversampling, average predictions across crops. # For oversampling, average predictions across crops.
# If not, the shape of output[name]: (1, class_number), # If not, the shape of output[name]: (1, class_number),
# the mean is also applicable. # the mean is also applicable.
return output[output_layer].mean(0) return output[output_layer]['value'].mean(0)
def predict(self, image=None, output_layer=None): def predict(self, image=None, output_layer=None):
assert isinstance(image, basestring) assert isinstance(image, basestring)
......
...@@ -6,33 +6,15 @@ passed to C++ side of Paddle. ...@@ -6,33 +6,15 @@ passed to C++ side of Paddle.
The user api could be simpler and carefully designed. The user api could be simpler and carefully designed.
""" """
import py_paddle.swig_paddle as api
from py_paddle import DataProviderConverter
import paddle.trainer.PyDataProvider2 as dp
import numpy as np
import random import random
from mnist_util import read_from_mnist
from paddle.trainer_config_helpers import *
def optimizer_config():
settings(
learning_rate=1e-4,
learning_method=AdamOptimizer(),
batch_size=1000,
model_average=ModelAverage(average_window=0.5),
regularization=L2Regularization(rate=0.5))
import numpy as np
import paddle.v2 as paddle_v2
import py_paddle.swig_paddle as api
from paddle.trainer_config_helpers import *
from py_paddle import DataProviderConverter
def network_config(): from mnist_util import read_from_mnist
imgs = data_layer(name='pixel', size=784)
hidden1 = fc_layer(input=imgs, size=200)
hidden2 = fc_layer(input=hidden1, size=200)
inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
cost = classification_cost(
input=inference, label=data_layer(
name='label', size=10))
outputs(cost)
def init_parameter(network): def init_parameter(network):
...@@ -75,19 +57,35 @@ def input_order_converter(generator): ...@@ -75,19 +57,35 @@ def input_order_converter(generator):
def main(): def main():
api.initPaddle("-use_gpu=false", "-trainer_count=4") # use 4 cpu cores api.initPaddle("-use_gpu=false", "-trainer_count=4") # use 4 cpu cores
# get enable_types for each optimizer. optimizer = paddle_v2.optimizer.Adam(
# enable_types = [value, gradient, momentum, etc] learning_rate=1e-4,
# For each optimizer(SGD, Adam), GradientMachine should enable different batch_size=1000,
# buffers. model_average=ModelAverage(average_window=0.5),
opt_config_proto = parse_optimizer_config(optimizer_config) regularization=L2Regularization(rate=0.5))
opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
_temp_optimizer_ = api.ParameterOptimizer.create(opt_config) # Create Local Updater. Local means not run in cluster.
enable_types = _temp_optimizer_.getParameterTypes() # For a cluster training, here we can change to createRemoteUpdater
# in future.
updater = optimizer.create_local_updater()
assert isinstance(updater, api.ParameterUpdater)
# define network
images = paddle_v2.layer.data(
name='pixel', type=paddle_v2.data_type.dense_vector(784))
label = paddle_v2.layer.data(
name='label', type=paddle_v2.data_type.integer_value(10))
hidden1 = paddle_v2.layer.fc(input=images, size=200)
hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
inference = paddle_v2.layer.fc(input=hidden2,
size=10,
act=paddle_v2.activation.Softmax())
cost = paddle_v2.layer.classification_cost(input=inference, label=label)
# Create Simple Gradient Machine. # Create Simple Gradient Machine.
model_config = parse_network_config(network_config) model_config = paddle_v2.layer.parse_network(cost)
m = api.GradientMachine.createFromConfigProto( m = api.GradientMachine.createFromConfigProto(model_config,
model_config, api.CREATE_MODE_NORMAL, enable_types) api.CREATE_MODE_NORMAL,
optimizer.enable_types())
# This type check is not useful. Only enable type hint in IDE. # This type check is not useful. Only enable type hint in IDE.
# Such as PyCharm # Such as PyCharm
...@@ -96,19 +94,12 @@ def main(): ...@@ -96,19 +94,12 @@ def main():
# Initialize Parameter by numpy. # Initialize Parameter by numpy.
init_parameter(network=m) init_parameter(network=m)
# Create Local Updater. Local means not run in cluster.
# For a cluster training, here we can change to createRemoteUpdater
# in future.
updater = api.ParameterUpdater.createLocalUpdater(opt_config)
assert isinstance(updater, api.ParameterUpdater)
# Initialize ParameterUpdater. # Initialize ParameterUpdater.
updater.init(m) updater.init(m)
# DataProvider Converter is a utility convert Python Object to Paddle C++ # DataProvider Converter is a utility convert Python Object to Paddle C++
# Input. The input format is as same as Paddle's DataProvider. # Input. The input format is as same as Paddle's DataProvider.
converter = DataProviderConverter( converter = DataProviderConverter(input_types=[images.type, label.type])
input_types=[dp.dense_vector(784), dp.integer_value(10)])
train_file = './data/raw_data/train' train_file = './data/raw_data/train'
test_file = './data/raw_data/t10k' test_file = './data/raw_data/t10k'
......
import paddle.v2 as paddle
def main():
paddle.init(use_gpu=False, trainer_count=1)
# define network topology
images = paddle.layer.data(
name='pixel', type=paddle.data_type.dense_vector(784))
label = paddle.layer.data(
name='label', type=paddle.data_type.integer_value(10))
hidden1 = paddle.layer.fc(input=images, size=200)
hidden2 = paddle.layer.fc(input=hidden1, size=200)
inference = paddle.layer.fc(input=hidden2,
size=10,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=inference, label=label)
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=adam_optimizer)
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1000 == 0:
result = trainer.test(reader=paddle.reader.batched(
paddle.dataset.mnist.test(), batch_size=256))
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
result.metrics)
else:
pass
trainer.train(
reader=paddle.reader.batched(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=32),
event_handler=event_handler)
# output is a softmax layer. It returns probabilities.
# Shape should be (100, 10)
probs = paddle.infer(
output=inference,
parameters=parameters,
reader=paddle.reader.batched(
paddle.reader.firstn(
paddle.reader.map_readers(lambda item: (item[0], ),
paddle.dataset.mnist.test()),
n=100),
batch_size=32))
print probs.shape
if __name__ == '__main__':
main()
...@@ -156,7 +156,7 @@ class ImageClassifier(): ...@@ -156,7 +156,7 @@ class ImageClassifier():
# For oversampling, average predictions across crops. # For oversampling, average predictions across crops.
# If not, the shape of output[name]: (1, class_number), # If not, the shape of output[name]: (1, class_number),
# the mean is also applicable. # the mean is also applicable.
res[name] = output[name].mean(0) res[name] = output[name]['value'].mean(0)
return res return res
......
...@@ -32,4 +32,6 @@ def process(settings, file_name): ...@@ -32,4 +32,6 @@ def process(settings, file_name):
word_slot = [ word_slot = [
settings.word_dict[w] for w in words if w in settings.word_dict settings.word_dict[w] for w in words if w in settings.word_dict
] ]
if not word_slot:
continue
yield word_slot, label yield word_slot, label
...@@ -138,7 +138,11 @@ def main(): ...@@ -138,7 +138,11 @@ def main():
batch = [] batch = []
for line in sys.stdin: for line in sys.stdin:
batch.append([predict.get_index(line)]) words = predict.get_index(line)
if words:
batch.append([words])
else:
print('All the words in [%s] are not in the dictionary.' % line)
if len(batch) == batch_size: if len(batch) == batch_size:
predict.batch_predict(batch) predict.batch_predict(batch)
batch = [] batch = []
......
...@@ -25,6 +25,6 @@ paddle train \ ...@@ -25,6 +25,6 @@ paddle train \
--config_args=is_predict=1 \ --config_args=is_predict=1 \
--predict_output_dir=. --predict_output_dir=.
python gen_result.py > result.txt python gen_result.py > result.csv
rm -rf rank-00000 rm -rf rank-00000
...@@ -139,24 +139,12 @@ lstmemory ...@@ -139,24 +139,12 @@ lstmemory
:members: lstmemory :members: lstmemory
:noindex: :noindex:
lstm_step_layer
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: lstm_step_layer
:noindex:
grumemory grumemory
--------- ---------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
:members: grumemory :members: grumemory
:noindex: :noindex:
gru_step_layer
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: gru_step_layer
:noindex:
Recurrent Layer Group Recurrent Layer Group
===================== =====================
...@@ -172,6 +160,18 @@ recurrent_group ...@@ -172,6 +160,18 @@ recurrent_group
:members: recurrent_group :members: recurrent_group
:noindex: :noindex:
lstm_step_layer
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: lstm_step_layer
:noindex:
gru_step_layer
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: gru_step_layer
:noindex:
beam_search beam_search
------------ ------------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
...@@ -279,6 +279,12 @@ concat_layer ...@@ -279,6 +279,12 @@ concat_layer
:members: concat_layer :members: concat_layer
:noindex: :noindex:
seq_concat_layer
----------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: seq_concat_layer
:noindex:
Reshaping Layers Reshaping Layers
================ ================
...@@ -302,6 +308,18 @@ repeat_layer ...@@ -302,6 +308,18 @@ repeat_layer
:members: repeat_layer :members: repeat_layer
:noindex: :noindex:
rotate_layer
------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: rotate_layer
:noindex:
seq_reshape_layer
-----------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: seq_reshape_layer
:noindex:
Math Layers Math Layers
=========== ===========
...@@ -382,6 +400,15 @@ sampling_id_layer ...@@ -382,6 +400,15 @@ sampling_id_layer
:members: sampling_id_layer :members: sampling_id_layer
:noindex: :noindex:
Slicing and Joining Layers
==========================
pad_layer
-----------
.. automodule:: paddle.trainer_config_helpers.layers
:members: pad_layer
:noindex:
.. _api_trainer_config_helpers_layers_cost_layers: .. _api_trainer_config_helpers_layers_cost_layers:
Cost Layers Cost Layers
...@@ -441,6 +468,12 @@ ctc_layer ...@@ -441,6 +468,12 @@ ctc_layer
:members: ctc_layer :members: ctc_layer
:noindex: :noindex:
warp_ctc_layer
--------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: warp_ctc_layer
:noindex:
nce_layer nce_layer
----------- -----------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
......
# PaddlePaddle Design Doc
## Ingredients
As our design principle is starting from the essence: how could we
allow users to express and solve their problems at neural networks.
Some essential concepts that our API have to provide include:
1. A *topology* is an expression of *layers*.
1. A layer could be any kind of computation, including *cost*.
1. Some layers have parameters, some don't. Most costs don't have
parameters.
1. In some topologies, layers share parameters. For
example,
[the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
1. At programming time, users specify topologies and possible sharing
of parameters. PaddlePaddle can figure out and create parameters
required (and possibly shared) by one or more topologies.
## Starting from Examples
As a summarization
of
[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
let us present two examples here:
### Example 1. Sharing Parameters between Layers
We use
the
[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
in this example. For your convenience, I copy-a-paste the model's
topology as follows:
```
A -> f -\
Q -> f --> cost
B -> f -/
```
The following program trains the topology including the cost, and then
use the sub-network in the trained topology in inference:
```python
def f(in):
e = paddle.layer.embedding(in, parameter_name="embedding")
o = paddle.layer.softmax(e, parameter_name="semantic")
return o
# Create 3 topologies (subnets), they share parameters because all
# correspoinding layers have the same parameter names.
fA = f(paddle.layer.data(input_name="A"))
fB = f(paddle.layer.data(input_name="B"))
fQ = f(paddle.layer.data(input_name="Q"))
topology = paddle.layer.less_than(
paddle.layer.cross_entropy(fA, fQ),
paddle.layer.corss_entropy(fB, fQ))
# Derive parameters required in topology and create them in model.
parameters = paddle.parameters.create(topology)
# Estimate parameters used in topology from data.
paddle.train(topology, parameters, reader=read_ranking_model_data)
# Inference using fA (or fB or fC, as they share their parameters).
[testA, testB, testQ] = read_ranking_model_data()
print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
```
### Example 2. Sharing Parameters between "Models"
We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
this example. In the following example program, `d0` and `d1`
correspond to the two networks in the following figure:
<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
```python
def G(in):
# over-simplified example as G has only one layers:
return paddle.layer.fc(in, parameter_name="G")
def D(in);
# again, over-simplified:
return paddle.layer.fc(in, parameter_name="D")
# Construct the first topology, which contains both D and G.
# By learning this topology, we update parameters of G.
d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
# Construct a second topology d1, which contains only D. By
# training this topology, we update parameters of D. Note
# that d1 share parameters with d0.
d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
# Create parameters from a list of multiple topologies (models) for
# the chance to share parameters between these topologies.
parameters = paddle.parameters.create([d0, d1])
# Iterative training of GAN.
for ...:
train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
train(d1, parameters, reader=read_from_realistic_images)
# Use d1 for inference:
print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
```
### Summarization
Above two programs reveal some important design concerns:
1. Users describe a topology as an expression of layers. Every layer
has a *parameter name*. If the users don't specify it explicitly, it's automatically generated as a unique name. By
specifying the parameter name, users can specify the sharing of
parameters between layers and even between topologies.
1. `paddle.parameters.create` figures out parameters required by one
or more topologies from parameter names of layers. It creates these
parameters and returns a `ParameterSet` object, which is in essence
a map from *parameter names* to *parameters*.
1. At training and inference time, `paddle.train` and `paddle.infer`
requires both a topology and the parameter set that holds the parameters of that topology. There are some reasons:
1. This prevents users from forgetting to call
`paddle.parameters.create`.
1. `paddle.train` needs to know which parameter set to update.
1. Users could load another (pre-trained) parameter set and use it
with a topology in `train.infer`.
1. By specifying the `immutable_parameters` parameter of
`paddle.train`, we can forbid the update of these parameters.
## Reader
Not all programming frameworks allow users to define I/O functions.
An example is Google MapReduce, which can only read from text,
SSTable, and RecordIO files. Hadoop MapReduce allows users to define
readers and writers by deriving from base classes `Reader` and
`Writer`. The former is less flexible but also less error-prone. We
decide to provide the flexibility to users to define their readers.
There are some open questions here:
1. **Should a reader return a Python dictionary?**
1. **How to map multiple outputs from a reader to multiple data layers?**
1. **How to easily compose some existing readers to read more data and
feed a topology with more data layers?**
## Training
The recommended way to training a model is to call `paddle.train`,
which simply calls `paddle.trainer.Default`, a global variable of
type `paddle.trainer.SGD`. Equivalently, we can do
```python
opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
opt.train(topology, parameters, reader=read, ...)
```
### Updater
Please be aware that a trainer can accept an updater as its data
member, where an updater is a class derived from
`paddle.trainer.Updater`. This is to make it easier to customize
trainers, as discussed
[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
### Event Handler
`paddle.train` and `paddle.trainer.XXX.train` take an optional
parameter `event_handler`, which should be either `None` or a function
that handle some events:
1. BeginTraining
1. EndTraining
1. BeginIteration
1. EndIteration
1. BeginPass
1. EndPass
where EndPass is sent if and only if the reader yields
`end_pass=True`.
An example as follows:
```python
def event_handler(event):
if ininstance(event, paddle.event.EndIteration):
print paddle.test(...)
paddle.train(topology, parameters, reader, event_handler)
```
If we are writing a PaddlePaddle program in and for iPython/Jypyter,
we can use metaplotlib in the event handler to plot a curve of
cost/error versus iterations, as shown
[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
### Distributed Training
If users want to do distributed training on a cluster, s/he should
call `paddle.dist_train` and provides access tokens to the cluster as
a parameter.
For example, if the user has a TLS certificate that allows him to
access a Kubernetes cluster, s/he should be able to call
```python
paddle.dist_train(model,
trainer=paddle.trainer.SGD(...,
paddle.updater.Adam(...)),
reader=read,
k8s_user="yi",
k8s_token="kube_cluster_tls.pem",
k8s_job="hello",
num_parameter_servers=15)
```
The pseudo code if `paddle.dist_train` is as follows:
```python
def dist_train(topology, parameters, trainer, reader, ...):
if os.getenv("KUBERNETES_SERVICE_HOST") == None:
image_name = k8s_user + '/' + k8s_job
docker_build(image_name)
docker_push()
kube_ctrl_start_job(image_name, k8s_user, k8s_token)
else:
rank = kube_list_containers_in_job_and_return_current_containers_rank()
if rank == 0:
master()
elif rank < 15:
parameter_server()
else:
trainer.train(model, reader=read)
```
Please be aware that if a process is running on the Kubernetes
cluster, it will have some environment variables pre-defined.
If `dist_train` doesn't see these environment variables, it knows
that it's running on users' personal computer, and it should work as a
*launcher*. Otherwise, it knows that it's running on the cluster and
need to figure out its role as either the master, or a trainer, or a
parameter server.
# Python Data Reader Design Doc
At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
- A *reader creator* is a function that returns a reader function.
- A *reader* decorator is a function, which accepts one or more readers, and returns a reader.
and provide frequently used reader creators and reader decorators.
## Data Reader Interface
Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
```
iterable = data_reader()
```
Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
An example implementation for single item data reader creator:
```python
def reader_creator_random_image(width, height):
def reader():
while True:
yield numpy.random.uniform(-1, 1, size=width*height)
return reader
```
An example implementation for multiple item data reader creator:
```python
def reader_creator_random_imageand_label(widht, height, label):
def reader():
while True:
yield numpy.random.uniform(-1, 1, size=width*height), label
return reader
```
## Usage
data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
```python
# two data layer is created:
image_layer = paddle.layer.data("image", ...)
label_layer = paddle.layer.data("label", ...)
# ...
paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...)
```
## Data Reader Decorator
*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
### Prefetch Data
Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
Use `paddle.reader.buffered` to prefetch data:
```python
buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100)
```
`buffered_reader` will try to buffer (prefetch) `100` data entries.
### Compose Multiple Data Readers
For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
We can do:
```python
def reader_creator_random_image(width, height):
def reader():
while True:
yield numpy.random.uniform(-1, 1, size=width*height)
return reader
def reader_creator_bool(t):
def reader:
while True:
yield t
return reader
true_reader = reader_creator_bool(True)
false_reader = reader_creator_bool(False)
reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader)
# Skipped 1 because paddle.dataset.mnist produces two items per data entry.
# And we don't care second item at this time.
paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
```
### Shuffle
Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
Example:
```python
reader = paddle.reader.shuffle(paddle.dataset.mnist, 512)
```
## Q & A
### Why return only a single entry, but not a mini batch?
If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`.
Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
### Why use a dictionary but not a list to provide mapping?
We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
### How to create custom data reader creator
```python
def image_reader_creator(image_path, label_path, n):
def reader():
f = open(image_path)
l = open(label_path)
images = numpy.fromfile(
f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
for i in xrange(n):
yield images[i, :], labels[i] # a single entry of data is created each time
f.close()
l.close()
return reader
# images_reader_creator creates a reader
reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
paddle.train(reader, {"image":0, "label":1}, ...)
```
### How is `paddle.train` implemented
An example implementation of paddle.train could be:
```python
def make_minibatch(reader, minibatch_size):
def ret():
r = reader()
buf = [r.next() for x in xrange(minibatch_size)]
while len(buf) > 0:
yield buf
buf = [r.next() for x in xrange(minibatch_size)]
return ret
def train(reader, mapping, batch_size, total_pass):
for pass_idx in range(total_pass):
for mini_batch in make_minibatch(reader): # this loop will never end in online learning.
do_forward_backward(mini_batch, mapping)
```
...@@ -4,6 +4,8 @@ Installing from Sources ...@@ -4,6 +4,8 @@ Installing from Sources
* [1. Download and Setup](#download) * [1. Download and Setup](#download)
* [2. Requirements](#requirements) * [2. Requirements](#requirements)
* [3. Build on Ubuntu](#ubuntu) * [3. Build on Ubuntu](#ubuntu)
* [4. Build on Centos](#centos)
## <span id="download">Download and Setup</span> ## <span id="download">Download and Setup</span>
You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle). You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
...@@ -16,9 +18,10 @@ cd paddle ...@@ -16,9 +18,10 @@ cd paddle
To compile the source code, your computer must be equipped with the following dependencies. To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X) - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
- **BLAS**: MKL, OpenBlas or ATLAS - **BLAS**: MKL, OpenBlas or ATLAS
- **Python**: only support Python 2.7
**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
For CUDA 8.0, GCC versions later than 5.3 are not supported! For CUDA 8.0, GCC versions later than 5.3 are not supported!
...@@ -64,7 +67,8 @@ As a simple example, consider the following: ...@@ -64,7 +67,8 @@ As a simple example, consider the following:
1. **BLAS Dependencies(optional)** 1. **BLAS Dependencies(optional)**
Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`. CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
```bash ```bash
# specify MKL # specify MKL
...@@ -94,12 +98,78 @@ As a simple example, consider the following: ...@@ -94,12 +98,78 @@ As a simple example, consider the following:
### Install Dependencies ### Install Dependencies
- **CPU Dependencies** - **Paddle Dependencies**
```bash ```bash
# necessary # necessary
sudo apt-get update sudo apt-get update
sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
sudo apt-get install -y python python-pip python-numpy libpython-dev bison
sudo pip install 'protobuf==3.1.0.post1'
# install cmake 3.4
curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
cd .. && rm -rf cmake-3.4.1
```
- **GPU Dependencies (optional)**
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. A supported version of Linux with a gcc compiler and toolchain
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
distribution versions that have been qualified for this CUDA Toolkit release.
After downloading cuDNN library, issue the following commands:
```bash
sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
```
Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
```bash
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
```
### Build and Install
As usual, the best option is to create build folder under paddle project directory.
```bash
mkdir build && cd build
```
Finally, you can build and install PaddlePaddle:
```bash
# you can add build option here, such as:
cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH
# install PaddlePaddle Python modules.
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
```
## <span id="centos">Build on Centos 7</span>
### Install Dependencies
- **CPU Dependencies**
```bash
# necessary
sudo yum update
sudo yum install -y epel-release
sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
sudo pip install wheel numpy sudo pip install wheel numpy
sudo pip install 'protobuf>=3.0.0' sudo pip install 'protobuf>=3.0.0'
``` ```
...@@ -142,7 +212,7 @@ Finally, you can build and install PaddlePaddle: ...@@ -142,7 +212,7 @@ Finally, you can build and install PaddlePaddle:
```bash ```bash
# you can add build option here, such as: # you can add build option here, such as:
cmake .. -DCMAKE_INSTALL_PREFIX=<path to install> cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
# please use sudo make install, if you want to install PaddlePaddle into the system # please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc # set PaddlePaddle installation path in ~/.bashrc
......
...@@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock ...@@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock
PaddlePaddle提供的Docker镜像版本 PaddlePaddle提供的Docker镜像版本
-------------------------------- --------------------------------
我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ,他们的image name都是 :code:`paddle-dev/paddle` ,tag分别为 我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ,他们的image name都是 :code:`paddledev/paddle` ,tag分别为
+-----------------+------------------+------------------------+-----------------------+ +-----------------+------------------+------------------------+-----------------------+
| | normal | devel | demo | | | normal | devel | demo |
...@@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本 ...@@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本
if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
如果输出 :code:`Support AVX`,则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image,则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。 如果输出 :code:`Support AVX`,则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image,则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。
PaddlePaddle提供的镜像并不包含任何命令运行,想要运行PaddlePaddle,您需要进入镜像运行PaddlePaddle PaddlePaddle提供的镜像并不包含任何命令运行,想要运行PaddlePaddle,您需要进入镜像运行PaddlePaddle
程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像` 程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
......
...@@ -16,70 +16,71 @@ Developers can work on PaddlePaddle using Docker. This allows ...@@ -16,70 +16,71 @@ Developers can work on PaddlePaddle using Docker. This allows
developers to work on different platforms -- Linux, Mac OS X, and developers to work on different platforms -- Linux, Mac OS X, and
Windows -- in a consistent way. Windows -- in a consistent way.
The general development workflow with Docker and CMake is as follows: 1. Build the Development Environment as a Docker Image
1. Get the source code of Paddle:
.. code-block:: bash .. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git git clone --recursive https://github.com/PaddlePaddle/Paddle
cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
2. Build a development Docker image :code:`paddle:dev` from the source Note that by default :code:`docker build` wouldn't import source
code. This image contains all the development tools and tree into the image and build it. If we want to do that, we need
dependencies of PaddlePaddle. to set a build arg:
.. code-block:: bash .. code-block:: bash
cd paddle docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
2. Run the Development Environment
Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below. Once we got the image :code:`paddle:dev`, we can use it to develop
Paddle by mounting the local source code tree into a container that
runs the image:
.. code-block:: bash .. code-block:: bash
docker build \ docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
--build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
-t paddle:dev \ This runs a container of the development environment Docker image
-f paddle/scripts/docker/Dockerfile . with the local source tree mounted to :code:`/paddle` of the
container.
Note that the default entry-point of :code:`paddle:dev` is
:code:`sshd`, and above :code:`docker run` commands actually starts
an SSHD server listening on port 2202. This allows us to log into
this container with:
.. code-block:: bash
ssh root@localhost -p 2202
3. Run the image as a container and mounting local source code Usually, I run above commands on my Mac. I can also run them on a
directory into the container. This allows us to change the code on GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
the host and build it within the container.
.. code-block:: bash .. code-block:: bash
docker run \ my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
-d \
--name paddle \
-p 2022:22 \
-v $PWD:/paddle \
paddle:dev
where :code:`-d` makes the container running in background, 3. Build and Install Using the Development Environment
:code:`--name paddle` allows us to run a nginx container to serve
documents in this container, :code:`-p 2022:22` allows us to SSH
into this container, :code:`-v $PWD:/paddle` shares the source code
on the host with the container.
4. SSH into the container: Once I am in the container, I can use
:code:`paddle/scripts/docker/build.sh` to build, install, and test
Paddle:
.. code-block:: bash .. code-block:: bash
ssh root@localhost -p 2022 /paddle/paddle/scripts/docker/build.sh
5. We can edit the source code in the container or on this host. Then This builds everything about Paddle in :code:`/paddle/build`. And
we can build using cmake we can run unit tests there:
.. code-block:: bash .. code-block:: bash
cd /paddle # where paddle source code has been mounted into the container cd /paddle/build
mkdir -p build ctest
cd build
cmake -DWITH_TESTING=ON ..
make -j `nproc`
CTEST_OUTPUT_ON_FAILURE=1 ctest
CPU-only and GPU Images CPU-only and GPU Images
......
...@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下,详细见 :ref:`api_trainer_config_helpers ...@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下,详细见 :ref:`api_trainer_config_helpers
- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。 - `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP` 时(默认值): - `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值):
- 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
- 输入:一个双层序列,或一个单层序列 - 输入:一个双层序列,或一个单层序列
...@@ -54,7 +54,7 @@ last_seq 的使用示例如下( :ref:`api_trainer_config_helpers_layers_first_ ...@@ -54,7 +54,7 @@ last_seq 的使用示例如下( :ref:`api_trainer_config_helpers_layers_first_
last = last_seq(input=layer, last = last_seq(input=layer,
agg_level=AggregateLevel.EACH_SEQUENCE) agg_level=AggregateLevel.EACH_SEQUENCE)
- `agg_level=AggregateLevel.TIMESTEP` 时(默认值): - `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值):
- 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
- 输入:一个双层序列或一个单层序列 - 输入:一个双层序列或一个单层序列
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
usage/cmd_parameter/index_cn.rst usage/cmd_parameter/index_cn.rst
usage/concepts/use_concepts_cn.rst usage/concepts/use_concepts_cn.rst
usage/cluster/cluster_train_cn.md usage/cluster/cluster_train_cn.md
usage/k8s/k8s_basis_cn.md
usage/k8s/k8s_cn.md usage/k8s/k8s_cn.md
usage/k8s/k8s_distributed_cn.md usage/k8s/k8s_distributed_cn.md
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
在本文中,我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。 在本文中,我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统(如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) )的用户参考。 在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统(如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) )的用户参考。
## 前提条件 ## 前提条件
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation). In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s). [Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
## Prerequisite ## Prerequisite
......
...@@ -127,11 +127,6 @@ ...@@ -127,11 +127,6 @@
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr> </tr>
<tr>
<td class="left">allow_inefficient_sparse_update</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr> <tr>
<td class="left">start_pass</td> <td class="left">start_pass</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
......
...@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo ...@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr> </tr>
<tr>
<td class="left">allow_inefficient_sparse_update</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr> <tr>
<td class="left">start_pass</td> <td class="left">start_pass</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
......
...@@ -306,10 +306,6 @@ ...@@ -306,10 +306,6 @@
- 指示是否显示参数服务器上的稀疏参数分布的日志细节. - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
- 类型: bool (默认: 0). - 类型: bool (默认: 0).
* `--allow_inefficient_sparse_update`
- 指示是否允许低效率的稀疏更新.
- 类型: bool (默认: 0).
* `--check_sparse_distribution_batches` * `--check_sparse_distribution_batches`
- 每运行多少个批次执行一次稀疏参数分布的检查. - 每运行多少个批次执行一次稀疏参数分布的检查.
- 类型: int32 (默认: 100). - 类型: int32 (默认: 100).
......
...@@ -310,10 +310,6 @@ ...@@ -310,10 +310,6 @@
- show log details for sparse parameter distribution in pserver. - show log details for sparse parameter distribution in pserver.
- type: bool (default: 0). - type: bool (default: 0).
* `--allow_inefficient_sparse_update`
- Whether to allow inefficient sparse update.
- type: bool (default: 0).
* `--check_sparse_distribution_batches` * `--check_sparse_distribution_batches`
- Running sparse parameter distribution check every so many batches. - Running sparse parameter distribution check every so many batches.
- type: int32 (default: 100). - type: int32 (default: 100).
......
此差异已折叠。
# Kubernetes 简介
[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。
- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。
- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods,job启动后会创建这些pod并开始执行一个程序,等待这个程序执行成功并返回0则成功退出,如果执行失败,也可以配置不同的重试机制。
- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。
- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合,将外部的存储服务在Kubernetes中描述成为统一的资源形式,便于存储资源管理和Pod引用。
# 部署Kubernetes集群
Kubernetes提供了多种集群部署的方案,本文档内不重复介绍。这里给出集中常见的部署方法:
- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器,便于本地验证和测试。
- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统,不同主机(Bare-Metal, AWS, GCE)条件下,快速部署集群。
- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
# 选择存储方案
容器不会保留在运行时生成的数据,job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务,需要有一个外部的存储服务来保存训练所需数据和训练输出。
常见的可选存储服务包括:
- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单,可以用于小量数据的验证。不提供分布式存储,高可用,冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)
- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统,可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统,支持rbd,POSIX API接口(ceph fs)和对象存储API,参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)
- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
# 配置kubectl
## 安装kubectl
```
# OS X
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
# Linux
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
# Windows
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
```
## 配置kubectl访问你的kubernetes集群
编辑`~/.kube/config`这个配置文件,修改`Master-IP`的地址。如果使用SSL认证,则需要配置`certificate-authority``users`中的用户证书。如果是使用非SSL方式访问(比如通过8080端口),也可以去掉这些证书的配置。
```
apiVersion: v1
clusters:
- cluster:
certificate-authority: /path/to/ca.crt
server: https://[Master-IP]:443
name: minikube
contexts:
- context:
cluster: minikube
user: minikube
name: minikube
current-context: minikube
kind: Config
preferences: {}
users:
- name: minikube
user:
client-certificate: /path/to/apiserver.crt
client-key: /Users/wuyi/.minikube/apiserver.key
```
...@@ -2,168 +2,50 @@ ...@@ -2,168 +2,50 @@
前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。
## Kubernetes 基本概念 有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群,可以参考[k8s_basis](./k8s_basis_cn.md)
[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、 扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。
- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。
- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods。
- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。
- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
## 整体方案 ## 整体方案
### 部署Kubernetes集群 在训练之前,用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统,需要使用其制定的方式挂载后并导入数据),训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下:
首先,我们需要拥有一个Kubernetes集群,在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建,可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/),在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机,并且可以按照官方文档在上面部署Kubernetes。在本文的环境中,Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)(Moose filesystem,一种分布式文件系统)共享目录,我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署,可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前,用户将配置与训练数据切分好放在MFS目录中,训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下:
![paddle on kubernetes结构图](src/k8s-paddle-arch.png) ![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
上图描述了一个3节点的分布式训练场景,Kubernetes集群的每个node上都挂载了一个MFS目录,这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。 上图描述了一个3节点的分布式训练场景,在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。
### 使用 Job
我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业,在作业完成后,Kubernetes会销毁job产生的容器并且释放相关资源。
在Kubernetes中,可以通过编写一个YAML文件,来描述这个job,在这个文件中,主要包含了一些配置信息,例如PaddlePaddle的节点个数,`paddle pserver`开放的端口个数与端口号,使用的网卡设备等,这些信息通过环境变量的形式传递给容器内的程序使用。
在一次分布式训练中,用户确定好本次训练需要的PaddlePaddle节点个数,将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件,提交给Kubernetes集群创建并开始作业。 根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,按照下面步骤即可:
### 创建PaddlePaddle节点 1. [制作PaddlePaddle镜像](#制作镜像)
1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
当Kubernetes master收到请求,解析完YAML文件后,会创建出多个pod(个数为PaddlePaddle节点数),Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点,当pod被成功分配到一台物理/虚拟机上后,Kubernetes会启动pod内的容器,这个容器会根据YAML文件中的环境变量,启动`paddle pserver``paddle train`进程。 1. [编写本次训练的YAML文件,创建一个Kubernetes job](#创建Job)
1. [训练结束后查看输出结果](#查看输出)
### 启动训练
在容器启动后,会通过脚本来启动这次分布式训练,我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id,由于PaddlePaddle本身不提供类似服务发现的功能,所以在本文的启动脚本中,每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
根据这些pod信息,就可以通过某种方式,为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序,将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下:
1. 查询Kubernetes apiserver获取pod信息,根据IP分配trainer_id
1. 从MFS共享目录中拷贝训练文件到容器内
1. 根据环境变量,解析出`paddle pserver``paddle train`的启动参数,启动进程
1. 训练时,PaddlePaddle会自动将结果保存在trainer_id为0的节点上,将输出路径设置为MFS目录,保存输出的文件
## 搭建过程
根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,主要分为以下几个步骤:
1. 制作PaddlePaddle镜像
1. 将训练文件与切分好的数据上传到共享存储
1. 编写本次训练的YAML文件,创建一个Kubernetes job
1. 训练结束后查看输出结果
下面就根据这几个步骤分别介绍。 下面就根据这几个步骤分别介绍。
### 制作镜像 ### 制作镜像
PaddlePaddle镜像需要提供`paddle pserver``paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能: PaddlePaddle镜像需要提供`paddle pserver``paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能:
- 拷贝训练文件到容器内 - 拷贝训练文件到容器内
- 生成`paddle pserver``paddle train`进程的启动参数,并且启动训练 - 生成`paddle pserver``paddle train`进程的启动参数,并且启动训练
因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。镜像的*Dockerfile*如下: 因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)
```Dockerfile
FROM paddledev/paddle:cpu-latest
MAINTAINER zjsxzong89@gmail.com
COPY start.sh /root/
COPY start_paddle.py /root/
CMD ["bash"," -c","/root/start.sh"]
```
[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内,然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练,前文提到的获取其他节点IP地址,分配`trainer_id`等都在`start_paddle.py`脚本中完成。
`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。
```python
parser = argparse.ArgumentParser(prog="start_paddle.py",
description='simple tool for k8s')
args, train_args_list = parser.parse_known_args()
train_args = refine_unknown_args(train_args_list)
train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
podlist = getPodList()
```
然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。
```python
podlist = getPodList()
# need to wait until all pods are running
while not isPodAllRunning(podlist):
time.sleep(10)
podlist = getPodList()
idMap = getIdMap(podlist)
```
在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。
```python
def getIdMap(podlist):
'''
generate tainer_id by ip
'''
ips = []
for pod in podlist["items"]:
ips.append(pod["status"]["podIP"])
ips.sort()
idMap = {}
for i in range(len(ips)):
idMap[ips[i]] = i
return idMap
```
在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver``paddle train`的启动参数并执行进程。
在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver``paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC``PADDLE_PORT``PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`
```python
program = 'paddle train'
args = " --nics=" + PADDLE_NIC
args += " --port=" + str(PADDLE_PORT)
args += " --ports_num=" + str(PADDLE_PORTS_NUM)
args += " --comment=" + "paddle_process_by_paddle"
ip_string = ""
for ip in idMap.keys():
ip_string += (ip + ",")
ip_string = ip_string.rstrip(",")
args += " --pservers=" + ip_string
args_ext = ""
for key, value in train_args_dict.items():
args_ext += (' --' + key + '=' + value)
localIP = socket.gethostbyname(socket.gethostname())
trainerId = idMap[localIP]
args += " " + args_ext + " --trainer_id=" + \
str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
```
使用 `docker build` 构建镜像:
```bash ```bash
docker build -t your_repo/paddle:mypaddle . $ cd doc/howto/usage/k8s/src/k8s_train
$ docker build -t [YOUR_REPO]/paddle:mypaddle .
``` ```
然后将构建成功的镜像上传到镜像仓库。 然后将构建成功的镜像上传到镜像仓库。
```bash ```bash
docker push your_repo/paddle:mypaddle docker push [YOUR_REPO]/paddle:mypaddle
``` ```
注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。 注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
### 上传训练文件 ### 上传训练文件
本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容,我们将训练文件与数据放在一个job name命名的目录中,上传到MFS共享存储。完成后MFS上的文件内容大致如下: 本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容,我们将训练文件与数据放在一个job name命名的目录中,上传到volume所在的共享存储(使用不同分布式存储会有不同的挂载方式,需要要先挂载这个目录,然后拷贝数据)。完成后volume中的文件内容大致如下:
```bash ```bash
[root@paddle-kubernetes-node0 mfs]# tree -d [root@paddle-kubernetes-node0 mfs]# tree -d
...@@ -205,7 +87,7 @@ spec: ...@@ -205,7 +87,7 @@ spec:
path: /home/work/mfs path: /home/work/mfs
containers: containers:
- name: trainer - name: trainer
image: your_repo/paddle:mypaddle image: [YOUR_REPO]/paddle:mypaddle
command: ["bin/bash", "-c", "/root/start.sh"] command: ["bin/bash", "-c", "/root/start.sh"]
env: env:
- name: JOB_NAME - name: JOB_NAME
...@@ -289,8 +171,8 @@ I1116 09:10:17.123121 50 Util.cpp:155] commandline: ...@@ -289,8 +171,8 @@ I1116 09:10:17.123121 50 Util.cpp:155] commandline:
--ports_num=2 --comment=paddle_process_by_paddle --ports_num=2 --comment=paddle_process_by_paddle
--pservers=192.168.129.66,192.168.223.143,192.168.129.71 --pservers=192.168.129.66,192.168.223.143,192.168.129.71
--ports_num_for_sparse=2 --config=./trainer_config.py --ports_num_for_sparse=2 --config=./trainer_config.py
--trainer_count=4 --num_passes=10 --use_gpu=0 --trainer_count=4 --num_passes=10 --use_gpu=0
--log_period=50 --dot_period=10 --saving_period=1 --log_period=50 --dot_period=10 --saving_period=1
--local=0 --trainer_id=0 --local=0 --trainer_id=0
--save_dir=/home/jobpath/paddle-cluster-job/output --save_dir=/home/jobpath/paddle-cluster-job/output
I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions
...@@ -310,3 +192,90 @@ I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143: ...@@ -310,3 +192,90 @@ I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:
I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164 I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165 I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
``` ```
## 一些细节的补充
### 使用环境变量
使用容器方式运行训练任务的Kubernetes Job,通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本,将环境变量转换成paddle的命令行参数:
```
API = "/api/v1/namespaces/"
JOBSELECTOR = "labelSelector=job-name="
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
JOB_PATH_OUTPUT = JOB_PATH + "/output"
JOBNAME = os.getenv("JOB_NAME")
NAMESPACE = os.getenv("JOB_NAMESPACE")
PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
```
### Pod间通信
`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。
```python
parser = argparse.ArgumentParser(prog="start_paddle.py",
description='simple tool for k8s')
args, train_args_list = parser.parse_known_args()
train_args = refine_unknown_args(train_args_list)
train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
podlist = getPodList()
```
然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。
```python
podlist = getPodList()
# need to wait until all pods are running
while not isPodAllRunning(podlist):
time.sleep(10)
podlist = getPodList()
idMap = getIdMap(podlist)
```
* *注意*: `getPodList()`会获取当前namespace下的所有pod,如果已经有pod运行,可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。
```python
def getIdMap(podlist):
'''
generate tainer_id by ip
'''
ips = []
for pod in podlist["items"]:
ips.append(pod["status"]["podIP"])
ips.sort()
idMap = {}
for i in range(len(ips)):
idMap[ips[i]] = i
return idMap
```
在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver``paddle train`的启动参数并执行进程。
### 启动任务
在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver``paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC``PADDLE_PORT``PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`
```python
program = 'paddle train'
args = " --nics=" + PADDLE_NIC
args += " --port=" + str(PADDLE_PORT)
args += " --ports_num=" + str(PADDLE_PORTS_NUM)
args += " --comment=" + "paddle_process_by_paddle"
ip_string = ""
for ip in idMap.keys():
ip_string += (ip + ",")
ip_string = ip_string.rstrip(",")
args += " --pservers=" + ip_string
args_ext = ""
for key, value in train_args_dict.items():
args_ext += (' --' + key + '=' + value)
localIP = socket.gethostbyname(socket.gethostname())
trainerId = idMap[localIP]
args += " " + args_ext + " --trainer_id=" + \
str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
```
doc/howto/usage/k8s/src/create_efs.png

244.5 KB | W: | H:

doc/howto/usage/k8s/src/create_efs.png

236.1 KB | W: | H:

doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
  • 2-up
  • Swipe
  • Onion skin
apiVersion: batch/v1
kind: Job
metadata:
name: paddle-cluster-job
spec:
parallelism: 3
completions: 3
template:
metadata:
name: paddle-cluster-job
spec:
volumes:
- name: jobpath
hostPath:
path: /home/work/paddle_output
containers:
- name: trainer
image: registry.baidu.com/public/paddle:mypaddle
command: ["bin/bash", "-c", "/root/start.sh"]
env:
- name: JOB_NAME
value: paddle-cluster-job
- name: JOB_PATH
value: /home/jobpath
- name: JOB_NAMESPACE
value: default
- name: TRAIN_CONFIG_DIR
value: recommendation
- name: CONF_PADDLE_NIC
value: eth0
- name: CONF_PADDLE_PORT
value: "7164"
- name: CONF_PADDLE_PORTS_NUM
value: "2"
- name: CONF_PADDLE_PORTS_NUM_SPARSE
value: "2"
- name: CONF_PADDLE_GRADIENT_NUM
value: "3"
volumeMounts:
- name: jobpath
mountPath: /home/jobpath
restartPolicy: Never
FROM alpine
RUN apk update && apk upgrade && apk add coreutils
ADD quick_start /quick_start
ADD get_data.sh /bin/
RUN chmod +x /bin/get_data.sh
ENTRYPOINT ["/bin/get_data.sh"]
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
```
cp -r ../../../../../../demo/quick_start .
docker build . -t prepare-data-image-name
```
#!/bin/sh
out_dir=$OUT_DIR
split_count=$SPLIT_COUNT
set -e
mkdir -p $out_dir
cp -r /quick_start $out_dir/
mkdir -p $out_dir/0/data
cd $out_dir/0/data
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
tar zxvf preprocessed_data.tar.gz
rm preprocessed_data.tar.gz
split -d --number=l/$split_count -a 5 train.txt train.
mv train.00000 train.txt
cd $out_dir
end=$(expr $split_count - 1)
for i in $(seq 1 $end); do
mkdir -p $i/data
cp -r 0/data/* $i/data
mv $i/data/train.`printf %05d $i` $i/data/train.txt
done;
FROM paddledev/paddle:cpu-latest
COPY start.sh /root/
COPY start_paddle.py /root/
RUN chmod +x /root/start.sh
CMD ["bash"," -c","/root/start.sh"]
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
```
docker build . -t train-image-name
```
#!/bin/sh #!/bin/sh
set -eu set -eu
jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR} jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
cd /root cd /root
cp -rf $jobconfig . cp -rf $jobconfig/* .
cd $TRAIN_CONFIG_DIR
python /root/start_paddle.py \ python /root/start_paddle.py \
--dot_period=10 \ --dot_period=10 \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \ --ports_num=$CONF_PADDLE_PORTS_NUM \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
--log_period=50 \ --log_period=50 \
--num_passes=10 \ --num_passes=10 \
--trainer_count=4 \ --trainer_count=$TRAINER_COUNT \
--saving_period=1 \ --saving_period=1 \
--local=0 \ --local=0 \
--config=./trainer_config.py \ --config=trainer_config.lr.py \
--use_gpu=0 --use_gpu=0
...@@ -23,7 +23,6 @@ import argparse ...@@ -23,7 +23,6 @@ import argparse
API = "/api/v1/namespaces/" API = "/api/v1/namespaces/"
JOBSELECTOR = "labelSelector=job-name=" JOBSELECTOR = "labelSelector=job-name="
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
JOB_PATH_DATA = JOB_PATH + "/data"
JOB_PATH_OUTPUT = JOB_PATH + "/output" JOB_PATH_OUTPUT = JOB_PATH + "/output"
JOBNAME = os.getenv("JOB_NAME") JOBNAME = os.getenv("JOB_NAME")
NAMESPACE = os.getenv("JOB_NAMESPACE") NAMESPACE = os.getenv("JOB_NAMESPACE")
...@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") ...@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
def refine_unknown_args(cmd_args): def refine_unknown_args(cmd_args):
''' '''
...@@ -64,6 +65,7 @@ def isPodAllRunning(podlist): ...@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
for pod in podlist["items"]: for pod in podlist["items"]:
if pod["status"]["phase"] == "Running": if pod["status"]["phase"] == "Running":
running += 1 running += 1
print "waiting for pods running, require:", require, "running:", running
if require == running: if require == running:
return True return True
return False return False
...@@ -79,8 +81,17 @@ def getPodList(): ...@@ -79,8 +81,17 @@ def getPodList():
pod = API + NAMESPACE + "/pods?" pod = API + NAMESPACE + "/pods?"
job = JOBNAME job = JOBNAME
return requests.get(apiserver + pod + JOBSELECTOR + job, if os.path.isfile(tokenpath):
verify=False).json() tokenfile = open(tokenpath, mode='r')
token = tokenfile.read()
Bearer = "Bearer " + token
headers = {"Authorization": Bearer}
return requests.get(apiserver + pod + JOBSELECTOR + job,
headers=headers,
verify=False).json()
else:
return requests.get(apiserver + pod + JOBSELECTOR + job,
verify=False).json()
def getIdMap(podlist): def getIdMap(podlist):
...@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None): ...@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
if not os.path.exists(JOB_PATH_OUTPUT): if not os.path.exists(JOB_PATH_OUTPUT):
os.makedirs(JOB_PATH_OUTPUT) os.makedirs(JOB_PATH_OUTPUT)
os.mkdir(logDir) os.mkdir(logDir)
copyCommand = 'cp -rf ' + JOB_PATH_DATA + \ copyCommand = 'cp -rf ' + JOB_PATH + \
"/" + str(trainerId) + " ./data" "/" + str(trainerId) + "/data/*" + " ./data/"
os.system(copyCommand) os.system(copyCommand)
startPserver = 'nohup paddle pserver' + \ startPserver = 'nohup paddle pserver' + \
" --port=" + str(PADDLE_PORT) + \ " --port=" + str(PADDLE_PORT) + \
...@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None): ...@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
print startPserver print startPserver
os.system(startPserver) os.system(startPserver)
# wait until pservers completely start # wait until pservers completely start
time.sleep(10) time.sleep(20)
startTrainer = program + args + " > " + \ startTrainer = program + args + " 2>&1 | tee " + \
logDir + "/train.log 2>&1 < /dev/null" logDir + "/train.log"
print startTrainer print startTrainer
os.system(startTrainer) os.system(startTrainer)
...@@ -152,7 +163,7 @@ if __name__ == '__main__': ...@@ -152,7 +163,7 @@ if __name__ == '__main__':
podlist = getPodList() podlist = getPodList()
# need to wait until all pods are running # need to wait until all pods are running
while not isPodAllRunning(podlist): while not isPodAllRunning(podlist):
time.sleep(10) time.sleep(20)
podlist = getPodList() podlist = getPodList()
idMap = getIdMap(podlist) idMap = getIdMap(podlist)
startPaddle(idMap, train_args_dict) startPaddle(idMap, train_args_dict)
...@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) { ...@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
return args; return args;
} }
Arguments* Arguments::createByPaddleArgument(const void* ptr) {
auto p = (paddle::Argument*)(ptr);
auto args = new Arguments();
args->m->outputs.push_back(*p);
return args;
}
Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) { Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
auto& a = m->getArg(idx); auto& a = m->getArg(idx);
return Matrix::createByPaddleMatrixPtr(&a.value); return Matrix::createByPaddleMatrixPtr(&a.value);
......
...@@ -27,3 +27,18 @@ std::string Evaluator::toString() { ...@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
m->rawPtr->printStats(sout); m->rawPtr->printStats(sout);
return sout.str(); return sout.str();
} }
std::vector<std::string> Evaluator::getNames() const {
std::vector<std::string> retv;
m->rawPtr->getNames(&retv);
return retv;
}
double Evaluator::getValue(const std::string name) const {
paddle::Error err;
double v = m->rawPtr->getValue(name, &err);
if (err) {
throw std::runtime_error(err.msg());
}
return v;
}
...@@ -144,12 +144,12 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) { ...@@ -144,12 +144,12 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
void GradientMachine::randParameters() { m->machine->randParameters(); } void GradientMachine::randParameters() { m->machine->randParameters(); }
Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
throw(UnsupportError) { throw(UnsupportError) {
auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine); auto nn = m->machine;
if (nn) { if (nn) {
auto mat = nn->getLayerOutput(layerName); auto arg = nn->getLayerOutput(layerName);
return Matrix::createByPaddleMatrixPtr(&mat); return Arguments::createByPaddleArgument(&arg);
} else { } else {
throw UnsupportError(); throw UnsupportError();
} }
......
...@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu); ...@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
/// Return true if this py_paddle is compiled in GPU Version /// Return true if this py_paddle is compiled in GPU Version
bool isGpuVersion(); bool isGpuVersion();
/// Return FLAGS_trainer_count
int getTrainerCount();
/// The Error of IO Operation. Such as file not found, etc. /// The Error of IO Operation. Such as file not found, etc.
class IOError {}; class IOError {};
...@@ -454,6 +457,7 @@ public: ...@@ -454,6 +457,7 @@ public:
private: private:
static Arguments* createByPaddleArgumentVector(void* ptr); static Arguments* createByPaddleArgumentVector(void* ptr);
static Arguments* createByPaddleArgument(const void* ptr);
void* getInternalArgumentsPtr() const; void* getInternalArgumentsPtr() const;
private: private:
...@@ -769,7 +773,7 @@ public: ...@@ -769,7 +773,7 @@ public:
void randParameters(); void randParameters();
Matrix* getLayerOutput(const std::string& layerName) const Arguments* getLayerOutput(const std::string& layerName) const
throw(UnsupportError); throw(UnsupportError);
/** /**
...@@ -900,6 +904,10 @@ public: ...@@ -900,6 +904,10 @@ public:
*/ */
std::string toString(); std::string toString();
std::vector<std::string> getNames() const;
double getValue(const std::string name) const;
private: private:
EvaluatorPrivate* m; EvaluatorPrivate* m;
...@@ -952,7 +960,7 @@ public: ...@@ -952,7 +960,7 @@ public:
Arguments* getForwardOutput(); Arguments* getForwardOutput();
Matrix* getLayerOutput(const std::string& layerName); Arguments* getLayerOutput(const std::string& layerName) const;
}; };
/// the N-Best results generated from one input sequence. /// the N-Best results generated from one input sequence.
......
...@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) { ...@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); } void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
void Trainer::finishTestPeriod() { m->finishTestPeriod(); } void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
Matrix* Trainer::getLayerOutput(const std::string& layerName) { Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>( auto nn = this->m->getGradientMachine();
this->m->getGradientMachine());
CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork"; CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
auto m = nn->getLayerOutput(layerName); auto arg = nn->getLayerOutput(layerName);
return Matrix::createByPaddleMatrixPtr(&m); return Arguments::createByPaddleArgument(&arg);
} }
void Trainer::forwardOneBatch(size_t batchSize) { void Trainer::forwardOneBatch(size_t batchSize) {
......
...@@ -54,5 +54,7 @@ bool isGpuVersion() { ...@@ -54,5 +54,7 @@ bool isGpuVersion() {
#endif #endif
} }
int getTrainerCount() { return FLAGS_trainer_count; }
static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES, static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
"The Parameter Type should be same in core/api and core/common"); "The Parameter Type should be same in core/api and core/common");
...@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase): ...@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
def test_numpyCpu(self): def test_numpyCpu(self):
numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32") numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False) m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
self.assertEqual((int(m.getHeight()), int(m.getWidth())), self.assertEqual((int(m.getHeight()), int(m.getWidth())),
numpy_mat.shape) numpy_mat.shape)
......
...@@ -89,9 +89,14 @@ def main(): ...@@ -89,9 +89,14 @@ def main():
except Exception as e: except Exception as e:
print e print e
ev = m.makeEvaluator()
ev.start()
m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN, m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
update_callback) update_callback)
m.eval(ev)
ev.finish()
for name in ev.getNames():
print name, ev.getValue(name)
for optimizer in optimizers: for optimizer in optimizers:
optimizer.finishBatch() optimizer.finishBatch()
......
...@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase): ...@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):
def test_cpu_numpy(self): def test_cpu_numpy(self):
vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32") vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False) iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
self.assertEqual(vec.shape[0], int(iv.__len__())) self.assertEqual(vec.shape[0], int(iv.__len__()))
vec[4] = 832 vec[4] = 832
for i in xrange(len(iv)): for i in xrange(len(iv)):
...@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase): ...@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase):
def testCpuNumpy(self): def testCpuNumpy(self):
numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32") numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False) vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
assert isinstance(vec, swig_paddle.Vector) assert isinstance(vec, swig_paddle.Vector)
numpy_arr[0] = 0.1 numpy_arr[0] = 0.1
for n, v in zip(numpy_arr, vec): for n, v in zip(numpy_arr, vec):
......
...@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d, ...@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
const int* index, const int* index,
int numSequence); int numSequence);
/**
* @brief Matrix classification error.
*
* @param[in] A_d input matrix (M x N).
* @param[in] B_d input vector (M x 1).
* @param[out] C_d output vector (M x 1).
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*
*/
extern void hl_matrix_classification_error(
real* A_d, int* B_d, real* C_d, int dimM, int dimN);
/** /**
* @brief Matrix cross entropy. * @brief Matrix cross entropy.
* *
...@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o, ...@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
int width, int width,
int height, int height,
int partial_sum); int partial_sum);
/**
* @brief cos sim forward
*
* @param[out] output output data
* @param[in] input1 input1 data(matrix)
* @param[in] input2 input2 data(matrix or vector)
* @param[in] width matrix width
* @param[in] input1_height input1_height
* @param[in] input2_height input2_height
* @param[in] scale scale factor
*/
extern void hl_cossim(real* output,
real* input1,
real* input2,
int width,
int input1_height,
int input2_height,
real scale);
/**
* @brief cos sim derivate
*
* @param[in] grad output grad
* @param[in] output output data
* @param[in] prevOutX input1 data
* @param[in] prevOutY input2 data
* @param[out] prevGradX input1 grad
* @param[out] prevGradY input2 grad
* @param[in] width matrix width
* @param[in] input1_height input1 height
* @param[in] input2_height input2 height
* @param[in] scale scale factor
*/
extern void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
real* prevOutY,
real* prevGradX,
real* prevGradY,
int width,
int input1_height,
int input2_height,
real scale);
/** /**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel]. * @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
...@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d, ...@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
const int dimN, const int dimN,
real scale); real scale);
/**
* @brief Matrix rotation in 90 degrees
*
* @param[in] mat input matrix (M x N).
* @param[out] matRot output matrix (N x M).
* @param[in] dimM input matrix height.
* @param[in] dimN input matrix width.
* @param[in] clockWise rotation direction
*/
extern void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise);
#endif /* HL_MATRIX_H_ */ #endif /* HL_MATRIX_H_ */
...@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal, ...@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
int beamSize, int beamSize,
int numSamples); int numSamples);
#endif /* HL_TOP_K_H_ */ /**
* @brief Matrix classification error.
*
* @param[out] topVal top k element.
* @param[in] ldv leading dimension of topVal.
* @param[out] topIds top k index.
* @param[in] src input value.
* @param[in] lds leading dimension of src.
* @param[in] dim width of input value.
* @param[in] topkSize size of top k element.
* @param[in] numSamples height of input value.
* @param[in] label ground truth label.
* @param[out] recResult top-k classification error.
*
*/
extern void hl_matrix_classification_error(real* topVal,
int ldv,
int* topIds,
real* src,
int lds,
int dim,
int topkSize,
int numSamples,
int* label,
real* recResult);
#endif // HL_TOP_K_H_
...@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d, ...@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
inline void hl_matrix_softmax_derivative( inline void hl_matrix_softmax_derivative(
real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {} real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
inline void hl_matrix_classification_error( inline void hl_matrix_classification_error(real* topVal,
real* A_d, int* B_d, real* C_d, int dimM, int dimN) {} int ldv,
int* topIds,
real* src,
int lds,
int dim,
int topkSize,
int numSamples,
int* label,
real* recResult) {}
inline void hl_matrix_cross_entropy( inline void hl_matrix_cross_entropy(
real* A_d, real* C_d, int* label_d, int dimM, int dimN) {} real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
...@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o, ...@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
int height, int height,
int partial_sum) {} int partial_sum) {}
inline void hl_cossim(real* output,
real* input1,
real* input2,
int width,
int input1_height,
int input2_height,
real scale) {}
inline void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
real* prevOutY,
real* prevGradX,
real* prevGradY,
int width,
int input1_height,
int input2_height,
real scale) {}
inline void hl_matrix_add_shared_bias(real* A_d, inline void hl_matrix_add_shared_bias(real* A_d,
real* B_d, real* B_d,
const int channel, const int channel,
...@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d, ...@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
const int dimM, const int dimM,
const int dimN, const int dimN,
real scale) {} real scale) {}
inline void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
#endif // HL_MATRIX_STUB_H_ #endif // HL_MATRIX_STUB_H_
...@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d, ...@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
CHECK_SYNC("hl_matrix_softmax_derivative failed"); CHECK_SYNC("hl_matrix_softmax_derivative failed");
} }
template<int blockSize>
__global__ void KeMatrixClassificationError(real* in_A,
int* in_B,
real* out_C,
int dimN) {
__shared__ real max_s[blockSize];
__shared__ int max_l[blockSize];
const int tid = threadIdx.x;
const int rowId = blockIdx.x;
max_s[tid] = -1e30f;
in_A += rowId * dimN;
real tmp;
for (int colId = tid; colId < dimN; colId += blockSize) {
tmp = in_A[colId];
if (max_s[tid] < tmp) {
max_s[tid] = tmp;
max_l[tid] = colId;
}
}
__syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) {
if (tid < stride) {
if (max_s[tid] < max_s[tid + stride]) {
max_s[tid] = max_s[tid + stride];
max_l[tid] = max_l[tid + stride];
}
}
__syncthreads();
}
__syncthreads();
if (tid == 0) {
out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
}
}
void hl_matrix_classification_error(real* A_d,
int* B_d,
real* C_d,
int dimM,
int dimN) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
// each sample is calculated by one block
KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
(A_d, B_d, C_d, dimN);
CHECK_SYNC("hl_matrix_classification_error");
}
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
real* entropy, real* entropy,
int* row, int* row,
...@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o, ...@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_SYNC("hl_param_relu_backward_diff failed"); CHECK_SYNC("hl_param_relu_backward_diff failed");
} }
template<int blockSize>
__global__ void KeCosSim(real* output,
real* input1,
real* input2,
int width,
int input1_height,
int input2_height,
real scale) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
__shared__ real xx[blockSize];
__shared__ real yy[blockSize];
__shared__ real xy[blockSize];
xx[tid] = 0.0;
yy[tid] = 0.0;
xy[tid] = 0.0;
__syncthreads();
input1 += ty * width;
if (input2_height > 1) {
input2 += ty * width;
}
for (int index = tid; index < width; index += blockSize) {
real x = input1[index];
real y = input2[index];
xx[tid] += x * x;
yy[tid] += y * y;
xy[tid] += x * y;
}
__syncthreads();
for (int s = blockSize / 2; s > 0; s >>= 1) {
if (tid < s) {
xx[tid] += xx[tid + s];
yy[tid] += yy[tid + s];
xy[tid] += xy[tid + s];
}
__syncthreads();
}
if (tid == 0) {
output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
}
}
void hl_cossim(real* output,
real* input1,
real* input2,
int width,
int input1_height,
int input2_height,
real scale) {
CHECK_NOTNULL(output);
CHECK_NOTNULL(input1);
CHECK_NOTNULL(input2);
const int blockSize = 256;
dim3 threads(blockSize, 1);
dim3 grid(1, input1_height);
KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
(output, input1, input2, width, input1_height, input2_height, scale);
CHECK_SYNC("hl_cossim failed");
}
template<int blockSize>
__global__ void KeCosSimDerivative(real* grad,
real* output,
real* prevOutX,
real* prevOutY,
real* prevGradX,
real* prevGradY,
int width,
int input1_height,
int input2_height,
real scale) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
__shared__ real xx[blockSize];
__shared__ real yy[blockSize];
__shared__ real xy[blockSize];
xx[tid] = 0.0;
yy[tid] = 0.0;
xy[tid] = 0.0;
__syncthreads();
prevOutX += ty * width;
prevGradX += ty * width;
if (input2_height > 1) {
prevOutY += ty * width;
prevGradY += ty * width;
}
for (int index = tid; index < width; index += blockSize) {
real x = prevOutX[index];
real y = prevOutY[index];
xx[tid] += x * x;
yy[tid] += y * y;
xy[tid] += x * y;
}
__syncthreads();
for (int s = blockSize / 2; s > 0; s >>= 1) {
if (tid < s) {
xx[tid] += xx[tid + s];
yy[tid] += yy[tid + s];
xy[tid] += xy[tid + s];
}
__syncthreads();
}
if (xy[0] == 0) {
real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
for (int index = tid; index < width; index += blockSize) {
prevGradX[index] +=
scale * grad[ty] * prevOutY[index] * reciprocal;
if (input2_height > 1) {
prevGradY[index] +=
scale * grad[ty] * prevOutX[index] * reciprocal;
} else {
paddle::paddleAtomicAdd(prevGradY + index,
scale * grad[ty] * prevOutX[index] * reciprocal);
}
}
} else {
real reciprocalXY = 1.0 / xy[0];
real reciprocalSquareSumX = 1.0 / xx[0];
real reciprocalSquareSumY = 1.0 / yy[0];
for (int index = tid; index < width; index += blockSize) {
prevGradX[index] += output[ty] * grad[ty] *
(prevOutY[index] * reciprocalXY -
prevOutX[index] * reciprocalSquareSumX);
if (input2_height > 1) {
prevGradY[index] += output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY);
} else {
paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
(prevOutX[index] * reciprocalXY -
prevOutY[index] * reciprocalSquareSumY));
}
}
}
}
void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
real* prevOutY,
real* prevGradX,
real* prevGradY,
int width,
int input1_height,
int input2_height,
real scale) {
CHECK_NOTNULL(grad);
CHECK_NOTNULL(output);
CHECK_NOTNULL(prevOutX);
CHECK_NOTNULL(prevOutY);
CHECK_NOTNULL(prevGradX);
CHECK_NOTNULL(prevGradY);
const int blockSize = 256;
dim3 threads(blockSize, 1);
dim3 grid(1, input1_height);
KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
(grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
input1_height, input2_height, scale);
CHECK_SYNC("hl_cossim_derivate failed");
}
__global__ void KeMatrixAddSharedBias(real* A, __global__ void KeMatrixAddSharedBias(real* A,
real* B, real* B,
const int channel, const int channel,
...@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d, ...@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
(B_d, A_d, channel, dimM, dimN, dim, limit, scale); (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed"); CHECK_SYNC("hl_matrix_collect_shared_bias failed");
} }
__global__ void keMatrixRotate(real* mat, real* matRot,
int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) {
int i = idx / dimN;
int j = idx % dimN;
if (clockWise) {
matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
} else {
matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
}
}
}
void hl_matrix_rotate(real *mat, real* matRot,
int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot);
const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
(mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed");
}
...@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv, ...@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
CHECK_SYNC("hl_sparse_matrix_top_k failed"); CHECK_SYNC("hl_sparse_matrix_top_k failed");
} }
/**
* Each block compute one sample.
* In a block:
* 1. every thread get top maxLength value;
* 2. merge to shTopK, block reduce and get max value;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
*/
template<int maxLength, int blockSize>
__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
int * topIds,
real* src, int lds,
int dim,
int beamSize,
int* label,
real* recResult) {
__shared__ Pair shTopK[blockSize];
__shared__ int maxId[blockSize / 2];
const int tid = threadIdx.x;
const int warp = threadIdx.x / 32;
src += blockIdx.x * lds;
topVal += blockIdx.x * ldv;
topIds += blockIdx.x * beamSize;
Pair topK[maxLength]; // NOLINT
int beam = maxLength;
Pair max;
bool isEmpty = false;
bool firstStep = true;
int topkSize = beamSize;
for (int k = 0; k < maxLength; k++) {
topK[k].set(-HL_FLOAT_MAX, -1);
}
while (beamSize) {
threadGetTopK<maxLength, blockSize>
(topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
shTopK[tid] = topK[0];
blockReduce<maxLength, blockSize>
(shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
}
__syncthreads();
if (tid == 0) {
for (int i = 0; i < topkSize; i++) {
if (*--topIds == label[blockIdx.x]) {
recResult[blockIdx.x] = 0;
break;
}
recResult[blockIdx.x] = 1.0f;
}
}
}
void hl_matrix_classification_error(real* topVal, int ldv,
int* topIds,
real* src, int lds,
int dim,
int topkSize,
int numSamples,
int* label,
real* recResult) {
CHECK_NOTNULL(topVal);
CHECK_NOTNULL(topIds);
CHECK_NOTNULL(src);
if (topkSize > dim) topkSize = dim;
dim3 threads(256, 1);
dim3 grid(numSamples, 1);
KeMatrixTopKClassificationError<5, 256>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
CHECK_SYNC("hl_matrix_top_k classification error failed");
}
...@@ -54,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size) ...@@ -54,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
#define WARPCTC_GET_VERSION dynload::get_warpctc_version #define WARPCTC_GET_VERSION dynload::get_warpctc_version
#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString #define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
static int g_warpctcVersion = -1;
#ifndef PADDLE_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss #define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size #define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
#else #else
#define WARPCTC_LOG_FATAL \ hl_warpctc_status_t fatal(...) {
LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \ LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
<< "] Error: not support double precision." << "] Error: not support double precision.";
#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__) // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__) // type value
return CTC_STATUS_EXECUTION_FAILED;
}
#define WARPCTC_COMPUTE_LOSS fatal
#define WARPCTC_GET_WORKSPACE_SIZE fatal
#endif #endif
/** /**
* Check build-in warp-ctc function using glog and it also * Check build-in warp-ctc function using glog and it also
* support << operator for more details error info. * support << operator for more details error info.
*/ */
static int g_warpctcVersion = -1;
#define CHECK_WARPCTC(warpctcStat) \ #define CHECK_WARPCTC(warpctcStat) \
CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat) \ CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat) \
<< "warp-ctc [version " << g_warpctcVersion \ << "warp-ctc [version " << g_warpctcVersion \
......
...@@ -20,23 +20,33 @@ limitations under the License. */ ...@@ -20,23 +20,33 @@ limitations under the License. */
namespace paddle { namespace paddle {
const SequenceArg& BufferArg::sequence() const { const SequenceArg& BufferArg::sequence() const {
// CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA); CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
return dynamic_cast<const SequenceArg&>(*this); return dynamic_cast<const SequenceArg&>(*this);
} }
const SparseMatrixArg& BufferArg::sparse() const { const SparseMatrixArg& BufferArg::sparse() const {
// CHECK_EQ(bufferType_, TENSOR_SPARSE); CHECK_EQ(bufferType_, TENSOR_SPARSE);
return dynamic_cast<const SparseMatrixArg&>(*this); return dynamic_cast<const SparseMatrixArg&>(*this);
} }
SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType) SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
: BufferArg(sparse, argType), : BufferArg(sparse, argType),
row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32), row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {} col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
nnz_(sparse.getElementCnt()),
format_(static_cast<SparseDataFormat>(sparse.getFormat())),
type_(static_cast<SparseDataType>(sparse.getValueType())) {
bufferType_ = TENSOR_SPARSE;
}
SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType) SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
: BufferArg(sparse, argType), : BufferArg(sparse, argType),
row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32), row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {} col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
nnz_(sparse.getElementCnt()),
format_(static_cast<SparseDataFormat>(sparse.getFormat())),
type_(static_cast<SparseDataType>(sparse.getValueType())) {
bufferType_ = TENSOR_SPARSE;
}
} // namespace paddle } // namespace paddle
...@@ -23,23 +23,16 @@ limitations under the License. */ ...@@ -23,23 +23,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
enum BufferType { enum BufferType {
TENSOR_NORMAL = 0, TENSOR_UNKNOWN = 0,
TENSOR_SEQUENCE_ID = 1, TENSOR_NORMAL = 1,
TENSOR_SEQUENCE_DATA = 2, TENSOR_SEQUENCE_ID = 2,
TENSOR_SPARSE = 3 TENSOR_SEQUENCE_DATA = 3,
TENSOR_SPARSE = 4
}; };
enum SparseDataType {
SPARSE_NO_VALUE = 0, // do not need value pointer, all values are 1
SPARSE_FLOAT_VALUE = 1
};
enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
class BufferArg; class BufferArg;
class SequenceArg; class SequenceArg;
class SparseMatrixArg; class SparseMatrixArg;
typedef std::shared_ptr<BufferArg> BufferArgPtr;
/** /**
* \brief BufferArg used as the argument type of Function. * \brief BufferArg used as the argument type of Function.
...@@ -50,6 +43,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr; ...@@ -50,6 +43,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
* 3. SequenceArg for a Buffer of sequence data. * 3. SequenceArg for a Buffer of sequence data.
* 4. SparseMatrixArg for a Buffer of sparse matrix. * 4. SparseMatrixArg for a Buffer of sparse matrix.
* *
* Buffer shape
* For most buffers, the first dimension `shape()[0]` represents
* the size of the mini-batch.
*
* Buffer argType
* There is an ArgType property for the BufferArg used as Function Output. * There is an ArgType property for the BufferArg used as Function Output.
* Whether the result of the Function calculation is assigned to the * Whether the result of the Function calculation is assigned to the
* output Buffer or added to the output Buffer is determined by the * output Buffer or added to the output Buffer is determined by the
...@@ -71,14 +69,24 @@ public: ...@@ -71,14 +69,24 @@ public:
ArgType getArgType() const { return argType_; } ArgType getArgType() const { return argType_; }
public: public:
BufferArg(ValueType valueType,
const TensorShape& shape,
ArgType argType = UNSPECIFIED)
: buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
bufferType_ = TENSOR_NORMAL;
}
BufferArg(void* buf, BufferArg(void* buf,
ValueType valueType, ValueType valueType,
const TensorShape& shape, const TensorShape& shape,
ArgType argType = UNSPECIFIED) ArgType argType = UNSPECIFIED)
: buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {} : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
bufferType_ = TENSOR_NORMAL;
}
BufferArg(void* buf, ValueType valueType) BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
: buf_(buf), valueType_(valueType) {} bufferType_ = TENSOR_NORMAL;
}
BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED) BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
: buf_( : buf_(
...@@ -86,6 +94,7 @@ public: ...@@ -86,6 +94,7 @@ public:
valueType_(DataType<real>::value), valueType_(DataType<real>::value),
shape_(2), shape_(2),
argType_(argType) { argType_(argType) {
bufferType_ = TENSOR_NORMAL;
shape_.setDim(0, matrix.getHeight()); shape_.setDim(0, matrix.getHeight());
shape_.setDim(1, matrix.getWidth()); shape_.setDim(1, matrix.getWidth());
} }
...@@ -98,6 +107,7 @@ public: ...@@ -98,6 +107,7 @@ public:
valueType_(DataType<real>::value), valueType_(DataType<real>::value),
shape_(shape), shape_(shape),
argType_(argType) { argType_(argType) {
bufferType_ = TENSOR_NORMAL;
CHECK_EQ(matrix.getElementCnt(), shape.getElements()); CHECK_EQ(matrix.getElementCnt(), shape.getElements());
} }
...@@ -107,6 +117,7 @@ public: ...@@ -107,6 +117,7 @@ public:
valueType_(DataType<real>::value), valueType_(DataType<real>::value),
shape_(1), shape_(1),
argType_(argType) { argType_(argType) {
bufferType_ = TENSOR_NORMAL;
shape_.setDim(0, vector.getSize()); shape_.setDim(0, vector.getSize());
} }
...@@ -116,6 +127,7 @@ public: ...@@ -116,6 +127,7 @@ public:
valueType_(VALUE_TYPE_INT32), valueType_(VALUE_TYPE_INT32),
shape_(1), shape_(1),
argType_(argType) { argType_(argType) {
bufferType_ = TENSOR_NORMAL;
shape_.setDim(0, vector.getSize()); shape_.setDim(0, vector.getSize());
} }
...@@ -150,6 +162,9 @@ public: ...@@ -150,6 +162,9 @@ public:
ValueType valueType() const { return valueType_; } ValueType valueType() const { return valueType_; }
BufferType bufferType() const { return bufferType_; } BufferType bufferType() const { return bufferType_; }
const TensorShape& shape() const { return shape_; } const TensorShape& shape() const { return shape_; }
bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
virtual size_t numElements() const { return shape_.getElements(); }
const SequenceArg& sequence() const; const SequenceArg& sequence() const;
const SparseMatrixArg& sparse() const; const SparseMatrixArg& sparse() const;
...@@ -158,8 +173,9 @@ protected: ...@@ -158,8 +173,9 @@ protected:
void* buf_; void* buf_;
ValueType valueType_; ValueType valueType_;
TensorShape shape_; TensorShape shape_;
BufferType bufferType_; BufferType bufferType_{TENSOR_UNKNOWN};
ArgType argType_ = UNSPECIFIED; ArgType argType_{UNSPECIFIED};
// TODO(tianbing), add deviceType_
// leading dimensions. The size is dims_.size() // leading dimensions. The size is dims_.size()
// Dims lds_; // Dims lds_;
}; };
...@@ -170,15 +186,25 @@ protected: ...@@ -170,15 +186,25 @@ protected:
// if a < b then value_.buf_[a] < value_.buf_[b] // if a < b then value_.buf_[a] < value_.buf_[b]
class SequenceIdArg : public BufferArg { class SequenceIdArg : public BufferArg {
public: public:
SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
: BufferArg(VALUE_TYPE_INT32, shape, argType) {
bufferType_ = TENSOR_SEQUENCE_ID;
CHECK_EQ(shape_.ndims(), 1UL);
CHECK_GE(shape_[0], 1UL);
numSeqs_ = shape_[0] - 1;
}
SequenceIdArg(void* buf, SequenceIdArg(void* buf,
const TensorShape& shape, const TensorShape& shape,
ArgType argType = UNSPECIFIED) ArgType argType = UNSPECIFIED)
: BufferArg(buf, VALUE_TYPE_INT32, shape, argType) { : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
CHECK_EQ(shape_.ndims(), (size_t)1); bufferType_ = TENSOR_SEQUENCE_ID;
CHECK_EQ(shape_.ndims(), 1UL);
numSeqs_ = shape_[0] - 1; numSeqs_ = shape_[0] - 1;
} }
SequenceIdArg(const IVector& vector) : BufferArg(vector) { SequenceIdArg(const IVector& vector) : BufferArg(vector) {
bufferType_ = TENSOR_SEQUENCE_ID;
numSeqs_ = shape_[0] - 1; numSeqs_ = shape_[0] - 1;
} }
...@@ -190,26 +216,44 @@ private: ...@@ -190,26 +216,44 @@ private:
size_t numSeqs_; size_t numSeqs_;
}; };
// sequence data // sequences data
// For mini-batch calculate,
// one batch can contain more than one sequence of data.
// SequenceArg can be used to represent sequences that contain multiple
// unequal lengths.
class SequenceArg : public BufferArg { class SequenceArg : public BufferArg {
public: public:
SequenceArg(ValueType valueType,
const TensorShape& shape,
ArgType argType = UNSPECIFIED)
: BufferArg(valueType, shape, argType),
startPositions_(TensorShape({shape[0]})) {
bufferType_ = TENSOR_SEQUENCE_DATA;
}
SequenceArg(void* buf, SequenceArg(void* buf,
ValueType valueType, ValueType valueType,
const TensorShape& shape, const TensorShape& shape,
const SequenceIdArg& startPositions, const SequenceIdArg& startPositions,
ArgType argType = UNSPECIFIED) ArgType argType = UNSPECIFIED)
: BufferArg(buf, valueType, shape, argType), : BufferArg(buf, valueType, shape, argType),
startPositions_(startPositions) {} startPositions_(startPositions) {
bufferType_ = TENSOR_SEQUENCE_DATA;
}
SequenceArg(const Matrix& matrix, SequenceArg(const Matrix& matrix,
const IVector& vector, const IVector& vector,
ArgType argType = UNSPECIFIED) ArgType argType = UNSPECIFIED)
: BufferArg(matrix, argType), startPositions_(vector) {} : BufferArg(matrix, argType), startPositions_(vector) {
bufferType_ = TENSOR_SEQUENCE_DATA;
}
~SequenceArg() {} ~SequenceArg() {}
void* getIdBuf() const { return startPositions_.data(); } void* getIdBuf() const { return startPositions_.data(); }
size_t numSeqs() const { return startPositions_.numSeqs(); } size_t numSeqs() const { return startPositions_.numSeqs(); }
SequenceIdArg& getSequenceId() { return startPositions_; }
const SequenceIdArg& getSequenceId() const { return startPositions_; }
private: private:
SequenceIdArg startPositions_; SequenceIdArg startPositions_;
...@@ -226,30 +270,75 @@ public: ...@@ -226,30 +270,75 @@ public:
const BufferArg& row, const BufferArg& row,
const BufferArg& col, const BufferArg& col,
size_t nnz, size_t nnz,
SparseDataFormat format, SparseFormat format,
SparseDataType type, SparseValueType type,
ArgType argType = UNSPECIFIED) ArgType argType = UNSPECIFIED)
: BufferArg(buf, valueType, shape, argType), : BufferArg(buf, valueType, shape, argType),
row_(row), row_(row),
col_(col), col_(col),
nnz_(nnz), nnz_(nnz),
format_(format), format_(static_cast<SparseDataFormat>(format)),
type_(type) { type_(static_cast<SparseDataType>(type)) {
bufferType_ = TENSOR_SPARSE;
CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE)); CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
CHECK_EQ(shape_.ndims(), (size_t)2); CHECK_EQ(shape_.ndims(), 2UL);
CHECK_EQ(row_.shape().ndims(), (size_t)1); CHECK_EQ(row_.shape().ndims(), 1UL);
CHECK_EQ(col_.shape().ndims(), (size_t)1); CHECK_EQ(col_.shape().ndims(), 1UL);
if (format == SPARSE_CSR_FORMAT) { if (format_ == T_SPARSE_CSR) {
CHECK_EQ(nnz, col.shape()[0]); CHECK_EQ(nnz, col.shape()[0]);
} else if (format == SPARSE_CSC_FORMAT) { } else if (format_ == T_SPARSE_CSC) {
CHECK_EQ(nnz, row.shape()[0]); CHECK_EQ(nnz, row.shape()[0]);
} }
} }
SparseMatrixArg(ValueType valueType,
const TensorShape& shape,
size_t nnz,
SparseFormat format,
SparseValueType type,
ArgType argType = UNSPECIFIED)
: BufferArg(valueType, shape, argType),
row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
nnz_(nnz),
format_(static_cast<SparseDataFormat>(format)),
type_(static_cast<SparseDataType>(type)) {
bufferType_ = TENSOR_SPARSE;
CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
CHECK_EQ(shape_.ndims(), 2UL);
/// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
row_ = (format_ == T_SPARSE_CSR
? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
: BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
/// len of col_ : width + 1 (CSC) or nnz (CSR), buf_ == nullptr
col_ = (format_ == T_SPARSE_CSR
? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
: BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
}
SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED); SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED); SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
template <DeviceType DType>
typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
CHECK(buf_);
CHECK(valueType_ == DataType<real>::value);
// CHECK(deviceType_ == DType);
CHECK_EQ(2UL, shape_.ndims());
return typename Tensor<real, DType>::SparseMatrix(
reinterpret_cast<real*>(buf_),
reinterpret_cast<int*>(row_.data()),
reinterpret_cast<int*>(col_.data()),
shape_[0],
shape_[1],
nnz_,
static_cast<SparseValueType>(type_),
static_cast<SparseFormat>(format_),
false);
}
~SparseMatrixArg() {} ~SparseMatrixArg() {}
void* getRowBuf() const { return row_.data(); } void* getRowBuf() const { return row_.data(); }
...@@ -258,6 +347,8 @@ public: ...@@ -258,6 +347,8 @@ public:
size_t nnz() const { return nnz_; } size_t nnz() const { return nnz_; }
size_t numElements() const override { return nnz_; }
SparseDataFormat dataFormat() const { return format_; } SparseDataFormat dataFormat() const { return format_; }
SparseDataType dataType() const { return type_; } SparseDataType dataType() const { return type_; }
......
...@@ -14,9 +14,7 @@ limitations under the License. */ ...@@ -14,9 +14,7 @@ limitations under the License. */
#include "BufferArg.h" #include "BufferArg.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "Function.h"
#include "paddle/math/MemoryHandle.h" #include "paddle/math/MemoryHandle.h"
#include "paddle/math/SparseMatrix.h"
namespace paddle { namespace paddle {
...@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) { ...@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
EXPECT_EQ(buffer.numSeqs(), 9); EXPECT_EQ(buffer.numSeqs(), 9);
} }
TEST(BufferTest, asArgument) {
MatrixPtr matrix = Matrix::create(100, 200);
VectorPtr vector = Vector::create(100, false);
CpuSparseMatrix sparse(200, 300, 50);
// prepare arguments
BufferArgs argments;
argments.addArg(*matrix);
argments.addArg(*vector);
argments.addArg(sparse);
// function
auto function = [=](const BufferArgs& inputs) {
EXPECT_EQ(inputs.size(), 3);
// check inputs[0]
EXPECT_EQ(inputs[0].shape().ndims(), 2);
EXPECT_EQ(inputs[0].shape()[0], 100);
EXPECT_EQ(inputs[0].shape()[1], 200);
EXPECT_EQ(inputs[0].data(), matrix->getData());
EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
matrix->getHeight());
EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
matrix->getWidth());
EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
// check inputs[1]
EXPECT_EQ(inputs[1].shape().ndims(), 1);
EXPECT_EQ(inputs[1].shape()[0], 100);
EXPECT_EQ(inputs[1].data(), vector->getData());
CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
EXPECT_EQ(inVector.getSize(), vector->getSize());
EXPECT_EQ(inVector.getData(), vector->getData());
// check inputs[2]
EXPECT_EQ(inputs[2].shape().ndims(), 2);
EXPECT_EQ(inputs[2].shape()[0], 200);
EXPECT_EQ(inputs[2].shape()[1], 300);
EXPECT_EQ(inputs[2].data(), sparse.getData());
// CHECK_EQ(inputs[2].sparse().nnz(), 50);
// CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
// CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
};
// call function
function(argments);
}
} // namespace paddle } // namespace paddle
...@@ -19,12 +19,15 @@ if(WITH_TESTING) ...@@ -19,12 +19,15 @@ if(WITH_TESTING)
# TODO: # TODO:
# file(GLOB test_files . *OpTest.cpp) # file(GLOB test_files . *OpTest.cpp)
# add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files}) # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
# add_simple_unittest(CrossMapNormalOpTest) add_simple_unittest(CrossMapNormalOpTest)
add_simple_unittest(TensorShapeTest) add_simple_unittest(TensorShapeTest)
add_simple_unittest(TensorTypeTest) add_simple_unittest(TensorTypeTest)
add_simple_unittest(BufferArgTest) add_simple_unittest(BufferArgTest)
add_simple_unittest(FunctionTest) add_simple_unittest(FunctionTest)
# add_simple_unittest(ContextProjectionOpTest) add_simple_unittest(ContextProjectionOpTest)
add_simple_unittest(PadOpTest)
add_simple_unittest(MulOpTest)
add_simple_unittest(CosSimOpTest)
endif() endif()
endif() endif()
......
...@@ -17,7 +17,10 @@ limitations under the License. */ ...@@ -17,7 +17,10 @@ limitations under the License. */
#include "paddle/math/Vector.h" #include "paddle/math/Vector.h"
namespace paddle { namespace paddle {
/**
* Context Projection Forward with CPU Matrix Device.
*
*/
template <> template <>
void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat, void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
const CpuMatrix& input_mat, const CpuMatrix& input_mat,
...@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat, ...@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
} }
/** /**
* \param inputs[0] input value. * Paddle Function for Context Projection Forward.
* \param inputs[1] input weight. * Calculate the output layer value sequence after context projection.
* \param inputs[2] input sequence. *
* \param outputs[0] output value. * What is Context Projection for a sequence?
* For example, assumed input (x) has 4 words and the dimension of each word
* representation is 2. If we use zero to pad instead of learned weight to pad,
* and the context_lenth is 3, the output (y) is:
*
* @code
* x = [a1, a2;
* b1, b2;
* c1, c2;
* d1, d2]
* y = [0, 0, a1, a2, b1, b2;
* a1, a2, b1, b2, c1, c2;
* b1, b2, c1, c2, d1, d2;
* c1, c2, d1, d2, 0, 0]
* @endcode
*
* \param outputs[0].matrix output layer value, n * (d * l)
* \param outputs[0].vector start position sequence, n * 1
* \param inputs[0].matrix input layer value, n * d
* \param inputs[0].vector start position sequence, n * 1
* \param inputs[1].matrix input layer weight, pad * d
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionForwardFunc : public FunctionBase { class ContextProjectionForwardFunc : public FunctionBase {
...@@ -85,28 +108,35 @@ public: ...@@ -85,28 +108,35 @@ public:
} }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ((size_t)3, inputs.size()); CHECK(1UL == inputs.size() || 2UL == inputs.size());
CHECK_EQ((size_t)1, outputs.size()); CHECK_EQ(1UL, outputs.size());
CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
<< "SequenceArg required here";
const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data()); CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
CHECK_EQ(outputs[0].shape().ndims(), (size_t)2); CHECK_EQ(out_seq.shape().ndims(), 2UL);
CHECK_EQ(inputs[0].shape().ndims(), (size_t)2); CHECK_EQ(val_seqs.shape().ndims(), 2UL);
CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
/// dim of output = dim of input * context_length /// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_); CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
/// input and output has the same batch_size /// input and output has the same batch_size
CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]); CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
if (2UL == inputs.size()) {
CHECK_EQ(inputs[1].shape().ndims(), 2UL);
/// dim of input == dim of weight
CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
}
CHECK_EQ(out_seq.getArgType(), ADD_TO);
auto out_mat = out_seq.matrix<Device>();
const auto in_mat = val_seqs.matrix<Device>();
const auto w_mat =
(2UL == inputs.size() && inputs[1].data())
? inputs[1].matrix<Device>()
: typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
auto out_mat = outputs[0].matrix<Device>();
auto in_mat = inputs[0].matrix<Device>();
auto w_mat = !inputs[1].data()
? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
: inputs[1].matrix<Device>();
auto seq_vec = inputs[2].vector<int, Device>();
ContextProjectionForward<Device>(out_mat, ContextProjectionForward<Device>(out_mat,
in_mat, in_mat,
w_mat, w_mat,
...@@ -122,8 +152,12 @@ private: ...@@ -122,8 +152,12 @@ private:
size_t begin_pad_; size_t begin_pad_;
}; };
/**
* Context Projection Backward with CPU Matrix Device.
*
*/
template <> template <>
void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
CpuMatrix& in_grad_mat, CpuMatrix& in_grad_mat,
CpuMatrix& w_grad_mat, CpuMatrix& w_grad_mat,
const CpuIVector& seq_vec, const CpuIVector& seq_vec,
...@@ -146,7 +180,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, ...@@ -146,7 +180,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
int64_t pad_size = int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]); std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) { if (is_padding && w_grad_mat) {
MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size); MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
.subMatrix(starts[i], pad_size);
MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size); MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
sub->addAtOffset(*mat, j * input_dim); sub->addAtOffset(*mat, j * input_dim);
} }
...@@ -157,8 +192,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, ...@@ -157,8 +192,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
int64_t pad_size = int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]); std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) { if (is_padding && w_grad_mat) {
MatrixPtr mat = MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size); .subMatrix(starts[i + 1] - pad_size, pad_size);
MatrixPtr sub = w_grad_mat.subMatrix( MatrixPtr sub = w_grad_mat.subMatrix(
begin_pad + context_start + j - pad_size, pad_size); begin_pad + context_start + j - pad_size, pad_size);
sub->addAtOffset(*mat, j * input_dim); sub->addAtOffset(*mat, j * input_dim);
...@@ -169,17 +204,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, ...@@ -169,17 +204,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
if (end <= begin) continue; if (end <= begin) continue;
if (!in_grad_mat) continue; if (!in_grad_mat) continue;
MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin); MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin); MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
.subMatrix(dst_begin, dst_end - dst_begin);
src->addAtOffset(*dst, j * input_dim); src->addAtOffset(*dst, j * input_dim);
} }
} }
} }
/** /**
* \param inputs[0] input grad. * Context Projection Backward Function.
* \param inputs[1] weight grad. * Update the weight gradient and input layer gradient with backprop
* \param inputs[2] input sequence. *
* \param outputs[0] output value. * \param inputs[0].matrix output layer grad, n * (d * l)
* \param inputs[0].vector start position sequence, n * 1
* \param outputs[0].matrix input layer grad, n * d
* \param outputs[0].vector start position sequence, n * 1
* \param outputs[1] weight grad, pad * d
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardFunc : public FunctionBase { class ContextProjectionBackwardFunc : public FunctionBase {
...@@ -193,32 +233,40 @@ public: ...@@ -193,32 +233,40 @@ public:
} }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ((size_t)3, inputs.size()); CHECK_EQ(1UL, inputs.size());
CHECK_EQ((size_t)1, outputs.size()); CHECK(1UL == outputs.size() || 2UL == outputs.size());
CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
CHECK(outputs[0].data() && inputs[2].data()); << "SequenceArg required here";
CHECK_EQ(outputs[0].shape().ndims(), (size_t)2); const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
CHECK_EQ(inputs[0].shape().ndims(), (size_t)2); auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
CHECK_EQ(inputs[1].shape().ndims(), (size_t)2); CHECK(in_seq.data() && in_seq.getSequenceId().data());
CHECK_EQ(inputs[2].shape().ndims(), (size_t)1); CHECK_EQ(in_seq.shape().ndims(), 2UL);
CHECK_EQ(out_seq.shape().ndims(), 2UL);
CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
/// dim of input == dim of weight /// input and output grad has the same batch_size
CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]); CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
/// input and output has the same batch_size /// dim of output grad = dim of input grad * context_length
CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]); CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
/// dim of output = dim of input * context_length CHECK_EQ(out_seq.getArgType(), ADD_TO);
CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
CHECK_EQ(outputs[0].getArgType(), ADD_TO); if (2UL == outputs.size()) {
CHECK_EQ(outputs[1].shape().ndims(), 2UL);
/// dim of input grad == dim of weight
CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
CHECK_EQ(outputs[1].getArgType(), ADD_TO);
}
auto out_grad_mat = outputs[0].matrix<Device>(); const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
const auto out_grad_mat = in_seq.matrix<Device>();
auto in_grad_mat = auto in_grad_mat =
!inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0) !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
: inputs[0].matrix<Device>(); : out_seq.matrix<Device>();
auto w_grad_mat = !inputs[1].data() auto w_grad_mat =
? typename Tensor<real, Device>::Matrix(nullptr, 0, 0) (2UL == outputs.size() && outputs[1].data())
: inputs[1].matrix<Device>(); ? outputs[1].matrix<Device>()
auto seq_vec = inputs[2].vector<int, Device>(); : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
ContextProjectionBackward<Device>(out_grad_mat, ContextProjectionBackward<Device>(out_grad_mat,
in_grad_mat, in_grad_mat,
w_grad_mat, w_grad_mat,
...@@ -238,11 +286,16 @@ private: ...@@ -238,11 +286,16 @@ private:
size_t total_pad_; size_t total_pad_;
}; };
#if 0
/** /**
* \param inputs[0] input grad. * Context Projection Backward Data Function
* \param inputs[1] input sequence. * Update input layer grad
* \param outputs[0] output grad. * input: sequence of output layer grad
* output: sequence of input layer grad
*
* \param outputs[0].matrix input layer grad, n * d
* \param outputs[0].vector start position sequence, n * 1
* \param inputs[0].matrix output layer grad, n * (d * l)
* \param inputs[0].vector start positon sequence, n * 1
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardDataFunc : public FunctionBase { class ContextProjectionBackwardDataFunc : public FunctionBase {
...@@ -252,32 +305,30 @@ public: ...@@ -252,32 +305,30 @@ public:
context_start_ = config.get<int>("context_start"); context_start_ = config.get<int>("context_start");
} }
void calc(const Arguments& inputs, void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
const Arguments& outputs, CHECK_EQ(1UL, inputs.size());
const Arguments& inouts) override { CHECK_EQ(1UL, outputs.size());
CHECK_EQ(2, static_cast<int>(inputs.size())); CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
CHECK_EQ(1, static_cast<int>(outputs.size())); << "SequenceArg required here";
CHECK_EQ(0, static_cast<int>(inouts.size())); const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData()); const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2); CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1); CHECK_EQ(out_seq.shape().ndims(), 2UL);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_); CHECK_EQ(in_seq.shape().ndims(), 2UL);
CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
/// output layer grad dim == input layer grad dim * context_length_
CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
/// input and output has the same batch_size /// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]); CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>( const auto out_grad_mat = in_seq.matrix<Device>();
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]); const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>( auto in_grad_mat = out_seq.matrix<Device>();
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardData<Device>(out_grad_mat.get(), ContextProjectionBackwardData<Device>(
in_grad_mat.get(), out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
seq_vec,
context_length_,
context_start_);
} }
private: private:
...@@ -286,9 +337,14 @@ private: ...@@ -286,9 +337,14 @@ private:
}; };
/** /**
* \param inputs[0] weight grad. * Context Projection Backward Weight Function
* \param inputs[1] input sequence. * Update weight grad by backprop
* \param outputs[0] output grad. * input: sequence of output layer grad
* output: weight grad
*
* \param outputs[0] weight grad, pad * d
* \param inputs[0].matrix output layer grad, n * (d * l)
* \param inputs[0].vecotr start positon sequence, n * 1
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardWeightFunc : public FunctionBase { class ContextProjectionBackwardWeightFunc : public FunctionBase {
...@@ -300,28 +356,25 @@ public: ...@@ -300,28 +356,25 @@ public:
total_pad_ = config.get<size_t>("total_pad"); total_pad_ = config.get<size_t>("total_pad");
} }
void calc(const Arguments& inputs, void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
const Arguments& outputs, CHECK_EQ(1UL, inputs.size());
const Arguments& inouts) override { CHECK_EQ(1UL, outputs.size());
CHECK_EQ(2, static_cast<int>(inputs.size())); CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
CHECK_EQ(1, static_cast<int>(outputs.size())); const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
CHECK_EQ(0, static_cast<int>(inouts.size())); CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
CHECK_EQ(outputs[0].shape().ndims(), 2UL);
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData()); CHECK_EQ(in_seq.shape().ndims(), 2UL);
CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2); CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2); CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1); /// output layer grad dim == weight dim * context_length_
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_); CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardWeight<Device>(out_grad_mat.get(), const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
w_grad_mat.get(), const auto out_grad_mat = in_seq.matrix<Device>();
auto w_grad_mat = outputs[0].matrix<Device>();
ContextProjectionBackwardWeight<Device>(out_grad_mat,
w_grad_mat,
seq_vec, seq_vec,
context_length_, context_length_,
context_start_, context_start_,
...@@ -335,7 +388,6 @@ private: ...@@ -335,7 +388,6 @@ private:
size_t begin_pad_; size_t begin_pad_;
size_t total_pad_; size_t total_pad_;
}; };
#endif
REGISTER_TYPED_FUNC(ContextProjectionForward, REGISTER_TYPED_FUNC(ContextProjectionForward,
CPU, CPU,
...@@ -350,7 +402,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward, ...@@ -350,7 +402,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
REGISTER_TYPED_FUNC(ContextProjectionBackward, REGISTER_TYPED_FUNC(ContextProjectionBackward,
GPU, GPU,
ContextProjectionBackwardFunc); ContextProjectionBackwardFunc);
#if 0
REGISTER_TYPED_FUNC(ContextProjectionBackwardData, REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
GPU, GPU,
ContextProjectionBackwardDataFunc); ContextProjectionBackwardDataFunc);
...@@ -358,5 +409,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight, ...@@ -358,5 +409,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
GPU, GPU,
ContextProjectionBackwardWeightFunc); ContextProjectionBackwardWeightFunc);
#endif #endif
#endif
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "Function.h" #include "Function.h"
namespace paddle { namespace paddle {
...@@ -21,14 +20,14 @@ namespace paddle { ...@@ -21,14 +20,14 @@ namespace paddle {
/** /**
* \brief Context Projection Forward. * \brief Context Projection Forward.
* *
* \param[out] outputs output data. * \param[in/out] outputs output data.
* \param[in] input input data. * \param[in] input input data.
* \param[in] weight input weight. * \param[in] weight input weight.
* \param[in] sequence input data. * \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation. * \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position. * \param[in] context_start context start position.
* \param[in] begin_pad begining pad position. * \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not. * \param[in] is_padding whether padding 0 or not.
* *
*/ */
template <DeviceType DType> template <DeviceType DType>
...@@ -56,7 +55,7 @@ void ContextProjectionForward( ...@@ -56,7 +55,7 @@ void ContextProjectionForward(
*/ */
template <DeviceType DType> template <DeviceType DType>
void ContextProjectionBackward( void ContextProjectionBackward(
typename Tensor<real, DType>::Matrix& out_grad, const typename Tensor<real, DType>::Matrix& out_grad,
typename Tensor<real, DType>::Matrix& in_grad, typename Tensor<real, DType>::Matrix& in_grad,
typename Tensor<real, DType>::Matrix& w_grad, typename Tensor<real, DType>::Matrix& w_grad,
const typename Tensor<int, DType>::Vector& seq_vec, const typename Tensor<int, DType>::Vector& seq_vec,
...@@ -68,7 +67,7 @@ void ContextProjectionBackward( ...@@ -68,7 +67,7 @@ void ContextProjectionBackward(
template <DeviceType DType> template <DeviceType DType>
void ContextProjectionBackwardData( void ContextProjectionBackwardData(
typename Tensor<real, DType>::Matrix& out_grad, const typename Tensor<real, DType>::Matrix& out_grad,
typename Tensor<real, DType>::Matrix& in_grad, typename Tensor<real, DType>::Matrix& in_grad,
const typename Tensor<int, DType>::Vector& sequence, const typename Tensor<int, DType>::Vector& sequence,
size_t context_length, size_t context_length,
...@@ -76,7 +75,7 @@ void ContextProjectionBackwardData( ...@@ -76,7 +75,7 @@ void ContextProjectionBackwardData(
template <DeviceType DType> template <DeviceType DType>
void ContextProjectionBackwardWeight( void ContextProjectionBackwardWeight(
typename Tensor<real, DType>::Matrix& out_grad, const typename Tensor<real, DType>::Matrix& out_grad,
typename Tensor<real, DType>::Matrix& w_grad, typename Tensor<real, DType>::Matrix& w_grad,
const typename Tensor<int, DType>::Vector& seq_vec, const typename Tensor<int, DType>::Vector& seq_vec,
size_t context_length, size_t context_length,
......
...@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output, ...@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
begin_pad); begin_pad);
} }
__global__ void KeContextProjectionBackwardData(real* out_grad, __global__ void KeContextProjectionBackwardData(const real* out_grad,
const int* sequence, const int* sequence,
real* in_grad, real* in_grad,
int input_dim, size_t input_dim,
int context_length, int context_length,
int context_start) { int context_start) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
out_grad += seq_start * input_dim * context_length; auto out = const_cast<real*>(out_grad);
out += seq_start * input_dim * context_length;
in_grad += seq_start * input_dim; in_grad += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) { for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) { if (idx < input_dim) {
...@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
out_grad + outy * input_dim * context_length + outx * input_dim; out + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx]; value += output_r[idx];
if (j - outy == outx) break; if (j - outy == outx) break;
...@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
* @param[in] context_start context start. * @param[in] context_start context start.
* *
*/ */
void hl_context_projection_backward_data(real* out_grad, void hl_context_projection_backward_data(const real* out_grad,
const int* sequence, const int* sequence,
real* input_grad, real* input_grad,
size_t num_sequences, size_t num_sequences,
...@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad, ...@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad,
} }
template <> template <>
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
GpuMatrix& in_grad, GpuMatrix& in_grad,
const GpuIVector& sequence, const GpuIVector& sequence,
size_t context_length, size_t context_length,
...@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, ...@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
} }
template<int THREADS_X, int THREADS_Y> template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
int num_sequences, int num_sequences,
...@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad, ...@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId+1];
output_r = out_grad + seq_start * w_dim * context_length; output_r = const_cast<real*>(out_grad)
+ seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
...@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad, ...@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
* beginning. * beginning.
* *
*/ */
void hl_context_projection_backward_weight(real* out_grad, void hl_context_projection_backward_weight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
size_t num_sequences, size_t num_sequences,
...@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad, ...@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad,
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
GpuMatrix& out_grad, const GpuMatrix& out_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& seq_vec, const GpuIVector& seq_vec,
size_t context_length, size_t context_length,
...@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( ...@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
} }
template <> template <>
void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
GpuMatrix& in_grad, GpuMatrix& in_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& sequence, const GpuIVector& sequence,
......
...@@ -28,52 +28,26 @@ void testMatrixProjectionForward(int context_start, ...@@ -28,52 +28,26 @@ void testMatrixProjectionForward(int context_start,
std::max(0, (int)(context_start + context_length - 1)); std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false; if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionForward", FunctionCompare test("ContextProjectionForward",
FuncConfig() FuncConfig()
.set("context_length", context_length) .set("context_length", context_length)
.set("context_start", context_start) .set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start))); .set("begin_pad", std::max(0, -context_start)));
CpuMatrix cpu_in(batch_size, input_dim); // prepare input arguments
cpu_in.randomizeUniform(); test.addSequence(SequenceIdArg(TensorShape{batch_size}));
GpuMatrix gpu_in(batch_size, input_dim); test.addInputs(
gpu_in.copyFrom(cpu_in); SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
auto cpu_weight = if (is_padding) { // weight
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr; test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
auto gpu_weight =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_weight->randomizeUniform();
gpu_weight->copyFrom(*cpu_weight);
} }
IVectorPtr cpu_seq; test.addOutputs(
generateSequenceStartPositions(batch_size, cpu_seq); SequenceArg(VALUE_TYPE_FLOAT,
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true); TensorShape{batch_size, input_dim * context_length}),
gpu_seq->copyFrom(*cpu_seq); ADD_TO);
CpuMatrix cpu_out(batch_size, input_dim * context_length);
GpuMatrix gpu_out(batch_size, input_dim * context_length);
cpu_out.randomizeUniform();
gpu_out.copyFrom(cpu_out);
compare.getCpuFunction()->calc(
{Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckEqual(cpu_out, gpu_out); // run Function
test.run();
} }
void testMatrixProjectionBackward(int context_start, void testMatrixProjectionBackward(int context_start,
...@@ -85,65 +59,31 @@ void testMatrixProjectionBackward(int context_start, ...@@ -85,65 +59,31 @@ void testMatrixProjectionBackward(int context_start,
std::max(0, (int)(context_start + context_length - 1)); std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false; if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionBackward", FunctionCompare test("ContextProjectionBackward",
FuncConfig() FuncConfig()
.set("context_length", context_length) .set("context_length", context_length)
.set("context_start", context_start) .set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start)) .set("begin_pad", std::max(0, -context_start))
.set("is_padding", is_padding) .set("is_padding", is_padding)
.set("total_pad", pad)); .set("total_pad", pad));
CpuMatrix cpu_in_grad(batch_size, input_dim); // prepare input arguments
cpu_in_grad.randomizeUniform(); test.addSequence(SequenceIdArg(TensorShape{batch_size}));
GpuMatrix gpu_in_grad(batch_size, input_dim); test.addInputs(SequenceArg(
gpu_in_grad.copyFrom(cpu_in_grad); VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
test.addOutputs(
CpuMatrix cpu_out_grad(batch_size, input_dim * context_length); SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
cpu_out_grad.randomizeUniform(); ADD_TO);
GpuMatrix gpu_out_grad(batch_size, input_dim * context_length); if (is_padding) { // weight
gpu_out_grad.copyFrom(cpu_out_grad); test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
ADD_TO);
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
auto cpu_w_grad =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_w_grad =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_w_grad->randomizeUniform();
gpu_w_grad->copyFrom(*cpu_w_grad);
} }
compare.getCpuFunction()->calc( // run Function
{Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}), test.run();
Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
if (is_padding) {
autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
}
} }
TEST(ContextProjection, projection) { TEST(ContextProjection, Projection) {
for (auto context_start : {-5, -3, -1, 0, 3}) { for (auto context_start : {-5, -3, -1, 0, 3}) {
for (auto context_length : {1, 2, 5, 7}) { for (auto context_length : {1, 2, 5, 7}) {
for (auto trainable_padding : {false, true}) { for (auto trainable_padding : {false, true}) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "CosSimOp.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Vector.h"
namespace paddle {
/**
* Cosine Similarity for CpuMatrix
*
* \param out_mat, output value, size: nSamples * 1.
* \param in1_mat, input value 1, size: nSamples * dim.
* \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
* \param scale, default 1.0
*
*/
template <>
void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
const CpuMatrix& in1_mat,
const CpuMatrix& in2_mat,
real scale) {
CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
size_t num_samples = out_mat.getHeight();
size_t dim = in1_mat.getWidth();
/// column vector [nSamples, 1]
real* out = out_mat.getData();
const real* x = in1_mat.getData();
const real* y = in2_mat.getData();
/// in2 might only have one row or full rows
CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
real square_sum_x = 0;
real square_sum_y = 0;
real xy = 0;
for (size_t j = 0; j < dim; ++j) {
square_sum_x += x[j] * x[j];
square_sum_y += y[j] * y[j];
xy += x[j] * y[j];
}
CHECK(square_sum_x > 0 && square_sum_y > 0);
out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
}
}
/**
* Cosine Similarity
* for each row i,
* out[i] = scale * cos(input1[i], input2[i])
* = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
* when input2 only has one row, then for each row i,
* out[i] = cos(input1[i], input2[0])
*
* \param inputs[0] input matrix 1, size: nSamples * dim.
* \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
* \param outputs[0] output matrix, size : nSamples * 1.
*/
template <DeviceType Device>
class CosSimForwardFunc : public FunctionBase {
void init(const FuncConfig& config) override {
scale_ = config.get<real>("scale");
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(inputs.size(), 2UL);
CHECK_EQ(outputs.size(), 1UL);
CHECK_EQ(inputs[0].shape().ndims(), 2UL);
CHECK_EQ(inputs[1].shape().ndims(), 2UL);
CHECK_EQ(outputs[0].shape().ndims(), 2UL);
CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
CHECK_EQ(outputs[0].shape()[1], 1UL);
CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
auto out_mat = outputs[0].matrix<Device>();
const auto in1_mat = inputs[0].matrix<Device>();
const auto in2_mat = inputs[1].matrix<Device>();
CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
}
private:
real scale_;
};
/**
* Cosine Similarity Derivative for CpuMatrix
*
* \param in1_grad forward input grad 1, size: nSamples * dim.
* \param in2_grad forward input grad 2,
* size: n2 * dim (n2 == 1 or n2 == nSamples).
*
* \param out_grad backward loss output grad, size : nSamples * 1.
* \param out_val forward output value, size: nSamples * 1.
* \param in1_val forward input value 1, size: nSamples * dim.
* \param in2_val forward input value 2,
* size: n2 * dim (n2 == 1 or n2 == nSamples).
* \param scale, default 1.0
*/
template <>
void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
const CpuMatrix& out_val,
const CpuMatrix& in1_val,
const CpuMatrix& in2_val,
CpuMatrix& in1_grad,
CpuMatrix& in2_grad,
real scale) {
CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
in2_val.getData() && in1_grad.getData() && in2_grad.getData());
CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
const real* grad = out_grad.getData();
const real* out = out_val.getData();
const real* prev_out_x = in1_val.getData();
const real* prev_out_y = in2_val.getData();
real* prev_grad_x = in1_grad.getData();
real* prev_grad_y = in2_grad.getData();
size_t num_samples = out_grad.getHeight();
size_t dim = in1_val.getWidth();
CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
for (size_t i = 0; i < num_samples; ++i,
prev_out_x += dim,
prev_out_y += inc,
prev_grad_x += dim,
prev_grad_y += inc) {
real square_sum_x = 0;
real square_sum_y = 0;
real xy = 0;
for (size_t j = 0; j < dim; ++j) {
square_sum_x += prev_out_x[j] * prev_out_x[j];
square_sum_y += prev_out_y[j] * prev_out_y[j];
xy += prev_out_x[j] * prev_out_y[j];
}
CHECK(square_sum_x > 0 && square_sum_y > 0);
if (xy == 0) {
real reciprocal =
1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
for (size_t j = 0; j < dim; ++j) {
prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
}
} else {
real reciprocal_xy = 1.0f / xy;
real reciprocal_square_sum_x = 1.0f / square_sum_x;
real reciprocal_square_sum_y = 1.0f / square_sum_y;
for (size_t j = 0; j < dim; ++j) {
prev_grad_x[j] +=
out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
prev_out_x[j] * reciprocal_square_sum_x);
prev_grad_y[j] +=
out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
prev_out_y[j] * reciprocal_square_sum_y);
}
}
}
}
/**
* Cosine Similarity backward Derivative
*
* \param outputs[0] forward input grad 1, size: nSamples * dim.
* \param outputs[1] forward input grad 2,
* size: n2 * dim (n2 == 1 or n2 == nSamples).
*
* \param inputs[0] backward loss output grad, size : nSamples * 1.
* \param inputs[1] forward output value, size: nSamples * 1.
* \param inputs[2] forward input value 1, size: nSamples * dim.
* \param inputs[3] forward input value 2,
* size: n2 * dim (n2 == 1 or n2 == nSamples).
*/
template <DeviceType Device>
class CosSimBackwardFunc : public FunctionBase {
void init(const FuncConfig& config) override {
scale_ = config.get<real>("scale");
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(inputs.size(), 4UL);
CHECK_EQ(outputs.size(), 2UL);
/// dim of out_grad and out_val == 1, column vector
CHECK_EQ(inputs[0].shape()[1], 1UL);
CHECK_EQ(inputs[1].shape()[1], 1UL);
/// nSamples of out_grad == out_val == in_val1 == in_grad1
CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
/// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
inputs[3].data() && outputs[0].data() && outputs[1].data());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
CHECK_EQ(outputs[1].getArgType(), ADD_TO);
const auto out_grad = inputs[0].matrix<Device>();
const auto out_val = inputs[1].matrix<Device>();
const auto in1_val = inputs[2].matrix<Device>();
const auto in2_val = inputs[3].matrix<Device>();
auto in1_grad = outputs[0].matrix<Device>();
auto in2_grad = outputs[1].matrix<Device>();
CosSimBackward<Device>(
out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
}
private:
real scale_;
};
REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief Cosine Similarity Forward.
* for each row i,
* out[i] = scale * cos(in1[i], in2[i])
* = scale * \sum_j (in1[i][j] * in2[i][j]) /
* sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
*
* \param[out] output output value.
* \param[in] intput1 input value.
* \param[in] intput2 input value.
* \param[in] scale default 1.0.
*
*/
template <DeviceType Device>
void CosSimForward(typename Tensor<real, Device>::Matrix& output,
const typename Tensor<real, Device>::Matrix& input1,
const typename Tensor<real, Device>::Matrix& input2,
real scale);
/**
* \brief Cosine Similarity BackWard for Derivative.
*
* \param[in] output grad backward loss output grad.
* \param[in] output val forward-output value.
* \param[in] input val1 forward input value 1.
* \param[in] input val2 forward input value 2.
* \param[in/out] input grad forward input grad 1.
* \param[in/out] input grad forward input grad 2.
* \param[in] scale default 1.0.
*
*/
template <DeviceType Device>
void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
const typename Tensor<real, Device>::Matrix& out_value,
const typename Tensor<real, Device>::Matrix& in1_value,
const typename Tensor<real, Device>::Matrix& in2_value,
typename Tensor<real, Device>::Matrix& in1_grad,
typename Tensor<real, Device>::Matrix& in2_grad,
real scale);
} // namespace paddle
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
#include "paddle/math/Matrix.h"
using namespace paddle; // NOLINT
void testCosSimForward(size_t height_x,
size_t height_y,
size_t width,
real scale) {
FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
// prepare input arguments
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
ASSIGN_TO);
// run Function
test.run();
}
void testCosSimBackward(size_t height_x,
size_t height_y,
size_t width,
real scale) {
FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
// prepare input arguments
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
ADD_TO);
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
ADD_TO);
// run Function
test.run();
}
TEST(Matrix, cosSim) {
for (auto height_x : {10, 100, 1000}) {
for (auto height_y : {1, height_x}) {
for (auto width : {10, 100, 1000}) {
for (auto scale : {1.0, 2.0}) {
testCosSimForward(height_x, height_y, width, scale);
testCosSimBackward(height_x, height_y, width, scale);
}
}
}
}
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -647,7 +647,7 @@ public: ...@@ -647,7 +647,7 @@ public:
DataBatch& gpuBatch = *batch; DataBatch& gpuBatch = *batch;
std::vector<Argument>& gpuArguments = gpuBatch.getStreams(); std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
gpuArguments.resize(cpuArguments.size()); gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size); gpuBatch.setSize(bsize);
for (size_t i = 0; i < headers_.size(); ++i) { for (size_t i = 0; i < headers_.size(); ++i) {
gpuArguments[i].resizeAndCopyFrom( gpuArguments[i].resizeAndCopyFrom(
cpuArguments[i], useGpu_, HPPL_STREAM_1); cpuArguments[i], useGpu_, HPPL_STREAM_1);
......
...@@ -20,7 +20,7 @@ namespace paddle { ...@@ -20,7 +20,7 @@ namespace paddle {
/** /**
* calculate sequence-to-sequence edit distance * calculate sequence-to-sequence edit distance
*/ */
class CTCErrorEvaluator : public Evaluator { class CTCErrorEvaluator : public NotGetableEvaluator {
private: private:
MatrixPtr outActivations_; MatrixPtr outActivations_;
int numTimes_, numClasses_, numSequences_, blank_; int numTimes_, numClasses_, numSequences_, blank_;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册