提交 1ce2fca4 编写于 作者: L liaogang

Merge conflicts

......@@ -21,3 +21,10 @@
sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
hooks:
- id: clang-formater
- repo: https://github.com/dnephin/pre-commit-golang
sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
hooks:
- id: go-fmt
files: (.*\.go)
- id: go-lint
files: (.*\.go)
......@@ -33,16 +33,17 @@ addons:
- ccache
before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version.
- pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
- pip install rarfile
- curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
notifications:
email:
......
......@@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
include(system)
......
......@@ -38,12 +38,14 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
......
......@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
FIND_PACKAGE(Protobuf QUIET)
SET(PROTOBUF_FOUND "OFF")
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS)
# shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
return()
endif()
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
# Create an include path for each file specified
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
endif()
if(DEFINED Protobuf_IMPORT_DIRS)
foreach(DIR ${Protobuf_IMPORT_DIRS})
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
endif()
set(${SRCS})
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
get_filename_component(FIL_DIR ${FIL} DIRECTORY)
if(FIL_DIR)
set(FIL_WE "${FIL_DIR}/${FIL_WE}")
endif()
endif()
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
COMMENT "Running Python protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
endfunction()
endif()
# Print and set the protobuf library information,
# finish this cmake process and exit from this file.
......
......@@ -88,7 +88,7 @@
#
# including binary directory for generated headers.
include_directories(${CMAKE_BINARY_DIR})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE)
find_package(Threads REQUIRED)
......@@ -106,7 +106,7 @@ function(merge_static_libs TARGET_NAME)
if(APPLE) # Use OSX's libtool to merge archives
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
# It is created by add_custom_command below and will helps
# also help to track dependencies.
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
......@@ -126,7 +126,7 @@ function(merge_static_libs TARGET_NAME)
# Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
......@@ -154,14 +154,14 @@ function(merge_static_libs TARGET_NAME)
endforeach()
add_library(${TARGET_NAME} STATIC ${mergebases})
target_link_libraries(${TARGET_NAME} ${libs_deps})
target_link_libraries(${TARGET_NAME} ${libs_deps})
# Get the file name of the generated library
set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
foreach(lib ${libs})
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
COMMAND ${CMAKE_RANLIB} ${outlibfile}
WORKING_DIRECTORY ${lib}.objdir)
endforeach()
......@@ -213,7 +213,7 @@ function(cc_test TARGET_NAME)
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_test(${TARGET_NAME} ${TARGET_NAME})
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
......@@ -300,10 +300,11 @@ function(go_library TARGET_NAME)
file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
# Golang build source code
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-o "${${TARGET_NAME}_LIB_PATH}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
# must run under GOPATH
......@@ -318,11 +319,13 @@ function(go_binary TARGET_NAME)
cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
# FIXME: link path
add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
"./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
# TODO: don't know what ${TARGET_NAME}_link does
add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
......@@ -351,3 +354,12 @@ function(proto_library TARGET_NAME)
protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
endfunction()
function(py_proto_compile TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS)
cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(py_srcs)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction()
......@@ -445,6 +445,11 @@ smooth_l1_cost
.. autoclass:: paddle.v2.layer.smooth_l1_cost
:noindex:
multibox_loss
--------------
.. autoclass:: paddle.v2.layer.multibox_loss
:noindex:
Check Layer
============
......@@ -468,3 +473,11 @@ prelu
--------
.. autoclass:: paddle.v2.layer.prelu
:noindex:
Detection output Layer
======================
detection_output
----------------
.. autoclass:: paddle.v2.layer.detection_output
:noindex:
......@@ -101,7 +101,7 @@
</div>
<div class="site-nav-links">
<div class="site-menu">
<a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
<a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
<div class="language-switcher dropdown">
<a type="button" data-toggle="dropdown">
<span>English</span>
......
......@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
go_binary(master SRC master.go)
go_binary(master SRC master.go DEPS paddle_go_optimizer)
......@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
go_binary(pserver SRCS pserver.go)
go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
go_library(paddle_master SHARED)
go_library(paddle_master SHARED DEPS paddle_go_optimizer)
......@@ -104,11 +104,22 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
return C.PADDLE_MASTER_OK
}
// return value:
// 0:ok
// -1:error
//export paddle_next_record
func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
c := get(client)
r := c.NextRecord()
r, err := c.NextRecord()
if err != nil {
// Error
// TODO: return the type of error?
*record = (*C.uchar)(nullPtr)
return -1
}
if len(r) == 0 {
// Empty record
*record = (*C.uchar)(nullPtr)
return 0
}
......
......@@ -11,7 +11,12 @@ import (
// Client is the client of the master server.
type Client struct {
conn *connection.Conn
ch chan []byte
ch chan record
}
type record struct {
r []byte
err error
}
// NewClient creates a new Client.
......@@ -21,7 +26,7 @@ type Client struct {
func NewClient(addrCh <-chan string, bufSize int) *Client {
c := &Client{}
c.conn = connection.New()
c.ch = make(chan []byte, bufSize)
c.ch = make(chan record, bufSize)
go c.monitorMaster(addrCh)
go c.getRecords()
return c
......@@ -46,10 +51,11 @@ func (c *Client) getRecords() {
s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
for s.Scan() {
c.ch <- s.Record()
c.ch <- record{s.Record(), nil}
}
if s.Err() != nil {
c.ch <- record{nil, s.Err()}
log.Errorln(err, chunk.Path)
}
......@@ -116,6 +122,7 @@ func (c *Client) taskFinished(taskID int) error {
//
// NextRecord will block until the next record is available. It is
// thread-safe.
func (c *Client) NextRecord() []byte {
return <-c.ch
func (c *Client) NextRecord() ([]byte, error) {
r := <-c.ch
return r.r, r.err
}
......@@ -68,12 +68,17 @@ func TestNextRecord(t *testing.T) {
for pass := 0; pass < 50; pass++ {
received := make(map[byte]bool)
for i := 0; i < total; i++ {
r := c.NextRecord()
r, err := c.NextRecord()
if err != nil {
t.Fatal(pass, i, "Read error:", err)
}
if len(r) != 1 {
t.Fatal("Length should be 1.", r)
t.Fatal(pass, i, "Length should be 1.", r)
}
if received[r[0]] {
t.Fatal("Received duplicate.", received, r)
t.Fatal(pass, i, "Received duplicate.", received, r)
}
received[r[0]] = true
}
......
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
target_link_libraries(paddle_go_optimizer stdc++ m)
go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
if(WITH_TESTING)
add_subdirectory(test)
# FIXME: this test requires pserver which is not managed by the test
# we need some kind of e2e testing machanism.
# add_subdirectory(test)
endif()
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
add_style_check_target(test_cclient test_cclient.c)
......@@ -2,7 +2,7 @@ package pserver
// #cgo CFLAGS: -I ../../
// //FIXME: ldflags contain "build" path
// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++
// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
// #include "paddle/optimizer/optimizer.h"
// #include <stdlib.h>
// #include <string.h>
......@@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
func (o *optimizer) GetWeights() []byte {
var buffer unsafe.Pointer
buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer)
return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float)
bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
}
func (o *optimizer) UpdateParameter(g Gradient) error {
......
......@@ -10,8 +10,10 @@ import (
type ElementType int
const (
// AlreadyInitialized is true if pserver is initialized
AlreadyInitialized = "pserver already initialized"
Uninitialized = "pserver not fully initialized"
// Uninitialized is true if pserver not fully initialized
Uninitialized = "pserver not fully initialized"
)
// Supported element types
......@@ -55,7 +57,7 @@ func NewService(idx int) (*Service, error) {
s := &Service{
idx: idx,
}
s.optMap = make(map[string]*optimizer)
s.optMap = make(map[string]*optimizer)
s.initialized = make(chan struct{})
return s, nil
}
......
......@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
paddle_trainer_lib
paddle_network
paddle_parameter
paddle_optimizer
paddle_math
paddle_utils
paddle_proto
......
......@@ -2,9 +2,17 @@
cc_library(ddim SRCS ddim.cc)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
cc_test(variable_test SRCS variable_test.cc)
cc_test(scope_test SRCS scope_test.cc)
cc_test(enforce_test SRCS enforce_test.cc)
proto_library(attr_type SRCS attr_type.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
#pragma once
#include <boost/variant.hpp>
#include <functional>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/framework/enforce.h"
namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
// check whether a value(attribute) fit a certain limit
template <typename T>
class LargerThanChecker {
public:
LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
}
private:
T lower_bound_;
};
// we can provide users more common Checker, like 'LessThanChecker',
// 'BetweenChecker'...
template <typename T>
class DefaultValueSetter {
public:
DefaultValueSetter(T default_value) : default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; }
private:
T default_value_;
};
// check whether a certain attribute fit its limits
// an attribute can have more than one limits
template <typename T>
class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker;
public:
TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
TypedAttrChecker& LargerThan(const T& lower_bound) {
value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
return *this;
}
// we can add more common limits, like LessThan(), Between()...
TypedAttrChecker& SetDefault(const T& default_value) {
PADDLE_ENFORCE(default_value_setter_.empty(),
"%s can't have more than one default value!", attr_name_);
default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
return *this;
}
// allow users provide their own checker
TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
value_checkers_.push_back(checker);
return *this;
}
void operator()(AttributeMap& attr_map) const {
if (!attr_map.count(attr_name_)) {
// user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(),
"Attribute '%s' is required!", attr_name_);
// default_value_setter_ has no more than one element
T val;
(default_value_setter_[0])(val);
attr_map[attr_name_] = val;
}
Attribute& attr = attr_map.at(attr_name_);
T& attr_value = boost::get<T>(attr);
for (const auto& checker : value_checkers_) {
checker(attr_value);
}
}
private:
std::string attr_name_;
std::vector<ValueChecker> value_checkers_;
std::vector<ValueChecker> default_value_setter_;
};
// check whether op's all attributes fit their own limits
class OpAttrChecker {
typedef std::function<void(AttributeMap&)> AttrChecker;
public:
template <typename T>
TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
AttrChecker& checker = attr_checkers_.back();
return *(checker.target<TypedAttrChecker<T>>());
}
void Check(AttributeMap& attr_map) const {
for (const auto& checker : attr_checkers_) {
checker(attr_map);
}
}
private:
std::vector<AttrChecker> attr_checkers_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
package paddle.framework;
import "attr_type.proto";
// AttrDesc is used to describe Attributes of an Operator. It contain's
// name, type, and value of Attribute.
//
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message AttrDesc {
required string name = 1;
required AttrType type = 2;
optional int32 i = 3;
optional float f = 4;
optional string s = 5;
repeated int32 ints = 6;
repeated float floats = 7;
repeated string strings = 8;
};
// Protocol Message to describe an Operator.
//
// In PaddlePaddle, Operator is used to do a certain computation such
// as "add", "sub", "cosine", etc.
// (1) Operator needs to know the input and output variable names.
// (2) Some ops may have special attributes such as "scale" in "CosineOp".
//
// 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message OpDesc {
// input names of this Operator.
repeated string inputs = 1;
// output names of this Operator.
repeated string outputs = 2;
// type of this Operator, such as "add", "sub", "fc".
required string type = 3;
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated AttrDesc attrs = 4;
};
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/framework/op_desc.pb.h>
TEST(OpDesc, Create) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("add");
op_desc.add_inputs("X");
op_desc.add_inputs("Y");
op_desc.add_outputs("Z");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.14);
// required field name is not set, so IsInitialized should be false.
ASSERT_FALSE(op_desc.IsInitialized());
attr->set_name("add");
// after all required fields are set, IsInitialized should be true now.
ASSERT_TRUE(op_desc.IsInitialized());
}
\ No newline at end of file
#pragma once
#include "paddle/framework/attr_checker.h"
//#include "paddle/framework/op_base.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/op_proto.pb.h"
namespace paddle {
namespace framework {
//==================For test================//
class OpBase {
public:
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
AttributeMap attr_map_;
virtual std::string Run() const = 0;
virtual ~OpBase() {}
};
//=========================================//
// helper class to set attribute type
struct AttrTypeHelper {
template <typename T>
static void SetAttrType(AttrProto* attr);
static Attribute GetAttrValue(const AttrDesc& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
}
case paddle::framework::AttrType::FLOAT: {
return attr_desc.f();
}
case paddle::framework::AttrType::STRING: {
return attr_desc.s();
}
case paddle::framework::AttrType::INTS: {
std::vector<int> val(attr_desc.ints_size());
for (int i = 0; i < attr_desc.ints_size(); ++i) {
val[i] = attr_desc.ints(i);
}
return val;
}
case paddle::framework::AttrType::FLOATS: {
std::vector<float> val(attr_desc.floats_size());
for (int i = 0; i < attr_desc.floats_size(); ++i) {
val[i] = attr_desc.floats(i);
}
return val;
}
case paddle::framework::AttrType::STRINGS: {
std::vector<std::string> val(attr_desc.strings_size());
for (int i = 0; i < attr_desc.strings_size(); ++i) {
val[i] = attr_desc.strings(i);
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
}
};
template <>
void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INT);
}
template <>
void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOAT);
}
template <>
void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRING);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::INTS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::FLOATS);
}
template <>
void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
attr->set_type(paddle::framework::AttrType::STRINGS);
}
// this class not only make proto but also init attribute checkers.
class OpProtoAndCheckerMaker {
public:
OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: proto_(proto), op_checker_(op_checker) {}
protected:
void AddInput(const std::string& name, const std::string& comment) {
auto input = proto_->mutable_inputs()->Add();
*(input->mutable_name()) = name;
*(input->mutable_comment()) = comment;
}
void AddOutput(const std::string& name, const std::string& comment) {
auto output = proto_->mutable_outputs()->Add();
*(output->mutable_name()) = name;
*(output->mutable_comment()) = comment;
}
template <typename T>
TypedAttrChecker<T>& AddAttr(const std::string& name,
const std::string& comment) {
auto attr = proto_->mutable_attrs()->Add();
*(attr->mutable_name()) = name;
*(attr->mutable_comment()) = comment;
AttrTypeHelper::SetAttrType<T>(attr);
return op_checker_->AddAttrChecker<T>(name);
}
void AddType(const std::string& op_type) { proto_->set_type(op_type); }
void AddComment(const std::string& comment) {
*(proto_->mutable_comment()) = comment;
}
OpProto* proto_;
OpAttrChecker* op_checker_;
};
class OpRegistry {
typedef std::function<OpBase*()> OpCreator;
public:
template <typename OpType, typename ProtoMakerType>
static void RegisterOp(const std::string& op_type) {
creators_[op_type] = []() { return new OpType; };
OpProto& op_proto = protos_[op_type];
OpAttrChecker& op_checker = op_checkers_[op_type];
ProtoMakerType(&op_proto, &op_checker);
PADDLE_ENFORCE(op_proto.IsInitialized() == true,
"Fail to initialize %s's OpProto !", op_type);
}
static OpBase* CreateOp(const OpDesc& op_desc) {
std::string op_type = op_desc.type();
OpBase* op = (creators_.at(op_type))();
(op->inputs_).resize(op_desc.inputs_size());
for (int i = 0; i < op_desc.inputs_size(); ++i) {
(op->inputs_)[i] = op_desc.inputs(i);
}
(op->outputs_).resize(op_desc.outputs_size());
for (int i = 0; i < op_desc.outputs_size(); ++i) {
(op->outputs_)[i] = op_desc.outputs(i);
}
for (int i = 0; i < op_desc.attrs_size(); ++i) {
const AttrDesc& ith_attr = op_desc.attrs(i);
std::string name = ith_attr.name();
(op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
}
const OpAttrChecker& op_checker = op_checkers_.at(op_type);
op_checker.Check(op->attr_map_);
return op;
}
private:
static std::unordered_map<std::string, OpCreator> creators_;
static std::unordered_map<std::string, OpProto> protos_;
static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
};
std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
std::unordered_map<std::string, OpProto> OpRegistry::protos_;
std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
template <typename OpType, typename ProtoMakerType>
class OpRegisterHelper {
public:
OpRegisterHelper(std::string op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
}
};
#define REGISTER_OP(__op_class, __op_maker_class, __op_type) \
class __op_class##Register { \
private: \
const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
}; \
const OpRegisterHelper<__op_class, __op_maker_class> \
__op_class##Register::reg(#__op_type);
// Demos
class CosineOp : public OpBase {
public:
virtual std::string Run() const {
std::string msg = "CosineOp runs! scale = " +
std::to_string(boost::get<float>(attr_map_.at("scale")));
return msg;
}
};
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of cosine op");
AddOutput("output", "output of cosine op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
AddType("cos");
AddComment("This is cos op");
}
};
REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
class MyTestOp : public OpBase {
public:
virtual std::string Run() const {
std::string msg =
"MyTestOp runs! test_attr = " +
std::to_string(boost::get<int>(attr_map_.at("test_attr")));
return msg;
}
};
class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of cosine op");
AddOutput("output", "output of cosine op");
auto my_checker = [](int i) {
PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
};
AddAttr<int>("test_attr", "a simple test attribute")
.AddCustomChecker(my_checker);
AddType("my_test_op");
AddComment("This is my_test op");
}
};
REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
} // namespace framework
} // namespace paddle
#include "paddle/framework/op_registry.h"
#include <gtest/gtest.h>
TEST(OpRegistry, CreateOp) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(3.3);
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
TEST(OpRegistry, IllegalAttr) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("scale");
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(-2.0);
bool caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "larger_than check fail";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
}
TEST(OpRegistry, DefaultValue) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("cos_sim");
op_desc.add_inputs("aa");
op_desc.add_outputs("bb");
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
float default_value = 1.0;
std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
TEST(OpRegistry, CustomChecker) {
paddle::framework::OpDesc op_desc;
op_desc.set_type("my_test_op");
op_desc.add_inputs("ii");
op_desc.add_outputs("oo");
// attr 'test_attr' is not set
bool caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "Attribute 'test_attr' is required!";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
// set 'test_attr' set to an illegal value
auto attr = op_desc.mutable_attrs()->Add();
attr->set_name("test_attr");
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(3);
caught = false;
try {
paddle::framework::OpBase* op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "'test_attr' must be even!";
const char* err_msg = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(err_msg[i], msg[i]);
}
}
ASSERT_TRUE(caught);
// set 'test_attr' set to a legal value
op_desc.mutable_attrs()->Clear();
attr = op_desc.mutable_attrs()->Add();
attr->set_name("test_attr");
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(4);
paddle::framework::OpBase* op =
paddle::framework::OpRegistry::CreateOp(op_desc);
std::string debug_str = op->Run();
std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
ASSERT_EQ(str.size(), debug_str.size());
for (size_t i = 0; i < debug_str.length(); ++i) {
ASSERT_EQ(debug_str[i], str[i]);
}
}
\ No newline at end of file
......@@ -14,33 +14,39 @@ limitations under the License. */
#pragma once
#include <memory>
#include <type_traits>
#include "paddle/framework/ddim.h"
#include "paddle/framework/enforce.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
class Tensor {
using paddle::platform::Place;
using paddle::platform::get_place;
public:
template <typename T>
const T* data() const {
PADDLE_ASSERT(holder_ != nullptr,
"Tensor::data must be called after Tensor::mutable_data");
return static_cast<const T*>(holder->Ptr());
PADDLE_ENFORCE(holder_ != nullptr,
"Tensor::data must be called after Tensor::mutable_data.");
return static_cast<const T*>(holder_->Ptr());
}
template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type>
T* mutable_data(DDim dims, Place place) {
if (holder_ == nullptr || holder_->Place() != place ||
holder_->Size() < dims.product() * sizeof(T)) {
holder_.reset(new PlaceholderImpl(place, dims.product() * sizeof(T)));
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, paddle::platform::Place place) {
if (holder_ == nullptr ||
!(holder_->Place() ==
place) /* some versions of boost::variant don't have operator!= */
|| holder_->Size() < product(dims) * sizeof(T)) {
holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
}
return static_cast<T*>(holder_->Ptr());
}
template <typename T, // must be POD types
typename = std::enable_if<std::is_pod<T>::value>::type>
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims) {
return mutable_data<T>(dims, paddle::platform::get_place());
}
......@@ -51,27 +57,41 @@ class Tensor {
struct Placeholder {
virtual ~Placeholder() {}
virtual void* Ptr() const = 0;
virtual Place Place() const = 0;
virtual paddle::platform::Place Place() const = 0;
virtual size_t Size() const = 0;
};
template <typename T>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place pl, size_t size)
: ptr_(paddle::memory::Alloc(pl, size), paddle::memory::Deleter(pl)),
place_(pl),
private:
class Deleter {
public:
Deleter(platform::Place place) : place_(place) {}
void operator()(T* ptr) {
paddle::memory::Free(place_, static_cast<void*>(ptr));
}
private:
paddle::platform::Place place_;
};
public:
PlaceholderImpl(paddle::platform::Place place, size_t size)
: ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
Deleter(place)),
place_(place),
size_(size) {}
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
virtual size_t Size() const { return size_; }
virtual Place Place() const { return place_; }
virtual paddle::platform::Place Place() const { return place_; }
std::unique_ptr<T, memory::Deleter> ptr_;
Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
std::unique_ptr<T, Deleter> ptr_;
paddle::platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
};
std::unique_ptr<Placeholder> holder_; // holds the memory block if allocated.
std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
};
} // namespace framework
......
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "paddle/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
TEST(Tensor, ASSERT) {
paddle::framework::Tensor cpu_tensor;
bool caught = false;
try {
const double* p __attribute__((unused)) = cpu_tensor.data<double>();
} catch (paddle::framework::EnforceNotMet err) {
caught = true;
std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
const char* what = err.what();
for (size_t i = 0; i < msg.length(); ++i) {
ASSERT_EQ(what[i], msg[i]);
}
}
ASSERT_TRUE(caught);
}
/* mutable_data() is not tested at present
because Memory::Alloc() and Memory::Free() have not been ready.
TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
{
Tensor cpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
EXPECT_NE(p1, nullptr);
// set cpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set cpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set cpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
{
Tensor gpu_tensor;
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
EXPECT_NE(p1, nullptr);
// set gpu_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set gpu_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
EXPECT_EQ(p1, p2);
// set gpu_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
EXPECT_EQ(p1, p2);
}
}
*/
......@@ -25,6 +25,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the average pooling
* operation is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......
......@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
}
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
void CrossChannelNormLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
......@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
normBuffer_->zeroMem();
// add eps to avoid overflow
normBuffer_->addScalar(*normBuffer_, 1e-6);
inV->square2(*dataBuffer_);
for (size_t i = 0; i < batchSize; i++) {
const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
......@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
// compute norm.
spatialBuffer_->sumCols(*dataTmp, 1, 0);
// add eps to avoid overflow
spatialBuffer_->add(1e-6);
spatialBuffer_->sqrt2(*spatialBuffer_);
normTmp->copyFrom(*spatialBuffer_);
outVTmp->copyFrom(*inVTmp);
......@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
size_t dataDim = inG->getWidth();
size_t spatialDim = dataDim / channels_;
MatrixPtr inGBuffer;
Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
dataBuffer_->dotMul(*outG, *outV);
Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
......@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
scaleDiff_->add(*channelBuffer_, 1.);
sampleBuffer_->dotMul(*inVTmp, *outGTmp);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
// scale the grad
inGTmp->copyFrom(*inVTmp);
inGTmp->mulRowVector(*spatialBuffer_);
inGBuffer->copyFrom(*inVTmp);
inGBuffer->mulRowVector(*spatialBuffer_);
// divide by square of norm
spatialBuffer_->dotMul(*normTmp, *normTmp);
inGTmp->divRowVector(*spatialBuffer_);
inGBuffer->divRowVector(*spatialBuffer_);
// subtract
inGTmp->add(*outGTmp, -1, 1);
inGBuffer->add(*outGTmp, -1, 1);
// divide by norm
inGTmp->divRowVector(*normTmp);
inGBuffer->divRowVector(*normTmp);
// scale the diff
inGTmp->mulColVector(*scale_->getW());
inGBuffer->mulColVector(*scale_->getW());
inGTmp->add(*inGBuffer);
}
// updata scale
if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
scale_->getParameterPtr()->incUpdate(callback);
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DetectionOutputLayer.h"
namespace paddle {
REGISTER_LAYER(detection_output, DetectionOutputLayer);
bool DetectionOutputLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto& layerConf = config_.inputs(0).detection_output_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
nmsThreshold_ = layerConf.nms_threshold();
confidenceThreshold_ = layerConf.confidence_threshold();
nmsTopK_ = layerConf.nms_top_k();
keepTopK_ = layerConf.keep_top_k();
backgroundId_ = layerConf.background_id();
return true;
}
void DetectionOutputLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
Matrix::resizeOrCreate(
confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).detection_output_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locTmpBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confTmpBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
MatrixPtr priorValue;
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(
confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
confBuffer_->softmax(*confBuffer_);
size_t numPriors = priorValue->getElementCnt() / 8;
std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
for (size_t n = 0; n < batchSize; ++n) {
std::vector<NormalizedBBox> decodedBBoxes;
for (size_t i = 0; i < numPriors; ++i) {
size_t priorOffset = i * 8;
size_t locPredOffset = n * numPriors * 4 + i * 4;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
std::vector<real> locPredData;
for (size_t j = 0; j < 4; ++j)
locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
NormalizedBBox bbox =
decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
decodedBBoxes.push_back(bbox);
}
allDecodedBBoxes.push_back(decodedBBoxes);
}
std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
size_t numKept = getDetectionIndices(confBuffer_->getData(),
numPriors,
numClasses_,
backgroundId_,
batchSize,
confidenceThreshold_,
nmsTopK_,
nmsThreshold_,
keepTopK_,
allDecodedBBoxes,
&allIndices);
resetOutput(numKept, 7);
MatrixPtr outV = getOutputValue();
getDetectionOutput(confBuffer_->getData(),
numKept,
numPriors,
numClasses_,
batchSize,
allIndices,
allDecodedBBoxes,
*outV);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <vector>
#include "DetectionUtil.h"
#include "Layer.h"
namespace paddle {
/**
* The detection output layer for a SSD detection task. This layer applies the
* Non-maximum suppression to the all predicted bounding box and keeps the
* Top-K bounding boxes.
* - Input: This layer needs three input layers: The first input layer
* is the priorbox layer. The rest two input layers are convolution
* layers for generating bbox location offset and the classification
* confidence.
* - Output: The predict bounding box locations.
*/
class DetectionOutputLayer : public Layer {
public:
explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[1 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[1 + inputNum_ + index];
}
private:
size_t numClasses_; // number of classes
size_t inputNum_; // number of input layers
real nmsThreshold_;
real confidenceThreshold_;
size_t nmsTopK_;
size_t keepTopK_;
size_t backgroundId_;
size_t locSizeSum_;
size_t confSizeSum_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
};
} // namespace paddle
......@@ -26,6 +26,10 @@ namespace paddle {
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]}
* If stride_ > 0:
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and the max pooling operation is
* then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MultiBoxLossLayer.h"
#include <float.h>
#include <vector>
#include "DataLayer.h"
namespace paddle {
REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
bool MultiBoxLossLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto layerConf = config_.inputs(0).multibox_loss_conf();
numClasses_ = layerConf.num_classes();
inputNum_ = layerConf.input_num();
overlapThreshold_ = layerConf.overlap_threshold();
negPosRatio_ = layerConf.neg_pos_ratio();
negOverlap_ = layerConf.neg_overlap();
backgroundId_ = layerConf.background_id();
return true;
}
void MultiBoxLossLayer::forward(PassType passType) {
Layer::forward(passType);
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
resetOutput(batchSize, 1);
// all location data and confidence score data
locSizeSum_ = 0;
confSizeSum_ = 0;
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
locSizeSum_ += inLoc->getElementCnt();
confSizeSum_ += inConf->getElementCnt();
}
// locBuffer layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
locBuffer_ = locTmpBuffer_;
// confBuffer layout:
// | class1 score | class2 score | ... |classN score | class1 score | ......
Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
confBuffer_ = confTmpBuffer_;
// concate location data and confidence score data
size_t locOffset = 0;
size_t confOffset = 0;
auto& layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
locOffset += appendWithPermute(*inLoc,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locBuffer_,
kNCHWToNHWC);
confOffset += appendWithPermute(*inConf,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confBuffer_,
kNCHWToNHWC);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
// priorValue layout:
// | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
// | xmin2 | ......
MatrixPtr priorValue;
// labelValue layout:
// | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
MatrixPtr labelValue;
// Copy data from GPU to CPU if use GPU
if (useGpu_) {
Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
Matrix::resizeOrCreate(
priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
Matrix::resizeOrCreate(labelCpuValue_,
labelTmpValue->getHeight(),
labelTmpValue->getWidth(),
false,
false);
locCpuBuffer_->copyFrom(*locTmpBuffer_);
confCpuBuffer_->copyFrom(*confTmpBuffer_);
priorCpuValue_->copyFrom(*priorTmpValue);
labelCpuValue_->copyFrom(*labelTmpValue);
locBuffer_ = locCpuBuffer_;
confBuffer_ = confCpuBuffer_;
priorValue = priorCpuValue_;
labelValue = labelCpuValue_;
} else {
priorValue = getInputValue(*getPriorBoxLayer());
labelValue = getInputValue(*getLabelLayer());
}
// Get max scores for each prior bbox. Used in negative mining
std::vector<std::vector<real>> allMaxConfScore;
numPriors_ = priorValue->getElementCnt() / 8;
getMaxConfidenceScores(confBuffer_->getData(),
batchSize,
numPriors_,
numClasses_,
backgroundId_,
&allMaxConfScore);
// Match prior bbox to groundtruth bbox
Argument label = getInput(*getLabelLayer());
const int* labelIndex = label.sequenceStartPositions->getData(false);
size_t seqNum = label.getNumSequences();
numMatches_ = 0;
numNegs_ = 0;
allMatchIndices_.clear();
allNegIndices_.clear();
std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
numPriors_,
*labelValue,
labelIndex,
seqNum,
allMaxConfScore,
batchSize,
overlapThreshold_,
negOverlap_,
negPosRatio_,
&allMatchIndices_,
&allNegIndices_);
numMatches_ = retPair.first;
numNegs_ = retPair.second;
// BBox location L1 smooth loss
locLoss_ = 0.0;
if (numMatches_ >= 1) {
size_t count = 0;
MatrixPtr locLossOutput;
Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
locDiff_->zeroMem();
std::vector<real> locGTData;
real* locDiffData = locDiff_->getData();
const real* locBufferData = locBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue; // match none
size_t locOffset =
n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
std::copy(locBufferData + locOffset,
locBufferData + locOffset + 4,
locDiffData + count);
count += 4;
const int gtIdx = allMatchIndices_[n][i];
size_t priorOffset = i * 8;
std::vector<NormalizedBBox> priorBBoxVec;
getBBoxFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVec);
std::vector<std::vector<real>> priorBBoxVar;
getBBoxVarFromPriorData(
priorValue->getData() + priorOffset, 1, priorBBoxVar);
size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
std::vector<NormalizedBBox> gtBBoxVec;
getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
std::vector<real> gtEncode;
encodeBBoxWithVar(
priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
}
}
locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
locLoss_ = locLossOutput->getSum() / numMatches_;
}
// BBox confidence softmax loss
confLoss_ = 0;
numConf_ = numMatches_ + numNegs_;
if (numConf_ >= 1) {
Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
IVector::resizeOrCreate(confGTData_, numConf_, false);
confProb_->zeroMem();
size_t count = 0;
std::vector<real> confPredData;
real* confProbData = confProb_->getData();
const real* confBufferData = confBuffer_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
const int gtLabel = (labelValue->getData() + labelOffset)[0];
confGTData_->getData()[count] = gtLabel;
size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
// Negative mining samples
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
confGTData_->getData()[count] = backgroundId_;
size_t confOffset =
n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
std::copy(confBufferData + confOffset,
confBufferData + confOffset + numClasses_,
confProbData + count * numClasses_);
confPredData.reserve(confPredData.size() + numClasses_);
confPredData.insert(confPredData.end(),
confBufferData + confOffset,
confBufferData + confOffset + numClasses_);
++count;
}
}
CHECK_EQ(numConf_, count);
confProb_->softmax(*confProb_);
MatrixPtr confLossOutput;
Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
confLoss_ = confLossOutput->getSum() / numMatches_;
}
real loss = locLoss_ + confLoss_;
MatrixPtr outV = getOutputValue();
outV->assign(loss);
}
void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
locBuffer_->zeroMem();
confBuffer_->zeroMem();
// Back propagate on location prediction
if (numMatches_ >= 1) {
MatrixPtr locDiffBuffer;
Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
locDiff_->copyFrom(*locDiffBuffer);
// scale gradient
for (size_t i = 0; i < numMatches_ * 4; ++i)
locDiff_->getData()[i] *= (1. / numMatches_);
// Copy gradient back
size_t count = 0;
const real* locDiffData = locDiff_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* locBufferData =
locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
std::copy(locDiffData + count * 4,
locDiffData + (count + 1) * 4,
locBufferData);
++count;
}
}
CHECK_EQ(count, numMatches_);
}
if (numConf_ >= 1) {
for (size_t i = 0; i < numConf_; ++i)
confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
for (size_t i = 0; i < numConf_ * numClasses_; ++i)
confProb_->getData()[i] *= (1. / numMatches_);
size_t count = 0;
const real* confProbData = confProb_->getData();
for (size_t n = 0; n < batchSize; ++n) {
for (size_t i = 0; i < numPriors_; ++i) {
if (allMatchIndices_[n][i] == -1) continue;
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + i * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
int idx = allNegIndices_[n][i];
real* confDiffData = confBuffer_->getData() +
n * numPriors_ * numClasses_ + idx * numClasses_;
std::copy(confProbData + count * numClasses_,
confProbData + (count + 1) * numClasses_,
confDiffData);
++count;
}
}
CHECK_EQ(count, numConf_);
}
if (useGpu_) {
locTmpBuffer_->copyFrom(*locCpuBuffer_);
confTmpBuffer_->copyFrom(*confCpuBuffer_);
locBuffer_ = locTmpBuffer_;
confBuffer_ = confTmpBuffer_;
}
// copy back
size_t locOffset = 0;
size_t confOffset = 0;
auto layerConf = config_.inputs(0).multibox_loss_conf();
for (size_t n = 0; n < inputNum_; ++n) {
const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
// only for unittest, there are no width and height information
// when constructing matrix in unittest, so we should
// set the shape in configuration
if (!height) height = layerConf.height();
size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
if (!width) width = layerConf.width();
// NHWC to NCHW
MatrixPtr locGBuffer;
Matrix::resizeOrCreate(
locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
MatrixPtr confGBuffer;
Matrix::resizeOrCreate(
confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
locOffset += decomposeWithPermute(*locBuffer_,
height,
width,
locSizeSum_,
locOffset,
batchSize,
*locGBuffer,
kNHWCToNCHW);
inLocG->add(*locGBuffer);
confOffset += decomposeWithPermute(*confBuffer_,
height,
width,
confSizeSum_,
confOffset,
batchSize,
*confGBuffer,
kNHWCToNCHW);
inConfG->add(*confGBuffer);
}
CHECK_EQ(locOffset, locSizeSum_ / batchSize);
CHECK_EQ(confOffset, confSizeSum_ / batchSize);
}
} // namespace paddle
/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
licensed under the apache license, version 2.0 (the "license");
you may not use this file except in compliance with the license.
you may obtain a copy of the license at
http://www.apache.org/licenses/license-2.0
unless required by applicable law or agreed to in writing, software
distributed under the license is distributed on an "as is" basis,
without warranties or conditions of any kind, either express or implied.
see the license for the specific language governing permissions and
limitations under the license. */
#pragma once
#include <vector>
#include "CostLayer.h"
#include "DataLayer.h"
#include "DetectionUtil.h"
#include "Layer.h"
using std::vector;
using std::pair;
namespace paddle {
/**
* The multibox loss layer for a SSD detection task.
* The loss is composed by the location loss and the confidence loss.
* The location loss is a smooth L1 loss and the confidence loss is
* a softmax loss.
* - Input: This layer needs four input layers: The first input layer
* is the priorbox layer and the second layer is a label layer.
* The rest two input layers are convolution layers for generating
* bbox location offset and the classification confidence.
* - Output: The Single Shot Multibox Detection loss value.
* Reference:
* Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
* Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
*/
class MultiBoxLossLayer : public CostLayer {
public:
explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
protected:
inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
inline LayerPtr getLocInputLayer(size_t index) {
return inputLayers_[2 + index];
}
inline LayerPtr getConfInputLayer(size_t index) {
return inputLayers_[2 + inputNum_ + index];
}
protected:
size_t numClasses_;
real overlapThreshold_;
real negPosRatio_;
real negOverlap_;
size_t inputNum_;
size_t backgroundId_;
real locLoss_;
real confLoss_;
size_t numPriors_;
size_t numMatches_;
size_t numNegs_;
size_t numConf_;
size_t locSizeSum_;
size_t confSizeSum_;
vector<vector<int>> allMatchIndices_;
vector<vector<int>> allNegIndices_;
MatrixPtr locGTData_;
IVectorPtr confGTData_;
MatrixPtr locBuffer_;
MatrixPtr confBuffer_;
MatrixPtr locDiff_;
MatrixPtr confProb_;
MatrixPtr labelCpuValue_;
MatrixPtr priorCpuValue_;
MatrixPtr locCpuBuffer_;
MatrixPtr confCpuBuffer_;
MatrixPtr locTmpBuffer_;
MatrixPtr confTmpBuffer_;
};
} // namespace paddle
......@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
return true;
}
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
} // namespace paddle
......@@ -26,10 +26,9 @@ namespace paddle {
* If SequenceLevel = kNonseq:
* Output: a sequence containing only the last instance of the input sequence
* If stride_ > 0:
* Output: a shorten sequence. The operation of getting last instance of a
* sequence is independently performed on every slice of the input
* sequence, which is obtained by sliding a window with the window
* size set to stride_.
* Output: a shorten sequence. Stride is the step size by which we slide a
* window upon the input sequence, and getting last instance
* operation is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence
......@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
void SequenceLastInstanceLayer::forward(PassType passType) {
SequencePoolLayer::forward(passType);
auto starts = (stride_ > 0) ? stridePositions_->getData()
: startPositions_->getData(false);
auto starts = startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
......
......@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
if (stride_ > 0) {
CHECK_EQ(input.hasSubseq(), 0UL)
<< "sequence stride pooling is invalid for hasSubseq now";
output_.poolSequenceWithStride(
input, stride_, &stridePositions_, reversed_);
newBatchSize_ = stridePositions_->getSize() - 1;
output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
newBatchSize_ = startPositions_->getSize() - 1;
}
resetOutput(newBatchSize_, dim);
......
......@@ -28,8 +28,9 @@ namespace paddle {
* sequence}{input[i]}
* If stride_ > 0:
* Check input sequence must not have sub-sequence
* Output: a shorten sequence, pooling is performed upon a small local
* area
* Output: a shorten sequence. Stride is the step size by which we slide
* a window upon the input sequence, and the pooling operation
* is then applied to each interval independently.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......@@ -47,8 +48,6 @@ protected:
size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_;
int stride_;
// Store the start position of each window.
IVectorPtr stridePositions_;
// Whether the input sequence is reversed or not.
bool reversed_ = false;
......
......@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox
add_test(NAME test_PriorBox
COMMAND test_PriorBox)
################# test_DetectionOutput #######################
add_unittest_without_exec(test_DetectionOutput
test_DetectionOutput.cpp
LayerGradUtil.cpp)
add_test(NAME test_DetectionOutput
COMMAND test_DetectionOutput)
################# test_ConvUnify #######################
add_unittest_without_exec(test_ConvUnify
test_ConvUnify.cpp
......
......@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
std::vector<Argument> args;
args.push_back(out);
EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
start[seqId] += seqLens[seqId];
}
......@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf,
data.value->sigmoid(*data.value);
data.grad->zeroMem();
break;
case INPUT_SELF_DEFINE_DATA: {
size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
CHECK_GT(static_cast<int>(height), 0);
CHECK_GT(static_cast<int>(width), 0);
data.value = Matrix::create(height, width, false, useGpu);
data.grad = Matrix::create(height, width, false, useGpu);
data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
data.grad->zeroMem();
const std::vector<int>& labelSeqStartPositions =
testConf.inputDefs[i].labelSeqStartPositions;
if (labelSeqStartPositions.size() != 0) {
CHECK(!sequenceStartPositions);
CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
sequenceStartPositions =
ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
labelSeqStartPositions.size(),
useGpu);
data.sequenceStartPositions = sequenceStartPositions;
}
break;
}
default:
LOG(FATAL) << " unknown inputType ";
return;
......@@ -440,7 +465,6 @@ void initTestLayer(TestConfig testConf,
ParameterConfig paraConfig) {
paraConfig.set_name(paraName);
paraConfig.set_size(paraSize);
paraConfig.set_initial_std(1);
paraConfig.set_is_static(isStatic);
auto para =
std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
......@@ -474,6 +498,9 @@ void initTestLayer(TestConfig testConf,
paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
paraConfig.add_dims(testConf.layerConfig.size());
}
CHECK_GE(testConf.paramInitialStd, 0);
paraConfig.set_initial_mean(testConf.paramInitialMean);
paraConfig.set_initial_std(testConf.paramInitialStd);
initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
}
}
......
......@@ -31,7 +31,8 @@ enum InputType {
INPUT_SEQUENCE_LABEL,
INPUT_SPARSE_NON_VALUE_DATA,
INPUT_SPARSE_FLOAT_VALUE_DATA,
INPUT_DENSE_DIM_DATA, // using sequence length to init dense data
INPUT_DENSE_DIM_DATA, // using sequence length to init dense data
INPUT_SELF_DEFINE_DATA, // support customizing for input value
};
struct ParaSparse {
......@@ -66,6 +67,7 @@ struct InputDef {
bool isStatic;
std::vector<int> labelInitValue;
std::vector<int> labelSeqStartPositions;
MatrixPtr selfDefinedData;
InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
inputType = type;
......@@ -76,6 +78,20 @@ struct InputDef {
isStatic = false;
}
InputDef(InputType type,
string nameIn,
MatrixPtr selfDefinedData,
std::vector<int> selfDefinedSeqStartPos = {})
: labelSeqStartPositions(selfDefinedSeqStartPos),
selfDefinedData(selfDefinedData) {
inputType = type;
name = nameIn;
dim = 0;
sparse = {""};
paraSize = 0;
isStatic = false;
}
InputDef(InputType type,
string nameIn,
size_t dimIn,
......@@ -109,12 +125,16 @@ struct TestConfig {
LayerConfig layerConfig;
std::vector<InputDef> inputDefs;
size_t biasSize;
real paramInitialMean;
real paramInitialStd;
bool testAccumulate;
bool testState;
bool staticBias;
bool testBatchState;
TestConfig()
: biasSize(0),
paramInitialMean(0.0),
paramInitialStd(1.0),
testAccumulate(true),
testState(false),
staticBias(false),
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
// Do one forward pass of priorBox layer and check to see if its output
// matches the given result
void doOneDetectionOutputTest(MatrixPtr& inputLoc,
MatrixPtr& inputConf,
MatrixPtr& inputPriorBox,
size_t feature_map_width,
size_t feature_map_height,
real nms_threshold,
bool use_gpu,
MatrixPtr& result) {
// Setting up the detection output layer
TestConfig configt;
configt.layerConfig.set_type("detection_output");
LayerInputConfig* input = configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
configt.layerConfig.add_inputs();
DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
detOutput->set_width(feature_map_width);
detOutput->set_height(feature_map_height);
detOutput->set_nms_threshold(nms_threshold);
detOutput->set_num_classes(2);
detOutput->set_nms_top_k(20);
detOutput->set_keep_top_k(10);
detOutput->set_background_id(0);
detOutput->set_confidence_threshold(0.01);
detOutput->set_input_num(1);
configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
// data layer initialize
std::vector<DataLayerPtr> dataLayers;
LayerMap layerMap;
vector<Argument> datas;
initDataLayer(
configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
// test layer initialize
bool store_FLAGS_use_gpu = FLAGS_use_gpu;
FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters;
LayerPtr detectionOutputLayer;
initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
FLAGS_use_gpu = store_FLAGS_use_gpu;
detectionOutputLayer->forward(PASS_GC);
checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
}
TEST(Layer, detectionOutputLayerFwd) {
bool useGpu = false;
// CPU case 1.
MatrixPtr inputLoc;
MatrixPtr inputConf;
MatrixPtr inputPriorBox;
MatrixPtr result, result2, result3, result4;
real nmsTreshold = 0.01;
real inputLocData[] = {0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1,
0.1};
real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
real resultData[] = {
0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
result = Matrix::create(1, 7, false, useGpu);
inputLoc->setData(inputLocData);
inputConf->setData(inputConfData);
inputPriorBox->setData(inputPriorBoxData);
result->setData(resultData);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result);
// CPU case 2.
nmsTreshold = 0.2;
result2 = Matrix::create(2, 7, false, useGpu);
real resultData2[] = {0,
1,
0.68997443,
0.099959746,
0.099959746,
0.50804031,
0.50804031,
0,
1,
0.59868765,
0.29995975,
0.29995975,
0.70804024,
0.70804024};
result2->setData(resultData2);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result2);
#ifndef PADDLE_ONLY_CPU
// GPU case 1.
useGpu = true;
inputLoc = Matrix::create(1, 16, false, useGpu);
inputConf = Matrix::create(1, 8, false, useGpu);
inputPriorBox = Matrix::create(1, 32, false, useGpu);
inputLoc->copyFrom(inputLocData, 16);
inputConf->copyFrom(inputConfData, 8);
inputPriorBox->copyFrom(inputPriorBoxData, 32);
nmsTreshold = 0.01;
result3 = Matrix::create(1, 7, false, useGpu);
result3->copyFrom(resultData, 7);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result3);
// GPU case 2.
nmsTreshold = 0.2;
result4 = Matrix::create(2, 7, false, useGpu);
result4->copyFrom(resultData2, 14);
doOneDetectionOutputTest(inputLoc,
inputConf,
inputPriorBox,
/* feature_map_width */ 1,
/* feature_map_height */ 1,
nmsTreshold,
useGpu,
result4);
#endif
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,
TEST(Layer, MaxLayer) {
testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
testDegradeLayer(false,
"max",
"non-seq",
5); // seq max to a shorten seq, stride window = 5
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
}
TEST(Layer, SequenceLastInstanceLayer) {
......@@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
TEST(Layer, AverageLayer) {
testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
testDegradeLayer(false,
"average",
"non-seq",
5); // seq average to a shorten seq, stride window = 5
testDegradeLayer(
true, "average", "non-seq", -1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
......@@ -1661,6 +1669,8 @@ TEST(Layer, PadLayer) {
TEST(Layer, CrossChannelNormLayer) {
TestConfig config;
config.paramInitialMean = 1.;
config.paramInitialStd = 0.;
config.layerConfig.set_type("norm");
config.layerConfig.set_size(100);
LayerInputConfig* input = config.layerConfig.add_inputs();
......@@ -1674,7 +1684,7 @@ TEST(Layer, CrossChannelNormLayer) {
config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
for (auto useGpu : {false, true}) {
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
}
}
......@@ -1692,6 +1702,70 @@ TEST(Layer, smooth_l1) {
}
}
TEST(Layer, multibox_loss) {
TestConfig config;
config.layerConfig.set_type("multibox_loss");
config.biasSize = 0;
LayerInputConfig* input = config.layerConfig.add_inputs();
MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
multiboxLoss->set_num_classes(21);
multiboxLoss->set_input_num(1);
multiboxLoss->set_overlap_threshold(0.5);
multiboxLoss->set_neg_pos_ratio(3);
multiboxLoss->set_neg_overlap(0.5);
multiboxLoss->set_background_id(0);
multiboxLoss->set_height(3);
multiboxLoss->set_width(3);
size_t gtNum = 1;
MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
labelValue->randomizeUniform();
labelValue->add(-0.5);
labelValue->sigmoid(*labelValue);
real* labelData = labelValue->getData();
size_t labelWidth = labelValue->getWidth();
for (size_t i = 0; i < gtNum; ++i) {
*(labelData + i * labelWidth) = std::rand() % 20 + 1;
*(labelData + i * labelWidth + 1) = 0.400259;
*(labelData + i * labelWidth + 2) = 0.377857;
*(labelData + i * labelWidth + 3) = 0.525712;
*(labelData + i * labelWidth + 4) = 0.519368;
}
vector<int> seqStartPositions(gtNum + 1, 0);
for (size_t i = 1; i <= gtNum; ++i) {
seqStartPositions[i] = i;
}
// Ensure at lease one matched bbox
MatrixPtr priorValue = Matrix::create(1, 72, false, false);
priorValue->randomizeUniform();
priorValue->add(-0.5);
priorValue->sigmoid(*priorValue);
real* priorData = priorValue->getData();
*(priorData) = 0.424811;
*(priorData + 1) = 0.397059;
*(priorData + 2) = 0.538905;
*(priorData + 3) = 0.447091;
*(priorData + 4) = 0.425720;
*(priorData + 5) = 0.515228;
*(priorData + 6) = 0.519452;
*(priorData + 7) = 0.591065;
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
}
}
TEST(Layer, TransLayer) {
TestConfig config;
const int height = 128;
......
......@@ -27,22 +27,24 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) {
const char* AdadeltaOptimizer::SerializeState(int* state_len) {
AdadeltaOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
state.set_num_sample_passed(num_sample_passed_);
std::string lr_str = this->lr_policy_->SerializeState(state_len);
state.mutable_lr_state()->ParseFromString(lr_str);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
TensorToProto(*accum_delta_, state.mutable_accum_delta());
TensorToProto(*update_delta_, state.mutable_update_delta());
auto str = state.SerializeAsString();
*state_len = str.size();
*state_len += str.size();
return str.c_str();
}
void AdadeltaOptimizer::DeserializeState(const std::string& str) {
AdadeltaOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
auto lr_state = state.lr_state();
this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
......
......@@ -19,20 +19,23 @@ void AdagradOptimizer::Update(const Tensor* gradient) {
}
const char* AdagradOptimizer::SerializeState(int* state_len) {
AdagradOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
state.set_num_sample_passed(num_sample_passed_);
std::string lr_str = this->lr_policy_->SerializeState(state_len);
state.mutable_lr_state()->ParseFromString(lr_str);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
auto str = state.SerializeAsString();
*state_len = str.size();
*state_len += str.size();
return str.c_str();
}
void AdagradOptimizer::DeserializeState(const std::string& str) {
AdagradOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
auto lr_state = state.lr_state();
this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.accum_gradient(), accum_gradient_);
......
......@@ -24,20 +24,23 @@ void AdamOptimizer::Update(const Tensor *gradient) {
const char *AdamOptimizer::SerializeState(int *state_len) {
AdamOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
std::string lr_str = this->lr_policy_->SerializeState(state_len);
state.mutable_lr_state()->ParseFromString(lr_str);
state.set_num_sample_passed(num_sample_passed_);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*momentums_, state.mutable_momentums());
TensorToProto(*velocitys_, state.mutable_velocitys());
auto str = state.SerializeAsString();
*state_len = str.size();
*state_len += str.size();
return str.c_str();
}
void AdamOptimizer::DeserializeState(const std::string &str) {
AdamOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
auto lr_state = state.lr_state();
this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
......
......@@ -17,36 +17,56 @@ public:
// constant learning rate policy
class ConstLr final : public LrPolicy {
public:
ConstLr(double lr) : learning_rate(lr){};
ConstLr(double lr) : learning_rate_(lr){};
double LearningRate(const uint64_t num_sample_passed) {
return learning_rate;
return learning_rate_;
}
const char *SerializeState(int *state_len) {
LrPolicyState state;
state.set_learning_rate(learning_rate_);
auto str = state.SerializeAsString();
*state_len = str.size();
return str.c_str();
}
void DeserializeState(const std::string &str) {
LrPolicyState state;
state.ParseFromString(str);
learning_rate_ = state.learning_rate();
}
const char *SerializeState(int *state_len) { return nullptr; }
void DeserializeState(const std::string &state) {}
private:
double learning_rate;
double learning_rate_;
};
class LinearLr final : public LrPolicy {
public:
LinearLr(double lr, double lr_decay_a, double lr_decay_b)
: learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {}
: learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
double LearningRate(const uint64_t num_sample_passed) {
return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
lr_decay_b_);
}
const char *SerializeState(int *state_len) {
// TODO(zhihong) : add lr_policy serialization
return nullptr;
LrPolicyState state;
state.set_learning_rate(learning_rate_);
state.set_lr_decay_a(lr_decay_a_);
state.set_lr_decay_b(lr_decay_b_);
auto str = state.SerializeAsString();
*state_len = str.size();
return str.c_str();
}
void DeserializeState(const std::string &state) {
// TODO(zhihong) : add lr_policy serialization
void DeserializeState(const std::string &str) {
LrPolicyState state;
state.ParseFromString(str);
learning_rate_ = state.learning_rate();
lr_decay_a_ = state.lr_decay_a();
lr_decay_b_ = state.lr_decay_b();
}
private:
double learning_rate;
double lr_decay_a;
double lr_decay_b;
double learning_rate_;
double lr_decay_a_;
double lr_decay_b_;
};
} // namespace optimizer
......
......@@ -30,16 +30,20 @@ void SGDOptimizer::Update(const Tensor *gradient) {
const char *SGDOptimizer::SerializeState(int *state_len) {
SGDOptimizerState state;
state.set_num_sample_passed(num_sample_passed_);
std::string lr_str = this->lr_policy_->SerializeState(state_len);
state.mutable_lr_state()->ParseFromString(lr_str);
TensorToProto(*parameter_, state.mutable_parameter());
if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
auto str = state.SerializeAsString();
*state_len = str.size();
*state_len += str.size();
return str.c_str();
}
void SGDOptimizer::DeserializeState(const std::string &str) {
SGDOptimizerState state;
state.ParseFromString(str);
auto lr_state = state.lr_state();
this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_);
......
......@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {
void Argument::poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePostions,
ICpuGpuVectorPtr* stridePostions,
bool reversed) {
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
// then sequenceStartPositions = [0, 2, 3, 4, 7].
......@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
stridePos.emplace_back(starts[numSequences]);
int size = stridePos.size();
CHECK_EQ(size - 1, tgtBuf[numSequences]);
IVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->copyFrom(stridePos.data(), size);
ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
}
void Argument::getValueString(
......
......@@ -299,7 +299,7 @@ struct Argument {
*/
void poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePositions,
ICpuGpuVectorPtr* stridePositions,
bool reversed = false);
/**
* @brief getValueString will return the argument's output in string. There
......
......@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
for (auto reversed : {false, true}) {
IVectorPtr stridePositions;
ICpuGpuVectorPtr stridePositions;
output.poolSequenceWithStride(
input, 5 /* stride */, &stridePositions, reversed);
......@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
CHECK_EQ(stridePositions->getSize(), 8UL);
auto result = reversed ? strideResultReversed : strideResult;
for (int i = 0; i < 8; i++) {
CHECK_EQ(stridePositions->getData()[i], result[i]);
CHECK_EQ(stridePositions->getData(false)[i], result[i]);
}
}
}
......
......@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
EXPECT_EQ((int)0, nums[i]);
}
}
TEST_F(CommonTest, barrierStat) {
const int threadNum = 10;
SyncThreadPool pool(threadNum);
#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
struct timeval time; \
gettimeofday(&time, nullptr); \
uint64_t usec = timeToMicroSecond(time); \
std::srand(usec); \
auto value = std::rand() % 100000; \
usleep(value); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
}
globalStat.printAllStatus();
globalStat.reset();
for (auto i = 0; i < 10; i++) {
TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
// use it to test accurate barrier gap
#define TEST_BARRIER(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
usleep(tid * 10000); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for (auto i = 0; i < 10; i++) {
TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
}
globalStat.printAllStatus();
globalStat.reset();
}
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <cublas_v2.h>
#include "paddle/platform/dynamic_loader.h"
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <cudnn.h>
#include "paddle/platform/dynamic_loader.h"
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <curand.h>
#include "paddle/platform/dynamic_loader.h"
#include <dlfcn.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
......
......@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "dynamic_loader.h"
#include "paddle/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <mutex>
#include <string>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/framework/enforce.h"
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
......@@ -72,13 +73,12 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) {
if (dso_path == "libcudnn.dylib") {
LOG(FATAL)
<< "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
<< "For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
<< "/usr/local \n sudo chmod a+r "
"/usr/local/cuda/include/cudnn.h " // NOLINT
<< "/usr/local/cuda/lib/libcudnn*";
PADDLE_ENFORCE(true,
"Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf "
"cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
"chmod a+r /usr/local/cuda/include/cudnn.h "
"/usr/local/cuda/lib/libcudnn*");
}
}
}
......@@ -106,22 +106,15 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
}
}
CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
<< " (" << dlerror() << ") \n"
<< "Please specify its path correctly using "
"following ways: \n"
<< "Method. set environment variable "
"LD_LIBRARY_PATH on Linux or "
<< "DYLD_LIBRARY_PATH on Mac OS. \n"
<< "For instance, issue command: export "
"LD_LIBRARY_PATH=... \n"
<< "Note: After Mac OS 10.11, using the "
"DYLD_LIBRARY_PATH is impossible "
<< "unless System Integrity Protection (SIP) "
"is disabled.";
PADDLE_ENFORCE(nullptr != *dso_handle,
"Failed to find dynamic library: %s ( %s ) \n Please specify "
"its path correctly using following ways: \n Method. set "
"environment variable LD_LIBRARY_PATH on Linux or "
"DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
"export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
"using the DYLD_LIBRARY_PATH is impossible unless System "
"Integrity Protection (SIP) is disabled.",
dlPath, dlerror());
}
void GetCublasDsoHandle(void** dso_handle) {
......
......@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
}
/// trigger to initialize RDMA lib
PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
}
SocketServer::~SocketServer() {
......@@ -168,7 +168,7 @@ void SocketServer::tcpServer() {
/// First call to socket() function
socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket";
CHECK(socket_ >= 0) << "ERROR opening socket";
/// Initialize socket structure
bzero((char *)&serv_addr, sizeof(serv_addr));
......@@ -176,7 +176,7 @@ void SocketServer::tcpServer() {
serv_addr.sin_port = htons(port_);
if (!addr_.empty()) {
server = gethostbyname(addr_.c_str());
PCHECK(server) << "ERROR, no such host: " << addr_;
CHECK(server) << "ERROR, no such host: " << addr_;
bcopy((char *)server->h_addr,
(char *)&serv_addr.sin_addr.s_addr,
server->h_length);
......@@ -187,7 +187,7 @@ void SocketServer::tcpServer() {
setOption(socket_);
/// Now bind the host address using bind() call.
PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding " << addr_;
/// Now start listening for the clients, here process will
......@@ -201,7 +201,7 @@ void SocketServer::tcpServer() {
if (stopping_) {
break;
}
PCHECK(newsockfd >= 0) << "ERROR on accept";
CHECK(newsockfd >= 0) << "ERROR on accept";
constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen];
CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
......@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() {
/// First call to socket() function
rdmaSocket_ = rdma::ssocket(rdmaCpu_);
PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
<< "ERROR bind RDMA socket";
/// Now start listening for the clients, here process will
/// go in sleep mode and will wait for the incoming connection
PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
while (true) {
/// Accept actual connection from the client
......@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() {
if (stopping_) {
break;
}
PCHECK(newsock) << "ERROR on accept";
CHECK(newsock) << "ERROR on accept";
constexpr int kPeerNameLen = 128;
char peerName[kPeerNameLen];
......@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
onlineCpus_ = rdma::numCpus();
for (auto i = 0; i < onlineCpus_; i++) {
socket = rdma::csocket(i);
PCHECK(socket) << "ERROR open client socket daemon";
CHECK(socket) << "ERROR open client socket daemon";
rdmaClientSocket_.push_back(socket);
}
......@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
/// Create a socket point
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket";
CHECK(sockfd >= 0) << "ERROR opening socket";
#if defined(__OSX__) || defined(__APPLE__)
server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
......@@ -396,8 +396,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
}
std::this_thread::sleep_for(std::chrono::seconds(1));
} else {
PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
<< serverPort << "errorno: " << errno;
CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
<< serverPort << "errorno: " << errno;
}
} while (errno == ECONNREFUSED);
......@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
/// connect to server with socket daemon
sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
PCHECK(sock) << "ERROR connect to server" << rdmaUri;
CHECK(sock) << "ERROR connect to server" << rdmaUri;
std::vector<std::string> seg;
str::split(rdmaUri, '/', &seg);
......
......@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
SetConfigResponse response;
callback(response);
/// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
}
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
......@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::vector<Buffer>* outputBuffers) {
VLOG(1) << "pserver: addGradient";
// forwardbackward delta from all trainers
// indicate the fluctuation caused by forwardbackward.
if (!numPassFinishClients_) {
REGISTER_BARRIER_DELTA_SERVER_SET(
*statSet_,
"forwardbackwardDelta",
FLAGS_num_gradient_servers,
request.trainer_id(),
request.forwardbackward_time(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{
/// approximately pure network overhead
REGISTER_TIMER_DYNAMIC_SET(
"pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
}
#ifndef PADDLE_DISABLE_TIMER
gettimeofday(&(*addGradBegin_), nullptr);
#endif
/// barrier fluctuation caused by network and previous forwardbackward
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER_SET(
*statSet_,
"handleReqBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
(*handleRequestBegin_),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradBegin",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
{
REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
ReadLockGuard guard(parameterMutex_);
int bufferIndex = 0;
for (const auto& block : request.blocks()) {
......@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std::lock_guard<std::mutex> guard(*info.lock);
simd::addTo(gradientSumBuffer, gradientBuffer, size);
}
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"addGradCoreFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
}
if (request.batch_status() == BATCH_FINISH ||
request.batch_status() == BATCH_START_AND_FINISH) {
......@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
VLOG(1) << "num samples: " << numSamplesProcessed_
<< ", new cost:" << cost_;
/// numPassFinishClients_ means some trainer has entered finishPass
if (!numPassFinishClients_) {
REGISTER_SLOW_NODES_PROBE(
*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// notify doOperation gradient ready
gradientReadyBarrier_.wait();
/// if wait pass finish does not start, do check
if (!numPassFinishClients_) {
CHECK_BARRIER_TIMER(*statSet_,
"SLOW_NODES",
FLAGS_num_gradient_servers,
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// barrier performance while all parameter add is finished
/// can indicate the fluctation caused by computation at pserver.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"paraReady",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// wait doOperation finish
parameterReadyBarrier_.wait();
VLOG(1) << "start send back";
{
/// total time except overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
}
}
......@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
return commitGradient;
}
void ParameterServer2::printAsyncGradientCommitStatAndReset() {
std::stringstream statFormat;
if (asyncUpdateSteps_) {
statFormat << "async discard gradients stat: " << std::endl;
statFormat << "serverId: " << serverId_
<< " serverType: " << isSparseServer_
<< " total updates: " << asyncUpdateSteps_
<< " discard updates: " << asyncLaggedGradientsNum_
<< " discard ratio: "
<< (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Update Steps distribution: " << std::endl
<< "Sample: 1:1912(0.00284449) means "
<< "the updates step=1 count 1912 times "
<< "and account for 0.284449% of total updates" << std::endl;
size_t index = 0;
for (const auto& stat : asyncUpdateStat_) {
statFormat << index << ":" << stat << "("
<< (real)stat / (real)asyncUpdateSteps_ << ") ";
index++;
}
statFormat << std::endl;
statFormat << std::endl;
statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
<< "Sample: 2:22(0.0016363) means "
<< "total discarded updates from trainer_id=2 count 22 "
<< "and account for 0.16363% of all updates from trainer_id=2"
<< std::endl;
for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
real ratio =
(real)asyncTrainerDiscardStat_[i] /
(real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
<< ")"
<< " ";
}
LOG(INFO) << statFormat.str();
/// reset stat
asyncUpdateSteps_ = 0;
asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
asyncLaggedGradientsNum_ = 0;
asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
}
}
static ThreadLocal<std::vector<bool>> localBlockBitset_;
void ParameterServer2::asyncSGD(const SendParameterRequest& request,
......@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
if (request.trainer_id() == 0) {
/// batchId_ is approximately equal to "real batchId_"
batchId_++;
tuningAsyncsgdMidOutput();
}
}
......@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
}
(*requestVec_).clear();
(*callbackVec_).clear();
/// barrier perfromance while all data are send finished.
/// indicates network flucatuation for big message.
if (!numPassFinishClients_) {
REGISTER_BARRIER_TIMER_SERVER(
*statSet_,
"sendParamFinish",
FLAGS_num_gradient_servers,
request.trainer_id(),
isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
}
/// all time exhausted in parameterServer for big message.
/// it contains network and computation at pserver.
{
/// total time including overhead of network.
REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
timeToMicroSecond(*handleRequestBegin_),
-1,
*statSet_);
}
/// all time exhausted in pserverServer except recieve network.
{
/// total time except overhead of network receive
REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
timeToMicroSecond(*addGradBegin_),
-1,
*statSet_);
}
}
break;
case PSERVER_UPDATE_MODE_SET_PARAM:
......@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
{
REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
BlockInfo& info = blockInfos_[blockId];
const ParameterConfig& config = getParameterConfig(blockId);
......@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
batchId_++;
tuningSgdMidOutput();
}
void ParameterServer2::op_start_pass(const Operation& operation,
......@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
/// finish pass
info.optimizer->finishPass();
});
tuningSgdFinished();
batchId_ = 0;
}
......@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
callback(SynchronizeResponse());
if (request.trainer_id() == 0) {
tuningAsyncsgdFinished();
batchId_ = 0;
}
}
......@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
callback(response);
}
void ParameterServer2::tuningSgdMidOutput() {
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== Batch=" << batchId_ << "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/// not reset raw data for reducing the overhead of performance tuning
statSet_->reset(false);
}
}
void ParameterServer2::tuningSgdFinished() {
LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
<< "=======";
statSet_->setThreadInfo(true);
statSet_->printAllStatus();
/**
* reset raw data at end of pass since some raw data could be not
* complete. Otherwise the raw data will pollute next pass performance
* tuning
*/
statSet_->reset();
}
void ParameterServer2::tuningAsyncsgdMidOutput() {
#ifndef PADDLE_DISABLE_TIMER
if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
printAsyncGradientCommitStatAndReset();
}
#endif
}
void ParameterServer2::tuningAsyncsgdFinished() {
LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
<< "=======";
printAsyncGradientCommitStatAndReset();
}
} // namespace paddle
......@@ -298,24 +298,6 @@ protected:
/// barrier performance tuning sync-sgd required
std::atomic<int64_t> batchId_;
/// the beginning of addGradient without network overhead
ThreadLocal<struct timeval> addGradBegin_;
/**
* tuning barrier performance
* to better control log for sparse and dense parameter,
* we use different log entities for different parameterServer
* objects.
* it will output lots of performance stats to perceive the
* overhead of network, fluctuation of computation from
* forwardbackward and network, computation from optimization
* at pserver end, barrier overhead, etc. to understand tuning
* data, focus on the synchronization between addGradient and
* doOperation which indirectly call op_SGD operation controlled
* by remote updater controller
*/
std::unique_ptr<StatSet> statSet_;
public:
struct Buffer {
real* base;
......@@ -325,7 +307,6 @@ public:
protected:
/// async gradient commit control
bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
void printAsyncGradientCommitStatAndReset();
public:
/// disable default parameter for overloading
......@@ -710,36 +691,6 @@ public:
void op_load(const Operation& operation, OperationResult* result);
void op_save(const Operation& operation, OperationResult* result);
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for sgd
*/
void tuningSgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for sgd. it will also
* flush some stateful stat for next pass.
*/
void tuningSgdFinished();
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for async-sgd.
* it will log some performance log if some lagged node are found
*/
void tuningAsyncsgdMidOutput();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for async-sgd.
*/
void tuningAsyncsgdFinished();
};
} // namespace paddle
......@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
else
len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_;
CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) {
return total;
}
......@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
else
len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
PCHECK(len >= 0) << " peer=" << peerName_;
CHECK(len >= 0) << " peer=" << peerName_;
if (len <= 0) {
return total;
}
......@@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc,
while (size < total) {
ssize_t len =
iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
<< " iovCnt=" << iovcnt
<< " iovs[curIov].base=" << iovs[curIov].iov_base
<< " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
<< " iovCnt=" << iovcnt
<< " iovs[curIov].base=" << iovs[curIov].iov_base
<< " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
size += len;
/// restore iovs[curIov] to the original value
......@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
header.totalLength += iov.iov_len;
}
PCHECK(writev(iovs) == (size_t)header.totalLength);
CHECK(writev(iovs) == (size_t)header.totalLength);
}
std::unique_ptr<MsgReader> SocketChannel::readMessage() {
......@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
return nullptr;
}
PCHECK(len == sizeof(header));
CHECK(len == sizeof(header));
std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
......@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
: channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
size_t size = numBlocks * sizeof(blockLengths_[0]);
PCHECK(channel_->read(&blockLengths_[0], size) == size);
CHECK(channel_->read(&blockLengths_[0], size) == size);
}
void MsgReader::readBlocks(const std::vector<void*>& bufs) {
......@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
++currentBlockIndex_;
}
PCHECK(channel_->readv(&iovs) == totalLength);
CHECK(channel_->readv(&iovs) == totalLength);
}
void MsgReader::readNextBlock(void* buf) {
CHECK_LT(currentBlockIndex_, blockLengths_.size());
PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
++currentBlockIndex_;
}
......
......@@ -113,7 +113,7 @@ void SocketServer::run() {
/* First call to socket() function */
socket_ = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(socket_ >= 0) << "ERROR opening socket";
CHECK(socket_ >= 0) << "ERROR opening socket";
/* Initialize socket structure */
bzero((char*)&serv_addr, sizeof(serv_addr));
......@@ -122,7 +122,7 @@ void SocketServer::run() {
serv_addr.sin_port = htons(port_);
/* Now bind the host address using bind() call.*/
PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR on binding";
/* Now start listening for the clients, here process will
......@@ -134,7 +134,7 @@ void SocketServer::run() {
while (true) {
/* Accept actual connection from the client */
newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
PCHECK(newsockfd >= 0) << "ERROR on accept";
CHECK(newsockfd >= 0) << "ERROR on accept";
SocketWorker* worker = new SocketWorker(newsockfd);
worker->start();
......@@ -146,17 +146,17 @@ void SocketWorker::run() {
while (true) {
int64_t n = channel_.readAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket";
CHECK(n == sizeof(header)) << "ERROR reading from socket";
buffer_.resize(header.dataLength);
n = channel_.readAll(&buffer_[0], header.dataLength);
PCHECK(n == header.dataLength) << "ERROR reading from socket";
CHECK(n == header.dataLength) << "ERROR reading from socket";
/* Write a response to the client */
n = channel_.writeAll(&header, sizeof(header));
PCHECK(n == sizeof(header)) << "ERROR reading from socket";
CHECK(n == sizeof(header)) << "ERROR reading from socket";
n = channel_.writeAll(buffer_.data(), buffer_.size());
PCHECK(n == header.dataLength) << "ERROR writing to socket";
CHECK(n == header.dataLength) << "ERROR writing to socket";
}
}
......@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
/* Create a socket point */
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
PCHECK(sockfd >= 0) << "ERROR opening socket";
CHECK(sockfd >= 0) << "ERROR opening socket";
server = gethostbyname(serverAddr.c_str());
PCHECK(server) << "ERROR, no such host: " << serverAddr;
CHECK(server) << "ERROR, no such host: " << serverAddr;
bzero((char*)&serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
......@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
serv_addr.sin_port = htons(serverPort);
/* Now connect to the server */
PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
<< "ERROR connecting";
channel_.reset(new SocketChannel(sockfd));
......@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
cpuGrad.copyFrom(gpuGrad);
header.dataLength = dataSize;
PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
<< "Client write header error";
PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
<< "Client write data error";
/* Now read server response */
PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
<< "Client read header error";
CHECK_EQ((uint64_t)header.dataLength, dataSize);
PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
<< "Client read data error";
gpuParam.copyFrom(cpuParam);
......
......@@ -3,7 +3,7 @@
set -xe
# Set BASE_IMAGE according to env variables
if [ ${WITH_GPU} == "ON" ]; then
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
......@@ -78,7 +78,7 @@ paddle version
# PaddlePaddle. This awkwardness is due to
# https://github.com/PaddlePaddle/Paddle/issues/1854. It also
# describes a solution.
if [ ${WITH_DOC} == "ON" ]; then
if [[ ${WITH_DOC} == "ON" ]]; then
cat <<EOF
========================================
Building documentation ...
......
......@@ -5,13 +5,14 @@ set -e
mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
# Compile paddle binaries first
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
mkdir output
make -j `nproc`
find .. -name '*whl' | xargs pip install # install all wheels.
rm -rf *
# Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
make -j `nproc` paddle_docs paddle_docs_cn
......@@ -25,7 +26,7 @@ SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
SHA=`git rev-parse --verify HEAD`
# Documentation branch name
# gh-pages branch is used for PaddlePaddle.org. The English version of
# gh-pages branch is used for PaddlePaddle.org. The English version of
# documentation in `doc` directory, and the chinese version in `doc_cn`
# directory.
TARGET_BRANCH="gh-pages"
......@@ -51,7 +52,7 @@ function deploy_docs() {
# checkout github page branch
git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
mkdir -p ${DIR}
# remove old docs. mv new docs.
set +e
......@@ -62,7 +63,7 @@ function deploy_docs() {
git add .
}
deploy_docs "master" "."
deploy_docs "master" "."
deploy_docs "develop" "./develop/"
# Check is there anything changed.
......
......@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
}
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
FILE* fp = fopen(featFile.c_str(), "ab+");
PCHECK(!ferror(fp)) << "Fail to open " << featFile;
CHECK(!ferror(fp)) << "Fail to open " << featFile;
size_t sampleNum = featMatrices[0]->getHeight();
for (size_t i = 0; i < sampleNum; ++i) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/BarrierStat.h"
#include <string.h>
#include <sys/types.h>
#include <algorithm>
#include <iomanip>
#include "paddle/utils/Flags.h"
#include "paddle/utils/Stat.h"
DEFINE_bool(log_barrier_abstract,
true,
"if true, show abstract of barrier performance");
DEFINE_int32(log_barrier_lowest_nodes,
5,
"how many lowest node will be logged");
DEFINE_bool(log_barrier_show_log,
false, // for performance tuning insight
"if true, always show barrier abstract even with little gap");
namespace paddle {
std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
if (FLAGS_log_barrier_abstract) {
std::lock_guard<std::mutex> guard(stat.lock_);
stat.showAbstract(output);
}
return output;
}
BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
const std::string &name)
: totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
abstract_.resize(numConnThreads_);
if (FLAGS_log_barrier_show_log) {
rateThreshold_ = 0.0;
} else {
/* probablity of abnormal node
* p = 1/n + (n/8)/(n+1), n = nodes, n > 1
* if the freq of lowest trainerId larger than p,
* output FLAGS_log_barrier_lowest_nodes lastTrainerId.
* numConnThreads_ indicates nodes
*/
float n = (float)numConnThreads;
rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
}
}
BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorEnd(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierEndStat: " << name
<< " endBarrier warning rate: " << rateThreshold_;
}
/*
* Note:
* the design different pserver entity owns different statSet to obey
* the background that different pserver runs separately.
*/
void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(cur, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getLastTrainerId();
auto delta = timeToMicroSecond(timeVector_->getDelta());
auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
abstract_[id].totSecondDelta += secondDelta;
abstract_[id].totLastTwoDelta += lastTwoDelta;
abstract_[id].totMidDelta += midDelta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
totAbstract_.totSecondDelta += secondDelta;
totAbstract_.totLastTwoDelta += lastTwoDelta;
totAbstract_.totMidDelta += midDelta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierEndStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierEndStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/*
* Note:
* avgGap: the average delta between 1 -- n arriving trainers
* avgSecondGap: the average delta between 2 -- n arriving trainers
* avgLastTwoGap: the average delta between n-1 -- n arriving trainers
* avgMidGap: the average delta between n/2 -- n arriving trainers
* rato: samples / totSamples
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
<< std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
<< std::setw(10) << "rate" << std::setw(10) << "samples"
<< std::setw(10) << "totSamples" << std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
<< std::setw(15)
<< (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
<< std::setw(10) << (float)totAbstract_.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
<< std::setw(15)
<< (abstract.totLastTwoDelta / abstract.freq) * 0.001
<< std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
<< std::setw(10) << (float)abstract.freq / (float)totSamples_
<< std::setw(10) << (float)abstract.freq << std::setw(10)
<< (float)totSamples_ << std::endl;
}
}
BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
const std::string &name)
: BarrierStatBase(numConnThreads, name) {
timeVector_.reset(new TimeVectorDelta(numConnThreads_));
reset(true);
LOG(INFO) << " create barrierDeltaStat: " << name
<< " barrierDelta warning rate: " << rateThreshold_;
}
void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
std::lock_guard<std::mutex> guard(lock_);
timeVector_->addTimeval(delta, trainerId);
if (timeVector_->full()) {
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
auto id = timeVector_->getMaxTrainerId();
auto delta = timeVector_->getDelta();
// discard first sample, since first sample probably is abnormal.
if (totSamples_) {
abstract_[id].freq++;
if (delta < abstract_[id].minDelta) {
abstract_[id].minDelta = delta;
}
if (delta > abstract_[id].maxDelta) {
abstract_[id].maxDelta = delta;
}
abstract_[id].totDelta += delta;
// update totAbstract_
totAbstract_.freq++;
if (delta < totAbstract_.minDelta) {
totAbstract_.minDelta = delta;
}
if (delta > totAbstract_.maxDelta) {
totAbstract_.maxDelta = delta;
}
totAbstract_.totDelta += delta;
}
totSamples_++;
timeVector_->reset();
}
}
void BarrierDeltaStat::reset(bool clearRawData) {
int32_t i = 0;
totSamples_ = 0;
std::lock_guard<std::mutex> guard(abstractLock_);
if (clearRawData) {
timeVector_->reset();
}
for (auto &abstract : abstract_) {
memset((void *)&abstract, 0, sizeof(abstract));
abstract.minDelta = UINT64_MAX;
abstract.trainerId = i++;
}
memset((void *)&totAbstract_, 0, sizeof(Abstract));
totAbstract_.minDelta = UINT64_MAX;
}
void BarrierDeltaStat::showAbstract(std::ostream &output) const {
// do not support the case "<=2 pserver"
if (numConnThreads_ <= 2 || !totSamples_) {
return;
}
// duplicate freq info
std::vector<struct Abstract> outputAbstract = abstract_;
std::sort(outputAbstract.begin(),
outputAbstract.end(),
[](const struct Abstract &a, const struct Abstract &b) {
return a.freq > b.freq;
});
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
if (rate < rateThreshold_) {
return;
}
output << std::setw(20) << name_ << std::endl;
/* Note:
* Gap means the delta from all trainers' forwardbackward
* avgGap: average Gap in log_period batches
* minGap: min Gap in log_period batches
* maxGap: max Gap in log_period batches
* trainerId: the slowest trainer_id
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output << std::setw(42) << " " << std::setw(15) << "trainerId"
<< std::setw(15) << "avgGap" << std::setw(10) << "minGap"
<< std::setw(10) << "maxGap" << std::setw(10) << "rate"
<< std::setw(10) << "samples" << std::setw(10) << "totSamples"
<< std::endl;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if (!totAbstract_.freq) return;
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
<< std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
<< totAbstract_.maxDelta * 0.001 << std::setw(10)
<< (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
<< (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
<< std::endl;
// show lastTrainerId abstract
int count = 0;
for (auto &abstract : outputAbstract) {
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
break;
}
// output format control
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
<< std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
<< abstract.maxDelta * 0.001 << std::setw(10)
<< (float)abstract.freq / (float)totSamples_ << std::setw(10)
<< (float)abstract.freq << std::setw(10) << (float)totSamples_
<< std::endl;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <sys/time.h>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
namespace paddle {
inline uint64_t timeToMicroSecond(struct timeval time) {
return time.tv_sec * 1000000LU + time.tv_usec;
}
class TimeVectorEnd {
/*
* help class for gathering all barrier performance data
* which shows time point property.
* freqently used in barrier performance tuning API, such
* as tuning which is slowest node in sync-sgd mode training.
*/
public:
explicit TimeVectorEnd(uint16_t size) : size_(size) {
index_ = 0;
timeArray_.resize(size);
trainerIds_.resize(size);
}
~TimeVectorEnd() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() { index_ = 0; }
void addTimeval(struct timeval time, int32_t trainerId) {
timeArray_[index_] = time;
trainerIds_[index_] = trainerId;
index_++;
}
struct timeval getDelta() const {
struct timeval delta;
CHECK_GT(size_, 1) << "not support with 1 pserver";
timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
return delta;
}
/* 2, n delta */
struct timeval get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
return delta;
}
/* n-1, n delta */
struct timeval getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
return delta;
}
/* n/2, n delta */
struct timeval getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
struct timeval delta;
timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
return delta;
}
int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
private:
uint16_t size_;
uint16_t index_;
std::vector<struct timeval> timeArray_;
std::vector<int32_t> trainerIds_;
};
class TimeVectorDelta {
/*
* help class for gathering performance data which shows time
* delta property, such as tuning the time distribution of
* forwardBackward time from all cluster nodes.
*/
public:
explicit TimeVectorDelta(uint16_t size)
: size_(size), min_(UINT64_MAX), max_(0) {
index_ = 0;
timeArray_.resize(size);
}
~TimeVectorDelta() {}
uint16_t size() { return size_; }
bool full() { return index_ == size_; }
bool empty() { return index_ == 0; }
void reset() {
index_ = 0;
min_ = UINT64_MAX;
max_ = 0;
}
void addTimeval(uint64_t delta, int32_t trainerId) {
timeArray_[index_] = delta;
index_++;
if (delta < min_) {
min_ = delta;
}
if (delta > max_) {
max_ = delta;
maxTrainerId_ = trainerId;
}
}
uint64_t getDelta() const {
CHECK_GT(size_, 1) << "not support with 1 pserver";
return max_ - min_;
}
/* 2, n delta */
uint64_t get1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n-1, n delta */
uint64_t getMinus1NDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
/* n/2, n delta */
uint64_t getMidNDelta() const {
CHECK_GT(size_, 2) << "not support with less than 2 pservers";
LOG(FATAL) << "Not implemented";
}
int32_t getMaxTrainerId() const { return maxTrainerId_; }
private:
uint16_t size_;
uint16_t index_;
std::vector<uint64_t> timeArray_;
private:
uint64_t min_;
uint64_t max_;
int32_t maxTrainerId_;
};
// total samples stats, us
struct Abstract {
// last trainerId for barrier end, maxDelta trainerId for barrier delta
int32_t trainerId;
uint64_t minDelta;
uint64_t maxDelta;
uint64_t totDelta;
// first one is probably itself, so discard it.
uint64_t totSecondDelta;
// to confirm if last node destroy barrier performance.
uint64_t totLastTwoDelta;
// n/2-n delta
uint64_t totMidDelta;
uint64_t freq;
};
// barrier performance tunning stats
class BarrierStatBase {
public:
BarrierStatBase(uint16_t numConnThreads, const std::string &name);
virtual ~BarrierStatBase() {}
// if called at pserver end, then trainId means trainer's id.
// by default trainer does not use trainerId, so set it to -1
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
const std::string &getName() { return name_; }
virtual void reset(bool clearRawData = true) {}
// since the timeVector_ is not stateful, so it's not clear whether the
// the barrier delta is correct. if one timestamp was lost, the all data
// from barrier stat becomes rubbish. -_-
virtual bool checkPassBarrier() {
LOG(INFO) << "bug implementation found";
return false;
}
protected:
virtual void showAbstract(std::ostream &output) const {}
friend std::ostream &operator<<(std::ostream &output,
const BarrierStatBase &stat);
protected:
mutable std::mutex lock_;
std::mutex abstractLock_; // see note on updaterStat
// each freqency for each barrier trainer
std::vector<struct Abstract> abstract_;
// it is valuable when do perf-tuining, if lastTrainerId acts uniform
// distribution
struct Abstract totAbstract_;
uint64_t totSamples_;
protected:
uint16_t numConnThreads_; // total updates needed
float rateThreshold_;
std::string name_;
};
// the end-time of arriving real/forged barrier position
class BarrierEndStat : public BarrierStatBase {
public:
BarrierEndStat(uint16_t numConnThreads, const std::string &name);
~BarrierEndStat() {}
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
LOG(INFO) << "have no delta updateStat in BarrierEndStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
/*
* LOG:
* readAllBlocks_denseUpdater
* trainerId avgGap avgSecondGap avgLastTwoGap avgMidGap rate
* 44 86.702 81.022 9.984 50.472 0.144737
* 46 87.723 82.939 8.737 50.019 0.118421
* 35 100.923 96.752 14.305 61.979
* 0.0657895
* log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
* control details.
*/
virtual void showAbstract(std::ostream &output) const;
private:
std::unique_ptr<TimeVectorEnd> timeVector_;
};
// the delta-time from different trainers,
// eg, find the degree of imbalance of BP time at pserver end
// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
class BarrierDeltaStat : public BarrierStatBase {
public:
BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
~BarrierDeltaStat() {}
virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
}
virtual void reset(bool clearRawData = true);
virtual bool checkPassBarrier() { return timeVector_->empty(); }
protected:
virtual void showAbstract(std::ostream &outPut) const;
private:
// store delta time in uint64_t, eg BP time of all trainers
std::unique_ptr<TimeVectorDelta> timeVector_;
};
// to distinguish different contexts for same parallel threads, and different
// threads with same code-sgement, just use tagName to tag the run-time
// position.
// in Sparse, sendParallel threads can not only run in the stage of push&pull
// with same thread group, but also run in the stage of pull&push with different
// thread group, tag will be used to distinguish different run-time barrier
// position.
// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
// nodes.
// end barrier
#define __REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
struct timeval cur; \
gettimeofday(&cur, nullptr); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// end barrier with user-defined timer
#define __REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// delta barrier
#define __REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
__stat->updateStat(delta, trainerId); \
} \
} while (0);
// check end barrier
#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
do { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
PCHECK(__stat->checkPassBarrier()) << internalName \
<< ": invalid barrier data"; \
} while (0);
/*
* Note:
* with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
* pserve end. these synchronizaton actions have impact on the efficiency of
* parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
* factors, such as the forwardBackward variance, network fluncation. we try
* to have a quantitative analysis on these factor, so we design lots of barrier
* time to capture these performance. these barrier also can be placed at
* implict barrier position.
*
* example:
* in sync-sgd algorithm, each parameter server waits for all gradients from
* all trainers, thus, an explict barrier point exsit before doing optimization.
* the barrier timer located before the point can sense the barrier condition.
*
*/
// try to capture which trainer is slowest node in sync-sgd at pserver.
#define REGISTER_SLOW_NODES_PROBE( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
// try to check if all threads or trainers have passed barriers for data
// accuracy.
#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
__CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
#ifdef PADDLE_DISABLE_TIMER
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...)
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#else
/*
* sensing barrier time distribution for all parallelization threads.
* it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
*/
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
/*
* sensing barrier time distribution for all parallelization threads.
* but time point for barrier performance is set by user.
* eg, with this api, you can get implict barrier point such as the beginning
* time distribution
* for receiving data.
*/
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
__REGISTER_BARRIER_TIMER_SERVER_SET( \
(set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
// try to capture time delta from all trainers, such as forwardBackward time
// which implies
// computation fluctuation
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
__REGISTER_BARRIER_DELTA_SERVER_SET( \
(set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
#endif // DISABLE_TIMER
} // namespace paddle
......@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
return outPut;
}
BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType) {
{
ReadLockGuard guard(lock_);
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
}
std::lock_guard<RWLock> guard(lock_);
// test again with lock_guard
auto it = barrierStatSet_.find(name);
if (it != barrierStatSet_.end()) {
return it->second;
}
BarrierStatPtr stat;
if (bType == BARRIER_END) {
stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
} else if (bType == BARRIER_DELTA) {
stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
}
auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
return ret.first->second;
}
void StatSet::printSegTimerStatus() {
ReadLockGuard guard(lock_);
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
......@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
}
}
void StatSet::printBarrierTimerStatus() {
ReadLockGuard guard(lock_);
if (barrierStatSet_.empty()) {
return;
}
// control barrierAbstact in runtime, so enable compliation
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< "======= BarrierStatSet status ======" << std::endl;
for (auto& stat : barrierStatSet_) {
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
<< *(stat.second);
}
}
void StatSet::printAllStatus() {
#ifndef PADDLE_DISABLE_TIMER
printSegTimerStatus();
#endif
printBarrierTimerStatus();
LOG(INFO) << std::setiosflags(std::ios::left)
<< "--------------------------------------------------"
<< std::endl;
}
void StatSet::printStatus(const std::string& name) {
ReadLockGuard guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
LOG(INFO) << *(iter->second);
}
void StatSet::reset(bool clearRawData) {
ReadLockGuard guard(lock_);
for (auto& stat : statSet_) {
stat.second->reset();
}
// reset barrierStat
for (auto& stat : barrierStatSet_) {
stat.second->reset(clearRawData);
}
}
void StatSet::setThreadInfo(const std::string& name, bool flag) {
......@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
iter->second->setThreadInfo(flag);
}
void StatSet::deleteStat(const std::string& name) {
std::lock_guard<RWLock> guard(lock_);
auto iter = statSet_.find(name);
CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
statSet_.erase(iter);
}
StatInfo::~StatInfo() {
if (stat_) {
std::lock_guard<std::mutex> guard(stat_->lock_);
......
......@@ -23,7 +23,6 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include "BarrierStat.h"
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
......@@ -60,12 +59,6 @@ public:
class Stat;
typedef std::shared_ptr<Stat> StatPtr;
typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
enum BarrierStatType {
BARRIER_END = 0,
BARRIER_DELTA = 1,
};
class StatSet {
public:
......@@ -74,11 +67,8 @@ public:
// print to LOG(INFO)
void printSegTimerStatus();
void printBarrierTimerStatus();
void printAllStatus();
void printStatus(const std::string& name);
StatPtr getStat(const std::string& name) {
{
ReadLockGuard guard(lock_);
......@@ -93,12 +83,6 @@ public:
return ret.first->second;
}
BarrierStatPtr getStat(uint16_t numConnThreads,
const std::string& name,
BarrierStatType bType);
void deleteStat(const std::string& name);
// true for showing stats for each thread
// false for showing stats aggragated over threads
void setThreadInfo(const std::string& name, bool flag);
......@@ -120,7 +104,6 @@ public:
private:
std::unordered_map<std::string, StatPtr> statSet_;
std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
const std::string name_;
RWLock lock_;
};
......
......@@ -51,7 +51,7 @@ template <class T>
class ThreadLocal {
public:
ThreadLocal() {
PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
}
~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
......@@ -65,7 +65,7 @@ public:
if (!p && createLocal) {
p = new T();
int ret = pthread_setspecific(threadSpecificKey_, p);
PCHECK(ret == 0);
CHECK(ret == 0);
}
return p;
}
......@@ -79,7 +79,7 @@ public:
if (T* q = get(false)) {
dataDestructor(q);
}
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
}
/**
......@@ -112,7 +112,7 @@ private:
template <class T>
class ThreadLocalD {
public:
ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
~ThreadLocalD() {
pthread_key_delete(threadSpecificKey_);
for (auto t : threadMap_) {
......@@ -127,7 +127,7 @@ public:
T* p = (T*)pthread_getspecific(threadSpecificKey_);
if (!p) {
p = new T();
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p);
}
return p;
......@@ -141,7 +141,7 @@ public:
if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
dataDestructor(q);
}
PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
updateMap(p);
}
......
......@@ -266,6 +266,29 @@ message PadConfig {
repeated uint32 pad_w = 4;
}
message MultiBoxLossConfig {
required uint32 num_classes = 1;
required float overlap_threshold = 2;
required float neg_pos_ratio = 3;
required float neg_overlap = 4;
required uint32 background_id = 5;
required uint32 input_num = 6;
optional uint32 height = 7 [default = 1];
optional uint32 width = 8 [default = 1];
}
message DetectionOutputConfig {
required uint32 num_classes = 1;
required float nms_threshold = 2;
required uint32 nms_top_k = 3;
required uint32 background_id = 4;
required uint32 input_num = 5;
required uint32 keep_top_k = 6;
required float confidence_threshold = 7;
optional uint32 height = 8 [default = 1];
optional uint32 width = 9 [default = 1];
}
message LayerInputConfig {
required string input_layer_name = 1;
optional string input_parameter_name = 2;
......@@ -284,6 +307,8 @@ message LayerInputConfig {
optional PriorBoxConfig priorbox_conf = 13;
optional PadConfig pad_conf = 14;
optional RowConvConfig row_conv_conf = 15;
optional MultiBoxLossConfig multibox_loss_conf = 16;
optional DetectionOutputConfig detection_output_conf = 17;
}
message LayerConfig {
......
......@@ -78,11 +78,15 @@ enum DataType {
repeated bytes content = 2;
}
message LrPolicyState {
// learninRate Policy
optional double learning_rate = 1 [default = 1.0];
optional double lr_decay_a = 2;
optional double lr_decay_b = 3;
}
message SGDOptimizerState {
// learning rate policy
optional double learning_rate = 101;
optional double lr_decay_a = 102;
optional double lr_decay_b = 103;
optional LrPolicyState lr_state = 101;
optional double num_sample_passed = 104;
// state
optional TensorProto parameter = 1;
......@@ -91,9 +95,7 @@ message SGDOptimizerState {
message AdadeltaOptimizerState {
// learning rate policy
optional double learning_rate = 101;
optional double lr_decay_a = 102;
optional double lr_decay_b = 103;
optional LrPolicyState lr_state = 101;
optional double num_sample_passed = 104;
// state
optional TensorProto parameter = 1;
......@@ -102,11 +104,9 @@ message AdadeltaOptimizerState {
optional TensorProto update_delta = 4;
}
message AdagradOptimizerState {
// learning rate policy
optional double learning_rate = 101;
optional double lr_decay_a = 102;
optional double lr_decay_b = 103;
optional LrPolicyState lr_state = 101;
optional double num_sample_passed = 104;
// state
optional TensorProto parameter = 1;
......@@ -114,10 +114,7 @@ message AdagradOptimizerState {
}
message AdamOptimizerState {
// learning rate policy
optional double learning_rate = 101;
optional double lr_decay_a = 102;
optional double lr_decay_b = 103;
optional LrPolicyState lr_state = 101;
optional double num_sample_passed = 104;
// state
optional TensorProto parameter = 1;
......
......@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp)
......@@ -43,6 +43,7 @@ if (WITH_TESTING)
add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests)
add_subdirectory(paddle/v2/framework/tests)
endif()
endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
......
......@@ -1353,7 +1353,8 @@ class LayerBase(object):
device=None,
active_type="",
drop_rate=0.,
coeff=None):
coeff=None,
error_clipping_threshold=None):
config_assert('@' not in name,
"layer name: %s contain special character @" % name)
global g_current_submodel
......@@ -1387,6 +1388,9 @@ class LayerBase(object):
elif g_default_device is not None:
self.config.device = g_default_device
if error_clipping_threshold is not None:
self.config.error_clipping_threshold = error_clipping_threshold
for input_index in xrange(len(self.inputs)):
input = self.inputs[input_index]
input_config = None
......@@ -1674,6 +1678,52 @@ class PriorBoxLayer(LayerBase):
self.config.size = size
@config_layer('multibox_loss')
class MultiBoxLossLayer(LayerBase):
def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
neg_pos_ratio, neg_overlap, background_id, **xargs):
super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 2),
'MultiBoxLossLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
self.config.inputs[
0].multibox_loss_conf.overlap_threshold = overlap_threshold
self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
self.config.inputs[0].multibox_loss_conf.background_id = background_id
self.config.inputs[0].multibox_loss_conf.input_num = input_num
self.config.size = 1
@config_layer('detection_output')
class DetectionOutputLayer(LayerBase):
def __init__(self, name, inputs, size, input_num, num_classes,
nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
background_id, **xargs):
super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
inputs)
config_assert(
len(inputs) == (input_num * 2 + 1),
'DetectionOutputLayer does not have enough inputs')
config_assert(num_classes > background_id,
'Classes number must greater than background ID')
self.config.inputs[0].detection_output_conf.num_classes = num_classes
self.config.inputs[
0].detection_output_conf.nms_threshold = nms_threshold
self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
self.config.inputs[
0].detection_output_conf.confidence_threshold = confidence_threshold
self.config.inputs[
0].detection_output_conf.background_id = background_id
self.config.inputs[0].detection_output_conf.input_num = input_num
self.config.size = size
@config_layer('data')
class DataLayer(LayerBase):
def __init__(self, name, size, height=None, width=None, device=None):
......@@ -2420,10 +2470,14 @@ class MaxLayer(LayerBase):
trans_type='non-seq',
bias=False,
output_max_index=None,
stride=-1,
**xargs):
super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index)
self.set_layer_size(input_layer.size)
......@@ -2685,11 +2739,15 @@ class AverageLayer(LayerBase):
average_strategy='average',
trans_type='non-seq',
bias=False,
stride=-1,
**xargs):
super(AverageLayer, self).__init__(
name, 'average', 0, inputs=inputs, **xargs)
self.config.average_strategy = average_strategy
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type
self.config.seq_pool_stride = stride
config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index)
......@@ -2728,13 +2786,7 @@ class TensorLayer(LayerBase):
@config_layer('mixed')
class MixedLayer(LayerBase):
def __init__(self,
name,
inputs,
size=0,
bias=True,
error_clipping_threshold=None,
**xargs):
def __init__(self, name, inputs, size=0, bias=True, **xargs):
config_assert(inputs, 'inputs cannot be empty')
super(MixedLayer, self).__init__(
name, 'mixed', size, inputs=inputs, **xargs)
......@@ -2816,9 +2868,6 @@ class MixedLayer(LayerBase):
self.config.bias_size = psize
self.create_bias_parameter(bias, psize)
if error_clipping_threshold is not None:
self.config.error_clipping_threshold = error_clipping_threshold
# like MixedLayer, but no bias parameter
@config_func
......
......@@ -115,6 +115,8 @@ __all__ = [
'print_layer',
'priorbox_layer',
'cross_channel_norm_layer',
'multibox_loss_layer',
'detection_output_layer',
'spp_layer',
'pad_layer',
'eos_layer',
......@@ -195,6 +197,8 @@ class LayerType(object):
PRINT_LAYER = 'print'
PRIORBOX_LAYER = 'priorbox'
MULTIBOX_LOSS_LAYER = 'multibox_loss'
DETECTION_OUTPUT_LAYER = 'detection_output'
CTC_LAYER = 'ctc'
WARP_CTC_LAYER = 'warp_ctc'
......@@ -1041,6 +1045,158 @@ def priorbox_layer(input,
size=size)
@wrap_name_default("multibox_loss")
def multibox_loss_layer(input_loc,
input_conf,
priorbox,
label,
num_classes,
overlap_threshold=0.5,
neg_pos_ratio=3.0,
neg_overlap=0.5,
background_id=0,
name=None):
"""
Compute the location loss and the confidence loss for ssd.
:param name: The Layer Name.
:type name: basestring
:param input_loc: The input predict locations.
:type input_loc: LayerOutput | List of LayerOutput
:param input_conf: The input priorbox confidence.
:type input_conf: LayerOutput | List of LayerOutput
:param priorbox: The input priorbox location and the variance.
:type priorbox: LayerOutput
:param label: The input label.
:type label: LayerOutput
:param num_classes: The number of the classification.
:type num_classes: int
:param overlap_threshold: The threshold of the overlap.
:type overlap_threshold: float
:param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
:type neg_pos_ratio: float
:param neg_overlap: The negative bbox overlap threshold.
:type neg_overlap: float
:param background_id: The background class index.
:type background_id: int
:return: LayerOutput
"""
if isinstance(input_loc, LayerOutput):
input_loc = [input_loc]
assert isinstance(input_loc, collections.Sequence) # list or tuple
for each in input_loc:
assert isinstance(each, LayerOutput)
input_loc_num = len(input_loc)
if isinstance(input_conf, LayerOutput):
input_conf = [input_conf]
assert isinstance(input_conf, collections.Sequence) # list or tuple
for each in input_conf:
assert isinstance(each, LayerOutput)
input_conf_num = len(input_conf)
# Check the input layer number.
assert input_loc_num == input_conf_num
inputs = [priorbox.name, label.name]
inputs.extend([l.name for l in input_loc])
inputs.extend([l.name for l in input_conf])
parents = [priorbox, label]
parents.extend(input_loc)
parents.extend(input_conf)
Layer(
name=name,
type=LayerType.MULTIBOX_LOSS_LAYER,
inputs=inputs,
input_num=input_loc_num,
num_classes=num_classes,
overlap_threshold=overlap_threshold,
neg_pos_ratio=neg_pos_ratio,
neg_overlap=neg_overlap,
background_id=background_id)
return LayerOutput(
name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
@wrap_name_default("detection_output")
def detection_output_layer(input_loc,
input_conf,
priorbox,
num_classes,
nms_threshold=0.45,
nms_top_k=400,
keep_top_k=200,
confidence_threshold=0.01,
background_id=0,
name=None):
"""
Apply the NMS to the output of network and compute the predict bounding
box location.
:param name: The Layer Name.
:type name: basestring
:param input_loc: The input predict locations.
:type input_loc: LayerOutput | List of LayerOutput.
:param input_conf: The input priorbox confidence.
:type input_conf: LayerOutput | List of LayerOutput.
:param priorbox: The input priorbox location and the variance.
:type priorbox: LayerOutput
:param num_classes: The number of the classification.
:type num_classes: int
:param nms_threshold: The Non-maximum suppression threshold.
:type nms_threshold: float
:param nms_top_k: The bbox number kept of the NMS's output
:type nms_top_k: int
:param keep_top_k: The bbox number kept of the layer's output
:type keep_top_k: int
:param confidence_threshold: The classification confidence threshold
:type confidence_threshold: float
:param background_id: The background class index.
:type background_id: int
:return: LayerOutput
"""
if isinstance(input_loc, LayerOutput):
input_loc = [input_loc]
assert isinstance(input_loc, collections.Sequence) # list or tuple
for each in input_loc:
assert isinstance(each, LayerOutput)
input_loc_num = len(input_loc)
if isinstance(input_conf, LayerOutput):
input_conf = [input_conf]
assert isinstance(input_conf, collections.Sequence) # list or tuple
for each in input_conf:
assert isinstance(each, LayerOutput)
input_conf_num = len(input_conf)
# Check the input layer number.
assert input_loc_num == input_conf_num
inputs = [priorbox.name]
inputs.extend([l.name for l in input_loc])
inputs.extend([l.name for l in input_conf])
parents = [priorbox]
parents.extend(input_loc)
parents.extend(input_conf)
size = keep_top_k * 7
Layer(
name=name,
type=LayerType.DETECTION_OUTPUT_LAYER,
inputs=inputs,
size=size,
input_num=input_loc_num,
num_classes=num_classes,
nms_threshold=nms_threshold,
nms_top_k=nms_top_k,
keep_top_k=keep_top_k,
confidence_threshold=confidence_threshold,
background_id=background_id)
return LayerOutput(
name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
@wrap_name_default("cross_channel_norm")
def cross_channel_norm_layer(input, name=None, param_attr=None):
"""
......@@ -1090,10 +1246,19 @@ def pooling_layer(input,
name=None,
bias_attr=None,
agg_level=AggregateLevel.TO_NO_SEQUENCE,
stride=-1,
layer_attr=None):
"""
Pooling layer for sequence inputs, not used for Image.
If stride > 0, this layer slides a window whose size is determined by stride,
and return the pooling value of the window as the output. Thus, a long sequence
will be shorten.
The parameter stride specifies the intervals at which to apply the pooling
operation. Note that for sequence with sub-sequence, the default value
of stride is -1.
The example usage is:
.. code-block:: python
......@@ -1112,6 +1277,8 @@ def pooling_layer(input,
:param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
SumPooling, SquareRootNPooling.
:type pooling_type: BasePoolingType|None
:param stride: The step size between successive pooling regions.
:type stride: Int
:param bias_attr: Bias parameter attribute. False if no bias.
:type bias_attr: ParameterAttribute|None|False
:param layer_attr: The Extra Attributes for layer, such as dropout.
......@@ -1129,12 +1296,16 @@ def pooling_layer(input,
extra_dict['output_max_index'] = pooling_type.output_max_index
extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
if agg_level == AggregateLevel.TO_SEQUENCE:
assert stride == -1
Layer(
name=name,
type=pooling_type.name,
inputs=[Input(input.name)],
bias=ParamAttr.to_bias(bias_attr),
trans_type=agg_level,
stride=stride,
**extra_dict)
return LayerOutput(
......@@ -1396,7 +1567,7 @@ def last_seq(input,
:type name: basestring
:param input: Input layer name.
:type input: LayerOutput
:param stride: window size.
:param stride: The step size between successive pooling regions.
:type stride: Int
:param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute.
......@@ -1452,7 +1623,7 @@ def first_seq(input,
:type name: basestring
:param input: Input layer name.
:type input: LayerOutput
:param stride: window size.
:param stride: The step size between successive pooling regions.
:type stride: Int
:param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute.
......@@ -4634,6 +4805,14 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
So groups should be larger than 1, and the num of channels should be able
to devided by groups.
.. math::
y_{si+j} = \max_k x_{gsi + sk + j}
g = groups
s = input.size / num_channels
0 \le i < num_channels / groups
0 \le j < s
0 \le k < groups
Please refer to Paper:
- Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
- Multi-digit Number Recognition from Street View \
......
......@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_prelu_layer test_row_conv)
test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
export whole_configs=(test_split_datasource)
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
detout = detection_output_layer(
input_loc=input_loc,
input_conf=input_conf,
priorbox=priorbox,
num_classes=21,
nms_threshold=0.45,
nms_top_k=400,
keep_top_k=200,
confidence_threshold=0.01,
background_id=0,
name='test_detection_output')
outputs(detout)
from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5)
input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
label = data_layer(name='label', size=24, height=4, width=6)
multibox_loss = multibox_loss_layer(
input_loc=input_loc,
input_conf=input_conf,
priorbox=priorbox,
label=label,
num_classes=21,
overlap_threshold=0.5,
neg_pos_ratio=3.0,
neg_overlap=0.5,
background_id=0,
name='test_multibox_loss')
outputs(multibox_loss)
add_python_test(test_framework test_protobuf.py)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册