提交 b15c69f5 编写于 作者: C chengduoZH

remove conflict

......@@ -127,6 +127,7 @@ include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/nccl)
include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration
......@@ -159,7 +160,7 @@ set(EXTERNAL_LIBS
if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU)
......
......@@ -62,11 +62,11 @@ else()
FIND_PACKAGE(CUDA REQUIRED)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
endif()
if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle need cudnn to compile")
message(FATAL_ERROR "Paddle needs cudnn to compile")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
......
INCLUDE(ExternalProject)
SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
if(WITH_DSO)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install")
SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
endif()
ExternalProject_Add(
extern_nccl
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
GIT_TAG "v1.3.4-1"
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
TEST_COMMAND ""
)
if (WITH_DSO)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(nccl STATIC ${dummyfile})
else()
add_library(nccl INTERFACE)
endif()
else()
ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION
${NCCL_INSTALL_DIR}/lib/libnccl.a)
endif()
add_dependencies(nccl extern_nccl)
LIST(APPEND external_project_dependencies nccl)
......@@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用;
2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度;
除此之外,还可以通过减小学习或者对数据进行归一化处理来解决这类问题。
除此之外,还可以通过减小学习或者对数据进行归一化处理来解决这类问题。
5. 如何调用 infer 接口输出多个layer的预测结果
-----------------------------------------------
......
......@@ -41,6 +41,19 @@ bool BlockDescBind::HasVar(const std::string &name) const {
return vars_.find(name) != vars_.end();
}
VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
auto it = vars_.find(name);
if (it == vars_.end()) {
return Parent() == kNoneBlockIndex ? nullptr
: ParentBlock()->FindVarRecursive(name);
}
return it->second.get();
}
bool BlockDescBind::HasVarRecursive(const std::string &name) const {
return FindVarRecursive(name) != nullptr;
}
std::vector<VarDescBind *> BlockDescBind::AllVars() const {
std::vector<VarDescBind *> res;
for (const auto &p : vars_) {
......@@ -97,7 +110,7 @@ void BlockDescBind::Flush() {
}
BlockDescBind *BlockDescBind::ParentBlock() const {
if (this->desc_->parent_idx() == -1) {
if (this->desc_->parent_idx() == kNoneBlockIndex) {
return nullptr;
}
return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <vector>
#include "paddle/framework/op_desc.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/framework/var_desc.h"
#include "paddle/platform/macros.h"
......@@ -56,6 +57,10 @@ class BlockDescBind {
bool HasVar(const std::string &var_name) const;
VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
bool HasVarRecursive(const std::string &var_name) const;
std::set<std::string> LocalVarNames() const {
std::set<std::string> var_names;
for (auto &var : vars_) {
......
......@@ -87,11 +87,8 @@ class OpInfoMap {
}
}
template <typename Callback>
void IterAllInfo(Callback callback) {
for (auto& it : map_) {
callback(it.first, it.second);
}
const std::unordered_map<std::string, const OpInfo>& map() const {
return map_;
}
private:
......
......@@ -327,37 +327,47 @@ class CompileTimeInferShapeContext : public InferShapeContext {
bool HasInput(const std::string& name) const override {
const std::vector<std::string>& input_names = op_.Input(name);
auto length = input_names.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have only one value, "
"but it have %d now",
name, length);
return block_.HasVar(input_names[0]);
return block_.HasVarRecursive(input_names[0]);
}
bool HasOutput(const std::string& name) const override {
const std::vector<std::string>& output_names = op_.Output(name);
auto length = output_names.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
"Output(%s) should have only one value, "
"but it have %d now",
name, length);
return block_.HasVar(output_names[0]);
return block_.HasVarRecursive(output_names[0]);
}
bool HasInputs(const std::string& name) const override {
const std::vector<std::string>& input_names = op_.Input(name);
PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
if (input_names.empty()) {
return false;
}
for (auto& input : input_names) {
if (!block_.HasVar(input)) return false;
if (!block_.HasVarRecursive(input)) return false;
}
return true;
}
bool HasOutputs(const std::string& name) const override {
const std::vector<std::string>& output_names = op_.Output(name);
PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
if (output_names.empty()) {
return false;
}
for (auto& output : output_names) {
if (!block_.HasVar(output)) return false;
if (!block_.HasVarRecursive(output)) return false;
}
return true;
}
......@@ -404,11 +414,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
private:
DDim GetDim(const std::string& name) const override {
return framework::make_ddim(block_.FindVar(name)->Shape());
return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
}
void SetDim(const std::string& name, const DDim& dim) override {
block_.FindVar(name)->SetShape(framework::vectorize(dim));
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
}
const OpDescBind& op_;
......@@ -421,13 +431,27 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override {
auto ipt = op_.Input(name);
auto& ins = Inputs(name);
size_t length = ins.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
name);
auto ipt = ins[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
}
bool HasOutput(const std::string& name) const override {
auto ipt = op_.Output(name);
auto& outs = Outputs(name);
size_t length = outs.size();
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
name);
auto ipt = outs[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
}
......
......@@ -35,8 +35,8 @@ ProgramDesc *ProgramDescBind::Proto() {
ProgramDescBind::ProgramDescBind() {
auto *block = prog_.mutable_blocks()->Add();
block->set_idx(0);
block->set_parent_idx(-1);
block->set_idx(kRootBlockIndex);
block->set_parent_idx(kNoneBlockIndex);
blocks_.emplace_back(new BlockDescBind(this, block));
}
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <memory>
#include <vector>
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h"
namespace paddle {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace framework {
// The Index of first Block in Program. also called root block.
constexpr int kRootBlockIndex = 0;
// The Parent Index of root Block, this block does not exist.
constexpr int kNoneBlockIndex = -1;
} // namespace framework
} // namespace paddle
......@@ -18,6 +18,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
}
......
......@@ -75,9 +75,9 @@ class VarDescBind {
int32_t GetLodLevel() const;
VarDesc::VarType GetType() const { return desc_.type(); }
VarDesc::VarType GetType() const;
void SetType(VarDesc::VarType type) { desc_.set_type(type); }
void SetType(VarDesc::VarType type);
bool Persistable() const { return desc_.persistable(); }
......
......@@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
copyInVal_ = nullptr;
if (act.grad && algo == algorithm::eltwise_tanh) {
// tanh need save src input for backward
inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
CHECK(copyInVal_) << "should not be emptry";
pipelineFwd_.push_back(*copyInVal_);
......@@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
algorithm algo = getAlgo(this->getName());
float alpha = getBwdAlpha();
float beta = getBeta();
grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
auto eng = CPUEngine::Instance().getEngine();
auto bwdDesc = eltwise_bwd::desc(
algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
......@@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
int ic = cnt_ / bs / ih / iw;
CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
val_ = MKLDNNMatrix::create(
act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
{bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
CHECK(val_);
val_->downSpatial();
}
......
......@@ -21,8 +21,8 @@ namespace paddle {
typedef enum {
MKLDNN_BASE = 1, // basical info of MKLDNN
MKLDNN_TESTS = 1, // gtest info of MKLDNN
MKLDNN_SIZES = 2, // size info of MKLDNN
MKLDNN_FMTS = 3, // format info of MKLDNN
MKLDNN_FMTS = 2, // format info of MKLDNN
MKLDNN_SIZES = 3, // size info of MKLDNN
MKLDNN_ALL = 4, // show all info of MKLDNN
} MKLDNN_LOG_LEVEL;
......
......@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdBuffers(fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
printValueFormatFlow();
}
void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
printGradFormatFlow();
}
void MKLDNNConvLayer::updateInputData() {
cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
......@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(pd);
resetInValue(pd, in);
resetInValue(
in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
resetOutValue(out, pd->dst_primitive_desc());
resetWgtBiasValue(pd, wgt, bias);
resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
resetOutValue(pd, out);
if (biases_ && biases_->getW()) {
resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
} else {
bias = nullptr;
}
}
void MKLDNNConvLayer::resetFwdPipeline(
......@@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (cvtInVal_) {
pipeline.push_back(*cvtInVal_);
}
if (bias) {
fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
} else {
fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
}
pipeline.push_back(*fwd_);
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void MKLDNNConvLayer::resetInValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
// create buffer and reorder if input value do not match
cpuInVal_ = nullptr;
cvtInVal_ = nullptr;
MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
in = dnnIn;
return;
}
if (dnnIn) {
if (dnnIn->getFormat() == format::nc) {
CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
// create a new one with nchw format and same data
memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
}
if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
in = dnnIn;
return;
}
cpuInVal_ = dnnIn;
in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
} else {
memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
// create new mkldnn matrix
in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
} else {
in = cpuInVal_;
}
}
}
void MKLDNNConvLayer::resetWgtBiasValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
bias = (biases_ && biases_->getW())
? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
: nullptr;
}
void MKLDNNConvLayer::resetOutValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
CHECK(cvtOutVal_) << "should not be empty";
} else {
cpuOut->setData(output_.value->getData());
cpuOutVal_ = out;
}
// when output is cpu device, change the mkldnn output value and make them
// share the same data. Then if next layer use inputlayer->getOuputValue()
// to achieve the input value, it will get the right data.
output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
return;
}
output_.value = std::dynamic_pointer_cast<Matrix>(out);
}
void MKLDNNConvLayer::resetBwdWgtPD(
......@@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
// create backward weight using input, output and weight value memory desc
CHECK(inVal_) << "Should have input value";
CHECK(outVal_) << "Should have output value";
CHECK(inVal_) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
CHECK(wgtVal_) << "Should have weight value";
algorithm algo = algorithm::convolution_direct;
padding_kind padKind = padding_kind::zero;
......@@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
CHECK(inVal_) << "Should have input value";
CHECK(outVal_) << "Should have output value";
CHECK(inVal_) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
// create backward data using input and output value memory desc
// but using weight memory desc with any format
auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
......@@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(wgtPD);
resetOutGrad(wgtPD, out);
resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
resetWgtBiasGrad(wgtPD, wgt, bias);
resetWithMatrix(
wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
CHECK(wgtVal_ != nullptr &&
wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
<< "primitive desc of weight grad and value should be equal";
resetInGrad(dataPD, in);
bias = nullptr;
if (biases_ && biases_->getWGrad()) {
resetWithMatrix(
bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
CHECK(bias && biasVal_ &&
bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
<< "primitive desc of bias grad should equal the bias value";
}
if (dataPD == nullptr) {
return;
}
resetInGrad(in, dataPD->diff_src_primitive_desc());
resetWgtValBwdData(dataPD, wgtValBwdData_);
}
......@@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_);
}
CHECK(inVal_);
// add bwdWgt handle
if (bias) {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
......@@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
if (dataPD == nullptr) {
return;
}
if (cvtWgtVal_) {
pipeline.push_back(*cvtWgtVal_);
}
// add bwdData handle
CHECK(wgtValBwdData_) << "Should have weight memory";
bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
pipeline.push_back(*bwdData_);
if (cvtInGrad_) {
pipeline.push_back(*cvtInGrad_);
}
}
void MKLDNNConvLayer::resetOutGrad(
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr;
CHECK(outVal_ != nullptr &&
outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
<< "primitive desc of out grad and value should be equal";
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
// always share the same grad data of CPU output
// then the activation can get the right grad from output_.grad
output_.grad->setData(cpuOut->getData());
// same PrimitiveDesc with cpuInVal_
CHECK(cpuOutVal_);
cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
// create reorder if primitive desc does not match
if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_);
} else {
out = cpuOutGrad_;
}
}
}
void MKLDNNConvLayer::resetWgtBiasGrad(
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
wgt = MKLDNNMatrix::create(weight_->getWGrad(),
wgtPD->diff_weights_primitive_desc());
CHECK(nullptr != wgtVal_ &&
wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
<< "primitive desc of weight grad and value should be equal";
VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
bias = nullptr;
if (biasVal_ == nullptr) {
return;
}
bias = MKLDNNMatrix::create(biases_->getWGrad(),
wgtPD->diff_bias_primitive_desc());
CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
<< "primitive desc of bias grad should equal the bias value";
}
void MKLDNNConvLayer::resetInGrad(
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in) {
in = nullptr;
cpuInGrad_ = nullptr;
cvtInGrad_ = nullptr;
if (dataPD == nullptr) {
return;
}
if (inputIsOnlyMKLDNN()) {
MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
CHECK(nullptr != inVal_ &&
in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
<< "primitive desc of input grad and value should be equal";
} else {
const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
// same PrimitiveDesc with cpuInVal_
CHECK(cpuInVal_);
cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
in = cpuInGrad_;
// create reorder if PrimitiveDesc does not match
if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
dataPD->diff_src_primitive_desc());
cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
CHECK(cvtInGrad_);
}
}
}
void MKLDNNConvLayer::resetWgtValBwdData(
......@@ -537,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
// since the primitive_desc would be different with wgtVal_
CHECK(wgtVal_) << "should have weight value";
if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
wgtValBwdData_ =
MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
CHECK(cvtWgtVal_);
} else {
......
......@@ -48,17 +48,6 @@ protected:
// save forward primitive_desc, which can be used backward
std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuInVal_;
MKLDNNMatrixPtr cpuInGrad_;
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtInVal_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// whether the weight has been init
bool hasInitedWgt_;
......@@ -94,8 +83,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void updateWeights(const UpdateCallback& callback) override;
void convertWeightsFromPaddle() override;
......@@ -109,26 +96,6 @@ public:
<< ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
}
void printValueFormatFlow() override {
if (cpuInVal_) {
VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
}
MKLDNNLayer::printValueFormatFlow();
if (cpuOutVal_) {
VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
}
}
void printGradFormatFlow() override {
if (cpuInGrad_) {
VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
}
MKLDNNLayer::printGradFormatFlow();
if (cpuOutGrad_) {
VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
}
}
protected:
/**
* load the dims settings of this conv
......@@ -162,23 +129,6 @@ protected:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of input value
*/
void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in);
/**
* reset MKLDNNMatrix of weight and bias value
*/
void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias);
/**
* reset MKLDNNMatrix of output value
*/
void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& out);
/**
* reset the backward weight primitive descriptor.
*/
......@@ -207,22 +157,6 @@ protected:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of output grad
*/
void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of weight and bias grad
*/
void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias);
/**
* reset MKLDNNMatrix of input grad
*/
void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in);
/**
* reset MKLDNNMatrix of weight value for backward data
* since the primitive_desc would be different with wgtVal_
......
......@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}
......@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}
......@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdPD(fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
printValueFormatFlow();
}
void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdDataPD(bwdDataPD, in, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
printGradFormatFlow();
}
void MKLDNNFcLayer::updateInputData() {
inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
......@@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetInValue(in);
resetWgtBiasValue(wgt, bias);
resetOutValue(out);
}
void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& dnnIn = getInputValue(0);
in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
CHECK(in) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
in = MKLDNNMatrix::create(
cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
CHECK(in);
in->downSpatial();
}
void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
auto outPD =
MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
resetOutValue(out, outPD);
format wgtFmt = format::oihw;
if (inVal_->getFormat() == format::nChw8c) {
if (in->getFormat() == format::nChw8c) {
wgtFmt = format::oIhw8i;
} else if (inVal_->getFormat() == format::nChw16c) {
} else if (in->getFormat() == format::nChw16c) {
wgtFmt = format::oIhw16i;
}
wgt = MKLDNNMatrix::create(
weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
auto wgtPD =
MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
resetWithMatrix(wgt, weight_->getW(), wgtPD);
wgt->downSpatial();
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
bias = (biases_ && biases_->getW())
? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
: nullptr;
}
void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
if (!outputIsOnlyMKLDNN()) {
// fc cpu output value do not need create convert, just share data
getOutput(CPU_DEVICE).value->setData(out->getData());
if (biases_ && biases_->getW()) {
auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
resetWithMatrix(bias, biases_->getW(), biasPD);
} else {
bias = nullptr;
}
output_.value = std::dynamic_pointer_cast<Matrix>(out);
}
void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
......@@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
} else {
fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
}
pipeline.push_back(*fwd_);
}
......@@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetOutGrad(out);
resetWgtBiasGrad(wgt, bias);
resetInGrad(in);
}
void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
CHECK(outVal_);
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
output_.grad->setData(cpuOut->getData());
out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
}
}
CHECK(inVal_ && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
CHECK(wgtVal_);
wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
if (biasVal_) {
resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
} else {
bias = nullptr;
if (biasVal_ == nullptr) {
return;
}
bias =
MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
}
void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr;
if (inputLayers_[0]->getOutput().grad == nullptr) {
return;
}
CHECK(inVal_);
MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
}
void MKLDNNFcLayer::resetBwdWgtPD(
......
......@@ -66,8 +66,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void updateWeights(const UpdateCallback& callback) override;
void convertWeightsFromPaddle() override;
......@@ -84,9 +82,6 @@ protected:
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr wgt,
......@@ -109,9 +104,6 @@ protected:
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) {
return false;
}
setOutputMap();
checkCPUOutputsNumber();
stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine();
return true;
}
void MKLDNNLayer::forward(PassType passType) {
passType_ = passType;
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty());
copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
if (inputElemenCnt_ != elemenCnt) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
// all cpu device output grad or value share output's
shareCPUDevice();
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
// MKLDNNLayer output value should be MKLDNNMatrix
// so external output value is necessary.
// Then external input value is not necessary,
// since input may be mkldnn internal buffer.
CHECK(extOutVal_) << "external output value is necessary";
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVal_ && outVal_) << "internal memories are necessary";
if (cvtInVal_) {
pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
}
if (cvtOutVal_) {
pipelineFwd_.push_back(*cvtOutVal_);
}
convertWeightsFromPaddle();
printSizeInfo();
printValueFormat();
needResetBwd_ = true;
}
if (inputLayers_[0]->getType() == "data") {
// Update input value data when input layer is "data" type,
// since the input value data address might be changed.
CHECK(extInVal_);
extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
if (!outputOnlyMKLDNN_) {
clearGrads();
}
stream_->submit(pipelineFwd_);
}
{
REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
forwardActivation();
}
}
void MKLDNNLayer::backward(const UpdateCallback& callback) {
if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
pipelineMergeGrad_.clear();
mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
// external output grad is not necessary
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
}
if (cvtInGrad_) {
pipelineBwd_.push_back(*cvtInGrad_);
}
printGradFormat();
needResetBwd_ = false;
}
// merge grad must before backward activation
if (mergeGrad_) {
REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
stream_->submit(pipelineMergeGrad_);
}
{
REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
backwardActivation();
}
{
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
stream_->submit(pipelineBwd_);
}
{
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
updateWeights(callback);
}
}
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
if (h != 0) {
height = h;
}
if (w != 0) {
width = w;
}
}
void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
output_.setFrameHeight(height);
output_.setFrameWidth(width);
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(height);
outputOtherDevice_[i].setFrameWidth(width);
}
}
void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
const MatrixPtr& mat,
memory::primitive_desc pd) {
dnn = nullptr;
if (mat == nullptr) {
return;
}
dnn = MKLDNNMatrix::create(pd, mat);
}
void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
cvtInVal_ = nullptr;
extInVal_ = nullptr;
in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
if (in == nullptr || in->getFormat() == format::nc) {
in = MKLDNNMatrix::create(extPD, inMat);
}
extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
if (in->getFormat() == format::nc) {
CHECK(ih_ == 1 && iw_ == 1);
}
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return;
}
// need create reorder
in = MKLDNNMatrix::create(*intPD);
extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
}
void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
memory::primitive_desc intPD) {
cvtOutVal_ = nullptr;
out = MKLDNNMatrix::create(intPD, output_.value);
extOutVal_ = out;
if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
return;
}
// need create reorder
CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
extOutVal_ = MKLDNNMatrix::create(
memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
out = MKLDNNMatrix::create(intPD);
cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
CHECK(cvtOutVal_) << "should not be empty";
}
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD) {
cvtInGrad_ = nullptr;
extInGrad_ = nullptr;
in = nullptr;
LayerPtr& input = inputLayers_[0];
if (input->getOutputGrad() == nullptr) {
// no need input grad
return;
}
CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
<< "only support input is MKLDNN layer or only have one output layer";
// when input is a mkldnn branch node,
// this layer will save input grad to a internal buffer,
// and the mkldnn input layer will merge them to actual prev->output_.grad
const MatrixPtr& inMat =
input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
in = MKLDNNMatrix::create(intPD, inMat);
Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
CHECK(inVal_);
CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
if (inputIsOnlyMKLDNN()) {
return;
}
extInGrad_ = in;
if (isPaddleFormat(extInGrad_->getFormat())) {
return;
}
// need create reorder
// TODO(TJ): add macro definition to simplify it
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
<< "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
<< "should have internal input value and primitive desc must equal";
in = MKLDNNMatrix::create(intPD);
cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
CHECK(cvtInGrad_);
}
void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
memory::primitive_desc intPD) {
cvtOutGrad_ = nullptr;
extOutGrad_ = nullptr;
out = nullptr;
MatrixPtr& outMat = output_.grad;
out = MKLDNNMatrix::create(intPD, outMat);
resetMergeGrad(out);
if (outputIsOnlyMKLDNN()) {
return;
}
CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
extOutGrad_ = out;
if (isPaddleFormat(extOutGrad_->getFormat())) {
return;
}
// need create reorder
CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
<< "should have external output value and the format must be nchw(nc)";
extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
<< "should have internal output value and primitive desc must equal";
out = MKLDNNMatrix::create(intPD);
cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
CHECK(cvtOutGrad_);
}
void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
mergeGrad_ = nullptr;
pipelineMergeGrad_.clear();
if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
// do not merge when output is not all MKLDNN or only one output
return;
}
CHECK(out) << "should have reset internal ouput grad";
std::vector<double> scales(outputMap_.size(), 1.0);
std::vector<memory::primitive_desc> srcPDs;
std::vector<primitive::at> srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
MKLDNNMatrixPtr src =
std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
CHECK(src) << "should be MKLDNNMatrix";
auto srcDims = src->getDims();
auto dstDims = out->getDims();
CHECK_EQ(srcDims.size(), dstDims.size());
for (size_t i = 0; i < srcDims.size(); ++i) {
CHECK_EQ(srcDims[i], dstDims[i]);
}
VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
<< ", format " << src->getFormat();
srcPDs.push_back(src->getPrimitiveDesc());
srcs.push_back(*src);
}
// TODO(TJ): remove me when mkldnn sum support different formats
for (size_t i = 1; i < srcPDs.size(); ++i) {
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = out;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
}
auto sumPD =
sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
}
} // namespace paddle
......@@ -58,11 +58,31 @@ protected:
std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_;
// MKLDNNMatrixPtr with internal format
/* Value and grad are seperated as internal and external buffers.
* Each MKLDNNLayer must init or reset internal buffer at least,
* and the external buffer format is always nchw of nc(when h==w==1),
* which is the same format as paddle.
* The output_.value and output_.grad always save the external data,
* when mixed with cpu device.
* When all layers are mkldnn layers, they could save internal data.
*/
// below MKLDNNMatrix buffers are all internal buffers
MKLDNNMatrixPtr inVal_;
MKLDNNMatrixPtr inGrad_;
MKLDNNMatrixPtr outVal_;
MKLDNNMatrixPtr outGrad_;
// below are external value and grad
MKLDNNMatrixPtr extInVal_;
MKLDNNMatrixPtr extInGrad_;
MKLDNNMatrixPtr extOutVal_;
MKLDNNMatrixPtr extOutGrad_;
// convert handle between external and internal buffers
std::shared_ptr<mkldnn::reorder> cvtInVal_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// weight and bias are always internal buffers
MKLDNNMatrixPtr wgtVal_;
MKLDNNMatrixPtr wgtGrad_;
MKLDNNMatrixPtr biasVal_;
......@@ -91,6 +111,7 @@ public:
oh_(0),
ow_(0),
needResetBwd_(true),
outputOnlyMKLDNN_(false),
engine_(mkldnn::engine::cpu, 0),
stream_(nullptr),
fwd_(nullptr),
......@@ -99,92 +120,9 @@ public:
~MKLDNNLayer() {}
virtual bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) {
return false;
}
setOutputMap();
checkCPUOutputsNumber();
stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine();
return true;
}
void forward(PassType passType) override {
passType_ = passType;
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty());
copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
if (inputElemenCnt_ != elemenCnt) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
convertWeightsFromPaddle();
needResetBwd_ = true;
}
if (inputLayers_[0]->getType() == "data") {
updateInputData();
}
if (!outputOnlyMKLDNN_) {
clearGrads();
}
stream_->submit(pipelineFwd_);
}
/* activation */ {
REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
forwardActivation();
}
}
void backward(const UpdateCallback& callback) override {
if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
pipelineMergeGrad_.clear();
mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
needResetBwd_ = false;
}
// merge grad must before backward activation
if (mergeGrad_) {
REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
stream_->submit(pipelineMergeGrad_);
}
{
REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
backwardActivation();
}
{
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
stream_->submit(pipelineBwd_);
}
{
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
updateWeights(callback);
}
}
virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
virtual void forward(PassType passType);
virtual void backward(const UpdateCallback& callback);
/**
* reshape the input image sizes
......@@ -195,7 +133,7 @@ public:
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
/**
* reset the mkldnn forward primitve and memory
* reset the mkldnn forward primitve and memories
* only would be called when input size changes
*/
virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
......@@ -205,7 +143,7 @@ public:
MKLDNNMatrixPtr& out) = 0;
/**
* reset the mkldnn backward primitve and memory for mkldnn fc
* reset the mkldnn backward primitve and memories
* only would be called when needed
*/
virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
......@@ -214,12 +152,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) = 0;
/**
* Update input value data when input layer is "data" type.
* Since the input value data address might be changed.
*/
virtual void updateInputData() {}
/**
* Update weights and biases if necessary.
*/
......@@ -246,131 +178,78 @@ protected:
/**
* reshape the input image sizes and input batchsize
*/
virtual void reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
if (h != 0) {
height = h;
}
if (w != 0) {
width = w;
}
}
void reshapeInput(int& batchsize, int& height, int& width);
/**
* reshape output image sizes
*/
virtual void reshapeOutput(size_t height, size_t width) {
output_.setFrameHeight(height);
output_.setFrameWidth(width);
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(height);
outputOtherDevice_[i].setFrameWidth(width);
}
}
void reshapeOutput(size_t height, size_t width);
/**
* reset the output grad matrix from primitive desc.
* and reset the merge grad primitive if needed.
* note: when this layer has serval outputs,
* it could not be mixed with cpu device,
* since it can not get memory desc from cpu device.
* reset MKLDNNMatrix from Matrix and internal primitive desc.
* reset nullptr if matrix or primitive desc is empty
*/
virtual void resetOutGrad(MKLDNNMatrixPtr& out,
mkldnn::memory::primitive_desc pd) {
CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
mergeGrad_ = nullptr;
pipelineMergeGrad_.clear();
out = MKLDNNMatrix::create(output_.grad, pd);
if (outputMap_.size() <= 1) {
return;
}
std::vector<double> scales(outputMap_.size(), 1.0);
std::vector<mkldnn::memory::primitive_desc> srcPDs;
std::vector<mkldnn::primitive::at> srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
MKLDNNMatrixPtr src =
std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
CHECK(src) << "should be MKLDNNMatrix";
auto srcDims = src->getDims();
auto dstDims = out->getDims();
CHECK_EQ(srcDims.size(), dstDims.size());
for (size_t i = 0; i < srcDims.size(); ++i) {
CHECK_EQ(srcDims[i], dstDims[i]);
}
srcPDs.push_back(src->getPrimitiveDesc());
srcs.push_back(*src);
}
// TODO(TJ): remove me when mkldnn sum support different formats
for (size_t i = 1; i < srcPDs.size(); ++i) {
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = nullptr;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
} else {
tmpOutGrad_ = out;
}
void resetWithMatrix(MKLDNNMatrixPtr& dnn,
const MatrixPtr& mat,
mkldnn::memory::primitive_desc pd);
auto sumPD = mkldnn::sum::primitive_desc(
tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
}
/**
* reset input value from input MKLDNNMatrix and internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
void resetInValue(
MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
/**
* reset input grad from primitive desc.
* this function is avaiable for input is only mkldnn
* or input do not care cpu device
* reset output value from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
virtual void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc pd) {
LayerPtr& input = inputLayers_[0];
const MatrixPtr& grad =
input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
in = MKLDNNMatrix::create(grad, pd);
Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
}
void resetOutValue(MKLDNNMatrixPtr& out,
mkldnn::memory::primitive_desc intPD);
/**
* print info about sizes
* reset input grad from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
virtual void printSizeInfo() {
VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
<< ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
<< ", oh: " << oh_ << ", ow: " << ow_;
}
void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
/**
* Print the mkldnn memory format flow of value
* reset output grad from internal primitive desc.
* merge grad if necessary.
* reset both internal and external buffer and create reorder if necessary.
* note: about merge grad, when this layer has several outputs,
* it could not be mixed with cpu device,
* since it can not get memory desc from cpu device.
*/
virtual void printValueFormatFlow() {
if (inVal_ && outVal_) {
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
<< outVal_->getFormat();
}
}
void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
/**
* reset the merge grad primitive if necessary.
* note: do not support the grads mixed with cpu device,
* since it can not get memory desc from cpu device.
*/
void resetMergeGrad(MKLDNNMatrixPtr& out);
protected:
/**
* Set deviceId of this layer.
*/
void setDevice(int id) { deviceId_ = id; }
/**
* Print the mkldnn memory format flow of grad
* check the format is nchw or nc,
* which is supported by Paddle default memory layout
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
<< outGrad_->getFormat();
bool isPaddleFormat(mkldnn::memory::format fmt) {
if (fmt == mkldnn::memory::format::nchw ||
fmt == mkldnn::memory::format::nc) {
return true;
} else {
return false;
}
}
protected:
/**
* If input only has MKLDNN device.
* Otherwise, only support the previous layer using CPU device.
......@@ -380,7 +259,6 @@ protected:
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
......@@ -400,20 +278,76 @@ protected:
}
/**
* Set deviceId of this layer.
* print info about sizes
*/
void setDevice(int id) { deviceId_ = id; }
virtual void printSizeInfo() {
VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
<< ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
<< ", oh: " << oh_ << ", ow: " << ow_;
}
/**
* print the mkldnn memory format of value
*/
virtual void printValueFormat() {
if (extInVal_) {
VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
}
if (inVal_) {
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
if (wgtVal_) {
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
}
if (biasVal_) {
VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
}
}
/**
* print the mkldnn memory format of grad
*/
virtual void printGradFormat() {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
if (inGrad_) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
}
if (extInGrad_) {
VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
}
if (wgtGrad_) {
VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
}
if (biasGrad_) {
VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
}
}
private:
/**
* clear all grad
*/
void clearGrads() {
if (output_.grad) {
output_.grad->zeroMem();
}
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].grad) {
outputOtherDevice_[i].grad->zeroMem();
}
}
}
/**
* Set deviceId of the params used in this layer.
......@@ -449,6 +383,19 @@ private:
}
}
/**
* if have cpu device, share value and grad data with output_
*/
void shareCPUDevice() {
if (outputIsOnlyMKLDNN()) {
return;
}
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].value = output_.value;
outputOtherDevice_[i].grad = output_.grad;
}
}
/**
* Check the cpu device number of outputOtherDevice_.
* should have only one at most.
......
......@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdPD(fwdPD_, in, out);
resetFwdPipeline(pipeline, fwdPD_, in, out);
printValueFormatFlow();
}
void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdPD(pd, in, out);
resetBwdPipeline(pipeline, pd, in, out);
printGradFormatFlow();
}
void MKLDNNPoolLayer::updateInputData() {
inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetInValue(in);
resetOutValue(out);
}
void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& dnnIn = getInputValue(0);
in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
CHECK(in) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
in = MKLDNNMatrix::create(
cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
}
void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
CHECK(inVal_) << "Should reset input value first";
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
out = MKLDNNMatrix::create(
output_.value, outDims, inVal_->getFormat(), engine_);
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
CHECK(cvtOutVal_) << "should not be emptry";
} else {
cpuOut->setData(output_.value->getData());
cpuOutVal_ = out;
}
output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
return;
}
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
CHECK(in);
auto outPD =
MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
resetOutValue(out, outPD);
}
void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out) {
memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
......@@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
: std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
pipeline.push_back(*fwd_);
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetOutGrad(out);
resetInGrad(in);
}
void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr;
CHECK(outVal_);
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
// always share the same grad data of CPU output
// then the activation can get the right grad from output_.grad
output_.grad->setData(cpuOut->getData());
cpuOutGrad_ = MKLDNNMatrix::create(
cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_) << "should not be emptry";
} else {
out = cpuOutGrad_;
}
}
}
void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr;
if (inputLayers_[0]->getOutput().grad == nullptr) {
return;
}
CHECK(inVal_);
MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
CHECK(inVal_ && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
}
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
pd = nullptr;
if (in == nullptr) {
return;
}
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
memory::dims padR = getPaddingR();
CHECK(in);
CHECK(out);
auto bwdDesc = pool_bwd::desc(poolAlgo_,
in->getMemoryDesc(),
......@@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_);
if (pd == nullptr) {
return;
}
bwdData_ =
......
......@@ -38,13 +38,6 @@ protected:
// pooling_avg or pooling_max
mkldnn::algorithm poolAlgo_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// save forward primitive_desc, which can be used backward
std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
// according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
......@@ -74,8 +67,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
......@@ -90,8 +81,6 @@ protected:
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out);
......@@ -106,8 +95,6 @@ protected:
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
......
......@@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
parameters_[REF][i]->randomize();
dnnValue->copyFrom(*refValue);
VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
printVector(dnnValue);
}
}
......@@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
dataLayers_[DNN][i]->getOutputValue()->copyFrom(
*(dataLayers_[REF][i]->getOutputValue()));
VLOG(lvl_) << "Input " << i << " data:";
VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
printMatrix(dataLayers_[REF][i]->getOutputValue());
}
}
......@@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
refLayer_->getOutputGrad()->randomizeUniform();
dnnLayer_->getOutput(CPU_DEVICE)
.grad->copyFrom(*(refLayer_->getOutputGrad()));
VLOG(lvl_) << "Random Backward Input, TopDiff: ";
VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
printMatrix(refLayer_->getOutputGrad());
}
void MKLDNNTester::checkForward() {
VLOG(MKLDNN_ALL) << "Check Forward";
VLOG(MKLDNN_TESTS) << "Check Forward";
printTopDatas();
double delta =
compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
......@@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
}
void MKLDNNTester::checkBackwardData() {
VLOG(MKLDNN_ALL) << "Check Backward Data";
VLOG(MKLDNN_TESTS) << "Check Backward Data";
// TODO(TJ): uncomment me when batch norm ready
// const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
printMatrix(dnnDiff);
VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
printMatrix(refDiff);
double delta = compareMatrix(dnnDiff, refDiff);
......@@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
}
void MKLDNNTester::checkBackwardWgts() {
VLOG(MKLDNN_ALL) << "Check Backward Weight";
VLOG(MKLDNN_TESTS) << "Check Backward Weight";
CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
vector<VectorPtr> dnnWgts; // used to temply save mkldnn weights
saveWgt(parameters_[DNN], dnnWgts);
......@@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
<< parameters_[DNN][i]->getName();
printVector(dnn);
VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
VLOG(MKLDNN_ALL) << "Reference Result: weight value "
<< parameters_[REF][i]->getName();
printVector(ref);
double delta = compareVector(dnn, ref);
......@@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
}
for (int n = 0; n < NUM; ++n) {
VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
<< " Forward Result: OutputValue";
printMatrix(testLayers_[n]->getOutputValue());
}
}
......@@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
std::ostringstream ostr;
m->print(ostr);
VLOG(lvl_) << std::endl << ostr.str();
VLOG(MKLDNN_ALL) << std::endl << ostr.str();
}
void MKLDNNTester::printVector(const VectorPtr& v) {
......@@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
std::ostringstream ostr;
v->print(ostr, v->getSize());
VLOG(lvl_) << std::endl << ostr.str();
VLOG(MKLDNN_ALL) << std::endl << ostr.str();
}
double MKLDNNTester::getDelta(const real* d1,
......@@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
UpdateCallback updateCallback = [](Parameter* para) {
auto& grad = para->getBuf(PARAMETER_GRADIENT);
auto& value = para->getBuf(PARAMETER_VALUE);
real lr = 1e-3;
real lr = 1e-2;
value->add(*grad, lr);
grad->zeroMem();
};
......@@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
size_t batchSize,
size_t inputImgH,
size_t inputImgW,
bool printDetails,
size_t iter,
float epsilon,
bool log,
int level) {
float epsilon) {
CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
<< "should be MKLDNN layer or MKLDNN activation";
......@@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
ih_ = inputImgH;
iw_ = inputImgW;
log_ = printDetails;
iter_ = iter;
eps_ = epsilon;
log_ = log;
lvl_ = level;
// Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
reset(dnn, ref, batchSize);
......@@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
for (size_t i = 0; i < ref.outValues.size(); i++) {
EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
}
VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
for (size_t i = 0; i < ref.paraValues.size(); i++) {
EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
}
......@@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
float eps) {
DataIn in;
initArgument(in, configPath, iter);
DataOut outCpu, outDnn;
VLOG(MKLDNN_TESTS) << "runing cpu network";
getOutResult(configPath, in, outCpu, false, iter);
VLOG(MKLDNN_TESTS) << "runing mkldnn network";
getOutResult(configPath, in, outDnn, true, iter);
compareResult(outCpu, outDnn, eps);
......
......@@ -58,8 +58,6 @@ protected:
size_t iter_;
/// whether to print out the details
bool log_;
/// vlog level to print the matrix details datas
int lvl_;
/// epsilon
float eps_;
/// input image size, default 1
......@@ -70,7 +68,6 @@ public:
iter_ = iter;
eps_ = epsilon;
log_ = false;
lvl_ = MKLDNN_ALL;
}
~MKLDNNTester() {}
......@@ -81,10 +78,9 @@ public:
size_t batchSize,
size_t inputImgH = 1,
size_t inputImgW = 1,
bool printDetails = false,
size_t iter = 3,
float epsilon = 1e-4,
bool log = false,
int level = MKLDNN_ALL);
float epsilon = 1e-4);
static void runBranchesTest(const std::string& configPath,
size_t iter = 3,
float eps = 1e-4);
......
......@@ -51,7 +51,10 @@ def test_sparse_non_value_no_seq(setting, filename):
yield [(i + 1) * (j + 1) for j in xrange(10)]
@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
@provider(input_types=[
sparse_float_vector(
30000, seq_type=SequenceType.NO_SEQUENCE)
])
def test_sparse_value_no_seq(setting, filename):
for i in xrange(200):
yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
......
......@@ -18,7 +18,7 @@ using namespace mkldnn; // NOLINT
namespace paddle {
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
memory::desc md = pd.desc();
size_t ndims = md.data.ndims;
int* dims = md.data.dims;
......@@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
}
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
memory::dims dims,
MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
memory::format fmt,
engine& eg,
MatrixPtr m,
mkldnn::memory::data_type dtype) {
return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
}
std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
......
......@@ -40,24 +40,37 @@ public:
/**
* Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
*/
static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
MatrixPtr m = nullptr);
/**
* Create MKLDNNMatrix from a MatrixPtr and memory details info
*/
static MKLDNNMatrixPtr create(
MatrixPtr m,
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::engine& eg,
MatrixPtr m = nullptr,
mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
/**
* Create primitive descriptor.
* default with f32 dtype
*/
static mkldnn::memory::primitive_desc createPrimitiveDesc(
const mkldnn::memory::dims dims,
const mkldnn::memory::format& fmt,
const mkldnn::engine& eg,
const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
}
/**
* Create Memory descriptor.
* default with any format and f32 dtype
*/
static mkldnn::memory::desc createMemoryDesc(
const mkldnn::memory::dims& dims,
const mkldnn::memory::dims dims,
const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
return mkldnn::memory::desc(dims, dtype, fmt);
......
......@@ -116,7 +116,8 @@ set(DEPS_OPS
sum_op
pool_op
pool_with_index_op
sequence_conv_op)
sequence_conv_op
lstm_op)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
......@@ -128,7 +129,7 @@ op_library(sum_op DEPS net_op)
op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling)
op_library(sequence_conv_op DEPS sequence_project)
op_library(lstm_op DEPS sequence2batch lstm_compute)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
......
......@@ -27,8 +27,8 @@ class ClipOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ClipOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto max = Attr<float>("max");
auto min = Attr<float>("min");
auto max = ctx->Attrs().Get<float>("max");
auto min = ctx->Attrs().Get<float>("min");
PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
......
......@@ -114,7 +114,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
// im2col
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[1]);
paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
......@@ -213,7 +213,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
}
}
}
......@@ -235,7 +236,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
// gemm
Tensor filter_grad_slice =
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h"
namespace paddle {
namespace operators {
void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DTransposeOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
for (size_t i = 0; i < paddings.size(); ++i) {
PADDLE_ENFORCE_EQ(paddings[i], 0,
"No Padding allowed in conv transpose op.");
}
PADDLE_ENFORCE_EQ(in_dims.size(), 4,
"Conv2DTransposeOp input should be 4-D tensor.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
"Conv2DTransposeOp filter should be 4-D tensor.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"input and kernel input dimension should be equal.");
auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
ctx->SetOutputDim("Output",
{in_dims[0], filter_dims[1], output_height, output_width});
}
Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"(Tensor) The input tensor of convolution transpose operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H and W is the height and width of image.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMHW, where C is the number of "
"output image channels, M is the number of input image channels, "
"H and W is height and width of filter. "
"We enforce groups number == 1 and padding == 0 in "
"convolution transpose Scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides",
"strides of convolution transpose operator.")
.SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings",
"paddings of convolution transpose operator.")
.SetDefault({0, 0});
AddComment(R"DOC(
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
)DOC");
}
void Conv2DTransposeOpGrad::InferShape(
framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
}
if (ctx->HasOutput(framework::GradVarName("Filter"))) {
ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
}
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
ops::Conv2DTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2dtranspose,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2dtranspose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
conv2dtranspose,
ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv2dtranspose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
// Define Op classes in .h file so that other conv transpose
// operator implementations can reuse the code.
class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv2DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class Conv2DTransposeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override;
};
class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override;
};
template <typename Place, typename T>
class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped, so it should not be constant pointer
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose.
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
const int c = output->dims()[1]; // output channels
const int o_h = output->dims()[2];
const int o_w = output->dims()[3];
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
col2im;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape = {c * k_h * k_w, h * w};
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
DDim filter_matrix_shape = {m, c * k_h * k_w};
filter.Resize(filter_matrix_shape);
// convolution transpose: gemm + col2im (similar to conv-backward on input)
output->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*output);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
// batch with size (M, h * w)
Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// filter size: (M, c * k_h * k_w)
// output size: (c, o_h, o_w)
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
// col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w)
math::matmul<Place, T>(context.device_context(), filter, true,
input_batch, false, T(1.0), &col_matrix, T(0.0));
col2im(context.device_context(), output_batch, col, strides[0],
strides[1], 0, 0, 0, 0);
}
}
};
template <typename Place, typename T>
class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
// For filter, we do not use const pointer b/c we will do reshape,
// but we should avoid modifying its value.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
const int c = output_grad->dims()[1]; // output channels
const int o_h = output_grad->dims()[2];
const int o_w = output_grad->dims()[3];
// Only im2col functor required for bp to get to the right shape
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
DDim filter_matrix_shape = {m, c * k_h * k_w};
filter.Resize(filter_matrix_shape);
// convolution transpose grad on input:
// im2col + gemm (similar to conv-forward)
// input need to compute gradient
if (input_grad) {
Tensor col_matrix;
col_matrix.ShareDataWith(col);
DDim col_matrix_shape = {c * k_h * k_w, h * w};
col_matrix.Resize(col_matrix_shape);
input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
// batch with size (c, o_h * o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// filter of size (m, c * k_h * k_w)
// batch with size (m, h, w)
Tensor input_grad_batch =
input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: dx = filter * dy
// (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), filter, false,
col_matrix, false, T(1.0), &input_grad_batch,
T(0.0));
}
}
// filter gradient required
if (filter_grad) {
Tensor col_matrix_f;
col_matrix_f.ShareDataWith(col);
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
col_matrix_f.Resize(col_matrix_shape_f);
filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; ++i) {
// batch with size (c, o_h, o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// input batch
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: (c * h * w, k_h * k_w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: d_filter = x * y_grad^T
// (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), in_batch, false,
col_matrix_f, true, T(1.0), &filter_grad_,
T(1.0));
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -59,7 +59,8 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
"The input should be a k-D tensor(k > 0 and k < 7)");
AddInput("Y",
"The input used as reference for cropping"
" with the same dimension as X. ");
" with the same dimension as X. ")
.AsDispensable();
AddOutput("Out",
"The output of crop op "
"with the same dimension as X.");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class FCOp : public NetOp {
public:
FCOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE(!Inputs("X").empty(),
"Inputs(X) of FCOp should not be null.");
PADDLE_ENFORCE(!Inputs("W").empty(),
"Inputs(W) of FCOp should not be null.");
PADDLE_ENFORCE(!Outputs("MulOut").empty(),
"Outputs(MulOut) of FCOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of FCOp should not be null.");
auto x = Inputs("X");
auto w = Inputs("W");
auto mul_out = Outputs("MulOut");
PADDLE_ENFORCE_EQ(
x.size(), w.size(),
"The size of inputs X(%d) should be the same as that of weights W(%d).",
x.size(), w.size());
PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
"The size of intermediate mul_out(%d) should be the same "
"as that of inputs X(%d).",
mul_out.size(), x.size());
size_t n = x.size();
PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
"The size of inputs X(%d) should be no less than 1.", n);
auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
// Set all values or set no values (use the default value)
if (!x_num_col_dims.empty()) {
PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
"The size of attribute xNumColDims(%d) should be the "
"same as that of inputs X(%d).",
x_num_col_dims.size(), n);
} else {
x_num_col_dims.resize(n);
for (size_t i = 0; i < n; i++) {
x_num_col_dims[i] = 1;
}
}
// mul_out[i] = X[i] * W[i]
for (size_t i = 0; i < n; i++) {
framework::AttributeMap mul_attr;
mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
mul_attr["y_num_col_dims"] = static_cast<int>(1);
AppendOp(
framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
{{"Out", {mul_out[i]}}}, mul_attr));
}
// sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
auto sum_out = mul_out[0];
if (n > 1) {
PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
"Output(SumOut) of FCOp should not be null when the "
"size of Inputs(X) > 1.");
sum_out = Output("SumOut");
AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
{{"Out", {sum_out}}}, {}));
} else {
if (Output("SumOut") != framework::kEmptyVarName) {
this->Rename(Output("SumOut"), framework::kEmptyVarName);
}
}
// add_out = sum_out + b
auto b = Input("B");
auto add_out = sum_out;
if (b != framework::kEmptyVarName) {
PADDLE_ENFORCE_NE(
Output("AddOut"), framework::kEmptyVarName,
"Output(AddOut) of FCOp should not be null when Input(B) is set.");
add_out = Output("AddOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {sum_out}}, {"Y", {Input("B")}}},
{{"Out", {add_out}}}, {}));
} else {
if (Output("AddOut") != framework::kEmptyVarName) {
this->Rename(Output("AddOut"), framework::kEmptyVarName);
}
}
auto activation = Attr<std::string>("activation");
AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
{{"Y", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class FCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(A vector of Tensors) each input Tensor can be of arbitrary "
"dimension, and will be reshaped to a 2-D matrix of size "
"(minibatch, number_of_input_features) according to attribute "
"xNumColDims.")
.AsDuplicable();
AddInput("W",
"(A vector of Tensors) the weights of FC operator, a "
"vector of 2-D matrix of size "
"(number_of_input_features, number_of_neurons).")
.AsDuplicable();
AddInput("B",
"(Tensor) the bias of FC operator, a 1-D vector of size "
"number_of_neurons.");
AddOutput("Out",
"(Tensor) the activated output matrix of FC operator, a 2-D "
"matrix of size (minibatch, number_of_neurons).");
AddOutput("MulOut",
"(A vector of Tensors) the intermediate outputs of FC operator, "
"each Tensor saving the product of X_i * W_i.")
.AsIntermediate()
.AsDuplicable();
AddOutput(
"SumOut",
"(Tensor) the intermediate output of FC operator, "
"saving the sum of the products of X and W, that is sum{X_i * W_i}.")
.AsIntermediate();
AddOutput("AddOut",
"(Tensor) the non-actived output of FC operator, "
"saving sum{X_i * W_i} + B.")
.AsIntermediate();
AddAttr<std::string>(
"activation",
"(string, default identity) the activation type of FC operator.")
.SetDefault("identity")
.InEnum({"identity", "sigmoid", "softmax"});
AddAttr<std::vector<int>>(
"xNumColDims",
"(std::vector<int>) The inputs Tensors of FC operator can be of "
"more than 2 dimensions. In that case, each input Tensor `X_i` will be "
"reshaped to a 2-D matrix. The matrix's first dimension "
"(the length of column) will be the product of `X_i`'s last "
"`xNumColDims_i` dimensions, that is "
"`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
"The matrix's second dimension (the length of row) will be the product "
"of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
"`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
.SetDefault(std::vector<int>{});
AddComment(R"DOC(
Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
in Convolutional Neural Networks. Neurons in a fully connected layer have
full connections to all activations in the previous layer.
It computes an inner product of a set of
learned weights with a matrix multiplication followed by a bias offset
(optionally).
Equation:
Out = Act(sum_n{X_i * W_i} + B)
where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
usually M is the minibatch size and K is the number of input features.
W_i is a 2-D matrix of size (K x N), where N means the number of neurons
in the fully connected layer. B is a 1-D vector of size N.
Thus, the output Out is a 2-D matrix of size (M x N).
Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
All the inputs can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD with first input (`X[0]`).
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);
......@@ -59,7 +59,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
return static_cast<framework::DataType>(Attr<int>("data_type"));
return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
}
};
......
......@@ -54,8 +54,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(
weight_width, frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
auto bias = Input("Bias");
if (bias != framework::kEmptyVarName) {
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
......@@ -89,7 +88,8 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
"weights of output candidate with shape [frame_size, frame_size]");
AddInput("Bias",
"(Tensor) Bias vector with shape [1, frame_size * 3] concating "
"bias of the update gate, reset gate and output candidate.");
"bias of the update gate, reset gate and output candidate.")
.AsDispensable();
AddOutput("Gate",
"(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
"output of update gate, reset gate and output candidate")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/net_op.h"
#include "paddle/operators/scale_op.h"
namespace paddle {
namespace operators {
// The identity operator is an alias of the scale operator. This is also an
// example for creating an alias for an existing operator.
template <typename AttrType>
class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IdentityOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of identity operator.");
AddOutput("Y", "The output tensor of identity operator.");
AddComment(R"DOC(
The identity operator is an alias of the scale operator
with the attribute scale fixed to 1.0.
)DOC");
}
};
template <typename AttrType>
class IdentityOp : public NetOp {
public:
IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
"Output(Y) of IdentityOp should not be null.");
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
{{"scale", static_cast<AttrType>(1)}}));
CompleteAddOp(false);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
ops::IdentityOpMaker<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class InterpOp : public NetOp {
public:
InterpOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Input("Y"), framework::kEmptyVarName,
"Input(Y) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Input("W"), framework::kEmptyVarName,
"Input(W) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("SubOut"), framework::kEmptyVarName,
"Output(SubOut) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("MulOut"), framework::kEmptyVarName,
"Output(MulOut) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of InterpOp should not be null.");
// SubOut = X - Y
auto x = Input("X");
auto y = Input("Y");
auto sub_out = Output("SubOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_sub", {{"X", {x}}, {"Y", {y}}}, {{"Out", {sub_out}}}, {}));
// MulOut = SubOut * W = (X - Y) * W
auto w = Input("W");
auto mul_out = Output("MulOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_mul", {{"X", {sub_out}}, {"Y", {w}}}, {{"Out", {mul_out}}},
{{"axis", 0}}));
// Out = MulOut + Y = (X - Y) * W + Y = X * W + Y * (1 - W)
AppendOp(framework::OpRegistry::CreateOp("elementwise_add",
{{"X", {mul_out}}, {"Y", {y}}},
{{"Out", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class InterpOpMaker : public framework::OpProtoAndCheckerMaker {
public:
InterpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(Tensor), 2-D Matrix of shape [batch_size, data_dim]"
"containing data samples, the first input of interp_op");
AddInput("Y",
"(Tensor), 2-D Matrix of shape `[batch_size, data_dim]`"
"containing data samples, the second input of interp_op");
AddInput("W",
"(Tensor), 1-D Vector of shape [batch_size],"
"the interpolated values in the half-open interval [0.0, 1.0)");
AddOutput("SubOut",
"(Tensor), the intermediate subtraction outputs, saving X - Y.")
.AsIntermediate();
AddOutput("MulOut",
"(Tensor), the intermediate multiplication outputs,"
"saving the elementwise multiplication of (X - Y) and W.")
.AsIntermediate();
AddOutput("Out",
"(Tensor), the output of interp_op, same shape with X,"
"returns the first-dimensional piecewise linear interpolant "
"between X and Y");
AddComment(R"DOC(
Linear Interpolation with two inputs, used in NEURAL TURING MACHINE.
Equation:
Out.row[i] = X.row[i] * W[i] + Y.row[i] * (1 - W[i])
= (X.row[i] - Y.row[i]) * W[i] + Y.row[i]
Example:
X = [[1,2],[3,4]],
Y = [[2,1],[4,3]],
W = [0.3, 0.4]
Then, Out = [[1.7,1.3],[3.6,3.4]]
where 1.7 = 1*0.3+2*(1-0.3),
1.3 = 2*0.3+1*(1-0.3),
3.6 = 3*0.4+4*(1-0.4),
3.4 = 4*0.4+3*(1-0.4)
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(interp, ops::InterpOp, ops::InterpOpMaker);
......@@ -32,6 +32,9 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
ctx->ShareLoD("Ids", /*->*/ "Out");
}
......@@ -53,7 +56,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
" which is a learnable parameter.");
AddInput("Ids",
"An input with type int32 or int64"
"contains the ids to be looked up in W.");
"contains the ids to be looked up in W."
"Ids must be a column vector with rank = 2."
"The 2nd dimension size must be 1");
AddOutput("Out", "The lookup results, which have the same type with W.");
AddComment(R"DOC(
This operator is used to perform lookups on the parameter W,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/lstm_op.h"
namespace paddle {
namespace operators {
class LSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(Cell) and Input(Hidden) of LSTM should not "
"be null at the same time.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
}
int frame_size = x_dims[1] / 4;
auto w_dims = ctx->GetInputDim("Weight");
PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"The rank of Input(Weight) should be 2.");
PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
"The first dimension of Input(Weight) "
"should be %d.",
frame_size);
PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
"The second dimension of Input(Weight) "
"should be 4 * %d.",
frame_size);
auto b_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
if (ctx->Attrs().Get<bool>("usePeepholes")) {
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection",
frame_size);
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection",
frame_size);
}
ctx->SetOutputDim("Hidden", {x_dims[0], frame_size});
ctx->SetOutputDim("Cell", {x_dims[0], frame_size});
ctx->SetOutputDim("BatchGate", x_dims);
ctx->ShareLoD("Input", "Hidden");
ctx->ShareLoD("Input", "Cell");
}
};
class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where, T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddInput("H0",
"(Tensor, optional) the initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size, D is the hidden size.");
AddInput("C0",
"(Tensor, optional) the initial cell state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size. `H0` and `C0` can be NULL but only at the same time");
AddInput("Weight",
"(Tensor) the learnable hidden-hidden weights."
" - The shape is (D x 4D), where D is the hidden size. "
" - Weight = {W_ch, W_ih, W_fh, W_oh}");
AddInput("Bias",
"(Tensor) the learnable weights, which contains two parts: "
"input-hidden bias weight and peephole connections weight if "
"setting `usePeepholes` True. "
"1. `usePeepholes = False` "
" - The shape is (1 x 4D). "
" - Bias = {b_c, b_i, b_f, b_o}."
"2. `usePeepholes = True` "
" - The shape is (1 x 7D). "
" - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
AddOutput("BatchGate",
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate after the nonlinear computation. This "
"LoDTensor has the same shape with the reorganized input, which "
"was also be called batch input. The LoD size is 2. The first "
"LoD is the batch offsets and the second LoD contains the "
"indexes, which denote the position of reorganized sequence "
"in the raw input.")
.AsIntermediate();
AddOutput("Hidden",
"(LoDTensor) the hidden state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`.");
AddOutput("Cell",
"(LoDTensor) the cell state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`.");
AddAttr<bool>("usePeepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
.SetDefault(true);
AddAttr<bool>("isReverse",
"(bool, defalut: False) "
"whether to compute reversed LSTM.")
.SetDefault(false);
AddAttr<std::string>(
"gateActivation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid");
AddAttr<std::string>("cellActivation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh");
AddAttr<std::string>("candidateActivation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh");
AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator
The defalut implementation is diagonal/peephole connection [1], the formula is
as follows
i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t}
h_t = o_t ⊙ act_h(c_t)
where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
are diagonal weight matrices for peephole connections. In our implenmention,
We use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
is the non-line actications, such as logistic sigmoid function, and
\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate,
output gate and cell activation vectors, all of which are the same size as
the cell output activation vector \f$h\f$.
The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$
are the cell input and cell output activation functions, `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Set `usePeepholes` False to disable peephole connection [2]. The formula
is omitted here.
@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
operations on the input x_{t} were NOT included in this operator.
Users can choose to use fully-connect operator before LSTM operator.
[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory
recurrent neural network architectures for large scale acoustic modeling.
INTERSPEECH, 2014.
[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory.
Neural Computation, 9(8):1735-1780, 1997.
)DOC");
}
};
class LSTMGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(Hidden@GRAD) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")),
"Input(Cell@GRAD) should not be null");
ctx->SetOutputDim(framework::GradVarName("Weight"),
ctx->GetInputDim("Weight"));
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
ops::LSTMKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(lstm_grad,
ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/lstm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
ops::LSTMKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(lstm_grad,
ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
namespace paddle {
namespace operators {
using framework::LoDTensor;
using framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
class LSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::LoDTensor>("Input");
auto* weight = ctx.Input<framework::Tensor>("Weight");
auto* bias = ctx.Input<framework::Tensor>("Bias");
auto* batch_gate = ctx.Output<framework::LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(ctx.GetPlace());
auto* hidden_out = ctx.Output<framework::LoDTensor>("Hidden");
hidden_out->mutable_data<T>(ctx.GetPlace());
auto* cell_out = ctx.Output<framework::LoDTensor>("Cell");
cell_out->mutable_data<T>(ctx.GetPlace());
// Now the function ShareLoD in InferShape is not implemented.
// So copy LoD here.
ctx.ShareLoD("Input", "Hidden");
ctx.ShareLoD("Input", "Cell");
bool is_reverse = ctx.Attr<bool>("isReverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(ctx.device_context(), *input, *batch_gate, is_reverse);
auto in_dims = input->dims();
int frame_size = static_cast<int>(in_dims[1] / 4);
framework::DDim dims({in_dims[0], frame_size});
if (bias) {
Eigen::array<int, 2> extents({{1, 4 * frame_size}});
Eigen::array<int, 2> offsets({{0, 0}});
auto b = EigenMatrix<T>::From(*bias);
auto gate = EigenMatrix<T>::From(*batch_gate);
gate.device(ctx.GetEigenDevice<Place>()) =
gate +
b.slice(offsets, extents)
.reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
.broadcast(
Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
}
math::LstmMetaValue<T> lstm_value;
T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size;
lstm_value.prevStateValue = nullptr;
framework::LoDTensor batch_out, batch_cell, batch_cell_pre_act;
batch_out.mutable_data<T>(dims, ctx.GetPlace());
batch_cell.mutable_data<T>(dims, ctx.GetPlace());
batch_cell_pre_act.mutable_data<T>(dims, ctx.GetPlace());
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
auto gate_act = ctx.Attr<std::string>("gateActivation");
auto cell_act = ctx.Attr<std::string>("cellActivation");
auto cand_act = ctx.Attr<std::string>("candidateActivation");
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor out_t = batch_out.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act.Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n != 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_out.Slice(pre_h_start, pre_h_end);
math::matmul<Place, T>(ctx.device_context(), pre_hidden_t, false,
*weight, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
}
// else if : FIXME support the initial hidden and cell
lstm_value.gateValue = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(ctx.device_context(), lstm_value,
frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue;
}
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_out.set_lod(batch_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(ctx.device_context(), batch_out, *hidden_out);
batch_cell.set_lod(batch_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(ctx.device_context(), batch_cell, *cell_out);
}
};
template <typename Place, typename T>
class LSTMGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {}
};
} // namespace operators
} // namespace paddle
......@@ -19,7 +19,6 @@
namespace paddle {
namespace operators {
using framework::LoDTensor;
using framework::Tensor;
template <typename T>
......
add_subdirectory(detail)
if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
......@@ -8,6 +10,8 @@ if(WITH_GPU)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(sequence_project SRCS sequence_project.cc sequence_project.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
......@@ -16,6 +20,8 @@ else()
cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(vol2col SRCS vol2col.cc DEPS device_context)
cc_library(sequence_project SRCS sequence_project.cc DEPS device_context)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
......@@ -22,8 +22,6 @@ namespace {
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
const int N, const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
......
if(WITH_AVX)
cc_library(activation_functions SRCS hl_cpu_functions.cc hl_avx_functions.cc)
else()
cc_library(activation_functions SRCS hl_cpu_functions.cc)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_ACTIVATION_FUNCTIONS_H_
#define HL_ACTIVATION_FUNCTIONS_H_
#include "hl_functions.h"
#include "paddle/operators/math/lstm_compute.h"
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
#define FLOAT_ACTIVE_FUNCTION \
{ \
hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \
hppl::typef::linear \
}
#define DOUBLE_ACTIVE_FUNCTION \
{ \
hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \
hppl::typed::linear \
}
#define AVX_ACTIVE_FUNCTION \
{ hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl {
using activation_mode_t = paddle::operators::math::activation_mode_t;
/**
* Hppl supports sigmoid, relu, tanh, linear active functions
* for neural networks' forward and backward activation.
*/
template <class T>
class Active {
public:
typedef T (*forward)(T);
typedef T (*backward)(T, T);
};
template <typename T>
struct ForwardActType;
template <>
struct ForwardActType<float> {
using type = Active<float>::forward;
};
template <>
struct ForwardActType<double> {
using type = Active<double>::forward;
};
template <typename T>
struct BackwardActType;
template <>
struct BackwardActType<float> {
using type = Active<float>::backward;
};
template <>
struct BackwardActType<double> {
using type = Active<double>::backward;
};
#ifdef __NVCC__
namespace gpu {
static __device__ Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static __device__ Active<double>::backward backward_d[] =
DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
__device__ typename ForwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct ForwardAct<float> {
__device__ ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
__device__ ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
__device__ typename BackwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct BackwardAct<float> {
__device__ BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
__device__ BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace gpu
#else
namespace cpu {
static Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static Active<double>::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
typename ForwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct ForwardAct<float> {
ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
typename BackwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct BackwardAct<float> {
BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace cpu
#ifdef __AVX__
namespace avx {
static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION;
} // namespace avx
#endif
#endif
} // namespace hppl
#endif // HL_ACTIVATION_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <immintrin.h>
#include "hl_functions.h"
// TODO(qingqing) refine this dependence
#include "paddle/cuda/src/avx_mathfun.h"
namespace hppl {
__m256 exp(__m256 a) { return exp256_ps(a); }
__m256 relu(const __m256 a) {
__m256 tmp = _mm256_set1_ps(0.0f);
return _mm256_max_ps(a, tmp);
}
__m256 sigmoid(const __m256 a) {
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
__m256 tmp = _mm256_max_ps(a, min);
tmp = _mm256_min_ps(tmp, max);
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
tmp = exp(tmp);
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
return tmp;
}
__m256 tanh(const __m256 a) {
__m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
__m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
tmp = _mm256_min_ps(tmp, max);
tmp = exp(tmp);
return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
_mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
_mm256_set1_ps(1.0f));
}
__m256 linear(const __m256 a) { return a; }
__m256 relu(const __m256 a, const __m256 b) {
return _mm256_mul_ps(
a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
_mm256_set1_ps(1.0f)));
}
__m256 sigmoid(const __m256 a, const __m256 b) {
return _mm256_mul_ps(_mm256_mul_ps(a, b),
_mm256_sub_ps(_mm256_set1_ps(1.0f), b));
}
__m256 tanh(const __m256 a, const __m256 b) {
return _mm256_mul_ps(
a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
}
__m256 linear(const __m256 a, const __m256 b) { return a; }
} // namespace hppl
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_AVX_FUNCTIONS_H_
#define HL_AVX_FUNCTIONS_H_
#include <immintrin.h>
namespace hppl {
__m256 relu(const __m256 a);
__m256 sigmoid(const __m256 a);
__m256 tanh(const __m256 a);
__m256 linear(const __m256 a);
__m256 relu(const __m256 a, const __m256 b);
__m256 sigmoid(const __m256 a, const __m256 b);
__m256 tanh(const __m256 a, const __m256 b);
__m256 linear(const __m256 a, const __m256 b);
} // namespace hppl
#endif // HL_AVX_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "hl_functions.h"
namespace hppl {
namespace typef {
float relu(const float a) {
return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
}
float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
}
float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
float linear(const float a) { return a; }
float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
float sigmoid(const float a, const float b) {
return a * b * (static_cast<float>(1) - b);
}
float tanh(const float a, const float b) {
return a * (static_cast<float>(1) - b * b);
}
float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
double relu(const double a) {
return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
}
double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
}
double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
double linear(const double a) { return a; }
double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
double sigmoid(const double a, const double b) {
return a * b * (static_cast<double>(1) - b);
}
double tanh(const double a, const double b) {
return a * (static_cast<double>(1) - b * b);
}
double linear(const double a, const double b) { return a; }
} // namespace typed
} // namespace hppl
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_FUNCTIONS_H_
#define HL_FUNCTIONS_H_
/**
* sigmoid threshold maximum
*/
#define SIGMOID_THRESHOLD_MIN -40.0
/**
* sigmoid threshold minimum
*/
#define SIGMOID_THRESHOLD_MAX 13.0
/**
* The maximum input value for exp, used to avoid overflow problem.
* currently only used for tanh function.
*/
#define EXP_MAX_INPUT 40.0
#ifndef __NVCC__
namespace hppl {
namespace typef {
float relu(const float a);
float sigmoid(const float a);
float tanh(const float a);
float linear(const float a);
float relu(const float a, const float b);
float sigmoid(const float a, const float b);
float tanh(const float a, const float b);
float linear(const float a, const float b);
} // namespace typef
namespace typed {
double relu(const double a);
double sigmoid(const double a);
double tanh(const double a);
double linear(const double a);
double relu(const double a, const double b);
double sigmoid(const double a, const double b);
double tanh(const double a, const double b);
double linear(const double a, const double b);
} // namespace typed
} // namespace hppl
#ifdef __AVX__
#include "hl_avx_functions.h"
#endif
#else
#include "hl_gpu_functions.h"
#endif
#endif // HL_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_GPU_FUNCTIONS_CUH_
#define HL_GPU_FUNCTIONS_CUH_
#include "hl_base.h"
namespace hppl {
namespace typef {
__device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; }
__device__ static float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return __fdividef(1.0f, 1.0f + __expf(-tmp));
}
__device__ static float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return __fdividef(2.0f, (1.0f + __expf(-2.0f * tmp))) - 1.0f;
}
__device__ static float linear(const float a) { return a; }
__device__ static float relu(const float a, const float b) {
return a * (b > 0.0f ? 1.0f : 0.0f);
}
__device__ static float sigmoid(const float a, const float b) {
return a * b * (1.0f - b);
}
__device__ static float tanh(const float a, const float b) {
return a * (1.0f - b * b);
}
__device__ static float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; }
__device__ static double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return 1.0 / (1.0 + exp(-tmp));
}
__device__ static double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0;
}
__device__ static double linear(const double a) { return a; }
__device__ static double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
__device__ static double sigmoid(const double a, const double b) {
return a * b * (1 - b);
}
__device__ static double tanh(const double a, const double b) {
return a * (1.0 - b * b);
}
__device__ static double linear(const double a, const double b) { return a; }
} // namespace typef
} // namespace hppl
#endif // HL_GPU_FUNCTIONS_CUH_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {
namespace detail {
#ifndef __NVCC__
template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI;
T rCheckF;
T rCheckO;
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg[i];
rCheckF = value.checkFg[i];
rCheckO = value.checkOg[i];
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
}
hppl::cpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
act(active_state));
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
value.stateValue[i] = rState;
value.stateActiveValue[i] = rStateAtv;
value.outputValue[i] = rOut;
}
}
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI;
T rCheckF;
T rCheckO;
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
T *gradIn = grad.gateGrad;
T *gradIg = grad.gateGrad + frameSize;
T *gradFg = grad.gateGrad + frameSize * 2;
T *gradOg = grad.gateGrad + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg[i];
rCheckF = value.checkFg[i];
rCheckO = value.checkOg[i];
rState = value.stateValue[i];
rStateAtv = value.stateActiveValue[i];
rOutputGrad = grad.outputGrad[i];
rStateGrad = grad.stateGrad[i];
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
}
hppl::cpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, act(active_node), act(active_gate), act(active_state));
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
grad.stateGrad[i] = rStateGrad;
if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
}
}
template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rCheckI;
__m256 rCheckF;
__m256 rCheckO;
__m256 rState;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rStateAtv;
__m256 rOut;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, hppl::avx::forward[active_node],
hppl::avx::forward[active_gate], hppl::avx::forward[active_state]);
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
((__m256 *)value.stateValue)[i] = rState;
((__m256 *)value.stateActiveValue)[i] = rStateAtv;
((__m256 *)value.outputValue)[i] = rOut;
}
#endif
}
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rGradIn;
__m256 rGradIg;
__m256 rGradFg;
__m256 rGradOg;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rPrevStateGrad;
__m256 rStateGrad;
__m256 rState;
__m256 rStateAtv;
__m256 rOutputGrad;
__m256 rCheckI;
__m256 rCheckF;
__m256 rCheckO;
__m256 rCheckIGrad;
__m256 rCheckFGrad;
__m256 rCheckOGrad;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
__m256 *gradIn = (__m256 *)grad.gateGrad;
__m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
__m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
__m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
rState = ((__m256 *)value.stateValue)[i];
rStateAtv = ((__m256 *)value.stateActiveValue)[i];
rOutputGrad = ((__m256 *)grad.outputGrad)[i];
rStateGrad = ((__m256 *)grad.stateGrad)[i];
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, hppl::avx::backward[active_node],
hppl::avx::backward[active_gate], hppl::avx::backward[active_state]);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
((__m256 *)grad.stateGrad)[i] = rStateGrad;
if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
}
if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
}
#endif
}
template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state);
} else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state);
}
}
template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state);
} else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state);
}
}
#endif
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class T, class Op, bool isBatch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.outputValue += batchIdx * frameSize;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
}
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI = value.checkIg[frameIdx];
T rCheckF = value.checkFg[frameIdx];
T rCheckO = value.checkOg[frameIdx];
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
}
hppl::gpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
act(active_state));
value.gateValue[frameIdx] = rValueIn;
value.gateValue[frameIdx + frameSize] = rValueIg;
value.gateValue[frameIdx + frameSize * 2] = rValueFg;
value.gateValue[frameIdx + frameSize * 3] = rValueOg;
value.stateValue[frameIdx] = rState;
value.stateActiveValue[frameIdx] = rStateAtv;
value.outputValue[frameIdx] = rOut;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class T, class Op, bool isBatch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
grad.gateGrad += batchIdx * frameSize * 4;
grad.stateGrad += batchIdx * frameSize;
grad.outputGrad += batchIdx * frameSize;
}
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI = value.checkIg[frameIdx];
T rCheckF = value.checkFg[frameIdx];
T rCheckO = value.checkOg[frameIdx];
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
rState = value.stateValue[frameIdx];
rStateAtv = value.stateActiveValue[frameIdx];
rOutputGrad = grad.outputGrad[frameIdx];
rStateGrad = grad.stateGrad[frameIdx];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
}
hppl::gpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
act(active_node), act(active_gate), act(active_state));
grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg;
grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
grad.stateGrad[frameIdx] = rStateGrad;
if (grad.prevStateGrad) {
if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
grad.prevStateGrad[frameIdx] = rPrevStateGrad;
}
if (isBatch) {
if (value.prevStateValue) {
if (grad.checkIgGrad)
paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
rCheckIGrad);
if (grad.checkFgGrad)
paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
rCheckFGrad);
}
if (grad.checkOgGrad)
paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
} else {
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
}
}
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 32 */
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
active_state);
} else {
KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
active_state);
}
}
template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 32 */
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
} else {
KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
}
}
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/platform/hostdevice.h"
#include <type_traits>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
namespace forward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &prevState, T &state, T &stateAtv, T &output,
T &checkI, T &checkF, T &checkO,
typename hppl::ForwardActType<T>::type actInput,
typename hppl::ForwardActType<T>::type actGate,
typename hppl::ForwardActType<T>::type actState) {
valueIn = actInput(valueIn);
valueIg = actGate(valueIg + prevState * checkI);
valueFg = actGate(valueFg + prevState * checkF);
state = valueIn * valueIg + prevState * valueFg;
valueOg = actGate(valueOg + state * checkO);
stateAtv = actState(state);
output = valueOg * stateAtv;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
static const bool avx = false;
#else
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
__m256 &valueOg, __m256 &prevState, __m256 &state,
__m256 &stateAtv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO,
hppl::Active<__m256>::forward actInput,
hppl::Active<__m256>::forward actGate,
hppl::Active<__m256>::forward actState) {
valueIn = actInput(valueIn);
valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
_mm256_mul_ps(prevState, valueFg));
valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
stateAtv = actState(state);
output = _mm256_mul_ps(valueOg, stateAtv);
}
#endif
#endif
};
} // namespace forward
namespace backward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &gradIn, T &gradIg, T &gradFg, T &gradOg,
T &prevState, T &prevStateGrad, T &state,
T &stateGrad, T &stateAtv, T &outputGrad,
T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad,
typename hppl::BackwardActType<T>::type actInput,
typename hppl::BackwardActType<T>::type actGate,
typename hppl::BackwardActType<T>::type actState) {
gradOg = actGate(outputGrad * stateAtv, valueOg);
stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
gradIn = actInput(stateGrad * valueIg, valueIn);
gradIg = actGate(stateGrad * valueIn, valueIg);
gradFg = actGate(stateGrad * prevState, valueFg);
prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
checkIGrad = gradIg * prevState;
checkFGrad = gradFg * prevState;
checkOGrad = gradOg * state;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
static const bool avx = false;
#else
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
__m256 &valueOg, __m256 &gradIn, __m256 &gradIg,
__m256 &gradFg, __m256 &gradOg, __m256 &prevState,
__m256 &prevStateGrad, __m256 &state,
__m256 &stateGrad, __m256 &stateAtv,
__m256 &outputGrad, __m256 &checkI, __m256 &checkF,
__m256 &checkO, __m256 &checkIGrad,
__m256 &checkFGrad, __m256 &checkOGrad,
hppl::Active<__m256>::backward actInput,
hppl::Active<__m256>::backward actGate,
hppl::Active<__m256>::backward actState) {
gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
stateGrad = _mm256_add_ps(
actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
_mm256_mul_ps(gradFg, checkF));
prevStateGrad =
_mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
checkIGrad = _mm256_mul_ps(gradIg, prevState);
checkFGrad = _mm256_mul_ps(gradFg, prevState);
checkOGrad = _mm256_mul_ps(gradOg, state);
}
#endif
#endif
};
} // namespace backward
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -29,8 +29,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -41,6 +41,22 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
int channels_col = input_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
......@@ -52,16 +68,14 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) < 0 ||
(im_row_idx - padding_height) >= input_height ||
(im_col_idx - padding_width) < 0 ||
(im_col_idx - padding_width) >= input_width) {
int im_row_idx = h * stride_height + h_offset - padding_up;
int im_col_idx = w * stride_width + w_offset - padding_left;
if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 ||
im_col_idx >= input_width) {
col_data[(c * output_height + h) * output_width + w] = T(0);
} else {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
im_row_idx += c_im * input_height;
col_data[(c * output_height + h) * output_width + w] =
im_data[im_row_idx * input_width + im_col_idx];
}
......@@ -82,7 +96,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -92,6 +107,22 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
int channels_col = input_channels * filter_height * filter_width;
T* im_data = im.data<T>();
......@@ -103,14 +134,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) >= 0 &&
(im_row_idx - padding_height) < input_height &&
(im_col_idx - padding_width) >= 0 &&
(im_col_idx - padding_width) < input_width) {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
int im_row_idx = h * stride_height + h_offset - padding_up;
int im_col_idx = w * stride_width + w_offset - padding_left;
if ((im_row_idx) >= 0 && (im_row_idx) < input_height &&
(im_col_idx) >= 0 && (im_col_idx) < input_width) {
im_row_idx += c_im * input_height;
im_data[im_row_idx * input_width + im_col_idx] +=
col_data[(c * output_height + h) * output_width + w];
}
......@@ -140,8 +169,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int up_pad,
int down_pad) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -149,25 +178,28 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
// int output_height = col.dims()[0];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int row_begin, row_end;
int padding_height = std::max(up_pad, down_pad);
int padding_width = 0;
if (up_pad >= down_pad) {
row_begin = 0;
} else {
row_begin = down_pad - up_pad;
}
row_end = row_begin + ((input_height + up_pad + down_pad - filter_height) /
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1);
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
const T* im_data = im.data<T>();
T* col_data = col.data<T>();
for (int col_row_idx = row_begin; col_row_idx < row_end; ++col_row_idx) {
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height;
......@@ -175,11 +207,10 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_height;
col_row_idx * stride_height + filter_row_idx - padding_up;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
int col_offset =
((((col_row_idx - row_begin) * output_width + col_col_idx) *
col_col_idx * stride_width + filter_col_idx - padding_left;
int col_offset = ((((col_row_idx)*output_width + col_col_idx) *
input_channels +
channel) *
filter_height +
......@@ -214,7 +245,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int up_pad, int down_pad) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -222,37 +254,39 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
// int output_height = col.dims()[0];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int row_begin, row_end;
int padding_height = std::max(up_pad, down_pad);
int padding_width = 0;
if (up_pad >= down_pad) {
row_begin = 0;
} else {
row_begin = down_pad - up_pad;
}
row_end = row_begin + ((input_height + up_pad + down_pad - filter_height) /
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1);
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
T* im_data = im.data<T>();
const T* col_data = col.data<T>();
for (int col_row_idx = row_begin; col_row_idx < row_end; ++col_row_idx) {
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height;
++filter_row_idx) {
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset = // change or not ???
col_row_idx * stride_height + filter_row_idx - padding_height;
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_up;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
int col_offset =
((((col_row_idx - row_begin) * output_width + col_col_idx) *
col_col_idx * stride_width + filter_col_idx - padding_left;
int col_offset = (((col_row_idx * output_width + col_col_idx) *
input_channels +
channel) *
filter_height +
......
......@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -79,6 +79,15 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int num_outputs = input_channels * output_height * output_width;
int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512;
......@@ -89,8 +98,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), num_outputs, input_height, input_width, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, col.data<T>());
filter_width, stride_height, stride_width, padding_up, padding_left,
output_height, output_width, col.data<T>());
}
};
......@@ -152,7 +161,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -164,8 +174,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int output_height = col.dims()[3];
int output_width = col.dims()[4];
size_t num_kernels = input_channels * (input_height + 2 * padding_height) *
(input_width + 2 * padding_width);
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
size_t num_kernels = input_channels *
(input_height + padding_up + padding_down) *
(input_width + padding_left + padding_right);
size_t blocks = (num_kernels + 1024 - 1) / 1024;
size_t block_x = 512;
......@@ -178,10 +198,10 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
col2im<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
num_kernels, col.data<T>(), input_height + 2 * padding_height,
input_width + 2 * padding_width, input_channels, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, im.data<T>());
num_kernels, col.data<T>(), input_height + padding_up + padding_down,
input_width + padding_left + padding_left, input_channels,
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width, im.data<T>());
}
};
......@@ -199,8 +219,7 @@ __global__ void im2colOCF(const T* im_data, T* col_data, int input_channels,
int input_height, int input_width, int filter_height,
int filter_width, int stride_height, int stride_width,
int padding_height, int padding_width,
int output_height, int output_width, int row_begin,
int row_end) {
int output_height, int output_width) {
int swid = blockIdx.x;
int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels;
......@@ -208,8 +227,7 @@ __global__ void im2colOCF(const T* im_data, T* col_data, int input_channels,
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width;
int height_offset =
idy + (shid + row_begin) * stride_height - padding_height;
int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width +
channelid * input_height * input_width;
......@@ -240,8 +258,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int up_pad,
int down_pad) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -249,21 +267,17 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int row_begin, row_end;
int padding_height = std::max(up_pad, down_pad);
int padding_width = 0;
if (up_pad >= down_pad) {
row_begin = 0;
} else {
row_begin = down_pad - up_pad;
}
row_end = row_begin + ((input_height + up_pad + down_pad - filter_height) /
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1);
int output_height = row_end - row_begin; // col.dims()[0];
int output_width = col.dims()[1];
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int block_dim_x = 0;
int block_dim_y = 0;
......@@ -289,9 +303,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width, row_begin,
row_end);
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width);
}
};
......@@ -300,8 +313,7 @@ __global__ void col2imOCF(T* im_data, const T* col_data, int input_channels,
int input_height, int input_width, int filter_height,
int filter_width, int stride_height, int stride_width,
int padding_height, int padding_width,
int output_height, int output_width, int row_begin,
int row_end) {
int output_height, int output_width) {
int swid = blockIdx.x;
int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels;
......@@ -309,8 +321,7 @@ __global__ void col2imOCF(T* im_data, const T* col_data, int input_channels,
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width;
int height_offset =
idy + (shid + row_begin) * stride_height - padding_height;
int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width +
channelid * input_height * input_width;
......@@ -340,7 +351,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int up_pad, int down_pad) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -348,21 +360,17 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int row_begin, row_end;
int padding_height = std::max(up_pad, down_pad);
int padding_width = 0;
if (up_pad >= down_pad) {
row_begin = 0;
} else {
row_begin = down_pad - up_pad;
}
row_end = row_begin + ((input_height + up_pad + down_pad - filter_height) /
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1);
int output_height = row_end - row_begin; // col.dims()[0];
int output_width = col.dims()[1];
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int block_dim_x = 0;
int block_dim_y = 0;
......@@ -388,9 +396,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width, row_begin,
row_end);
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width);
}
};
......
......@@ -74,8 +74,8 @@ class Im2ColFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width);
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right);
};
template <ColFormat Format, typename Place, typename T>
......@@ -83,7 +83,8 @@ class Col2ImFunctor {
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width);
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right);
};
} // namespace math
......
......@@ -85,10 +85,10 @@ void testIm2col() {
paddle::operators::math::ColFormat::kOCF, Place, float>
im2col_ocf;
im2col(*context, input, output_cfo, stride, stride, padding, padding);
im2col_ocf(*context, input, output_ocf, /*stride_height*/ stride,
/*stride_width*/ stride, /*up_pad*/ padding,
/*down_pad*/ padding);
im2col(*context, input, output_cfo, stride, stride, padding, padding, padding,
padding);
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding,
padding, padding);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
......@@ -131,7 +131,8 @@ void testIm2col() {
input.CopyFrom(input_tmp, *place, *context);
}
col2im(*context, input, output_cfo, stride, stride, padding, padding);
col2im(*context, input, output_cfo, stride, stride, padding, padding, padding,
padding);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
......@@ -152,9 +153,8 @@ void testIm2col() {
input.CopyFrom(input_tmp, *place, *context);
}
col2im_ocf(*context, input, output_ocf, /*stride_height*/ stride,
/*stride_width*/ stride, /*up_pad*/ padding,
/*down_pad*/ padding);
col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding,
padding, padding);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h"
namespace paddle {
namespace operators {
namespace math {
template <class T>
struct LstmUnitFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
}
}
}
};
template <class T>
struct LstmUnitGradFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
}
grad.gateGrad += frame_size * 4;
grad.stateGrad += frame_size;
grad.stateActiveGrad += frame_size;
grad.outputGrad += frame_size;
if (grad.prevStateGrad) {
grad.prevStateGrad += frame_size;
}
}
}
};
template class LstmUnitFunctor<platform::CPUPlace, float>;
template class LstmUnitFunctor<platform::CPUPlace, double>;
template class LstmUnitGradFunctor<platform::CPUPlace, float>;
template class LstmUnitGradFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h"
#include "paddle/operators/math/lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {
template <class T>
struct LstmUnitFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
frame_size, batch_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
}
};
template <class T>
struct LstmUnitGradFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
frame_size, batch_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
}
};
template class LstmUnitFunctor<platform::GPUPlace, float>;
template class LstmUnitFunctor<platform::GPUPlace, double>;
template class LstmUnitGradFunctor<platform::GPUPlace, float>;
template class LstmUnitGradFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace operators {
namespace math {
typedef enum {
HL_ACTIVATION_SIGMOID = 0,
HL_ACTIVATION_RELU = 1,
HL_ACTIVATION_TANH = 2,
HL_ACTIVATION_LINEAR = 3,
HL_ACTIVATION_END
} activation_mode_t;
template <class T>
struct LstmMetaValue {
T *gateValue;
T *prevStateValue;
T *stateValue;
T *stateActiveValue;
T *outputValue;
T *checkIg;
T *checkFg;
T *checkOg;
};
template <class T>
struct LstmMetaGrad {
T *gateGrad;
T *prevStateGrad;
T *stateGrad;
T *stateActiveGrad;
T *outputGrad;
T *checkIgGrad;
T *checkFgGrad;
T *checkOgGrad;
};
inline activation_mode_t ActiveType(const std::string &type) {
if (type == "sigmoid") {
return HL_ACTIVATION_SIGMOID;
} else if (type == "relu") {
return HL_ACTIVATION_RELU;
} else if (type == "tanh") {
return HL_ACTIVATION_TANH;
} else if (type == "linear" || type == "identity" || type == "") {
return HL_ACTIVATION_LINEAR;
} else {
PADDLE_THROW("Do not support activation type.");
}
}
template <typename Place, typename T>
class LstmUnitFunctor {
public:
static void compute(const platform::DeviceContext &context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const std::string &gate_act, const std::string &cell_act,
const std::string &cand_act);
};
template <typename Place, typename T>
class LstmUnitGradFunctor {
public:
static void compute(const platform::DeviceContext &context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
const std::string &gate_act, const std::string &cell_act,
const std::string &cand_act);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence2batch.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& src, const size_t* index,
framework::LoDTensor& dst, bool is_src_index) {
auto src_dims = src.dims();
auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
"The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
"The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
"The width of src and dst must be same.");
auto height = dst_dims[0];
auto width = dst_dims[1];
auto* src_data = src.data<T>();
auto* dst_data = dst.data<T>();
for (int i = 0; i < height; ++i) {
if (is_src_index) {
memcpy(dst_data + i * width, src_data + index[i] * width,
width * sizeof(T));
} else {
memcpy(dst_data + index[i] * width, src_data + i * width,
width * sizeof(T));
}
}
}
};
template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence2batch.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
int64_t height, int64_t width,
bool is_src_index) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int id = blockIdx.x + idy * GridDimX;
while (id < height) {
int src_idx = is_src_index ? index[id] : id;
int dst_idx = is_src_index ? id : index[id];
const T* src_data = src + src_idx * width;
T* dst_data = dst + dst_idx * width;
for (int i = idx; i < width; i += BlockDimX) {
dst_data[i] = src_data[i];
}
id += BlockDimY * GridDimX;
}
}
template <typename T>
class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& src, const size_t* index,
framework::LoDTensor& dst, bool is_src_index) {
auto src_dims = src.dims();
auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2,
"The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
"The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
"The width of src and dst must be same.");
auto height = dst_dims[0];
auto width = dst_dims[1];
auto* src_data = src.data<T>();
auto* dst_data = dst.data<T>();
dim3 threads(128, 8);
dim3 grid(8, 1);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
src_data, dst_data, index, height, width, is_src_index);
}
};
template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
template <typename Place, typename T>
class CopyMatrixRowsFunctor {
public:
// If is_src_index is true,
// copy the indexed rows of input src to the output dst.
// If is_src_index is false,
// copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index.
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& src, const size_t* index,
framework::LoDTensor& dst, bool is_src_index);
};
template <typename Place, typename T>
class LoDTensor2BatchFunctor {
// Calculate the length of each sequence and
// sort sequence index by the length.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
//
struct SeqInfo {
SeqInfo(int start, int length, int seq_idx)
: start(start), length(length), seq_idx(seq_idx) {}
int start;
int length;
int seq_idx;
};
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& lod_tensor,
framework::LoDTensor& batch, bool is_reverse) const {
auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
auto lod = lods[0];
std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
int length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id);
}
std::sort(seq_info.begin(), seq_info.end(),
[](SeqInfo a, SeqInfo b) { return a.length > b.length; });
// calculate the start position of each batch
// (numBatch equal the maxLength of sequences)
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// num_batch = 5,
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
// batch_start_positions[0] = len(b0)
// batch_start_positions[1] = len(b0) + len(b1)
// batch_start_positions[2] = len(b0) + len(b1) + len(b2)
// ...
// seq2batch_idx[12] = {4, 0, 9,
// 5, 1, 10,
// 6, 2, 11,
// 7, 3,
// 8}
// The batch number represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
paddle::framework::LoD batch_lods;
batch_lods.emplace_back(std::vector<size_t>{0});
batch_lods.emplace_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
int num_batch = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
// batch_lods[1] is the raw index in the input LoDTensor
auto dims = lod_tensor.dims();
batch_lods[1].resize(static_cast<size_t>(dims[0]));
size_t* batch_starts = batch_lods[0].data();
size_t* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0;
for (size_t n = 0; n < num_batch; n++) {
auto batch_id = static_cast<int>(batch_starts[n]);
for (size_t i = 0; i < seq_info.size(); ++i) {
size_t seq_len = seq_info[i].length;
int start = seq_info[i].start;
if (n < seq_len) {
seq2batch_idx[batch_id] =
is_reverse ? start + seq_len - 1 - n : start + n;
batch_id++;
} else {
break;
}
}
batch_starts[n + 1] = static_cast<size_t>(batch_id);
}
batch.set_lod(batch_lods);
CopyMatrixRowsFunctor<Place, T> to_batch;
to_batch(context, lod_tensor, seq2batch_idx, batch, true);
}
};
template <typename Place, typename T>
class Batch2LoDTensorFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& batch,
framework::LoDTensor& lod_tensor) const {
auto in_lod = batch.lod();
PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
"The LoD size of input `batch` should be 2.");
auto out_lod = lod_tensor.lod()[0];
auto num = out_lod[out_lod.size() - 1];
PADDLE_ENFORCE_EQ(num, lod_tensor.dims()[0]);
PADDLE_ENFORCE_EQ(num, in_lod[1].size());
PADDLE_ENFORCE_EQ(num, batch.dims()[0]);
CopyMatrixRowsFunctor<Place, T> to_seq;
size_t* index = in_lod[1].data();
to_seq(context, batch, index, lod_tensor, false);
}
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -133,8 +133,8 @@ class SequenceProjectFunctor {
in_t.Resize(framework::make_ddim(input_shape));
im2col_ocf(context, in_t, out_t,
/*stride_height*/ context_stride, /*stride_width*/ 0, up_pad,
down_pad);
/*stride_height*/ context_stride, /*stride_width*/ 1, up_pad,
down_pad, 0, 0);
}
if (padding_trainable) {
......
......@@ -49,7 +49,19 @@ class MulOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(
x_mat_dims[1], y_mat_dims[0],
"First matrix's width must be equal with second matrix's height.");
ctx->SetOutputDim("Out", {x_mat_dims[0], y_mat_dims[1]});
std::vector<int64_t> output_dims;
output_dims.reserve(
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
for (int i = 0; i < x_num_col_dims; ++i) {
output_dims.push_back(x_dims[i]);
}
for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
output_dims.push_back(y_dims[i]);
}
ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
......@@ -109,15 +121,6 @@ class MulOpGrad : public framework::OperatorWithKernel {
auto y_mat_dims = framework::flatten_to_2d(
y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
PADDLE_ENFORCE_EQ(
x_mat_dims[0], out_dims[0],
"The first dimension of Out@GRAD must equal to the first dimension of "
"the first operand.");
PADDLE_ENFORCE_EQ(
y_mat_dims[1], out_dims[1],
"The second dimension of Out@GRAD must equal to the second "
"dimension of the second operand.");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
......
......@@ -46,8 +46,15 @@ class MulKernel : public framework::OpKernel<T> {
: *y;
z->mutable_data<T>(context.GetPlace());
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
false, 1, z, 0);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
};
......@@ -67,6 +74,11 @@ class MulGradKernel : public framework::OpKernel<T> {
: *y;
const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (dx) {
......@@ -74,9 +86,10 @@ class MulGradKernel : public framework::OpKernel<T> {
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
1, &dx_matrix, 0);
math::matmul<Place, T>(ctx.device_context(), dout_mat, false, y_matrix,
true, 1, &dx_matrix, 0);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
......@@ -84,8 +97,8 @@ class MulGradKernel : public framework::OpKernel<T> {
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
1, &dy_matrix, 0);
math::matmul<Place, T>(ctx.device_context(), x_matrix, true, dout_mat,
false, 1, &dy_matrix, 0);
}
}
};
......
......@@ -160,66 +160,6 @@ class ReduceMinOpMaker : public ReduceOpMaker {
}
};
class NormOp : public NetOp {
public:
NormOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of NormOp should not be null.");
PADDLE_ENFORCE_NE(Output("AbsOut"), framework::kEmptyVarName,
"Output(AbsOut) of NormOp should not be null.");
PADDLE_ENFORCE_NE(Output("PowOut"), framework::kEmptyVarName,
"Output(PowOut) of NormOp should not be null.");
PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
"Output(SumOut) of NormOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of NormOp should not be null.");
auto dim = Attr<int>("dim");
auto keep_dim = Attr<bool>("keep_dim");
auto p = Attr<float>("p");
PADDLE_ENFORCE_GT(p, 0, "Order of the norm should be positive.");
AppendOp(framework::OpRegistry::CreateOp("abs", {{"X", {Input("X")}}},
{{"Y", {Output("AbsOut")}}}, {}));
AppendOp(framework::OpRegistry::CreateOp("pow", {{"X", {Output("AbsOut")}}},
{{"Y", {Output("PowOut")}}},
{{"factor", p}}));
framework::AttributeMap sum_attr;
sum_attr["dim"] = dim;
sum_attr["keep_dim"] = keep_dim;
AppendOp(framework::OpRegistry::CreateOp(
"reduce_sum", {{"X", {Output("PowOut")}}},
{{"Out", {Output("SumOut")}}}, sum_attr));
AppendOp(framework::OpRegistry::CreateOp(
"pow", {{"X", {Output("SumOut")}}}, {{"Y", {Output("Out")}}},
{{"factor", static_cast<float>(1. / p)}}));
CompleteAddOp(false);
}
};
class NormOpMaker : public ReduceOpMaker {
public:
NormOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: ReduceOpMaker(proto, op_checker) {
AddOutput("AbsOut",
"(Tensor) The intermediate output of Norm operator, "
"saving the absolute value of the input tensor X.")
.AsIntermediate();
AddOutput("PowOut",
"(Tensor) The intermediate output of Norm operator, "
"saving the p-th power of the output tensor AbsOut.")
.AsIntermediate();
AddOutput("SumOut",
"(Tensor) the intermediate output of Norm operator, "
"saving the sum of PowOut reduced on the given dimension.")
.AsIntermediate();
AddAttr<float>("p", "(float, default 2) The order of Norm.").SetDefault(2);
SetComment("Norm", "vector p-norm");
AddComment(comment_);
}
};
} // namespace operators
} // namespace paddle
......@@ -237,8 +177,6 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
ops::ReduceGradOp);
REGISTER_OP_WITHOUT_GRADIENT(norm, ops::NormOp, ops::NormOpMaker);
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
reduce_type, \
......
......@@ -167,8 +167,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
in_t.Resize(framework::make_ddim(input_shape));
col2im_ocf(context.device_context(), in_t, col_t,
/*stride_height*/ context_stride, /*stride_width*/ 0,
up_pad, down_pad);
/*stride_height*/ context_stride, /*stride_width*/ 1,
up_pad, down_pad, 0, 0);
}
col_t.Resize(framework::make_ddim(
{sequence_height, context_length * sequence_width}));
......
......@@ -62,11 +62,13 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("InsideWeight",
"Optional input tensor of smooth l1 loss op with the same shape "
"as X. If provided, the result of (X - Y) will be multiplied "
"by this tensor element by element.");
"by this tensor element by element.")
.AsDispensable();
AddInput("OutsideWeight",
"Optinal input of smooth l1 loss op with the same shape as X."
"If provided, the output smooth l1 loss will be multiplied by "
"this tensor element by element.");
"this tensor element by element.")
.AsDispensable();
AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
.AsIntermediate();
AddOutput("Out", "Smooth l1 loss.");
......
......@@ -65,7 +65,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
return static_cast<framework::DataType>(Attr<int>("data_type"));
return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
}
};
......
......@@ -25,3 +25,4 @@ nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
DEPS dynamic_loader nccl)
......@@ -35,6 +35,11 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
DEFINE_string(nccl_dir, "",
"Specify path for loading nccl library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");
namespace paddle {
namespace platform {
namespace dynload {
......@@ -157,6 +162,14 @@ void GetLapackDsoHandle(void** dso_handle) {
#endif
}
void GetNCCLDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
#endif
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -58,6 +58,14 @@ void GetWarpCTCDsoHandle(void** dso_handle);
*/
void GetLapackDsoHandle(void** dso_handle);
/**
* @brief load the DSO of NVIDIA nccl
*
* @param **dso_handle dso handler
*
*/
void GetNCCLDsoHandle(void** dso_handle);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/dynload/nccl.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nccl_dso_flag;
void *nccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <dlfcn.h>
#include <nccl.h>
#include <mutex>
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(nccl_dso_flag, \
paddle::platform::dynload::GetNCCLDsoHandle, \
&nccl_dso_handle); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
ncclResult_t operator()(Args... args) { \
return __name(args...); \
} \
}; \
extern DynLoad__##__name __name
#endif
#define NCCL_RAND_ROUTINE_EACH(__macro) \
__macro(ncclCommInitAll); \
__macro(ncclGetUniqueId); \
__macro(ncclCommInitRank); \
__macro(ncclCommDestroy); \
__macro(ncclCommCount); \
__macro(ncclCommCuDevice); \
__macro(ncclCommUserRank); \
__macro(ncclAllReduce); \
__macro(ncclBcast); \
__macro(ncclAllGather); \
__macro(ncclReduce); \
__macro(ncclGetErrorString);
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -29,11 +29,14 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif
#include <glog/logging.h>
#ifdef PADDLE_WITH_CUDA
#include "paddle/platform/dynload/cublas.h"
#include "paddle/platform/dynload/cudnn.h"
#include "paddle/platform/dynload/curand.h"
#include "paddle/platform/dynload/nccl.h"
#include <cublas_v2.h>
#include <cudnn.h>
......@@ -172,6 +175,17 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
throw std::runtime_error(err + string::Sprintf(args...));
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
ncclResult_t stat, const Args&... args) {
if (stat == ncclSuccess) {
return;
} else {
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
string::Sprintf(args...));
}
}
#endif // PADDLE_ONLY_CPU
template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/dynload/nccl.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/gpu_info.h"
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
static int dev_count = 0;
namespace paddle {
namespace platform {
TEST(NCCL, init) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
PADDLE_ENFORCE(status);
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
template <typename T>
struct PerThreadData {
thrust::device_vector<T> send_buff;
thrust::device_vector<T> recv_buff;
CUDADeviceContext dev_ctx;
T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
send_buff.resize(size);
for (size_t i = 0; i < size; ++i) {
send_buff[i] = static_cast<T>(i);
}
recv_buff.resize(size);
}
};
static constexpr int ELEM_COUNT = 10000;
TEST(NCCL, all_reduce) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
PADDLE_ENFORCE(status);
VLOG(1) << "ncclComm initialized";
VLOG(1) << "Creating thread data";
std::vector<std::unique_ptr<PerThreadData<double>>> data;
data.reserve(dev_count);
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Creating thread data for device " << i;
SetDeviceId(i);
data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
}
VLOG(1) << "Thread data created";
VLOG(1) << "Check send_buf data";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Check on device " << i;
SetDeviceId(i);
thrust::host_vector<double> tmp = data[i]->send_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
}
}
VLOG(1) << "Invoking ncclAllReduce";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Invoking ncclAllReduce with device " << i;
SetDeviceId(i);
PADDLE_ENFORCE(dynload::ncclAllReduce(
data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
ncclSum, comms[i], data[i]->dev_ctx.stream()));
VLOG(1) << "Invoked ncclAllReduce for device " << i;
}
VLOG(1) << "Invoked ncclAllReduce";
VLOG(1) << "Sync devices";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Sync device " << i;
SetDeviceId(i);
data[i]->dev_ctx.Wait();
}
VLOG(1) << "device synced";
for (int i = 0; i < dev_count; ++i) {
SetDeviceId(i);
VLOG(1) << "Checking vector on device " << i;
thrust::host_vector<double> tmp = data[i]->recv_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
auto elem = static_cast<double>(j);
elem *= dev_count;
ASSERT_NEAR(tmp[j], elem, 1e-4);
}
}
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
} // namespace platform
} // namespace paddle
int main(int argc, char** argv) {
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -35,6 +35,7 @@ struct GPUPlace {
GPUPlace() : GPUPlace(0) {}
explicit GPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const GPUPlace &o) const { return device == o.device; }
inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
......
......@@ -4,3 +4,5 @@ if(WITH_PYTHON)
DEPS pybind python backward proto_desc tensor_array paddle_memory executor
${GLOB_OP_LIB})
endif(WITH_PYTHON)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
#include <iostream>
#include <sstream> // std::stringstream
#include <string>
#include "paddle/framework/op_info.h"
#include "paddle/framework/op_registry.h"
#include "paddle/pybind/pybind.h"
std::string Escape(const std::string& s) {
std::string r;
for (size_t i = 0; i < s.size(); i++) {
switch (s[i]) {
case '\"':
r += "\\\"";
break;
case '\\':
r += "\\\\";
break;
case '\n':
r += "\\n";
break;
case '\t':
r += "\\t";
case '\r':
break;
default:
r += s[i];
break;
}
}
return r;
}
std::string AttrType(paddle::framework::AttrType at) {
switch (at) {
case paddle::framework::INT:
return "int";
case paddle::framework::FLOAT:
return "float";
case paddle::framework::STRING:
return "string";
case paddle::framework::BOOLEAN:
return "bool";
case paddle::framework::INTS:
return "int array";
case paddle::framework::FLOATS:
return "float array";
case paddle::framework::STRINGS:
return "string array";
case paddle::framework::BOOLEANS:
return "bool array";
case paddle::framework::BLOCK:
return "block id";
}
return "UNKNOWN"; // not possible
}
void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
ss << " { "
<< "\n"
<< " \"name\" : \"" << Escape(v.name()) << "\",\n"
<< " \"comment\" : \"" << Escape(v.comment()) << "\",\n"
<< " \"duplicable\" : " << v.duplicable() << ",\n"
<< " \"intermediate\" : " << v.intermediate() << "\n"
<< " },";
}
void PrintAttr(const paddle::framework::OpProto::Attr& a,
std::stringstream& ss) {
ss << " { "
<< "\n"
<< " \"name\" : \"" << Escape(a.name()) << "\",\n"
<< " \"type\" : \"" << AttrType(a.type()) << "\",\n"
<< " \"comment\" : \"" << Escape(a.comment()) << "\",\n"
<< " \"generated\" : " << a.generated() << "\n"
<< " },";
}
void PrintOpProto(const std::string& type,
const paddle::framework::OpInfo& opinfo,
std::stringstream& ss) {
std::cerr << "Processing " << type << "\n";
const paddle::framework::OpProto* p = opinfo.proto_;
if (p == nullptr) {
return; // It is possible that an operator doesn't have OpProto.
}
ss << "{\n"
<< " \"type\" : \"" << Escape(p->type()) << "\",\n"
<< " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
ss << " \"inputs\" : [ "
<< "\n";
for (int i = 0; i < p->inputs_size(); i++) {
PrintVar(p->inputs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ], "
<< "\n";
ss << " \"outputs\" : [ "
<< "\n";
for (int i = 0; i < p->outputs_size(); i++) {
PrintVar(p->outputs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ], "
<< "\n";
ss << " \"attrs\" : [ "
<< "\n";
for (int i = 0; i < p->attrs_size(); i++) {
PrintAttr(p->attrs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ] "
<< "\n";
ss << "},";
}
int main() {
std::stringstream ss;
ss << "[\n";
for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
PrintOpProto(iter.first, iter.second, ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << "]\n";
std::cout << ss.str();
}
......@@ -257,6 +257,7 @@ void BindOpDesc(py::module &m) {
.def("block_attr", &OpDescBind::GetBlockAttr)
.def("check_attrs", &OpDescBind::CheckAttrs)
.def("infer_shape", &OpDescBind::InferShape)
.def("infer_var_type", &OpDescBind::InferVarType)
.def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
const OpDesc *desc = op_desc.Proto();
PADDLE_ENFORCE(desc->IsInitialized(),
......
......@@ -225,15 +225,16 @@ All parameter, weight, gradient are variables in Paddle.
//! Python str. If you want a str object, you should cast them in Python.
m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
std::vector<py::bytes> ret_values;
OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
const OpInfo &info) {
if (!info.HasOpProtoAndChecker()) return;
for (auto &iter : OpInfoMap::Instance().map()) {
auto &info = iter.second;
if (info.HasOpProtoAndChecker()) {
std::string str;
PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
PADDLE_ENFORCE(
info.Proto().SerializeToString(&str),
"Serialize OpProto Error. This could be a bug of Paddle.");
ret_values.emplace_back(str);
});
}
}
return ret_values;
});
m.def_submodule(
......
......@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 256,
settings(batch_size = 128,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
......@@ -44,10 +44,11 @@ a2 = img_conv_layer(input=tmp,
shared_biases=True,
act=ReluActivation())
tmp = concat_layer(input=[a1, a2])
tmp = addto_layer(input=[a1, a2],
act=ReluActivation(),
bias_attr=False)
tmp = img_pool_layer(input=tmp,
num_channels=64,
pool_size=3,
stride=2,
padding=1,
......@@ -55,35 +56,34 @@ tmp = img_pool_layer(input=tmp,
b1 = img_conv_layer(input=tmp,
filter_size=3,
num_filters=64,
num_filters=32,
padding=1,
shared_biases=True,
act=ReluActivation())
b1 = img_pool_layer(input=b1,
pool_size=3,
stride=1,
padding=1,
stride=2,
padding=0,
pool_type=MaxPooling())
b2 = img_conv_layer(input=tmp,
filter_size=5,
filter_size=3,
num_filters=64,
padding=2,
padding=1,
shared_biases=True,
act=ReluActivation())
b2 = img_pool_layer(input=b2,
pool_size=5,
stride=1,
padding=2,
stride=2,
padding=1,
pool_type=MaxPooling())
tmp = addto_layer(input=[b1, b2],
act=ReluActivation(),
bias_attr=False)
tmp = concat_layer(input=[b1, b2])
tmp = img_pool_layer(input=tmp,
num_channels=96,
pool_size=3,
stride=2,
padding=1,
......
......@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 1000,
settings(batch_size = 128,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
......
......@@ -175,7 +175,7 @@ def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
dense_vector = dense_slot
sparse_binary_vector = sparse_non_value_slot
sparse_vector = sparse_value_slot
sparse_float_vector = sparse_value_slot
integer_value = index_slot
# dense_array can be used for variable-length input feature.
......@@ -216,7 +216,7 @@ def sparse_binary_vector_sub_sequence(dim):
return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
def sparse_vector_sequence(dim):
def sparse_float_vector_sequence(dim):
"""
Data type of a sequence of sparse vector, which most elements are zero,
others could be any float value.
......@@ -226,11 +226,11 @@ def sparse_vector_sequence(dim):
:return: An input type object
:rtype: InputType
"""
return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
def sparse_vector_sub_sequence(dim):
return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
def sparse_float_vector_sub_sequence(dim):
return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
def integer_value_sequence(value_range):
......
......@@ -1457,11 +1457,13 @@ def dot_product_attention(encoded_sequence,
expanded = expand_layer(
input=transformed_state,
expanded_as=encoded_sequence,
expand_as=encoded_sequence,
name='%s_expand' % name)
m = linear_comb_layer(
weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
weights=expanded,
vectors=encoded_sequence,
name='%s_dot-product' % name)
attention_weight = fc_layer(
input=m,
......
......@@ -53,8 +53,8 @@ class Variable(object):
if is_new_var:
self.desc.set_data_type(dtype)
else:
old_dtype = self.data_type()
if dtype != old_shape:
old_dtype = self.data_type
if dtype != old_dtype:
raise ValueError("Variable {0} has been created before. "
"The previous data type is {1}; the new "
"data type is {2}. They are not "
......@@ -113,6 +113,10 @@ class Variable(object):
def lod_level(self):
return self.desc.lod_level()
@property
def type(self):
return self.desc.type()
@staticmethod
def _unique_var_name_():
uid = core.unique_integer() # unique during whole process.
......@@ -192,31 +196,32 @@ class Operator(object):
self.desc.set_type(type)
proto = OpProtoHolder.instance().get_op_proto(type)
if inputs is not None:
given = set()
need = set()
for n in inputs:
given.add(n)
for m in proto.inputs:
need.add(m.name)
if not given == need:
raise ValueError(
"Incorrect setting for input(s) of operator \"%s\". Need: [%s] Given: [%s]"
% (type, ", ".join(str(e) for e in need), ", ".join(
str(e) for e in given)))
def find_name(var_list, name):
for var_name in var_list:
if var_name == name:
return True
return False
if inputs is not None:
for in_proto in proto.inputs:
found = find_name(inputs, in_proto.name)
assert found or in_proto.dispensable, "Input {} not found".format(
in_proto.name)
if found:
in_argus = inputs[in_proto.name]
if not isinstance(in_argus, list):
in_argus = [in_argus]
if not in_proto.duplicable and len(in_argus) > 1:
raise ValueError(
"Input %s expects only one input, but %d are given." %
(in_proto.name, len(in_argus)))
"Input %s expects only one input, but %d are given."
% (in_proto.name, len(in_argus)))
in_argu_names = []
for argu in in_argus:
in_argu_names.append(argu.name)
self.desc.set_input(in_proto.name, in_argu_names)
else:
self.desc.set_input(in_proto.name, [])
if outputs is not None:
given = set()
......@@ -250,13 +255,14 @@ class Operator(object):
attr_name = attr.name
if (not attr_name in attrs) or (attrs[attr_name] is None):
continue
if not isinstance(attrs[attr_name], Block):
self.desc.set_attr(attr_name, attrs[attr_name])
else:
if isinstance(attrs[attr_name], Block):
self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
else:
self.desc.set_attr(attr_name, attrs[attr_name])
self.desc.check_attrs()
if type not in {'feed', 'fetch'}:
self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc)
def __str__(self):
......
from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program, g_init_program
import paddle.v2.framework.core as core
import copy
import itertools
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import Variable, g_program, \
g_init_program
def unique_name(prefix):
uid = core.unique_integer() # unique during whole process.
......@@ -120,10 +123,7 @@ class LayerHelper(object):
if attr['name'] is None:
attr['name'] = unique_name(".".join([self.name, suffix]))
self.init_program.global_block().create_parameter(
name=attr['name'],
dtype=dtype,
shape=shape,
init_attr=attr['init_attr'])
dtype=dtype, shape=shape, **attr)
return self.program.global_block().create_parameter(
name=attr['name'], dtype=dtype, shape=shape)
......@@ -133,6 +133,9 @@ class LayerHelper(object):
dtype=dtype,
persistable=False)
def create_variable(self, *args, **kwargs):
return self.program.current_block().create_var(*args, **kwargs)
def create_global_variable(self, *args, **kwargs):
return self.program.global_block().create_var(
*args, persistable=False, **kwargs)
......
from paddle.v2.framework.layer_helper import LayerHelper
from paddle.v2.framework.layer_helper import LayerHelper, unique_name
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import OpProtoHolder, Variable
from paddle.v2.framework.framework import OpProtoHolder, Variable, Program
import re
__all__ = ['fc', 'data', 'cross_entropy', 'conv2d', 'pool2d']
__all__ = [
'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
'StaticRNN'
]
def fc(input,
......@@ -24,7 +27,9 @@ def fc(input,
mul_results = []
for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape
param_shape = list(input_shape[num_flatten_dims:]) + [size]
param_shape = [
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size]
w = helper.create_parameter(
attr=param_attr, shape=param_shape, dtype=dtype)
......@@ -36,10 +41,8 @@ def fc(input,
"Y": w,
},
outputs={"Out": tmp},
attrs={
'x_num_col_dims': num_flatten_dims,
'y_num_col_dims': len(input_shape) - num_flatten_dims
})
attrs={'x_num_col_dims': num_flatten_dims,
'y_num_col_dims': 1})
mul_results.append(tmp)
# sum
......@@ -55,6 +58,24 @@ def fc(input,
return helper.append_activation(pre_activation)
def embedding(input,
size,
data_type='float32',
param_attr=None,
program=None,
init_program=None):
helper = LayerHelper('embedding', **locals())
w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=data_type)
tmp = helper.create_tmp_variable(data_type)
helper.append_op(
type='lookup_table',
inputs={'Ids': input,
'W': w},
outputs={'Out': tmp})
return tmp
def data(name,
shape,
data_type='float32',
......@@ -122,6 +143,19 @@ _create_op_func_('mean')
_create_op_func_('mul')
def concat(input, axis, program=None, init_program=None):
helper = LayerHelper('concat', **locals())
if not isinstance(input, list) and not isinstance(input, tuple):
input = [input]
out = helper.create_tmp_variable(dtype=input[0].data_type)
helper.append_op(
type='concat',
inputs={'X': input},
outputs={'Out': [out]},
attrs={'axis': axis})
return out
def cross_entropy(input, label, **kwargs):
helper = LayerHelper('cross_entropy', **kwargs)
out = helper.create_tmp_variable(dtype=input.data_type)
......@@ -240,3 +274,170 @@ def pool2d(input,
})
return pool_out
class BlockGuard(object):
"""
BlockGuard used to create sub-block in program by using Python `with`
keyword.
"""
def __init__(self, program):
if not isinstance(program, Program):
raise TypeError("BlockGuard takes a program")
self.program = program
def __enter__(self):
self.program.create_block()
def __exit__(self, exc_type, exc_val, exc_tb):
self.program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class StaticRNNGuard(BlockGuard):
def __init__(self, rnn):
if not isinstance(rnn, StaticRNN):
raise TypeError("StaticRNNGuard takes an StaticRNN")
super(StaticRNNGuard, self).__init__(rnn.helper.program)
self.rnn = rnn
def __enter__(self):
self.rnn.status = StaticRNN.IN_RNN_BLOCK
return super(StaticRNNGuard, self).__enter__()
def __exit__(self, exc_type, exc_val, exc_tb):
self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
self.rnn.complete_rnn_op()
return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
class StaticRNNMemoryLink(object):
"""
:param init: the initial variable for Memory
:type init: Variable
:param pre_mem: the memory variable in previous time step
:type pre_mem: Variable
:param mem: the memory variable in current time step
:type mem: Variable
"""
def __init__(self, init, pre_mem, mem=None):
self.init = init
self.pre_mem = pre_mem
self.mem = mem
class StaticRNN(object):
BEFORE_RNN_BLOCK = 0
IN_RNN_BLOCK = 1
AFTER_RNN_BLOCK = 2
def __init__(self, name=None, program=None):
self.helper = LayerHelper("static_rnn", name=name, program=program)
self.memories = {} # memory map, from pre_mem.name --> MemoryLink
self.inputs = [] # input variable list in current block
self.outputs = [] # output variable list in parent block
self.status = StaticRNN.BEFORE_RNN_BLOCK # status flag.
# sequence length, since it is a static RNN, sequence length are fixed.
self.seq_len = None
def step(self):
return StaticRNNGuard(self)
def _assert_in_rnn_block_(self, method):
if self.status != StaticRNN.IN_RNN_BLOCK:
raise ValueError("You must invoke {0} in rnn block".format(method))
def memory(self, init=None, shape=None, dtype=None, init_value=0):
self._assert_in_rnn_block_('memory')
if init is None:
if shape is None or dtype is None:
raise ValueError(
"if init is None, memory at least need shape and dtype")
parent_block = self.parent_block()
var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
boot_var = parent_block.create_var(
name=var_name, shape=shape, dtype=dtype, persistable=False)
parent_block.append_op(
type="fill_constant",
inputs={},
outputs={'Out': [boot_var]},
attrs={
'value': init_value,
'shape': boot_var.shape,
'data_type': boot_var.data_type
})
return self.memory(init=boot_var)
else:
pre_mem = self.helper.create_variable(
name=unique_name("@".join([self.helper.name, "mem"])),
dtype=init.data_type,
shape=init.shape)
self.memories[pre_mem.name] = StaticRNNMemoryLink(
init=init, pre_mem=pre_mem)
return pre_mem
def step_input(self, x):
self._assert_in_rnn_block_('step_input')
if not isinstance(x, Variable):
raise TypeError("step input takes a Variable")
if self.seq_len is None:
self.seq_len = x.shape[1]
elif self.seq_len != x.shape[1]:
raise ValueError("Static RNN only take fix seq_len input")
ipt = self.helper.create_variable(
name=x.name,
dtype=x.data_type,
shape=[-1] + list(x.shape[2:]),
type=x.type)
self.inputs.append(ipt)
return ipt
def step_output(self, o):
self._assert_in_rnn_block_('step_output')
if not isinstance(o, Variable):
raise TypeError("step output takes a Variable")
out_var = self.parent_block().create_var(
name=o.name,
shape=[-1, self.seq_len] + list(o.shape[1:]),
dtype=o.data_type)
self.outputs.append(out_var)
def output(self, *outputs):
for each in outputs:
self.step_output(each)
def update_memory(self, mem, var):
if not isinstance(mem, Variable) or not isinstance(var, Variable):
raise TypeError("update memory should take variables")
self.memories[mem.name].mem = var
def parent_block(self):
prog = self.helper.program
parent_idx = prog.current_block().parent_idx
assert parent_idx >= 0
parent_block = prog.block(parent_idx)
return parent_block
def __call__(self, *args, **kwargs):
if self.status != StaticRNN.AFTER_RNN_BLOCK:
raise ValueError("RNN output can only be retrieved after rnn block")
if len(self.outputs) == 0:
raise ValueError("RNN has no output")
elif len(self.outputs) == 1:
return self.outputs[0]
else:
return self.outputs
def complete_rnn_op(self):
# TODO(yuyang18): Create RNN Op here.
# Implement this method after RNN op complete.
pass
......@@ -4,6 +4,8 @@ import random
import itertools
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
from paddle.v2.framework.executor import Executor
from paddle.v2.framework.framework import Program, OpProtoHolder
def grad_var_name(var_name):
......@@ -177,7 +179,12 @@ def get_backward_op(scope, op, no_grad_set):
return backward_op
def get_gradient(scope, op, inputs, outputs, grad_name, place,
def get_gradient(scope,
op,
inputs,
outputs,
grad_names,
place,
no_grad_set=None):
ctx = core.DeviceContext.create(place)
......@@ -193,8 +200,52 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
backward_op.run(scope, ctx)
out = np.array(scope.find_var(grad_name).get_tensor())
return out
return [
np.array(scope.find_var(grad_name).get_tensor())
for grad_name in grad_names
]
def append_input_output(block, op_proto, np_list, is_input):
'''Insert VarDesc and generate Python variable instance'''
proto_list = op_proto.inputs if is_input else op_proto.outputs
def create_var(block, name, np_list, var_proto):
if name not in np_list:
assert var_proto.intermediate, "{} not found".format(name)
shape = None
lod_level = None
else:
np_value = np_list[name]
if isinstance(np_value, tuple):
shape = list(np_value[0].shape)
lod_level = len(np_value[1])
else:
shape = list(np_value.shape)
lod_level = 0
return block.create_var(
dtype="float32", shape=shape, lod_level=lod_level, name=name)
var_dict = {}
for var_proto in proto_list:
var_name = str(var_proto.name)
if is_input:
if (var_name not in np_list) and var_proto.dispensable:
continue
assert (var_name in np_list) or (var_proto.dispensable), \
"Missing {} as input".format(var_name)
if var_proto.duplicable:
assert isinstance(np_list[var_name], list), \
"Duplicable {} should be set as list".format(var_name)
var_list = []
for (name, np_value) in np_list[var_name]:
var_list.append(
create_var(block, name, {name: np_value}, var_proto))
var_dict[var_name] = var_list
else:
var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
return var_dict
class OpTest(unittest.TestCase):
......@@ -213,48 +264,93 @@ class OpTest(unittest.TestCase):
np.random.set_state(cls._np_rand_state)
random.setstate(cls._py_rand_state)
def feed_var(self, input_vars, place):
feed_map = {}
for var_name in input_vars:
if isinstance(input_vars[var_name], list):
for name, np_value in self.inputs[var_name]:
tensor = core.LoDTensor()
tensor.set(np_value, place)
feed_map[name] = tensor
else:
tensor = core.LoDTensor()
if isinstance(self.inputs[var_name], tuple):
tensor.set(self.inputs[var_name][0], place)
tensor.set_lod(self.inputs[var_name][1])
else:
tensor.set(self.inputs[var_name], place)
feed_map[var_name] = tensor
return feed_map
def check_output_with_place(self, place, atol):
self.scope = core.Scope()
op_inputs = self.inputs if hasattr(self, "inputs") else dict()
op_outputs = self.outputs if hasattr(self, "outputs") else dict()
op_attrs = self.attrs if hasattr(self, "attrs") else dict()
self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
op_attrs)
if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
return
set_input(self.scope, self.op, self.inputs, place)
ctx = core.DeviceContext.create(place)
self.op.run(self.scope, ctx)
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
program = Program()
block = program.global_block()
inputs = append_input_output(block, op_proto, self.inputs, True)
outputs = append_input_output(block, op_proto, self.outputs, False)
op = block.append_op(
type=self.op_type,
inputs=inputs,
outputs=outputs,
attrs=self.attrs if hasattr(self, "attrs") else dict())
fetch_list = []
for var_name, var in outputs.iteritems():
if var_name in self.outputs:
if isinstance(var, list):
for v in var:
fetch_list.append(v)
else:
fetch_list.append(var)
feed_map = self.feed_var(inputs, place)
for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
exe = Executor(place)
outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs:
continue
def find_actual(target_name, fetch_list):
found = [
i for i, var in enumerate(fetch_list)
if var.name == target_name
]
self.assertTrue(
len(found) == 1, "Found {} {}".format(
len(found), target_name))
return found[0]
if out_dup:
sub_out = self.outputs[out_name]
if not isinstance(sub_out, list):
raise AssertionError("sub_out type %s is not list",
type(sub_out))
for sub_out_name, expect in sub_out:
actual = np.array(
self.scope.find_var(sub_out_name).get_tensor())
idx = find_actual(sub_out_name, fetch_list)
actual = outs[idx]
self.assertTrue(
np.allclose(
actual, expect, atol=atol),
"output name: " + out_name + " has diff.")
"Output (" + sub_out_name + ") has diff at " +
str(place))
else:
actual = np.array(self.scope.find_var(out_name).get_tensor())
idx = find_actual(out_name, fetch_list)
actual = outs[idx]
expect = self.outputs[out_name]
self.assertTrue(
np.allclose(
actual, expect, atol=atol),
"output name: " + out_name + " has diff.")
"Output (" + out_name + ") has diff at " + str(place))
def check_output(self, atol=1e-5):
places = [core.CPUPlace()]
if core.is_compile_gpu():
if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
places.append(core.GPUPlace(0))
for place in places:
self.check_output_with_place(place, atol)
......@@ -310,11 +406,9 @@ class OpTest(unittest.TestCase):
]
cpu_place = core.CPUPlace()
cpu_analytic_grads = [
get_gradient(self.scope, self.op, self.inputs, self.outputs,
grad_name, cpu_place, no_grad_set)
for grad_name in grad_names
]
cpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs,
self.outputs, grad_names, cpu_place,
no_grad_set)
self.__assert_is_close(numeric_grads, cpu_analytic_grads, grad_names,
max_relative_error,
......@@ -322,11 +416,9 @@ class OpTest(unittest.TestCase):
if core.is_compile_gpu() and self.op.support_gpu():
gpu_place = core.GPUPlace(0)
gpu_analytic_grads = [
get_gradient(self.scope, self.op, self.inputs, self.outputs,
grad_name, gpu_place, no_grad_set)
for grad_name in grad_names
]
gpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs,
self.outputs, grad_names,
gpu_place, no_grad_set)
self.__assert_is_close(numeric_grads, gpu_analytic_grads,
grad_names, max_relative_error,
......
......@@ -16,7 +16,9 @@ class TestAccuracyOp(OpTest):
if ele == label[rowid]:
num_correct += 1
break
self.outputs = {'Accuracy': [num_correct / float(n)]}
self.outputs = {
'Accuracy': np.array([num_correct / float(n)]).astype("float32")
}
def test_check_output(self):
self.check_output()
......
......@@ -172,8 +172,8 @@ class TestBRelu(OpTest):
def setUp(self):
self.op_type = "brelu"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
t_min = 1
t_max = 4
t_min = 1.0
t_max = 4.0
# The same with TestAbs
x[np.abs(x - t_min) < 0.005] = t_min + 0.02
x[np.abs(x - t_max) < 0.005] = t_max + 0.02
......@@ -218,7 +218,7 @@ class TestSoftRelu(OpTest):
def setUp(self):
self.op_type = "soft_relu"
x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
threshold = 2
threshold = 2.0
# The same reason with TestAbs
x[np.abs(x - threshold) < 0.005] = threshold + 0.02
x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
......@@ -303,7 +303,7 @@ class TestPow(OpTest):
def setUp(self):
self.op_type = "pow"
self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
self.attrs = {'factor': 3}
self.attrs = {'factor': 3.0}
self.outputs = {'Y': np.power(self.inputs['X'], 3)}
def test_check_output(self):
......
......@@ -37,14 +37,14 @@ class TestCase1(TestClipOp):
def initTestCase(self):
self.shape = (8, 16, 8)
self.max = 0.7
self.min = 0
self.min = 0.0
class TestCase2(TestClipOp):
def initTestCase(self):
self.shape = (8, 16)
self.max = 1
self.min = 0
self.max = 1.0
self.min = 0.0
class TestCase3(TestClipOp):
......
import unittest
import numpy as np
from op_test import OpTest
def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
# [2, 3, 5, 5]
in_n, in_c, in_h, in_w = input_.shape
# [3, 6, 3, 3]
f_c, out_c, f_h, f_w = filter_.shape
assert in_c == f_c
stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
out_h = (in_h - 1) * stride[0] + f_h
out_w = (in_w - 1) * stride[1] + f_w
out = np.zeros((in_n, out_c, out_h, out_w))
for n in range(in_n):
for i in range(in_h):
for j in range(in_w):
input_masked = input_[n, :, i, j] # (c)
input_masked = np.reshape(input_masked, (in_c, 1, 1))
input_masked = np.tile(input_masked, (1, f_h, f_w))
for k in range(out_c):
tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
i1, i2 = i * stride[0], i * stride[0] + f_h
j1, j2 = j * stride[0], j * stride[0] + f_w
out[n, k, i1:i2, j1:j2] += tmp_out
return out
class TestConv2dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.init_op_type()
# [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
self.init_test_case()
conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
input_ = np.random.random(self.input_size).astype("float32")
filter_ = np.random.random(self.filter_size).astype("float32")
output = conv2dtranspose_forward_naive(input_, filter_,
conv2dtranspose_param)
# print 'deconv output py', output, output.shape
self.inputs = {'Input': input_, 'Filter': filter_}
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
# 'dilations': self.dilations
}
self.outputs = {'Output': output}
def test_check_output(self):
print 'check output here'
self.check_output()
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.05,
no_grad_set=set(['Input']))
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
def init_op_type(self):
self.op_type = "conv2dtranspose"
"""
class TestCudnn(TestConv2dOp):
def init_group(self):
self.groups = 1
def init_op_type(self):
self.op_type = "conv_cudnn"
"""
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestFCOp1(OpTest):
def setUp(self):
x0 = np.random.random((16, 32)).astype("float32")
w0 = np.random.random((32, 10)).astype("float32")
mul_out0 = np.dot(x0, w0)
identity_out = mul_out0
self.op_type = "fc"
self.inputs = {"X": [("X0", x0)], "W": [("W0", w0)]}
self.outputs = {"MulOut": [("MulOut0", mul_out0)], "Out": identity_out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X0", "W0"], "Out", max_relative_error=0.01)
class TestFCOp2(OpTest):
def setUp(self):
x0 = np.random.random((16, 4, 8)).astype("float32")
x1 = np.random.random((4, 4, 32)).astype("float32")
w0 = np.random.random((32, 10)).astype("float32")
w1 = np.random.random((32, 10)).astype("float32")
b = np.random.random(10).astype("float32")
mul_out0 = np.dot(x0.reshape(16, 4 * 8), w0)
mul_out1 = np.dot(x1.reshape(4 * 4, 32), w1)
sum_out = mul_out0 + mul_out1
add_out = np.add(sum_out, b)
sigmoid_out = 1 / (1 + np.exp(-add_out))
self.op_type = "fc"
self.inputs = {
"X": [("X0", x0), ("X1", x1)],
"W": [("W0", w0), ("W1", w1)],
"B": b
}
self.attrs = {"xNumColDims": [1, 2], "activation": "sigmoid"}
self.outputs = {
"MulOut": [("MulOut0", mul_out0), ("MulOut1", mul_out1)],
"SumOut": sum_out,
"AddOut": add_out,
"Out": sigmoid_out
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
["X0", "X1", "W0", "W1", "B"], "Out", max_relative_error=0.01)
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestIdentityOp(OpTest):
def setUp(self):
self.op_type = "identity"
self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
self.outputs = {'Y': self.inputs['X']}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Y')
if __name__ == "__main__":
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestInterpOp(OpTest):
def setUp(self):
self.op_type = "interp"
x = np.random.random((2, 3)).astype("float32")
y = np.random.random((2, 3)).astype("float32")
w = np.random.random(2).astype("float32")
sub_out = x - y
mul_out = sub_out * w.reshape(2, 1)
out = mul_out + y
self.inputs = {'X': x, 'Y': y, 'W': w}
self.outputs = {'Out': out, 'SubOut': sub_out, 'MulOut': mul_out}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
if __name__ == "__main__":
unittest.main()
......@@ -88,6 +88,77 @@ class TestBook(unittest.TestCase):
print str(program)
def test_word_embedding(self):
program = Program()
dict_size = 10000
embed_size = 32
first_word = layers.data(
name='firstw', shape=[1], data_type='int32', program=program)
second_word = layers.data(
name='secondw', shape=[1], data_type='int32', program=program)
third_word = layers.data(
name='thirdw', shape=[1], data_type='int32', program=program)
forth_word = layers.data(
name='forthw', shape=[1], data_type='int32', program=program)
next_word = layers.data(
name='nextw', shape=[1], data_type='int32', program=program)
embed_param_attr_1 = {
'name': 'shared_w',
'init_attr': {
'max': 1.0,
'type': 'uniform_random',
'min': -1.0
}
}
embed_param_attr_2 = {'name': 'shared_w'}
embed_first = layers.embedding(
input=first_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_1,
program=program)
embed_second = layers.embedding(
input=second_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program)
embed_third = layers.embedding(
input=third_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program)
embed_forth = layers.embedding(
input=forth_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program)
concat_embed = layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth],
axis=1,
program=program)
hidden1 = layers.fc(input=concat_embed,
size=256,
act='sigmoid',
program=program)
predict_word = layers.fc(input=hidden1,
size=dict_size,
act='softmax',
program=program)
cost = layers.cross_entropy(
input=predict_word, label=next_word, program=program)
avg_cost = layers.mean(x=cost, program=program)
self.assertIsNotNone(avg_cost)
print str(program)
if __name__ == '__main__':
unittest.main()
......@@ -8,7 +8,8 @@ class TestLookupTableOp(OpTest):
self.op_type = "lookup_table"
table = np.random.random((17, 31)).astype("float32")
ids = np.random.randint(0, 17, 4).astype("int32")
self.inputs = {'W': table, 'Ids': ids}
ids_expand = np.expand_dims(ids, axis=1)
self.inputs = {'W': table, 'Ids': ids_expand}
self.outputs = {'Out': table[ids]}
def test_check_output(self):
......
import unittest
import numpy as np
from op_test import OpTest
SIGMOID_THRESHOLD_MIN = -40.0
SIGMOID_THRESHOLD_MAX = 13.0
EXP_MAX_INPUT = 40.0
def identity(x):
return x
def sigmoid(x):
y = np.copy(x)
y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
return 1. / (1. + np.exp(-y))
def tanh(x):
y = -2. * x
y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
return (2. / (1. + np.exp(y))) - 1.
def relu(x):
return np.maximum(x, 0)
ACTVATION = {
'identity': identity,
'sigmoid': sigmoid,
'tanh': tanh,
'relu': relu
}
def lstm(
input, # T x 4D
lod, # 1 x N
h0=None, # N x D
c0=None, # N x D
w_h=None, # D x 4D
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
act_gate=None,
act_cell=None,
act_cand=None):
def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand):
g = np.dot(h_pre, w_h) # 1 x 4D
g = g + x
g = np.reshape(g, (1, g.size))
c_tmp, g_i, g_f, g_o = np.split(g, 4, axis=1)
if w_c is None:
g_i = act_gate(g_i) # 1 x D
g_f = act_gate(g_f) # 1 x D
else:
w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
g_i = act_gate(g_i + w_ic * c_pre) # 1 x D
g_f = act_gate(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * act_cand(c_tmp) # 1 x D
if w_c is None:
g_o = act_gate(g_o) # 1 x D
else:
_, _, w_oc = np.split(w_c, 3, axis=1)
g_o = act_gate(g_o + w_oc * c) # 1 x D
h = g_o * act_cell(c)
bg = np.concatenate((act_cand(c_tmp), g_i, g_f, g_o), axis=1)
return h, c, bg
def _reverse(x, lod):
y = np.zeros_like(x)
for i in range(len(lod) - 1):
b, e = lod[i], lod[i + 1]
y[b:e, :] = np.flip(x[b:e, :], 0)
return y
offset = lod[0]
batch_size = len(offset) - 1
hidden = []
cell = []
gate = []
input = _reverse(input, offset) if is_reverse else input
if w_b is not None:
input = input + np.tile(w_b, (offset[-1], 1))
for i in range(batch_size):
# compute one sequence
seq_len = offset[i + 1] - offset[i]
x = input[offset[i]:offset[i + 1], :]
h_pre = h0[i] # 1 x D
c_pre = c0[i] # 1 x D
for j in range(seq_len):
# compute one step
h_pre, c_pre, g_pre = _step(x[j], w_h, w_c, h_pre, c_pre, act_gate,
act_cell, act_cand)
hidden.append(h_pre.flatten())
cell.append(c_pre.flatten())
gate.append(g_pre.flatten())
hidden = np.array(hidden).astype("float64")
cell = np.array(cell).astype("float64")
gate = np.array(gate).astype("float64")
hidden = _reverse(hidden, offset) if is_reverse else hidden
cell = _reverse(cell, offset) if is_reverse else cell
assert gate.shape == input.shape
assert hidden.shape == (input.shape[0], input.shape[1] / 4)
assert cell.shape == (input.shape[0], input.shape[1] / 4)
return hidden, cell, gate
class TestLstmOp(OpTest):
def set_data(self):
self.lod = [[0, 2, 6, 9]]
self.D = 64
self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5]
self.act_gate = "sigmoid"
self.act_cell = "tanh"
self.act_cand = "tanh"
self.is_reverse = False
def setUp(self):
self.set_data()
self.op_type = "lstm"
T = self.lod[0][-1]
N = len(self.lod[0]) - 1
x = np.random.normal(size=(T, 4 * self.D)).astype("float64")
h0 = np.zeros((N, self.D)).astype("float64")
c0 = np.zeros((N, self.D)).astype("float64")
w = np.random.normal(size=(self.D, 4 * self.D)).astype("float64")
b = np.random.normal(size=(1, 7 * self.D)).astype("float64")
w_b = b[:, 0:4 * self.D]
w_c = b[:, 4 * self.D:]
h, c, g = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
ACTVATION[self.act_gate], ACTVATION[self.act_cell],
ACTVATION[self.act_cand])
g_sort = np.zeros_like(x)
for i, j in enumerate(self.sort_idx):
g_sort[i, :] = g[j, :]
self.inputs = {
'Input': (x, self.lod),
'H0': h0,
'C0': c0,
'Weight': w,
'Bias': b
}
self.outputs = {'Hidden': h, 'Cell': c, 'BatchGate': g_sort}
self.attrs = {
'usePeepholes': True,
'isReverse': self.is_reverse,
'gateActivation': 'sigmoid',
'cellActivation': 'tanh',
'candidateActivation': 'tanh'
}
def test_check_output(self):
self.check_output()
class TestLstmOpRerverse(TestLstmOp):
def set_data(self):
self.lod = [[0, 2, 6, 9]]
self.D = 64
self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5]
self.act_gate = "sigmoid"
self.act_cell = "tanh"
self.act_cand = "tanh"
self.is_reverse = True
if __name__ == "__main__":
unittest.main()
......@@ -35,10 +35,10 @@ class TestMulOp2(OpTest):
'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
}
self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
self.outputs = {
'Out': np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
}
result = result.reshape(15, 4, 8, 2, 9)
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
......
......@@ -27,7 +27,7 @@ class TestPadOp(OpTest):
def initTestCase(self):
self.shape = (16, 16)
self.paddings = [(0, 1), (2, 3)]
self.pad_value = 0
self.pad_value = 0.0
class TestCase1(TestPadOp):
......@@ -41,7 +41,7 @@ class TestCase2(TestPadOp):
def initTestCase(self):
self.shape = (2, 2, 2)
self.paddings = [(0, 0), (0, 0), (1, 2)]
self.pad_value = 1
self.pad_value = 1.0
class TestCase3(TestPadOp):
......
......@@ -85,33 +85,5 @@ class Test1DReduce(OpTest):
self.check_grad(['X'], 'Out')
class TestNorm(OpTest):
def setUp(self):
# use x away from 0 to avoid errors of numerical gradient when gradient near 0
x = np.random.random((5, 6, 10)).astype("float32") + 0.2
p = 2
dim = 1
keep_dim = False
abs_out = np.absolute(x)
pow_out = np.power(x, p)
sum_out = np.sum(pow_out, axis=dim, keepdims=keep_dim)
out = np.power(sum_out, 1. / p)
self.op_type = "norm"
self.inputs = {'X': x}
self.attrs = {"p": p, "dim": dim, "keep_dim": keep_dim}
self.outputs = {
"AbsOut": abs_out,
"PowOut": pow_out,
"SumOut": sum_out,
"Out": out
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', max_relative_error=0.01)
if __name__ == '__main__':
unittest.main()
import unittest
from paddle.v2.framework.layers import *
from paddle.v2.framework.framework import g_program
class TestRNN(unittest.TestCase):
def test_rnn(self):
img = data(
shape=[
80, # sequence length
22, # image height
22
], # image width
data_type='float32',
name='image')
hidden = fc(input=img, size=100, act='sigmoid', num_flatten_dims=2)
self.assertEqual((-1, 80, 100), hidden.shape)
hidden = fc(input=hidden, size=100, act='sigmoid', num_flatten_dims=2)
self.assertEqual((-1, 80, 100), hidden.shape)
rnn = StaticRNN()
with rnn.step():
hidden = rnn.step_input(hidden)
self.assertEqual((-1, 100), hidden.shape)
memory = rnn.memory(shape=(-1, 32), dtype='float32', init_value=0.0)
rnn_out = fc(input=[hidden, memory], size=32, act='sigmoid')
self.assertEqual((-1, 32), rnn_out.shape)
rnn.update_memory(memory, rnn_out)
rnn.output(rnn_out)
out = rnn()
self.assertEqual((-1, 80, 32), out.shape)
print g_program
if __name__ == '__main__':
unittest.main()
import paddle.v2 as paddle
import paddle.v2.framework.layers as layers
import paddle.v2.framework.core as core
import paddle.v2.framework.optimizer as optimizer
from paddle.v2.framework.framework import Program, g_program
from paddle.v2.framework.executor import Executor
import numpy as np
init_program = Program()
program = Program()
embed_size = 32
hidden_size = 256
N = 5
batch_size = 32
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
first_word = layers.data(
name='firstw',
shape=[1],
data_type='int32',
program=program,
init_program=init_program)
second_word = layers.data(
name='secondw',
shape=[1],
data_type='int32',
program=program,
init_program=init_program)
third_word = layers.data(
name='thirdw',
shape=[1],
data_type='int32',
program=program,
init_program=init_program)
forth_word = layers.data(
name='forthw',
shape=[1],
data_type='int32',
program=program,
init_program=init_program)
next_word = layers.data(
name='nextw',
shape=[1],
data_type='int32',
program=program,
init_program=init_program)
embed_param_attr_1 = {
'name': 'shared_w',
'init_attr': {
'max': 1.0,
'type': 'uniform_random',
'min': -1.0
}
}
embed_param_attr_2 = {'name': 'shared_w'}
embed_first = layers.embedding(
input=first_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_1,
program=program,
init_program=init_program)
embed_second = layers.embedding(
input=second_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program,
init_program=init_program)
embed_third = layers.embedding(
input=third_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program,
init_program=init_program)
embed_forth = layers.embedding(
input=forth_word,
size=[dict_size, embed_size],
data_type='float32',
param_attr=embed_param_attr_2,
program=program,
init_program=init_program)
concat_embed = layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth],
axis=1,
program=program,
init_program=init_program)
hidden1 = layers.fc(input=concat_embed,
size=hidden_size,
act='sigmoid',
program=program,
init_program=init_program)
predict_word = layers.fc(input=hidden1,
size=dict_size,
act='softmax',
program=program,
init_program=init_program)
cost = layers.cross_entropy(
input=predict_word,
label=next_word,
program=program,
init_program=init_program)
avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), batch_size)
place = core.CPUPlace()
exe = Executor(place)
exe.run(init_program, feed={}, fetch_list=[])
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
input_data = map(lambda x: np.array(x).astype("int32"), input_data)
input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
first_data = input_data[0]
first_tensor = core.LoDTensor()
first_tensor.set(first_data, place)
second_data = input_data[0]
second_tensor = core.LoDTensor()
second_tensor.set(second_data, place)
third_data = input_data[0]
third_tensor = core.LoDTensor()
third_tensor.set(third_data, place)
forth_data = input_data[0]
forth_tensor = core.LoDTensor()
forth_tensor.set(forth_data, place)
next_data = input_data[0]
next_tensor = core.LoDTensor()
next_tensor.set(next_data, place)
outs = exe.run(program,
feed={
'firstw': first_tensor,
'secondw': second_tensor,
'thirdw': third_tensor,
'forthw': forth_tensor,
'nextw': next_tensor
},
fetch_list=[avg_cost])
out = np.array(outs[0])
if out[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1)
......@@ -101,6 +101,10 @@ class Parameters(object):
self.__param_conf__[param_conf.name] = param_conf
def update_param_conf(self, model_config):
for p in model_config.parameters:
self.__param_conf__[p.name] = p
def keys(self):
"""
keys are the names of each parameter.
......@@ -322,6 +326,17 @@ class Parameters(object):
self.set(name, arr.reshape(self.get_shape(name)))
def to_tar(self, f):
"""
Save parameters to a tar file.
WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)`
to save parameters most of the time. Otherwise, some settings such
as model average will not take effect.
:param f:
:type f: file
:return:
"""
tar = tarfile.TarFile(fileobj=f, mode='w')
for nm in self.names():
buf = cStringIO.StringIO()
......
......@@ -5,3 +5,4 @@ py_test(test_topology SRCS test_topology.py)
py_test(test_rnn_layer SRCS test_rnn_layer.py)
py_test(test_parameters SRCS test_parameters.py)
py_test(test_data_feeder SRCS test_data_feeder.py)
py_test(test_paramconf_order SRCS test_paramconf_order.py)
......@@ -97,7 +97,7 @@ class DataFeederTest(unittest.TestCase):
each_sample.append(zip(a, b))
data.append(each_sample)
feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
{'input': 0})
arg = feeder(data)
output = arg.getSlotValue(0)
......
# Copyright PaddlePaddle contributors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import math
import paddle.v2 as paddle
def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
input=inlayer,
size=5,
param_attr=paddle.attr.Param(
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
return wordemb
def train():
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
# Every layer takes integer value of range [0, dict_size)
firstword = paddle.layer.data(
name="firstw", type=paddle.data_type.integer_value(dict_size))
secondword = paddle.layer.data(
name="secondw", type=paddle.data_type.integer_value(dict_size))
thirdword = paddle.layer.data(
name="thirdw", type=paddle.data_type.integer_value(dict_size))
fourthword = paddle.layer.data(
name="fourthw", type=paddle.data_type.integer_value(dict_size))
nextword = paddle.layer.data(
name="fifthw", type=paddle.data_type.integer_value(dict_size))
Efirst = wordemb(firstword)
Esecond = wordemb(secondword)
Ethird = wordemb(thirdword)
Efourth = wordemb(fourthword)
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
hidden1 = paddle.layer.fc(name="fc1",
input=contextemb,
size=128,
act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
initial_std=1. / math.sqrt(5 * 8),
learning_rate=1,
l2_rate=6e-4))
predictword = paddle.layer.fc(input=hidden1,
size=dict_size,
bias_attr=paddle.attr.Param(learning_rate=2),
act=paddle.activation.Softmax())
return paddle.layer.classification_cost(input=predictword, label=nextword)
class TestParamConfOrder(unittest.TestCase):
def test_param_conf_order(self):
paddle.init()
cost = train()
parameters = paddle.parameters.create(cost)
adagrad = paddle.optimizer.AdaGrad(
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost, parameters, adagrad)
for p in trainer.get_topology_proto().parameters:
if p.name == "_fc1.w0":
self.assertEqual(p.decay_rate, 6e-4)
else:
self.assertEqual(p.decay_rate, 8e-4)
if __name__ == '__main__':
unittest.main()
......@@ -19,6 +19,7 @@ import paddle.trainer_config_helpers as conf_helps
import layer as v2_layer
import config_base
import cPickle
from paddle.trainer import config_parser as cp
__all__ = ['Topology']
......@@ -50,6 +51,35 @@ class Topology(object):
assert isinstance(self.__model_config__, ModelConfig)
def update_from_default(self):
# HACK(typhoonzero): update ParameterConfig(proto) in case of
# optimizers are defined after layers, or between layers.
# Must be called from trainer.__init__()
for parameter in self.__model_config__.parameters:
if parameter.momentum == 0.0 and cp.g_default_momentum:
parameter.momentum = cp.g_default_momentum
if parameter.decay_rate == 0.0 and cp.g_default_decay_rate:
parameter.decay_rate = cp.g_default_decay_rate
if parameter.initial_mean == 0.0:
parameter.initial_mean = cp.g_default_initial_mean
if parameter.initial_std == 0.01:
parameter.initial_std = cp.g_default_initial_std
if parameter.initial_strategy == 0:
parameter.initial_strategy = cp.g_default_initial_strategy
if parameter.initial_smart == False:
parameter.initial_smart = cp.g_default_initial_smart
if parameter.num_batches_regularization == 1 and \
cp.g_default_num_batches_regularization:
parameter.num_batches_regularization = \
cp.g_default_num_batches_regularization
if parameter.gradient_clipping_threshold == 0.0 and \
cp.g_default_gradient_clipping_threshold:
parameter.gradient_clipping_threshold = \
cp.g_default_gradient_clipping_threshold
if parameter.device == -1 and cp.g_default_device:
parameter.device = cp.g_default_device
# FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func
def use_sparse_updater(self):
"""
check if any parameter require to use sparse_update
......
......@@ -64,6 +64,11 @@ class SGD(object):
"paddle.v2.optimizer.Optimizer")
import py_paddle.swig_paddle as api
topology = Topology(cost, extra_layers=extra_layers)
# HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers
# are defined after layers, or between layers.
topology.update_from_default()
parameters.update_param_conf(topology.proto())
self.__optimizer__ = update_equation
self.__topology__ = topology
self.__parameters__ = parameters
......@@ -91,6 +96,9 @@ class SGD(object):
self.__parameters__.append_gradient_machine(gm)
self.__parameter_updater__ = None
def get_topology_proto(self):
return self.__topology_in_proto__
def __use_remote_sparse_updater__(self):
return self.__use_sparse_updater__ and not self.__is_local__
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册