提交 d7e5e1f1 编写于 作者: W wanghaoshuang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into average_model

......@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON)
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
......
add_subdirectory(cuda)
add_subdirectory(function)
add_subdirectory(utils)
add_subdirectory(math)
add_subdirectory(gserver)
add_subdirectory(parameter)
add_subdirectory(testing)
if(NOT WITH_FLUID)
add_subdirectory(cuda)
add_subdirectory(function)
add_subdirectory(utils)
add_subdirectory(math)
add_subdirectory(gserver)
add_subdirectory(parameter)
if(MOBILE_INFERENCE)
if(MOBILE_INFERENCE)
add_subdirectory(capi)
else()
else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
......@@ -17,11 +17,13 @@ else()
add_subdirectory(capi)
endif()
if(NOT ANDROID AND NOT IOS)
add_subdirectory(fluid)
endif()
if(WITH_SWIG_PY)
add_subdirectory(api)
endif()
endif()
endif()
add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
add_subdirectory(fluid)
endif()
......@@ -15,23 +15,43 @@ limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <condition_variable>
#include <typeindex>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
enum class ChannelAction {
SEND = 0,
RECEIVE = 1,
CLOSE = 2,
};
// Channel is the abstract class of buffered and un-buffered channels.
template <typename T>
class Channel {
public:
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
virtual bool IsClosed() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
virtual void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
};
// Forward declaration of channel implementations.
......@@ -80,6 +100,27 @@ class ChannelHolder {
return channel != nullptr ? channel->Receive(data) : false;
}
bool IsClosed() {
if (IsInitialized()) {
return holder_->IsClosed();
}
return false;
}
bool CanSend() {
if (IsInitialized()) {
return holder_->CanSend();
}
return false;
}
bool CanReceive() {
if (IsInitialized()) {
return holder_->CanReceive();
}
return false;
}
void close() {
if (IsInitialized()) holder_->Close();
}
......@@ -97,6 +138,50 @@ class ChannelHolder {
if (IsInitialized()) holder_->Unlock();
}
template <typename T>
void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToSendQ(referrer, data, cond, cb);
}
}
}
template <typename T>
void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToReceiveQ(referrer, data, cond, cb);
}
}
}
template <typename T>
void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->RemoveFromSendQ(referrer);
}
}
}
template <typename T>
void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->RemoveFromReceiveQ(referrer);
}
}
}
inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() {
......@@ -113,6 +198,9 @@ class ChannelHolder {
virtual ~Placeholder() {}
virtual const std::type_index Type() const = 0;
virtual void* Ptr() const = 0;
virtual bool IsClosed() = 0;
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual void Close() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
......@@ -129,6 +217,27 @@ class ChannelHolder {
virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
virtual bool IsClosed() {
if (channel_) {
return channel_->IsClosed();
}
return false;
}
virtual bool CanSend() {
if (channel_) {
return channel_->CanSend();
}
return false;
}
virtual bool CanReceive() {
if (channel_) {
return channel_->CanReceive();
}
return false;
}
virtual void Close() {
if (channel_) channel_->Close();
}
......
......@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T> *);
public:
virtual bool CanSend();
virtual bool CanReceive();
virtual bool Send(T *);
virtual bool Receive(T *);
virtual size_t Cap() { return cap_; }
virtual void Lock();
virtual void Unlock();
virtual bool IsClosed();
virtual void Close();
ChannelImpl(size_t);
virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void AddToReceiveQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void RemoveFromSendQ(const void *referrer);
virtual void RemoveFromReceiveQ(const void *referrer);
private:
struct QueueMessage {
T *data;
std::condition_variable_any cond;
std::shared_ptr<std::condition_variable_any> cond;
bool chan_closed = false;
bool completed = false;
const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback;
QueueMessage(T *item) : data(item) {}
QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
: data(item), cond(cond) {}
void Wait(std::unique_lock<std::recursive_mutex> &lock) {
cond.wait(lock, [this]() { return completed; });
cond->wait(lock, [this]() { return completed; });
}
void Notify() {
completed = true;
cond.notify_all();
cond->notify_all();
}
};
......@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
PADDLE_ENFORCE_GE(capacity, 0);
}
template <typename T>
bool ChannelImpl<T>::CanSend() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !closed_ && (!recvq.empty() || buf_.size() < cap_);
}
template <typename T>
bool ChannelImpl<T>::CanReceive() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
}
template <typename T>
bool ChannelImpl<T>::Send(T *item) {
send_ctr++;
......@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) {
std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front();
// Do the data transfer
// We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_send = true;
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send)
*(m->data) = std::move(*item);
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Send function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return Send(item);
// Wake up the blocked process and unlock
m->Notify();
lock.unlock();
......@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) {
std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front();
// Do the data transfer
// We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_receive = true;
if (m->callback != nullptr)
do_receive = m->callback(ChannelAction::RECEIVE);
if (do_receive)
*item = std::move(*(m->data));
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Receive function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return Receive(item);
// Wake up the blocked process and unlock
m->Notify();
lock.unlock();
......@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() {
mu_.unlock();
}
template <typename T>
bool ChannelImpl<T>::IsClosed() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return closed_;
}
template <typename T>
void ChannelImpl<T>::Close() {
std::unique_lock<std::recursive_mutex> lock{mu_};
......@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front();
m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify();
}
......@@ -211,10 +288,72 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front();
m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify();
}
}
template <typename T>
void ChannelImpl<T>::AddToSendQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
sendq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::AddToReceiveQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
recvq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = sendq.begin(); it != sendq.end();) {
std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
if (sendMsg->referrer == referrer) {
it = sendq.erase(it);
send_ctr--;
} else {
++it;
}
}
}
template <typename T>
void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = recvq.begin(); it != recvq.end();) {
std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
if (recvMsg->referrer == referrer) {
it = recvq.erase(it);
recv_ctr--;
} else {
++it;
}
}
}
template <typename T>
ChannelImpl<T>::~ChannelImpl() {
Close();
......
......@@ -56,6 +56,7 @@ class AssignFunctor {
private:
void copy_tensor(const framework::LoDTensor &lod_tensor,
framework::LoDTensor *out) const {
if (lod_tensor.numel() == 0) return;
auto &out_tensor = *out;
TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
out_tensor.set_lod(lod_tensor.lod());
......
......@@ -17,11 +17,14 @@ limitations under the License. */
namespace paddle {
namespace operators {
using framework::OpKernelType;
using framework::Tensor;
class MulOpShapeInference : public framework::InferShapeBase {
class MulOp : public framework::OperatorWithKernel {
public:
void operator()(framework::InferShapeContext* ctx) const override {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
......@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$.
}
};
class MulOpGrad : public framework::OperatorWithKernel {
class MulGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
ops::MulOpShapeInference,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>);
......@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> {
}
math::matmul<DeviceContext, T>(
context.template device_context<DeviceContext>(), x_matrix, false,
y_matrix, false, 1, z, 0);
y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
......
......@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
T* recvbuffer = nullptr;
if (root == gpu_id) {
recvbuffer = out->mutable_data<T>(ctx.GetPlace());
} else {
out->Resize(framework::make_ddim({0}));
}
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel();
......
......@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ref"),
"Input(Ref) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Index"),
"Input(Index) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input(Ids) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Updates"),
"Input(Updates) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ScatterOp should not be null.");
auto updates_dims = ctx->GetInputDim("Updates");
auto ref_dims = ctx->GetInputDim("Ref");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
"Update Index should be 1-D.");
auto ref_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
"Update Ids should be 1-D.");
PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
"Reference and Updates should have the same shape size");
"Xerence and Updates should have the same shape size");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
ctx->GetInputDim("Index")[0],
"Updates and Index should have same batch-size.");
ctx->GetInputDim("Ids")[0],
"Updates and Ids should have same batch-size.");
framework::DDim data_dim(updates_dims);
for (int i = 1; i < data_dim.size(); ++i) {
PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
......@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context());
}
};
......@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("Updates"),
ctx->GetInputDim("Updates"));
ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context());
}
};
......@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ref", "The source input of scatter op");
AddInput("Index",
"The index input of scatter op where Ref will be updated");
AddInput("X", "The source input of scatter op");
AddInput("Ids", "The index input of scatter op where X will be updated");
AddInput("Updates", "The updated value of updates op");
AddOutput("Out", "The output of add op");
AddComment(R"DOC(
......@@ -91,8 +90,8 @@ Scatter Operator.
This operator obtains output by updating the input on selected indices on the first axis:
$$
Out = Ref \\
Out[Index] = Ref[Index] + Updates
Out = X \\
Out[Ids] = X[Ids] + Updates
$$
)DOC");
......
......@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *X = ctx.Input<Tensor>("X");
auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith(*Ref);
Out->ShareDataWith(*X);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
}
};
......@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith(*dOut);
// In place gradient: dX = dO
dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
// Gradient by Gather: dUpdates = dO[Ids]
GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
}
};
......
......@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *X = ctx.Input<Tensor>("X");
auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = Ref, Out[Index] += Updates
Out->ShareDataWith(*Ref);
// In place output: Out = X, Out[Ids] += Updates
Out->ShareDataWith(*X);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
}
};
......@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith(*dOut);
// In place gradient: dX = dO
dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index]
CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
// Gradient by Gather: dUpdates += dO[Ids]
CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
}
};
......
......@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h"
......@@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<double>)
.def("set", PyCPUTensorSetFromArray<int64_t>)
.def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>)
#ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>)
.def("set", PyCUDATensorSetFromArray<double>)
.def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>)
#endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>)
......@@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif
});
// clang-format on
#ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif
......@@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", &framework::InitDevices);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
#ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16
return platform::GetCUDAComputeCapability(place.device) >= 53;
});
#endif
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable);
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
......@@ -77,21 +78,32 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor;
}
if (std::type_index(typeid(CUR_TYPE)) ==
std::type_index(typeid(platform::float16))) {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
"e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
} else {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
}
} else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
}
}
};
} // namespace details
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
tensor);
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor);
return buffer_info;
}
......@@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray(
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
template <>
void PyCPUTensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#ifdef PADDLE_WITH_CUDA
template <typename T>
void PyCUDATensorSetFromArray(
......@@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray(
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
template <>
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CUDAPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
#endif
} // namespace pybind
......
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
set(PY_FILES paddle/__init__.py
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${UTILS_PY_FILES}
${V2_PY_FILES}
${FLUID_PY_FILES})
add_custom_target(copy_paddle_master)
if(NOT WITH_FLUID)
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
set(PY_FILES ${PY_FILES}
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${V2_PY_FILES})
add_custom_target(copy_paddle_master)
SET(COPY_PADDLE_MASTER "")
if(WITH_GOLANG)
SET(COPY_PADDLE_MASTER "")
if(WITH_GOLANG)
SET(COPY_PADDLE_MASTER "copy_paddle_master")
add_custom_command(TARGET ${COPY_PADDLE_MASTER}
COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
)
add_dependencies(copy_paddle_master paddle_master)
endif(WITH_GOLANG)
endif(WITH_GOLANG)
endif()
set(MKL_SHARED_LIBS "")
set(MKL_DEPENDS "")
......@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
if(WITH_SWIG_PY)
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
if(NOT WITH_FLUID)
set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
if(WITH_SWIG_PY)
list(APPEND paddle_python_deps python_api_wheel)
endif()
endif()
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
if (WITH_TESTING)
if(NOT WITH_FLUID)
add_subdirectory(paddle/trainer_config_helpers/tests)
if (WITH_SWIG_PY)
# enable v2 API unittest only when paddle swig api is compiled
add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests)
add_subdirectory(paddle/fluid/tests)
endif()
endif()
add_subdirectory(paddle/fluid/tests)
endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels
......
......@@ -248,12 +248,15 @@ def _callback_lookup_(op):
if o_argu in self.param_grad_names:
allreduce_out_name = o_argu + "__nccl_all_reduce__"
op_desc = _create_op_desc_(
"ncclAllReduce", {
"ncclReduce",
{
"X": [o_argu],
"Communicator":
['nccl_com__do_not_change_']
}, {"Out": [allreduce_out_name]},
{"reduction": "ncclSum"})
},
{"Out": [allreduce_out_name]},
{"reduction": "ncclSum",
"root": 0}, )
block.desc.append_op().copy_from(op_desc)
op_desc = _create_op_desc_(
......
......@@ -45,31 +45,13 @@ __activations__ = [
]
__all__ = [
'mean',
'mul',
'reshape',
'scale',
'sigmoid_cross_entropy_with_logits',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'clip',
'clip_by_norm',
'softmax',
'sequence_softmax',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
'uniform_random',
'uniform_random_batch_size_like',
'gaussian_random',
'gaussian_random_batch_size_like',
'cumsum',
'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits',
'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul',
'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip',
'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or',
'logical_xor', 'logical_not', 'uniform_random',
'uniform_random_batch_size_like', 'gaussian_random',
'gaussian_random_batch_size_like', 'cumsum', 'scatter'
] + __activations__
for _OP in set(__all__):
......
......@@ -14,6 +14,7 @@
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
......@@ -69,5 +70,42 @@ class TestMulOp2(OpTest):
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestFP16MulOp1(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((32, 84)).astype("float16")
y = np.random.random((84, 100)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.outputs = {'Out': np.dot(x, y)}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-1)
class TestFP16MulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((15, 4, 12, 10)).astype("float16")
y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.attrs = {
'x_num_col_dims': 2,
'y_num_col_dims': 2,
}
result = np.dot(
x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
result = result.reshape(15, 4, 8, 2, 9)
self.outputs = {'Out': result}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
if __name__ == "__main__":
unittest.main()
......@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
updates_np = np.random.random((2, 3)).astype("float32")
output_np = np.copy(ref_np)
output_np[index_np] = updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np}
def test_check_output(self):
......
......@@ -62,19 +62,21 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
packages=['paddle',
'paddle.proto',
'paddle.utils',
'paddle.fluid',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.layers']
if '${WITH_FLUID}'== 'OFF':
packages+=['paddle.proto',
'paddle.trainer',
'paddle.trainer_config_helpers',
'paddle.utils',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master',
'paddle.v2.plot',
'paddle.fluid',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.layers',
'py_paddle']
with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
......@@ -84,12 +86,30 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
setup_requires+=['opencv-python']
# the prefix is sys.prefix which should always be usr
paddle_bin_dir = 'opt/paddle/bin'
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
paddle_bins = ''
if '${WITH_FLUID}'== 'OFF':
paddle_bin_dir = 'opt/paddle/bin'
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['core.so']}
if '${WITH_FLUID}'== 'OFF':
package_data['paddle.v2.master']=['libpaddle_master.so']
package_data['py_paddle']=['*.py','_swig_paddle.so']
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
}
if '${WITH_FLUID}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
paddle_rt_lib_dir = 'lib'
paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
if '${MKL_SHARED_LIBS}'!= '':
......@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}',
install_requires=setup_requires,
packages=packages,
ext_modules=[Extension('_foo', ['stub.cc'])],
package_data={
'paddle.v2.master': ['libpaddle_master.so'],
'paddle.fluid': ['core.so'],
'py_paddle':['*.py','_swig_paddle.so']
},
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
},
package_data=package_data,
package_dir=package_dir,
scripts=paddle_bins,
data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册