diff --git a/CMakeLists.txt b/CMakeLists.txt index c86889c05c8cf0d521dce9adbf3e918ba91729a1..0ec65bac84b0b0d89123473a8941f80c90f1b339 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. -option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON) +option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index a7b249d43bf3ad9924749d5e66618750f19d8bf7..d2a4b1335464f553a361728e64ed5ca177ca53da 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,27 +1,29 @@ -add_subdirectory(cuda) -add_subdirectory(function) -add_subdirectory(utils) -add_subdirectory(math) -add_subdirectory(gserver) -add_subdirectory(parameter) -add_subdirectory(testing) - -if(MOBILE_INFERENCE) - add_subdirectory(capi) -else() - add_subdirectory(pserver) - add_subdirectory(trainer) - add_subdirectory(scripts) +if(NOT WITH_FLUID) + add_subdirectory(cuda) + add_subdirectory(function) + add_subdirectory(utils) + add_subdirectory(math) + add_subdirectory(gserver) + add_subdirectory(parameter) - if(WITH_C_API) + if(MOBILE_INFERENCE) add_subdirectory(capi) - endif() + else() + add_subdirectory(pserver) + add_subdirectory(trainer) + add_subdirectory(scripts) - if(NOT ANDROID AND NOT IOS) - add_subdirectory(fluid) - endif() + if(WITH_C_API) + add_subdirectory(capi) + endif() - if(WITH_SWIG_PY) - add_subdirectory(api) + if(WITH_SWIG_PY) + add_subdirectory(api) + endif() endif() endif() + +add_subdirectory(testing) +if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS) + add_subdirectory(fluid) +endif() diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 9f8fb12098d622058a86f83c1c42a1feb1cfb2e2..51e2b03f9cb9abf6b3effe4035d4eec2ba4f9fbf 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -15,23 +15,43 @@ limitations under the License. */ #pragma once #include // for size_t +#include #include #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { +enum class ChannelAction { + SEND = 0, + RECEIVE = 1, + CLOSE = 2, +}; + // Channel is the abstract class of buffered and un-buffered channels. template class Channel { public: + virtual bool CanSend() = 0; + virtual bool CanReceive() = 0; virtual bool Send(T*) = 0; virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; virtual void Lock() = 0; + virtual void Unlock() = 0; + virtual bool IsClosed() = 0; virtual void Close() = 0; virtual ~Channel() {} + + virtual void AddToSendQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) = 0; + virtual void AddToReceiveQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) = 0; + virtual void RemoveFromSendQ(const void* referrer) = 0; + virtual void RemoveFromReceiveQ(const void* referrer) = 0; }; // Forward declaration of channel implementations. @@ -80,6 +100,27 @@ class ChannelHolder { return channel != nullptr ? channel->Receive(data) : false; } + bool IsClosed() { + if (IsInitialized()) { + return holder_->IsClosed(); + } + return false; + } + + bool CanSend() { + if (IsInitialized()) { + return holder_->CanSend(); + } + return false; + } + + bool CanReceive() { + if (IsInitialized()) { + return holder_->CanReceive(); + } + return false; + } + void close() { if (IsInitialized()) holder_->Close(); } @@ -97,6 +138,50 @@ class ChannelHolder { if (IsInitialized()) holder_->Unlock(); } + template + void AddToSendQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToSendQ(referrer, data, cond, cb); + } + } + } + + template + void AddToReceiveQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToReceiveQ(referrer, data, cond, cb); + } + } + } + + template + void RemoveFromSendQ(const void* referrer) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->RemoveFromSendQ(referrer); + } + } + } + + template + void RemoveFromReceiveQ(const void* referrer) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->RemoveFromReceiveQ(referrer); + } + } + } + inline bool IsInitialized() const { return holder_ != nullptr; } inline const std::type_index Type() { @@ -113,6 +198,9 @@ class ChannelHolder { virtual ~Placeholder() {} virtual const std::type_index Type() const = 0; virtual void* Ptr() const = 0; + virtual bool IsClosed() = 0; + virtual bool CanSend() = 0; + virtual bool CanReceive() = 0; virtual void Close() = 0; virtual void Lock() = 0; virtual void Unlock() = 0; @@ -129,6 +217,27 @@ class ChannelHolder { virtual void* Ptr() const { return static_cast(channel_.get()); } + virtual bool IsClosed() { + if (channel_) { + return channel_->IsClosed(); + } + return false; + } + + virtual bool CanSend() { + if (channel_) { + return channel_->CanSend(); + } + return false; + } + + virtual bool CanReceive() { + if (channel_) { + return channel_->CanReceive(); + } + return false; + } + virtual void Close() { if (channel_) channel_->Close(); } diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h index a4561031fd8c49613269e7008ce558f25f9765e4..c194c03e264cccfa9ad755ea24bc6372e82bfb00 100644 --- a/paddle/fluid/framework/channel_impl.h +++ b/paddle/fluid/framework/channel_impl.h @@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel *); public: + virtual bool CanSend(); + virtual bool CanReceive(); virtual bool Send(T *); virtual bool Receive(T *); virtual size_t Cap() { return cap_; } virtual void Lock(); virtual void Unlock(); + virtual bool IsClosed(); virtual void Close(); - ChannelImpl(size_t); virtual ~ChannelImpl(); + virtual void AddToSendQ(const void *referrer, T *data, + std::shared_ptr cond, + std::function cb); + virtual void AddToReceiveQ(const void *referrer, T *data, + std::shared_ptr cond, + std::function cb); + + virtual void RemoveFromSendQ(const void *referrer); + virtual void RemoveFromReceiveQ(const void *referrer); + private: struct QueueMessage { T *data; - std::condition_variable_any cond; + std::shared_ptr cond; bool chan_closed = false; bool completed = false; + const void *referrer; // TODO(thuan): figure out better way to do this + std::function callback; - QueueMessage(T *item) : data(item) {} + QueueMessage(T *item) + : data(item), cond(std::make_shared()) {} + + QueueMessage(T *item, std::shared_ptr cond) + : data(item), cond(cond) {} void Wait(std::unique_lock &lock) { - cond.wait(lock, [this]() { return completed; }); + cond->wait(lock, [this]() { return completed; }); } void Notify() { completed = true; - cond.notify_all(); + cond->notify_all(); } }; @@ -87,6 +105,18 @@ ChannelImpl::ChannelImpl(size_t capacity) PADDLE_ENFORCE_GE(capacity, 0); } +template +bool ChannelImpl::CanSend() { + std::lock_guard lock{mu_}; + return !closed_ && (!recvq.empty() || buf_.size() < cap_); +} + +template +bool ChannelImpl::CanReceive() { + std::lock_guard lock{mu_}; + return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0); +} + template bool ChannelImpl::Send(T *item) { send_ctr++; @@ -105,7 +135,24 @@ bool ChannelImpl::Send(T *item) { std::shared_ptr m = recvq.front(); recvq.pop_front(); // Do the data transfer - *(m->data) = std::move(*item); + // We will do this data transfer if either of the following + // cases are true + // 1. callback == nullptr // This means it was a regular channel send + // 2. callback returns true + bool do_send = true; + if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); + if (do_send) + *(m->data) = std::move(*item); + else + // We cannot do the data transfer because + // this QueueMessage was added by Select + // and some other case was executed. + // So call the Send function again. + // We do not care about notifying other + // because they would have been notified + // by the executed select case. + return Send(item); + // Wake up the blocked process and unlock m->Notify(); lock.unlock(); @@ -150,7 +197,25 @@ bool ChannelImpl::Receive(T *item) { std::shared_ptr m = sendq.front(); sendq.pop_front(); // Do the data transfer - *item = std::move(*(m->data)); + // We will do this data transfer if either of the following + // cases are true + // 1. callback == nullptr // This means it was a regular channel send + // 2. callback returns true + bool do_receive = true; + if (m->callback != nullptr) + do_receive = m->callback(ChannelAction::RECEIVE); + if (do_receive) + *item = std::move(*(m->data)); + else + // We cannot do the data transfer because + // this QueueMessage was added by Select + // and some other case was executed. + // So call the Receive function again. + // We do not care about notifying other + // because they would have been notified + // by the executed select case. + return Receive(item); + // Wake up the blocked process and unlock m->Notify(); lock.unlock(); @@ -186,6 +251,12 @@ void ChannelImpl::Unlock() { mu_.unlock(); } +template +bool ChannelImpl::IsClosed() { + std::lock_guard lock{mu_}; + return closed_; +} + template void ChannelImpl::Close() { std::unique_lock lock{mu_}; @@ -203,6 +274,12 @@ void ChannelImpl::Close() { std::shared_ptr m = recvq.front(); recvq.pop_front(); m->chan_closed = true; + + // Execute callback function (if any) + if (m->callback != nullptr) { + m->callback(ChannelAction::CLOSE); + } + m->Notify(); } @@ -211,10 +288,72 @@ void ChannelImpl::Close() { std::shared_ptr m = sendq.front(); sendq.pop_front(); m->chan_closed = true; + + // Execute callback function (if any) + if (m->callback != nullptr) { + m->callback(ChannelAction::CLOSE); + } + m->Notify(); } } +template +void ChannelImpl::AddToSendQ( + const void *referrer, T *data, + std::shared_ptr cond, + std::function cb) { + std::lock_guard lock{mu_}; + auto m = std::make_shared(data, cond); + m->referrer = referrer; + m->callback = cb; + sendq.push_back(m); +} + +template +void ChannelImpl::AddToReceiveQ( + const void *referrer, T *data, + std::shared_ptr cond, + std::function cb) { + std::lock_guard lock{mu_}; + auto m = std::make_shared(data, cond); + m->referrer = referrer; + m->callback = cb; + recvq.push_back(m); +} + +template +void ChannelImpl::RemoveFromSendQ(const void *referrer) { + std::lock_guard lock{mu_}; + + for (auto it = sendq.begin(); it != sendq.end();) { + std::shared_ptr sendMsg = (std::shared_ptr)*it; + + if (sendMsg->referrer == referrer) { + it = sendq.erase(it); + send_ctr--; + } else { + ++it; + } + } +} + +template +void ChannelImpl::RemoveFromReceiveQ(const void *referrer) { + std::lock_guard lock{mu_}; + + for (auto it = recvq.begin(); it != recvq.end();) { + std::shared_ptr recvMsg = (std::shared_ptr)*it; + + if (recvMsg->referrer == referrer) { + it = recvq.erase(it); + recv_ctr--; + } else { + ++it; + } + } +} + template ChannelImpl::~ChannelImpl() { Close(); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 39ae3c0040d04a6d901f1d6c992d547a6778c28e..d372213e1b6008b0c4227103dd40730f86a84301 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -56,6 +56,7 @@ class AssignFunctor { private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; auto &out_tensor = *out; TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); out_tensor.set_lod(lod_tensor.lod()); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index e7bed2c39735b66c19e738c91f4977e46571143b..90af1e2d602ac039b4d98a69a889ff8b1b85ffc6 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -17,11 +17,14 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::OpKernelType; using framework::Tensor; -class MulOpShapeInference : public framework::InferShapeBase { +class MulOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext* ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$. } }; -class MulOpGrad : public framework::OperatorWithKernel { +class MulGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, - ops::MulOpShapeInference, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); +REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc index 0667530e943856576ae8c9fe4856cb6aa1448e4e..757f9c3ee2665c7ac654659416fe8dd727dca16d 100644 --- a/paddle/fluid/operators/mul_op.cu.cc +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mul, ops::MulKernel); -REGISTER_OP_CUDA_KERNEL( - mul_grad, ops::MulGradKernel); +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, + ops::MulKernel); +REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h index 38311cf87265ad0f1f815734cbf69bd682d62e62..b1260d36ebe11f65529ac274c959479dcb38ee5f 100644 --- a/paddle/fluid/operators/mul_op.h +++ b/paddle/fluid/operators/mul_op.h @@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel { } math::matmul( context.template device_context(), x_matrix, false, - y_matrix, false, 1, z, 0); + y_matrix, false, static_cast(1), z, static_cast(0)); if (z_dim.size() != 2) { z->Resize(z_dim); } diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index 4d83a70e7334a84bb98bd52f0172f6b7ecedb58d..ad623e1fe0f8941615b671a0c20bd3637ae6d407 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel { T* recvbuffer = nullptr; if (root == gpu_id) { recvbuffer = out->mutable_data(ctx.GetPlace()); + } else { + out->Resize(framework::make_ddim({0})); } VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() << " recv " << out->numel(); diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3fb8b56d2676f90ff7e1cefa46c459ee37f63ca8..d6fd6214711f4ee66b1daffa4db2e84aa7201e79 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Ref"), - "Input(Ref) of ScatterOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Index"), - "Input(Index) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of ScatterOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Updates"), "Input(Updates) of ScatterOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ScatterOp should not be null."); auto updates_dims = ctx->GetInputDim("Updates"); - auto ref_dims = ctx->GetInputDim("Ref"); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, - "Update Index should be 1-D."); + auto ref_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1, + "Update Ids should be 1-D."); PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), - "Reference and Updates should have the same shape size"); + "Xerence and Updates should have the same shape size"); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], - ctx->GetInputDim("Index")[0], - "Updates and Index should have same batch-size."); + ctx->GetInputDim("Ids")[0], + "Updates and Ids should have same batch-size."); framework::DDim data_dim(updates_dims); for (int i = 1; i < data_dim.size(); ++i) { PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); @@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Ref")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } }; @@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Ref")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } }; @@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { public: ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("Ref", "The source input of scatter op"); - AddInput("Index", - "The index input of scatter op where Ref will be updated"); + AddInput("X", "The source input of scatter op"); + AddInput("Ids", "The index input of scatter op where X will be updated"); AddInput("Updates", "The updated value of updates op"); AddOutput("Out", "The output of add op"); AddComment(R"DOC( @@ -91,8 +90,8 @@ Scatter Operator. This operator obtains output by updating the input on selected indices on the first axis: $$ -Out = Ref \\ -Out[Index] = Ref[Index] + Updates +Out = X \\ +Out[Ids] = X[Ids] + Updates $$ )DOC"); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index bdabb29fa680f8f87873b4381acf0dbd2b6195d0..ef7d700659d8d713715a10910baf739954ba0786 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto *Ref = ctx.Input("Ref"); - auto *Index = ctx.Input("Index"); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Ids"); auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*X); - GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); + GPUScatterAssign(ctx.device_context(), *Updates, *Ids, Out); } }; @@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dX = ctx.Output(framework::GradVarName("X")); auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Index = ctx.Input("Index"); + auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + // In place gradient: dX = dO + dX->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Index] - GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + // Gradient by Gather: dUpdates = dO[Ids] + GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); } }; diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 3c6e7ece320229e1a311ef6d7a27387d40be3c2a..2151d8a9240fc88966533f4a07d5cf56b6c1c3bc 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "This kernel only runs on CPU."); - auto *Ref = ctx.Input("Ref"); - auto *Index = ctx.Input("Index"); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Ids"); auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - // In place output: Out = Ref, Out[Index] += Updates - Out->ShareDataWith(*Ref); + // In place output: Out = X, Out[Ids] += Updates + Out->ShareDataWith(*X); // Apply ScatterUpdate: Out[index] += Updates[:] - ScatterAssign(ctx.device_context(), *Updates, *Index, Out); + ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); } }; @@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "This kernel only runs on CPU."); - auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dX = ctx.Output(framework::GradVarName("X")); auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Index = ctx.Input("Index"); + auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + // In place gradient: dX = dO + dX->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Index] - CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + // Gradient by Gather: dUpdates += dO[Ids] + CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d2e883caccdd34a9d662f06b83cf9a71d3d4a51e..6c05442466f5f3d8e04a8f0a2206443b1007a107 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/const_value.h" @@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("set_float_element", TensorSetElement) @@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle. #endif }); // clang-format on - #ifdef PADDLE_WITH_CUDA py::class_(m, "Communicator").def(py::init<>()); #endif @@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_devices", &framework::InitDevices); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); +#ifdef PADDLE_WITH_CUDA + m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { + // Only GPUs with Compute Capability >= 53 support float16 + return platform::GetCUDAComputeCapability(place.device) >= 53; + }); +#endif m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 1b0916ea0370d95a0c7dd149ee3f7b294c5e2351..3b206f2f87abe01363fb7e61c319559c6dd24594 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" @@ -77,21 +78,32 @@ struct CastToPyBufferImpl { } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; } - return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), - py::format_descriptor::format(), - (size_t)framework::arity(dst_tensor.dims()), - dims_outside, strides); + + if (std::type_index(typeid(CUR_TYPE)) == + std::type_index(typeid(platform::float16))) { + return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), + "e", /* np.dtype('e') == np.float16 */ + (size_t)framework::arity(dst_tensor.dims()), + dims_outside, strides); + } else { + return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), + py::format_descriptor::format(), + (size_t)framework::arity(dst_tensor.dims()), + dims_outside, strides); + } } else { constexpr bool less = I + 1 < std::tuple_size>::value; return CastToPyBufferImpl()(tensor); } } }; + } // namespace details + inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { auto buffer_info = - details::CastToPyBufferImpl()( - tensor); + details::CastToPyBufferImpl()(tensor); return buffer_info; } @@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray( std::memcpy(dst, array.data(), sizeof(T) * array.size()); } +template <> +void PyCPUTensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::CPUPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); +} + #ifdef PADDLE_WITH_CUDA template void PyCUDATensorSetFromArray( @@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray( paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } + +template <> +void PyCUDATensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::CUDAPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = + static_cast(pool.Get(place)); + paddle::platform::GpuMemcpyAsync(dst, array.data(), + sizeof(uint16_t) * array.size(), + cudaMemcpyHostToDevice, dev_ctx->stream()); +} #endif } // namespace pybind diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 6e24cbdd3f6a4f05c1691dc643d880f6f454429d..90c2dfbba78418fb7b731f5363017d70577b1ae5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,27 +1,29 @@ - -file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) -file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) -file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py) - set(PY_FILES paddle/__init__.py - ${TRAINER_PY_FILES} - ${HELPERS_PY_FILES} ${UTILS_PY_FILES} - ${V2_PY_FILES} ${FLUID_PY_FILES}) -add_custom_target(copy_paddle_master) +if(NOT WITH_FLUID) + file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) + file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) + file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) + set(PY_FILES ${PY_FILES} + ${TRAINER_PY_FILES} + ${HELPERS_PY_FILES} + ${V2_PY_FILES}) -SET(COPY_PADDLE_MASTER "") -if(WITH_GOLANG) - SET(COPY_PADDLE_MASTER "copy_paddle_master") - add_custom_command(TARGET ${COPY_PADDLE_MASTER} - COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ - ) - add_dependencies(copy_paddle_master paddle_master) -endif(WITH_GOLANG) + add_custom_target(copy_paddle_master) + + SET(COPY_PADDLE_MASTER "") + if(WITH_GOLANG) + SET(COPY_PADDLE_MASTER "copy_paddle_master") + add_custom_command(TARGET ${COPY_PADDLE_MASTER} + COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ + ) + add_dependencies(copy_paddle_master paddle_master) + endif(WITH_GOLANG) +endif() set(MKL_SHARED_LIBS "") set(MKL_DEPENDS "") @@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS}) -if(WITH_SWIG_PY) - list(APPEND paddle_python_deps python_api_wheel) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +if(NOT WITH_FLUID) + set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model) + if(WITH_SWIG_PY) + list(APPEND paddle_python_deps python_api_wheel) + endif() endif() add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) if (WITH_TESTING) - add_subdirectory(paddle/trainer_config_helpers/tests) - if (WITH_SWIG_PY) - # enable v2 API unittest only when paddle swig api is compiled - add_subdirectory(paddle/v2/tests) - add_subdirectory(paddle/v2/reader/tests) - add_subdirectory(paddle/v2/plot/tests) - add_subdirectory(paddle/fluid/tests) + if(NOT WITH_FLUID) + add_subdirectory(paddle/trainer_config_helpers/tests) + if (WITH_SWIG_PY) + # enable v2 API unittest only when paddle swig api is compiled + add_subdirectory(paddle/v2/tests) + add_subdirectory(paddle/v2/reader/tests) + add_subdirectory(paddle/v2/plot/tests) + endif() endif() + add_subdirectory(paddle/fluid/tests) endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index b6f20daee3a585777a23255355f0a0e31328d23f..7af6ed1463ab737e871da487f2a687301652ef2d 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -248,12 +248,15 @@ def _callback_lookup_(op): if o_argu in self.param_grad_names: allreduce_out_name = o_argu + "__nccl_all_reduce__" op_desc = _create_op_desc_( - "ncclAllReduce", { + "ncclReduce", + { "X": [o_argu], "Communicator": ['nccl_com__do_not_change_'] - }, {"Out": [allreduce_out_name]}, - {"reduction": "ncclSum"}) + }, + {"Out": [allreduce_out_name]}, + {"reduction": "ncclSum", + "root": 0}, ) block.desc.append_op().copy_from(op_desc) op_desc = _create_op_desc_( diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 0b88b639629ac73b16ec36aa5930c3d6a9665943..14ad18d5085fb945646818cc679f088a43806a70 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -45,31 +45,13 @@ __activations__ = [ ] __all__ = [ - 'mean', - 'mul', - 'reshape', - 'scale', - 'sigmoid_cross_entropy_with_logits', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', - 'clip', - 'clip_by_norm', - 'softmax', - 'sequence_softmax', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'uniform_random', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'gaussian_random_batch_size_like', - 'cumsum', + 'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits', + 'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul', + 'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip', + 'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or', + 'logical_xor', 'logical_not', 'uniform_random', + 'uniform_random_batch_size_like', 'gaussian_random', + 'gaussian_random_batch_size_like', 'cumsum', 'scatter' ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 9d1da420c7f70bd2a89d183a5f0a2b145f0ff475..40440bea1267112b84b66002a0bf921be3029265 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import paddle.fluid.core as core from op_test import OpTest @@ -69,5 +70,42 @@ class TestMulOp2(OpTest): ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) +class TestFP16MulOp1(OpTest): + def setUp(self): + self.op_type = "mul" + x = np.random.random((32, 84)).astype("float16") + y = np.random.random((84, 100)).astype("float16") + self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + self.outputs = {'Out': np.dot(x, y)} + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-1) + + +class TestFP16MulOp2(OpTest): + def setUp(self): + self.op_type = "mul" + x = np.random.random((15, 4, 12, 10)).astype("float16") + y = np.random.random((4, 30, 8, 2, 9)).astype("float16") + self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot( + x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9)) + result = result.reshape(15, 4, 8, 2, 9) + self.outputs = {'Out': result} + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-1) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index bb02a40d449860cf6c0576662e79a5e36e6e0635..fb1728743630b3ea908ae835444eff7fd71b72c8 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -25,7 +25,7 @@ class TestScatterOp(OpTest): updates_np = np.random.random((2, 3)).astype("float32") output_np = np.copy(ref_np) output_np[index_np] = updates_np - self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np} + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} self.outputs = {'Out': output_np} def test_check_output(self): diff --git a/python/setup.py.in b/python/setup.py.in index f830039a3af581d593d510326f15139377cb25f1..4cb5409524457b7bc5a99c88a0dbbfc8834923fa 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') packages=['paddle', - 'paddle.proto', - 'paddle.trainer', - 'paddle.trainer_config_helpers', 'paddle.utils', - 'paddle.v2', - 'paddle.v2.dataset', - 'paddle.v2.reader', - 'paddle.v2.master', - 'paddle.v2.plot', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', - 'paddle.fluid.layers', - 'py_paddle'] + 'paddle.fluid.layers'] + +if '${WITH_FLUID}'== 'OFF': + packages+=['paddle.proto', + 'paddle.trainer', + 'paddle.trainer_config_helpers', + 'paddle.v2', + 'paddle.v2.dataset', + 'paddle.v2.reader', + 'paddle.v2.master', + 'paddle.v2.plot', + 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: setup_requires = f.read().splitlines() @@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: setup_requires+=['opencv-python'] # the prefix is sys.prefix which should always be usr -paddle_bin_dir = 'opt/paddle/bin' -paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', - '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', - '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', - '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] +paddle_bins = '' +if '${WITH_FLUID}'== 'OFF': + paddle_bin_dir = 'opt/paddle/bin' + paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', + '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', + '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', + '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] + +package_data={'paddle.fluid': ['core.so']} +if '${WITH_FLUID}'== 'OFF': + package_data['paddle.v2.master']=['libpaddle_master.so'] + package_data['py_paddle']=['*.py','_swig_paddle.so'] + +package_dir={ + '': '${CMAKE_CURRENT_SOURCE_DIR}', + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', + 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', +} +if '${WITH_FLUID}'== 'OFF': + package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle' + paddle_rt_lib_dir = 'lib' paddle_rt_libs = ['${WARPCTC_LIBRARIES}'] @@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}', install_requires=setup_requires, packages=packages, ext_modules=[Extension('_foo', ['stub.cc'])], - package_data={ - 'paddle.v2.master': ['libpaddle_master.so'], - 'paddle.fluid': ['core.so'], - 'py_paddle':['*.py','_swig_paddle.so'] - }, - package_dir={ - '': '${CMAKE_CURRENT_SOURCE_DIR}', - # The paddle.fluid.proto will be generated while compiling. - # So that package points to other directory. - 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', - 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', - 'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle' - }, + package_data=package_data, + package_dir=package_dir, scripts=paddle_bins, data_files=[(paddle_rt_lib_dir, paddle_rt_libs)] )