提交 d7e5e1f1 编写于 作者: W wanghaoshuang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into average_model

...@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) ...@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON) option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
......
add_subdirectory(cuda) if(NOT WITH_FLUID)
add_subdirectory(function) add_subdirectory(cuda)
add_subdirectory(utils) add_subdirectory(function)
add_subdirectory(math) add_subdirectory(utils)
add_subdirectory(gserver) add_subdirectory(math)
add_subdirectory(parameter) add_subdirectory(gserver)
add_subdirectory(testing) add_subdirectory(parameter)
if(MOBILE_INFERENCE)
add_subdirectory(capi)
else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
if(WITH_C_API) if(MOBILE_INFERENCE)
add_subdirectory(capi) add_subdirectory(capi)
endif() else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
if(NOT ANDROID AND NOT IOS) if(WITH_C_API)
add_subdirectory(fluid) add_subdirectory(capi)
endif() endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
add_subdirectory(api) add_subdirectory(api)
endif()
endif() endif()
endif() endif()
add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
add_subdirectory(fluid)
endif()
...@@ -15,23 +15,43 @@ limitations under the License. */ ...@@ -15,23 +15,43 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <condition_variable>
#include <typeindex> #include <typeindex>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
enum class ChannelAction {
SEND = 0,
RECEIVE = 1,
CLOSE = 2,
};
// Channel is the abstract class of buffered and un-buffered channels. // Channel is the abstract class of buffered and un-buffered channels.
template <typename T> template <typename T>
class Channel { class Channel {
public: public:
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0; virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
virtual void Unlock() = 0; virtual void Unlock() = 0;
virtual bool IsClosed() = 0;
virtual void Close() = 0; virtual void Close() = 0;
virtual ~Channel() {} virtual ~Channel() {}
virtual void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
}; };
// Forward declaration of channel implementations. // Forward declaration of channel implementations.
...@@ -80,6 +100,27 @@ class ChannelHolder { ...@@ -80,6 +100,27 @@ class ChannelHolder {
return channel != nullptr ? channel->Receive(data) : false; return channel != nullptr ? channel->Receive(data) : false;
} }
bool IsClosed() {
if (IsInitialized()) {
return holder_->IsClosed();
}
return false;
}
bool CanSend() {
if (IsInitialized()) {
return holder_->CanSend();
}
return false;
}
bool CanReceive() {
if (IsInitialized()) {
return holder_->CanReceive();
}
return false;
}
void close() { void close() {
if (IsInitialized()) holder_->Close(); if (IsInitialized()) holder_->Close();
} }
...@@ -97,6 +138,50 @@ class ChannelHolder { ...@@ -97,6 +138,50 @@ class ChannelHolder {
if (IsInitialized()) holder_->Unlock(); if (IsInitialized()) holder_->Unlock();
} }
template <typename T>
void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToSendQ(referrer, data, cond, cb);
}
}
}
template <typename T>
void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToReceiveQ(referrer, data, cond, cb);
}
}
}
template <typename T>
void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->RemoveFromSendQ(referrer);
}
}
}
template <typename T>
void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->RemoveFromReceiveQ(referrer);
}
}
}
inline bool IsInitialized() const { return holder_ != nullptr; } inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() { inline const std::type_index Type() {
...@@ -113,6 +198,9 @@ class ChannelHolder { ...@@ -113,6 +198,9 @@ class ChannelHolder {
virtual ~Placeholder() {} virtual ~Placeholder() {}
virtual const std::type_index Type() const = 0; virtual const std::type_index Type() const = 0;
virtual void* Ptr() const = 0; virtual void* Ptr() const = 0;
virtual bool IsClosed() = 0;
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual void Close() = 0; virtual void Close() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
virtual void Unlock() = 0; virtual void Unlock() = 0;
...@@ -129,6 +217,27 @@ class ChannelHolder { ...@@ -129,6 +217,27 @@ class ChannelHolder {
virtual void* Ptr() const { return static_cast<void*>(channel_.get()); } virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
virtual bool IsClosed() {
if (channel_) {
return channel_->IsClosed();
}
return false;
}
virtual bool CanSend() {
if (channel_) {
return channel_->CanSend();
}
return false;
}
virtual bool CanReceive() {
if (channel_) {
return channel_->CanReceive();
}
return false;
}
virtual void Close() { virtual void Close() {
if (channel_) channel_->Close(); if (channel_) channel_->Close();
} }
......
...@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T> *); friend void paddle::framework::CloseChannel<T>(Channel<T> *);
public: public:
virtual bool CanSend();
virtual bool CanReceive();
virtual bool Send(T *); virtual bool Send(T *);
virtual bool Receive(T *); virtual bool Receive(T *);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Lock(); virtual void Lock();
virtual void Unlock(); virtual void Unlock();
virtual bool IsClosed();
virtual void Close(); virtual void Close();
ChannelImpl(size_t); ChannelImpl(size_t);
virtual ~ChannelImpl(); virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void AddToReceiveQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void RemoveFromSendQ(const void *referrer);
virtual void RemoveFromReceiveQ(const void *referrer);
private: private:
struct QueueMessage { struct QueueMessage {
T *data; T *data;
std::condition_variable_any cond; std::shared_ptr<std::condition_variable_any> cond;
bool chan_closed = false; bool chan_closed = false;
bool completed = false; bool completed = false;
const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback;
QueueMessage(T *item) : data(item) {} QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
: data(item), cond(cond) {}
void Wait(std::unique_lock<std::recursive_mutex> &lock) { void Wait(std::unique_lock<std::recursive_mutex> &lock) {
cond.wait(lock, [this]() { return completed; }); cond->wait(lock, [this]() { return completed; });
} }
void Notify() { void Notify() {
completed = true; completed = true;
cond.notify_all(); cond->notify_all();
} }
}; };
...@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity) ...@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
PADDLE_ENFORCE_GE(capacity, 0); PADDLE_ENFORCE_GE(capacity, 0);
} }
template <typename T>
bool ChannelImpl<T>::CanSend() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !closed_ && (!recvq.empty() || buf_.size() < cap_);
}
template <typename T>
bool ChannelImpl<T>::CanReceive() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
}
template <typename T> template <typename T>
bool ChannelImpl<T>::Send(T *item) { bool ChannelImpl<T>::Send(T *item) {
send_ctr++; send_ctr++;
...@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front(); recvq.pop_front();
// Do the data transfer // Do the data transfer
*(m->data) = std::move(*item); // We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_send = true;
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send)
*(m->data) = std::move(*item);
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Send function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return Send(item);
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
...@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front(); sendq.pop_front();
// Do the data transfer // Do the data transfer
*item = std::move(*(m->data)); // We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_receive = true;
if (m->callback != nullptr)
do_receive = m->callback(ChannelAction::RECEIVE);
if (do_receive)
*item = std::move(*(m->data));
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Receive function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return Receive(item);
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
...@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() { ...@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() {
mu_.unlock(); mu_.unlock();
} }
template <typename T>
bool ChannelImpl<T>::IsClosed() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return closed_;
}
template <typename T> template <typename T>
void ChannelImpl<T>::Close() { void ChannelImpl<T>::Close() {
std::unique_lock<std::recursive_mutex> lock{mu_}; std::unique_lock<std::recursive_mutex> lock{mu_};
...@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() { ...@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front(); recvq.pop_front();
m->chan_closed = true; m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify(); m->Notify();
} }
...@@ -211,10 +288,72 @@ void ChannelImpl<T>::Close() { ...@@ -211,10 +288,72 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front(); sendq.pop_front();
m->chan_closed = true; m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify(); m->Notify();
} }
} }
template <typename T>
void ChannelImpl<T>::AddToSendQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
sendq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::AddToReceiveQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
recvq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = sendq.begin(); it != sendq.end();) {
std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
if (sendMsg->referrer == referrer) {
it = sendq.erase(it);
send_ctr--;
} else {
++it;
}
}
}
template <typename T>
void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = recvq.begin(); it != recvq.end();) {
std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
if (recvMsg->referrer == referrer) {
it = recvq.erase(it);
recv_ctr--;
} else {
++it;
}
}
}
template <typename T> template <typename T>
ChannelImpl<T>::~ChannelImpl() { ChannelImpl<T>::~ChannelImpl() {
Close(); Close();
......
...@@ -56,6 +56,7 @@ class AssignFunctor { ...@@ -56,6 +56,7 @@ class AssignFunctor {
private: private:
void copy_tensor(const framework::LoDTensor &lod_tensor, void copy_tensor(const framework::LoDTensor &lod_tensor,
framework::LoDTensor *out) const { framework::LoDTensor *out) const {
if (lod_tensor.numel() == 0) return;
auto &out_tensor = *out; auto &out_tensor = *out;
TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
out_tensor.set_lod(lod_tensor.lod()); out_tensor.set_lod(lod_tensor.lod());
......
...@@ -17,11 +17,14 @@ limitations under the License. */ ...@@ -17,11 +17,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::OpKernelType;
using framework::Tensor; using framework::Tensor;
class MulOpShapeInference : public framework::InferShapeBase { class MulOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext* ctx) const override { using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$. ...@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$.
} }
}; };
class MulOpGrad : public framework::OperatorWithKernel { class MulGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel { ...@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
ops::MulOpShapeInference,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>); mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
......
...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( namespace plat = paddle::platform;
mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>); REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
REGISTER_OP_CUDA_KERNEL( ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>); REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>);
...@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> { ...@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> {
} }
math::matmul<DeviceContext, T>( math::matmul<DeviceContext, T>(
context.template device_context<DeviceContext>(), x_matrix, false, context.template device_context<DeviceContext>(), x_matrix, false,
y_matrix, false, 1, z, 0); y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
z->Resize(z_dim); z->Resize(z_dim);
} }
......
...@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
T* recvbuffer = nullptr; T* recvbuffer = nullptr;
if (root == gpu_id) { if (root == gpu_id) {
recvbuffer = out->mutable_data<T>(ctx.GetPlace()); recvbuffer = out->mutable_data<T>(ctx.GetPlace());
} else {
out->Resize(framework::make_ddim({0}));
} }
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel(); << " recv " << out->numel();
......
...@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ref"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(Ref) of ScatterOp should not be null."); "Input(X) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Index"), PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input(Index) of ScatterOp should not be null."); "Input(Ids) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Updates"), PADDLE_ENFORCE(ctx->HasInput("Updates"),
"Input(Updates) of ScatterOp should not be null."); "Input(Updates) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ScatterOp should not be null."); "Output(Out) of ScatterOp should not be null.");
auto updates_dims = ctx->GetInputDim("Updates"); auto updates_dims = ctx->GetInputDim("Updates");
auto ref_dims = ctx->GetInputDim("Ref"); auto ref_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
"Update Index should be 1-D."); "Update Ids should be 1-D.");
PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
"Reference and Updates should have the same shape size"); "Xerence and Updates should have the same shape size");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
ctx->GetInputDim("Index")[0], ctx->GetInputDim("Ids")[0],
"Updates and Index should have same batch-size."); "Updates and Ids should have same batch-size.");
framework::DDim data_dim(updates_dims); framework::DDim data_dim(updates_dims);
for (int i = 1; i < data_dim.size(); ++i) { for (int i = 1; i < data_dim.size(); ++i) {
PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
...@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context()); ctx.device_context());
} }
}; };
...@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->SetOutputDim(framework::GradVarName("Updates"),
ctx->GetInputDim("Updates")); ctx->GetInputDim("Updates"));
ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context()); ctx.device_context());
} }
}; };
...@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ref", "The source input of scatter op"); AddInput("X", "The source input of scatter op");
AddInput("Index", AddInput("Ids", "The index input of scatter op where X will be updated");
"The index input of scatter op where Ref will be updated");
AddInput("Updates", "The updated value of updates op"); AddInput("Updates", "The updated value of updates op");
AddOutput("Out", "The output of add op"); AddOutput("Out", "The output of add op");
AddComment(R"DOC( AddComment(R"DOC(
...@@ -91,8 +90,8 @@ Scatter Operator. ...@@ -91,8 +90,8 @@ Scatter Operator.
This operator obtains output by updating the input on selected indices on the first axis: This operator obtains output by updating the input on selected indices on the first axis:
$$ $$
Out = Ref \\ Out = X \\
Out[Index] = Ref[Index] + Updates Out[Ids] = X[Ids] + Updates
$$ $$
)DOC"); )DOC");
......
...@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> { ...@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto *Ref = ctx.Input<Tensor>("Ref"); auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith(*Ref); Out->ShareDataWith(*X);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out); GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
} }
}; };
...@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO // In place gradient: dX = dO
dRef->ShareDataWith(*dOut); dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index] // Gradient by Gather: dUpdates = dO[Ids]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates); GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
} }
}; };
......
...@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> { ...@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU."); "This kernel only runs on CPU.");
auto *Ref = ctx.Input<Tensor>("Ref"); auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = Ref, Out[Index] += Updates // In place output: Out = X, Out[Ids] += Updates
Out->ShareDataWith(*Ref); Out->ShareDataWith(*X);
// Apply ScatterUpdate: Out[index] += Updates[:] // Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out); ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
} }
}; };
...@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU."); "This kernel only runs on CPU.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO // In place gradient: dX = dO
dRef->ShareDataWith(*dOut); dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index] // Gradient by Gather: dUpdates += dO[Ids]
CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates); CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
} }
}; };
......
...@@ -31,6 +31,7 @@ limitations under the License. */ ...@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/const_value.h"
...@@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) { ...@@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<double>) .def("set", PyCPUTensorSetFromArray<double>)
.def("set", PyCPUTensorSetFromArray<int64_t>) .def("set", PyCPUTensorSetFromArray<int64_t>)
.def("set", PyCPUTensorSetFromArray<bool>) .def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>)
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>) .def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>) .def("set", PyCUDATensorSetFromArray<int>)
.def("set", PyCUDATensorSetFromArray<double>) .def("set", PyCUDATensorSetFromArray<double>)
.def("set", PyCUDATensorSetFromArray<int64_t>) .def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>) .def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>) .def("set_float_element", TensorSetElement<float>)
...@@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
}); });
// clang-format on // clang-format on
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
...@@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", &framework::InitDevices); m.def("init_devices", &framework::InitDevices);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
#ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16
return platform::GetCUDAComputeCapability(place.device) >= 53;
});
#endif
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -77,21 +78,32 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -77,21 +78,32 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
} }
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(), if (std::type_index(typeid(CUR_TYPE)) ==
(size_t)framework::arity(dst_tensor.dims()), std::type_index(typeid(platform::float16))) {
dims_outside, strides); return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
"e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
} else {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
}
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor); return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
} }
} }
}; };
} // namespace details } // namespace details
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()( details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
tensor); platform::float16>()(tensor);
return buffer_info; return buffer_info;
} }
...@@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray( ...@@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray(
std::memcpy(dst, array.data(), sizeof(T) * array.size()); std::memcpy(dst, array.data(), sizeof(T) * array.size());
} }
template <>
void PyCPUTensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
...@@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray( ...@@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray(
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
} }
template <>
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CUDAPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
#endif #endif
} // namespace pybind } // namespace pybind
......
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py) file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
set(PY_FILES paddle/__init__.py set(PY_FILES paddle/__init__.py
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${UTILS_PY_FILES} ${UTILS_PY_FILES}
${V2_PY_FILES}
${FLUID_PY_FILES}) ${FLUID_PY_FILES})
add_custom_target(copy_paddle_master) if(NOT WITH_FLUID)
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
set(PY_FILES ${PY_FILES}
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${V2_PY_FILES})
SET(COPY_PADDLE_MASTER "") add_custom_target(copy_paddle_master)
if(WITH_GOLANG)
SET(COPY_PADDLE_MASTER "copy_paddle_master") SET(COPY_PADDLE_MASTER "")
add_custom_command(TARGET ${COPY_PADDLE_MASTER} if(WITH_GOLANG)
COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ SET(COPY_PADDLE_MASTER "copy_paddle_master")
) add_custom_command(TARGET ${COPY_PADDLE_MASTER}
add_dependencies(copy_paddle_master paddle_master) COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
endif(WITH_GOLANG) )
add_dependencies(copy_paddle_master paddle_master)
endif(WITH_GOLANG)
endif()
set(MKL_SHARED_LIBS "") set(MKL_SHARED_LIBS "")
set(MKL_DEPENDS "") set(MKL_DEPENDS "")
...@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ...@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
if(WITH_SWIG_PY) if(NOT WITH_FLUID)
list(APPEND paddle_python_deps python_api_wheel) set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
if(WITH_SWIG_PY)
list(APPEND paddle_python_deps python_api_wheel)
endif()
endif() endif()
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
if (WITH_TESTING) if (WITH_TESTING)
add_subdirectory(paddle/trainer_config_helpers/tests) if(NOT WITH_FLUID)
if (WITH_SWIG_PY) add_subdirectory(paddle/trainer_config_helpers/tests)
# enable v2 API unittest only when paddle swig api is compiled if (WITH_SWIG_PY)
add_subdirectory(paddle/v2/tests) # enable v2 API unittest only when paddle swig api is compiled
add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/plot/tests) add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/v2/plot/tests)
endif()
endif() endif()
add_subdirectory(paddle/fluid/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -248,12 +248,15 @@ def _callback_lookup_(op): ...@@ -248,12 +248,15 @@ def _callback_lookup_(op):
if o_argu in self.param_grad_names: if o_argu in self.param_grad_names:
allreduce_out_name = o_argu + "__nccl_all_reduce__" allreduce_out_name = o_argu + "__nccl_all_reduce__"
op_desc = _create_op_desc_( op_desc = _create_op_desc_(
"ncclAllReduce", { "ncclReduce",
{
"X": [o_argu], "X": [o_argu],
"Communicator": "Communicator":
['nccl_com__do_not_change_'] ['nccl_com__do_not_change_']
}, {"Out": [allreduce_out_name]}, },
{"reduction": "ncclSum"}) {"Out": [allreduce_out_name]},
{"reduction": "ncclSum",
"root": 0}, )
block.desc.append_op().copy_from(op_desc) block.desc.append_op().copy_from(op_desc)
op_desc = _create_op_desc_( op_desc = _create_op_desc_(
......
...@@ -45,31 +45,13 @@ __activations__ = [ ...@@ -45,31 +45,13 @@ __activations__ = [
] ]
__all__ = [ __all__ = [
'mean', 'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits',
'mul', 'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul',
'reshape', 'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip',
'scale', 'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or',
'sigmoid_cross_entropy_with_logits', 'logical_xor', 'logical_not', 'uniform_random',
'elementwise_add', 'uniform_random_batch_size_like', 'gaussian_random',
'elementwise_div', 'gaussian_random_batch_size_like', 'cumsum', 'scatter'
'elementwise_sub',
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'clip',
'clip_by_norm',
'softmax',
'sequence_softmax',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
'uniform_random',
'uniform_random_batch_size_like',
'gaussian_random',
'gaussian_random_batch_size_like',
'cumsum',
] + __activations__ ] + __activations__
for _OP in set(__all__): for _OP in set(__all__):
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -69,5 +70,42 @@ class TestMulOp2(OpTest): ...@@ -69,5 +70,42 @@ class TestMulOp2(OpTest):
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestFP16MulOp1(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((32, 84)).astype("float16")
y = np.random.random((84, 100)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.outputs = {'Out': np.dot(x, y)}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-1)
class TestFP16MulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((15, 4, 12, 10)).astype("float16")
y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.attrs = {
'x_num_col_dims': 2,
'y_num_col_dims': 2,
}
result = np.dot(
x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
result = result.reshape(15, 4, 8, 2, 9)
self.outputs = {'Out': result}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -25,7 +25,7 @@ class TestScatterOp(OpTest): ...@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
updates_np = np.random.random((2, 3)).astype("float32") updates_np = np.random.random((2, 3)).astype("float32")
output_np = np.copy(ref_np) output_np = np.copy(ref_np)
output_np[index_np] = updates_np output_np[index_np] = updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np} self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
def test_check_output(self): def test_check_output(self):
......
...@@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') ...@@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
packages=['paddle', packages=['paddle',
'paddle.proto',
'paddle.trainer',
'paddle.trainer_config_helpers',
'paddle.utils', 'paddle.utils',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master',
'paddle.v2.plot',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers']
'py_paddle']
if '${WITH_FLUID}'== 'OFF':
packages+=['paddle.proto',
'paddle.trainer',
'paddle.trainer_config_helpers',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master',
'paddle.v2.plot',
'py_paddle']
with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
setup_requires = f.read().splitlines() setup_requires = f.read().splitlines()
...@@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: ...@@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
setup_requires+=['opencv-python'] setup_requires+=['opencv-python']
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bin_dir = 'opt/paddle/bin' paddle_bins = ''
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', if '${WITH_FLUID}'== 'OFF':
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', paddle_bin_dir = 'opt/paddle/bin'
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['core.so']}
if '${WITH_FLUID}'== 'OFF':
package_data['paddle.v2.master']=['libpaddle_master.so']
package_data['py_paddle']=['*.py','_swig_paddle.so']
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
}
if '${WITH_FLUID}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
paddle_rt_lib_dir = 'lib' paddle_rt_lib_dir = 'lib'
paddle_rt_libs = ['${WARPCTC_LIBRARIES}'] paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
...@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}', ...@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}',
install_requires=setup_requires, install_requires=setup_requires,
packages=packages, packages=packages,
ext_modules=[Extension('_foo', ['stub.cc'])], ext_modules=[Extension('_foo', ['stub.cc'])],
package_data={ package_data=package_data,
'paddle.v2.master': ['libpaddle_master.so'], package_dir=package_dir,
'paddle.fluid': ['core.so'],
'py_paddle':['*.py','_swig_paddle.so']
},
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
},
scripts=paddle_bins, scripts=paddle_bins,
data_files=[(paddle_rt_lib_dir, paddle_rt_libs)] data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
) )
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册