提交 8314412b 编写于 作者: Y yangyaming

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-7717

...@@ -26,8 +26,8 @@ glu ...@@ -26,8 +26,8 @@ glu
:noindex: :noindex:
dot_product_attention scaled_dot_product_attention
--------------------- ----------------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex: :noindex:
...@@ -152,12 +152,12 @@ for data in train_reader(): ...@@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on `JobDesc` object describe the distributed job resource specification to run on
Cluster environment. Cluster environment.
<img src="src/remote_executor.png"/> <img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`. to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
### Placement Algorithm ### Placement Algorithm
......
...@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op) ...@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(do_memory_benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
...@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(4) << op->DebugStringEx(local_scope); VLOG(4) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) { if (FLAGS_do_memory_benchmark) {
......
...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) { ...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins", "seqlastins",
"non-seq", "non-seq",
-1); // hasSubseq seqlastins to non-seq -1); // hasSubseq seqlastins to non-seq
testDegradeLayer( testDegradeLayer(true,
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq "seqlastins",
"seq",
-1); // hasSubseq seqlastins to seq
} }
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) { ...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average", "average",
"non-seq", "non-seq",
5); // seq average to a shorten seq, stride window = 5 5); // seq average to a shorten seq, stride window = 5
testDegradeLayer( testDegradeLayer(true,
true, "average", "non-seq", -1); // hasSubseq average to non-seq "average",
"non-seq",
-1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
} }
...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) { ...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2( testPoolLayer2("cudnn-avg-incl-pad-pool",
"cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); /* trans= */ false,
/* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) { ...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
} }
TEST(Layer, ScaleShiftLayer) { TEST(Layer, ScaleShiftLayer) {
const size_t batchSize = 16; // FIXME: Disable ScaleShiftLayer because it is not stable.
const size_t size = 32; // https://github.com/PaddlePaddle/Paddle/issues/7781
TestConfig config; return;
config.layerConfig.set_type("scale_shift"); // const size_t batchSize = 16;
config.layerConfig.set_size(size); // const size_t size = 32;
config.biasSize = 1; // TestConfig config;
config.inputDefs.push_back( // config.layerConfig.set_type("scale_shift");
{INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); // config.layerConfig.set_size(size);
config.layerConfig.add_inputs(); // config.biasSize = 1;
for (auto useGpu : {false, true}) { // config.inputDefs.push_back(
testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); // {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
} // config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
} }
TEST(Layer, ScaleSubRegionLayer) { TEST(Layer, ScaleSubRegionLayer) {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "grpc_client.h" #include "grpc_client.h"
#include "paddle/framework/threadpool.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
...@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
sendrecv::VariableMessage req; const platform::DeviceContext* p_ctx = &ctx;
auto* var = scope.FindVar(var_name); const std::string ep_val = ep;
SerializeToMessage(var_name, var, ctx, &req); const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope;
// varhandle const auto ch = GetChannel(ep_val);
VarHandle var_h;
var_h.ep = ep; framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
var_h.scope = &scope; auto* var = p_scope->FindVar(var_name_val);
var_h.name = var_name; sendrecv::VariableMessage req;
var_h.ctx = &ctx; SerializeToMessage(var_name_val, var, *p_ctx, &req);
// stub context // varhandle
auto ch = GetChannel(ep); VarHandle var_h;
SendProcessor* s = new SendProcessor(ch); var_h.ep = ep_val;
s->Prepare(var_h, time_out); var_h.scope = p_scope;
s->response_call_back_ = NULL; var_h.name = var_name_val;
var_h.ctx = p_ctx;
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s); // stub context
SendProcessor* s = new SendProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = NULL;
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
});
req_count_++; req_count_++;
...@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& ret_msg) { const sendrecv::VariableMessage& ret_msg) {
auto* outvar = var_h.scope->FindVar(var_h.name); auto* outvar = var_h.scope->FindVar(var_h.name);
std::istringstream iss(ret_msg.serialized());
DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
} }
...@@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
sendrecv::VariableMessage req; const platform::DeviceContext* p_ctx = &ctx;
req.set_varname(var_name); const std::string ep_val = ep;
const std::string var_name_val = var_name;
// varhandle const framework::Scope* p_scope = &scope;
VarHandle var_h; const auto ch = GetChannel(ep_val);
var_h.ep = ep;
var_h.scope = &scope; framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
var_h.name = var_name; sendrecv::VariableMessage req;
var_h.ctx = &ctx; req.set_varname(var_name_val);
// stub context // varhandle
auto ch = GetChannel(ep); VarHandle var_h;
GetProcessor* s = new GetProcessor(ch); var_h.ep = ep_val;
s->Prepare(var_h, time_out); var_h.scope = p_scope;
s->response_call_back_ = ProcGetResponse; var_h.name = var_name_val;
var_h.ctx = p_ctx;
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s); // stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse;
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
});
req_count_++; req_count_++;
...@@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
} }
bool RPCClient::Wait() { bool RPCClient::Wait() {
bool ok = true; if (req_count_ <= 0) {
return true;
}
while (true) { std::vector<bool> a(req_count_);
if (req_count_ <= 0) { std::vector<std::future<void>> waits(req_count_);
break;
}
if (!Proceed()) { for (int i = 0; i < req_count_; i++) {
waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
}
for (int i = 0; i < req_count_; i++) {
waits[i].wait();
}
int last_req_count = req_count_;
req_count_ = 0;
for (int i = 0; i < last_req_count; i++) {
if (!a[i]) {
return false; return false;
} }
} }
return ok; return true;
} }
bool RPCClient::Proceed() { bool RPCClient::Proceed() {
...@@ -124,7 +149,6 @@ bool RPCClient::Proceed() { ...@@ -124,7 +149,6 @@ bool RPCClient::Proceed() {
c->Process(); c->Process();
delete c; delete c;
req_count_--;
return true; return true;
} }
......
...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> { ...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
framework::LoD lod(1); framework::LoD lod(1);
lod[0].reserve(batch_size + 1); lod[0].reserve(batch_size + 1);
for (int i = 0, offset = 0; i < batch_size + 1; ++i) { for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
lod[0][i] = offset; lod[0].push_back(offset);
offset += output_height * output_width; offset += output_height * output_width;
} }
out->set_lod(lod); out->set_lod(lod);
......
...@@ -90,14 +90,10 @@ Reshape Operator. ...@@ -90,14 +90,10 @@ Reshape Operator.
Reshape Input(X) into the shape specified by Attr(shape). Reshape Input(X) into the shape specified by Attr(shape).
An example: An example:
Given a 2-D tensor X with 2 rows and 2 columns Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
[[1, 2], [3, 4]]
and target shape = [1, 4], the reshape operator will transform and target shape = [1, 4], the reshape operator will transform
the tensor X into a 2-D tensor: the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
[[1, 2, 3, 4]]
One dimension in the target shape can be set -1, representing that its One dimension in the target shape can be set -1, representing that its
size is unknown. In this case, the real dimension will be infered from size is unknown. In this case, the real dimension will be infered from
......
...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() { ...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
} }
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (cuda_dev_ctx) { if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_)); PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream(); auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream)); PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
} }
#endif #endif
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
...@@ -114,19 +114,20 @@ inline EventList& GetEventList() { ...@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name, DeviceContext* dev_ctx) { void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
} }
void PushEvent(const std::string& name, DeviceContext* dev_ctx) { void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
} }
void PopEvent(const std::string& name, DeviceContext* dev_ctx) { void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx); Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait(); dev_ctx->Wait();
delete dev_ctx;
}); });
} }
} }
...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) { ...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
Mark("_start_profiler_", nullptr); Mark("_start_profiler_", nullptr);
} }
std::vector<std::vector<Event>> DisableProfiler() { void ResetProfiler() {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
(*it)->Clear();
}
}
std::vector<std::vector<Event>> GetAllEvents() {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
std::vector<std::vector<Event>> result;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
result.emplace_back((*it)->Reduce()); result.emplace_back((*it)->Reduce());
...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() { ...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result; return result;
} }
void DisableProfiler(EventSortingKey sorted_key) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}
void ParseEvents(std::vector<std::vector<Event>>& events, void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) { EventSortingKey sorted_by) {
if (g_profiler_place == "") return; if (g_profiler_place == "") return;
...@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
// Print report // Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
} }
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width) { const size_t data_width) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
<< " Profiling Report " << " Profiling Report "
......
...@@ -29,7 +29,7 @@ class Event { ...@@ -29,7 +29,7 @@ class Event {
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx); const DeviceContext* dev_ctx);
std::string kind() const; std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
...@@ -84,6 +84,8 @@ struct EventList { ...@@ -84,6 +84,8 @@ struct EventList {
return result; return result;
} }
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks; std::forward_list<std::vector<Event>> event_blocks;
}; };
...@@ -93,29 +95,26 @@ enum ProfilerState { ...@@ -93,29 +95,26 @@ enum ProfilerState {
kCUDA, // GPU profiling state kCUDA, // GPU profiling state
}; };
void Mark(const std::string& name, DeviceContext* dev_ctx); void Mark(const std::string& name, const DeviceContext* dev_ctx);
void PushEvent(const std::string& name, DeviceContext* dev_ctx); void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
}; };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls // Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> GetAllEvents();
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
...@@ -130,13 +129,22 @@ struct EventItem { ...@@ -130,13 +129,22 @@ struct EventItem {
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Clear the g_all_event_lists, which is total event lists of all threads.
void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key);
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&, void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault); EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results // Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width); const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) { ...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);
int cuda_startup_count = 0; int cuda_startup_count = 0;
int start_profiler_count = 0; int start_profiler_count = 0;
int stop_profiler_count = 0;
for (size_t i = 0; i < events.size(); ++i) { for (size_t i = 0; i < events.size(); ++i) {
for (size_t j = 0; j < events[i].size(); ++j) { for (size_t j = 0; j < events[i].size(); ++j) {
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
} }
EXPECT_EQ(cuda_startup_count % 5, 0); EXPECT_EQ(cuda_startup_count % 5, 0);
EXPECT_EQ(start_profiler_count, 1); EXPECT_EQ(start_profiler_count, 1);
EXPECT_EQ(stop_profiler_count, 1);
// Will remove parsing-related code from test later
DisableProfiler(EventSortingKey::kTotal);
} }
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include "paddle/platform/variant.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
......
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
#include "paddle/pybind/const_value.h" #include "paddle/pybind/const_value.h"
#include "paddle/pybind/exception.h" #include "paddle/pybind/exception.h"
#include "paddle/pybind/pybind.h" #include "paddle/pybind/pybind.h"
...@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) { ...@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
return generators[prefix].fetch_add(1); return generators[prefix].fetch_add(1);
} }
bool IsCompileGPU() { bool IsCompiledWithCUDA() {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
return false; return false;
#else #else
...@@ -430,7 +431,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -430,7 +431,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_glog", framework::InitGLOG); m.def("init_glog", framework::InitGLOG);
m.def("init_devices", &framework::InitDevices); m.def("init_devices", &framework::InitDevices);
m.def("is_compile_gpu", IsCompileGPU); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
...@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
m.def("nvprof_stop", platform::CudaProfilerStop); m.def("nvprof_stop", platform::CudaProfilerStop);
#endif #endif
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
.value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU)
.value("kCUDA", platform::ProfilerState::kCUDA)
.export_values();
py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
.value("kDefault", platform::EventSortingKey::kDefault)
.value("kCalls", platform::EventSortingKey::kCalls)
.value("kTotal", platform::EventSortingKey::kTotal)
.value("kMin", platform::EventSortingKey::kMin)
.value("kMax", platform::EventSortingKey::kMax)
.value("kAve", platform::EventSortingKey::kAve)
.export_values();
m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler);
m.def("reset_profiler", platform::ResetProfiler);
return m.ptr(); return m.ptr();
} }
} // namespace pybind } // namespace pybind
......
...@@ -89,7 +89,7 @@ def __bootstrap__(): ...@@ -89,7 +89,7 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark' 'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
] ]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync'] read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
......
...@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): ...@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
if _all_in_set_( if _all_in_set_(
filter(lambda name: name.find(core.grad_var_suffix()) != -1, filter(lambda name: name.find(core.grad_var_suffix()) != -1,
op_desc.input_arg_names()), no_grad_set): op_desc.input_arg_names()), no_grad_set):
no_grad_set.union(out_arg_names) no_grad_set.update(out_arg_names)
return True return True
return False return False
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import os import os
import cPickle as pickle import cPickle as pickle
from paddle.v2.fluid.evaluator import Evaluator
from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
from . import core from . import core
...@@ -187,8 +188,14 @@ def get_inference_program(target_vars, main_program=None): ...@@ -187,8 +188,14 @@ def get_inference_program(target_vars, main_program=None):
main_program = default_main_program() main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
vars = []
pruned_program = main_program.prune(targets=target_vars) for var in target_vars:
if isinstance(var, Evaluator):
vars.append(var.states)
vars.append(var.metrics)
else:
vars.append(var)
pruned_program = main_program.prune(targets=vars)
inference_program = pruned_program.inference_optimize() inference_program = pruned_program.inference_optimize()
return inference_program return inference_program
......
...@@ -111,6 +111,7 @@ class LayerHelper(object): ...@@ -111,6 +111,7 @@ class LayerHelper(object):
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr = copy.deepcopy(attr)
assert isinstance(attr, ParamAttr) assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w' suffix = 'b' if is_bias else 'w'
......
此差异已折叠。
...@@ -11,14 +11,13 @@ ...@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import layers import layers
__all__ = [ __all__ = [
"simple_img_conv_pool", "simple_img_conv_pool",
"sequence_conv_pool", "sequence_conv_pool",
"glu", "glu",
"dot_product_attention", "scaled_dot_product_attention",
] ]
...@@ -160,7 +159,11 @@ def glu(input, dim=-1): ...@@ -160,7 +159,11 @@ def glu(input, dim=-1):
return out return out
def dot_product_attention(querys, keys, values): def scaled_dot_product_attention(queries,
keys,
values,
num_heads=1,
dropout_rate=0.):
""" """
The dot-product attention. The dot-product attention.
...@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values): ...@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values):
.. math:: .. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_. <https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args: Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor. queries (Variable): The input variable which should be a 3-D Tensor.
value (Variable): The input variable which is a Tensor or LoDTensor. keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns: Returns:
tuple: The Tensor variables representing the output and attention scores.
Variable: A 3-D Tensor computed by multi-head scaled dot product
attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
# Suppose q, k, v are tensor variables with the following shape: # Suppose q, k, v are Tensors with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10] contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
attn_scores.shape # [3, 5, 6] contexts.shape # [3, 5, 10]
""" """
assert keys.shape[-2] == values.shape[ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-2], 'The shapes of keys and values mismatch.' raise ValueError(
assert querys.shape[-1] == keys.shape[ "Inputs quries, keys and values should all be 3-D tensors.")
-1], 'The shapes of querys and keys mismatch.'
product = layers.matmul(x=querys, y=keys, transpose_y=True) if queries.shape[-1] != keys.shape[-1]:
attn_scores = layers.reshape( raise ValueError(
"The hidden size of queries and keys should be the same.")
if keys.shape[-2] != values.shape[-2]:
raise ValueError(
"The max sequence length in query batch and in key batch "
"should be the same.")
if keys.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)." %
(keys.shape[-1], num_heads))
if values.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)." %
(values.shape[-1], num_heads))
def __compute_qkv(queries, keys, values, num_heads):
"""
Add linear projection to queries, keys, and values.
Args:
queries(Tensor): a 3-D input Tensor.
keys(Tensor): a 3-D input Tensor.
values(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads. Linearly project the inputs
ONLY when num_heads > 1.
Returns:
Tensor: linearly projected output Tensors: queries', keys' and
values'. They have the same shapes with queries, keys and
values.
"""
if num_heads == 1:
return queries, keys, values
q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
return q, k, v
def __split_heads(x, num_heads):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if num_heads == 1:
return x
hidden_size = x.shape[-1]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped = layers.reshape(
x=x,
shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
return layers.reshape(
x=trans_x,
shape=map(int, [
trans_x.shape[0], trans_x.shape[1],
trans_x.shape[2] * trans_x.shape[3]
]))
q, k, v = __compute_qkv(queries, keys, values, num_heads)
q = __split_heads(q, num_heads)
k = __split_heads(k, num_heads)
v = __split_heads(v, num_heads)
key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
weights = layers.reshape(
x=layers.reshape( x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act='softmax'), x=product, shape=[-1, product.shape[-1]], act="softmax"),
shape=product.shape) shape=product.shape)
out = layers.matmul(attn_scores, values) if dropout_rate:
return out, attn_scores weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
ctx_multiheads = layers.matmul(weights, v)
return __combine_heads(ctx_multiheads)
...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
# Disables profiler collection. # Disables profiler collection.
core.nvprof_stop() core.nvprof_stop()
os.remove(config_file) os.remove(config_file)
def reset_profiler():
"""The profiler clear interface.
reset_profiler will clear the previous time record.
"""
core.reset_profiler()
@contextmanager
def profiler(state, sorted_key=None):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
"""
if state not in ['CPU', 'GPU']:
raise ValueError("The state must be 'CPU' or 'GPU'.")
prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
core.enable_profiler(prof_state)
yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key])
...@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): ...@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol)
...@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase): ...@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
max_relative_error=0.005, max_relative_error=0.005,
user_defined_grads=None): user_defined_grads=None):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_grad_with_place(place, inputs_to_check, output_names, self.check_grad_with_place(place, inputs_to_check, output_names,
......
...@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase): ...@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
def test_sparse_adagrad(self): def test_sparse_adagrad(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest): ...@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest):
print "op test backward passed: ", str(place), data_layout print "op test backward passed: ", str(place), data_layout
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
......
...@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase): ...@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase):
self.gaussian_random_test(place=fluid.CPUPlace()) self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
self.gaussian_random_test(place=fluid.CUDAPlace(0)) self.gaussian_random_test(place=fluid.CUDAPlace(0))
def gaussian_random_test(self, place): def gaussian_random_test(self, place):
......
文件模式从 100755 更改为 100644
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import numpy as np
class TestMultiheadAttention(unittest.TestCase):
def gen_random_input(self):
"""Generate random input data.
"""
# batch_size, max_sequence_length, hidden dimension
self.input_shape = (3, 13, 16)
self.queries = np.random.random(size=self.input_shape).astype("float32")
self.keys = np.random.random(size=self.input_shape).astype("float32")
def set_program(self):
"""Build the test program.
"""
queries = fluid.layers.data(
name="queries",
shape=self.input_shape,
dtype="float32",
append_batch_size=False)
queries.stop_gradient = False
keys = fluid.layers.data(
name="keys",
shape=self.input_shape,
dtype="float32",
append_batch_size=False)
keys.stop_gradient = False
contexts = fluid.nets.scaled_dot_product_attention(
queries=queries,
keys=keys,
values=keys,
num_heads=8,
dropout_rate=0.)
out = fluid.layers.reduce_sum(contexts, dim=None)
fluid.backward.append_backward(loss=out)
self.fetch_list = [contexts]
def run_program(self):
"""Run the test program.
"""
places = [core.CPUPlace()]
if core.is_compile_gpu():
places.append(core.CUDAPlace(0))
for place in places:
self.set_inputs(place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output = exe.run(fluid.default_main_program(),
feed=self.inputs,
fetch_list=self.fetch_list,
return_numpy=True)
self.op_output = output
def set_inputs(self, place):
"""Set the randomly generated data to the test program.
"""
self.inputs = {}
queries = fluid.Tensor()
queries.set(self.queries, place)
keys = fluid.Tensor()
keys.set(self.keys, place)
self.inputs["keys"] = keys
self.inputs["queries"] = queries
def test_multihead_attention(self):
self.gen_random_input()
self.set_program()
self.run_program()
#fixme(caoying) add more meaningfull unittest.
if __name__ == '__main__':
unittest.main()
...@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase): ...@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase):
"""Run the test program. """Run the test program.
""" """
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
......
...@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core ...@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core
class TestOpSupportGPU(unittest.TestCase): class TestOpSupportGPU(unittest.TestCase):
def test_case(self): def test_case(self):
self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum")) self.assertEqual(core.is_compiled_with_cuda(),
core.op_support_gpu("sum"))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase):
fetch=fetch, fetch=fetch,
place=cpu, place=cpu,
use_parallel=True) use_parallel=True)
if fluid.core.is_compile_gpu(): if fluid.core.is_compiled_with_cuda():
gpu = fluid.CUDAPlace(0) gpu = fluid.CUDAPlace(0)
result_gpu = self._run_test_impl_( result_gpu = self._run_test_impl_(
callback=callback, callback=callback,
...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest):
def test_simple_fc(self): def test_simple_fc(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={ feed={
'img': numpy.random.random(size=(51, 784)).astype('float32') 'img': numpy.random.random(size=(51, 784)).astype('float32')
}, },
...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest):
def test_fc_with_tiny_data(self): def test_fc_with_tiny_data(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
fetch=['fc1.w@GRAD']) fetch=['fc1.w@GRAD'])
class ParallelOpTestMultipleInput(BaseParallelForTest):
@staticmethod
def __network__():
x = fluid.layers.data(
shape=[784], dtype='float32', name='img1', stop_gradient=False)
y = fluid.layers.data(
shape=[784], dtype='float32', name='img2', stop_gradient=False)
yield [x, y]
x = x + y
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
loss = fluid.layers.mean(x=hidden3)
yield loss
def test_simple_fc(self):
self.run_test(
callback=self.__network__,
feed={
'img1': numpy.random.random(size=(51, 784)).astype('float32'),
'img2': numpy.random.random(size=(51, 784)).astype('float32')
},
fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -13,16 +13,17 @@ ...@@ -13,16 +13,17 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import os
import numpy as np import numpy as np
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import os import paddle.v2.fluid.core as core
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def test_nvprof(self): def test_nvprof(self):
if not fluid.core.is_compile_gpu(): if not fluid.core.is_compiled_with_cuda():
return return
epoc = 8 epoc = 8
dshape = [4, 3, 28, 28] dshape = [4, 3, 28, 28]
...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase): ...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase):
exe.run(fluid.default_main_program(), feed={'data': input}) exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file) os.remove(output_file)
def net_profiler(self, state):
if state == 'GPU' and not core.is_compiled_with_cuda():
return
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost, startup_program=startup_program)
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
accuracy.reset(exe)
with profiler.profiler(state, 'total') as prof:
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64")
outs = exe.run(main_program,
feed={'x': x,
'y': y},
fetch_list=[avg_cost] + accuracy.metrics)
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe)
def test_cpu_profiler(self):
self.net_profiler('CPU')
def test_cuda_profiler(self):
self.net_profiler('GPU')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase):
outputs = [] outputs = []
input_grads = [] input_grads = []
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.set_inputs(place) self.set_inputs(place)
......
...@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator ...@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator
class TestSpliteSelectedRows(unittest.TestCase): class TestSpliteSelectedRows(unittest.TestCase):
def get_places(self): def get_places(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
......
...@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase):
self.uniform_random_test(place=core.CPUPlace()) self.uniform_random_test(place=core.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
self.uniform_random_test(place=core.CUDAPlace(0)) self.uniform_random_test(place=core.CUDAPlace(0))
def uniform_random_test(self, place): def uniform_random_test(self, place):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册