提交 d8a939d8 编写于 作者: L Liu Yiqun

Merge branch 'develop' into core_opt_choose_kernel

此差异已折叠。
...@@ -38,10 +38,10 @@ if(WITH_GPU) ...@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) add_dependencies(tensor tensor_util)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32) endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std::vector<FetchOpHandle *> fetch_ops; std::vector<FetchOpHandle *> fetch_ops;
for (auto &fetch_var_name : fetch_tensors) { for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
......
...@@ -14,8 +14,11 @@ ...@@ -14,8 +14,11 @@
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to " VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place; << dst_place;
...@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) && } else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place); auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
......
add_subdirectory(detail) add_subdirectory(detail)
add_subdirectory(allocation) add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade) cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory cc_library(memory
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
} }
} else { } else {
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream); num, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num); num);
} }
...@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
} }
} }
...@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include <memory>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h"
#include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
...@@ -49,9 +51,10 @@ BufferedReader::BufferedReader( ...@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
.Get(place_))) .Get(place_)))
->stream(); ->stream();
events.resize(buffer_size); events.resize(buffer_size);
for (auto &event : events) PADDLE_ENFORCE(cudaStreamCreate(&stream));
for (auto &event : events) {
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); }
} }
#endif #endif
cpu_buffer_.resize(buffer_size); cpu_buffer_.resize(buffer_size);
...@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync // NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream // TensorCopySync would block other stream, because TensorCopySync
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
TensorVec &gpu = gpu_buffer_[i]; TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size()); gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
gpu[i].Resize(cpu[i].dims()); gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout()); gpu[i].set_layout(cpu[i].layout());
...@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
auto size = auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if (platform::is_cuda_pinned_place(cpu_place)) if (platform::is_cuda_pinned_place(cpu_place)) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place), boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream); cpu_ptr, size, stream);
else if ((platform::is_gpu_place(cpu_place))) } else if ((platform::is_gpu_place(cpu_place))) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr, boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream); size, stream);
else } else {
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size, boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
0); stream);
}
gpu[i].set_lod(cpu[i].lod()); gpu[i].set_lod(cpu[i].lod());
} }
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
......
...@@ -30,7 +30,6 @@ limitations under the License. */ ...@@ -30,7 +30,6 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
} }
case CUPTI_ACTIVITY_KIND_DRIVER: { case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) if (api->start != 0 && api->end != 0) {
// -1 device id represents CUDA api call // -1 device id represents ActiveKind api call
tracer->AddCPURecords( tracer->AddActiveKindRecords(
DriverKind(api->cbid), api->start, api->end, -1, DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId)); GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break; break;
} }
case CUPTI_ACTIVITY_KIND_RUNTIME: { case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) if (api->start != 0 && api->end != 0) {
tracer->AddCPURecords( // -1 device id represents ActiveKind api call
tracer->AddActiveKindRecords(
RuntimeKind(api->cbid), api->start, api->end, -1, RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId)); GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break; break;
} }
default: { break; } default: { break; }
...@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
}
thread_local std::forward_list<ActiveKindRecord>
*local_active_kind_records = nullptr;
if (local_active_kind_records == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
active_kind_records_.emplace_front();
local_active_kind_records = &active_kind_records_.front();
}
// lock is not needed, only one thread call this function.
local_active_kind_records->push_front(ActiveKindRecord{
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
}
void AddKernelRecords(std::string name, uint64_t start, uint64_t end, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t device_id, int64_t stream_id, int64_t device_id, int64_t stream_id,
uint32_t correlation_id) { uint32_t correlation_id) {
...@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
const std::vector<int> cbids { const std::vector<int> cbids {
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
...@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear();
} }
void GenEventKernelCudaElapsedTime() { void GenEventKernelCudaElapsedTime() {
...@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_) for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU); event->set_type(proto::Event::CPU);
...@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id); event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
}
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
}
miss = find = 0; miss = find = 0;
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_; std::unordered_map<uint32_t, Event *> correlations_;
...@@ -613,6 +657,7 @@ void initCuptiCbidStr() { ...@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
......
...@@ -63,7 +63,14 @@ class DeviceTracer { ...@@ -63,7 +63,14 @@ class DeviceTracer {
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
struct ActiveKindRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -85,6 +92,10 @@ class DeviceTracer { ...@@ -85,6 +92,10 @@ class DeviceTracer {
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0; int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id,
uint32_t correlation_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
......
...@@ -415,10 +415,11 @@ function assert_api_not_changed() { ...@@ -415,10 +415,11 @@ function assert_api_not_changed() {
source .env/bin/activate source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl pip install ${PADDLE_ROOT}/build/python/dist/*whl
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
# Use sed to make python2 and python3 sepc keeps the same # Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i 's/arg0: str/arg0: unicode/g' new.spec
sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
fi fi
# ComposeNotAligned has significant difference between py2 and py3 # ComposeNotAligned has significant difference between py2 and py3
sed -i '/.*ComposeNotAligned.*/d' new.spec sed -i '/.*ComposeNotAligned.*/d' new.spec
...@@ -452,11 +453,20 @@ function assert_api_spec_approvals() { ...@@ -452,11 +453,20 @@ function assert_api_spec_approvals() {
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
fi
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
else
echo "You must have panyx0718 approval for the api change! ${API_FILE}" echo "You must have panyx0718 approval for the api change! ${API_FILE}"
fi
exit 1 exit 1
fi fi
fi fi
...@@ -472,19 +482,6 @@ function assert_api_spec_approvals() { ...@@ -472,19 +482,6 @@ function assert_api_spec_approvals() {
exit 1 exit 1
fi fi
fi fi
pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
if [ "True" != ${CHECK_DOCK_MD5} ]; then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have shanyi15 approval for the api doc change! "
exit 1
fi
echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
fi
} }
......
...@@ -17,7 +17,6 @@ import os ...@@ -17,7 +17,6 @@ import os
import six import six
import sys import sys
from .. import compat as cpt from .. import compat as cpt
from . import framework
from . import core from . import core
from . import framework from . import framework
...@@ -36,6 +35,30 @@ def _place_obj(place): ...@@ -36,6 +35,30 @@ def _place_obj(place):
return p return p
def _is_pserver_mode(main_program):
main = main_program if main_program \
else default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
def get_available_places(use_cuda):
if use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert places, "no place for execution"
return places
class CompiledProgram(object): class CompiledProgram(object):
""" """
Compiles to Graph for execution. Compiles to Graph for execution.
...@@ -127,8 +150,7 @@ class CompiledProgram(object): ...@@ -127,8 +150,7 @@ class CompiledProgram(object):
self._exec_strategy = ExecutionStrategy() self._exec_strategy = ExecutionStrategy()
if self._build_strategy is None: if self._build_strategy is None:
self._build_strategy = BuildStrategy() self._build_strategy = BuildStrategy()
self._build_strategy.is_distribution = framework.is_pserver_mode( self._build_strategy.is_distribution = _is_pserver_mode(self._program)
self._program)
return self return self
def with_inference_optimize(self, config): def with_inference_optimize(self, config):
...@@ -153,9 +175,9 @@ class CompiledProgram(object): ...@@ -153,9 +175,9 @@ class CompiledProgram(object):
def _with_distributed(self): def _with_distributed(self):
raise NotImplementedError() raise NotImplementedError()
def _compile_data_parallel(self): def _compile_data_parallel(self, use_cuda=False, scope=None):
if self._share_vars_from: if self._share_vars_from:
if self._scope: if scope:
sys.stderr.write("share_vars_from is set, scope is ignored.\n") sys.stderr.write("share_vars_from is set, scope is ignored.\n")
if not self._share_vars_from._is_data_parallel: if not self._share_vars_from._is_data_parallel:
raise ValueError("share_vars_from is not data parallel. Cannot " raise ValueError("share_vars_from is not data parallel. Cannot "
...@@ -166,23 +188,11 @@ class CompiledProgram(object): ...@@ -166,23 +188,11 @@ class CompiledProgram(object):
"var to share.") "var to share.")
self._local_scopes = self._share_vars_from._executor.local_scopes() self._local_scopes = self._share_vars_from._executor.local_scopes()
else: else:
assert scope is not None, ""
self._local_scopes = [] self._local_scopes = []
self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace) self._exec_strategy.use_cuda = use_cuda
if self._exec_strategy.use_cuda: self._places = get_available_places(self._exec_strategy.use_cuda)
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [
i for i in six.moves.range(core.get_cuda_device_count())
]
self._places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert self._places, "no place for execution"
if self._exec_strategy.num_threads == 0: if self._exec_strategy.num_threads == 0:
if self._exec_strategy.use_cuda: if self._exec_strategy.use_cuda:
...@@ -197,9 +207,11 @@ class CompiledProgram(object): ...@@ -197,9 +207,11 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
if self._build_strategy.memory_optimize is None: if self._build_strategy.memory_optimize is None:
self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True self._build_strategy.memory_optimize = False \
if self._program and self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is None: if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True self._build_strategy.enable_inplace = False \
if self._program and self._program._is_mem_optimized else True
# TODO(wuyi): trainer endpoings should be passed in through # TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx. # build_strategy, not program.xxx.
...@@ -221,12 +233,12 @@ class CompiledProgram(object): ...@@ -221,12 +233,12 @@ class CompiledProgram(object):
places = list(map(_place_obj, self._places)) places = list(map(_place_obj, self._places))
return core.ParallelExecutor( return core.ParallelExecutor(places,
places,
set(self._persistable_vars), set(self._persistable_vars),
cpt.to_text(self._loss_name) cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), self._scope, self._local_scopes, if self._loss_name else six.u(''), scope,
self._exec_strategy, self._build_strategy, self._graph) self._local_scopes, self._exec_strategy,
self._build_strategy, self._graph)
def _compile_inference(self): def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config) return core.create_paddle_predictor(self._infer_config)
...@@ -253,7 +265,9 @@ class CompiledProgram(object): ...@@ -253,7 +265,9 @@ class CompiledProgram(object):
self._scope = scope self._scope = scope
self._place = place self._place = place
if self._is_data_parallel: if self._is_data_parallel:
self._executor = self._compile_data_parallel() self._executor = self._compile_data_parallel(
use_cuda=isinstance(self._place, core.CUDAPlace),
scope=self._scope)
elif self._is_inference: elif self._is_inference:
self._executor = self._compile_inference() self._executor = self._compile_inference()
else: else:
......
...@@ -261,20 +261,20 @@ def _as_lodtensor(data, place): ...@@ -261,20 +261,20 @@ def _as_lodtensor(data, place):
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor. Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list. operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished. variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs. finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example: Example:
.. code-block:: python .. code-block:: python
# First create the Executor. # First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -297,9 +297,6 @@ class Executor(object): ...@@ -297,9 +297,6 @@ class Executor(object):
Args: Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
""" """
def __init__(self, place): def __init__(self, place):
...@@ -382,6 +379,12 @@ class Executor(object): ...@@ -382,6 +379,12 @@ class Executor(object):
] ]
return outs return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self): def close(self):
""" """
Close this executor. Close this executor.
...@@ -389,9 +392,6 @@ class Executor(object): ...@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method. You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to For the distributed training, this method would free the resource on PServers related to
the current Trainer. the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example: Example:
>>> cpu = core.CPUPlace() >>> cpu = core.CPUPlace()
......
...@@ -87,15 +87,6 @@ def _current_expected_place(): ...@@ -87,15 +87,6 @@ def _current_expected_place():
return _imperative_current_expected_place_ return _imperative_current_expected_place_
def is_pserver_mode(main_program):
main = main_program if main_program \
else default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
class NameScope(object): class NameScope(object):
def __init__(self, name="", parent=None): def __init__(self, name="", parent=None):
self._children = dict() self._children = dict()
......
...@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None): ...@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model" param_path = "./my_paddle_model"
# `prog` can be a program defined by the user
prog = fluid.default_main_program() prog = fluid.default_main_program()
fluid.io.save_persistables(executor=exe, dirname=param_path, fluid.io.save_persistables(executor=exe, dirname=param_path,
main_program=None) main_program=prog)
""" """
if main_program and main_program._is_distributed: if main_program and main_program._is_distributed:
......
...@@ -13,15 +13,11 @@ ...@@ -13,15 +13,11 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import multiprocessing
from . import core from . import core
from . import framework from . import framework
from . import executor from . import executor
from .. import compat as cpt from . import compiler
import warnings
import sys import sys
import six
import os
__all__ = ['ParallelExecutor'] __all__ = ['ParallelExecutor']
...@@ -97,99 +93,27 @@ class ParallelExecutor(object): ...@@ -97,99 +93,27 @@ class ParallelExecutor(object):
'Please use CompiledProgram and Executor. CompiledProgram ' 'Please use CompiledProgram and Executor. CompiledProgram '
'is a central place for optimization and Executor is the ' 'is a central place for optimization and Executor is the '
'unified executor. Example can be found in compiler.py.\n') 'unified executor. Example can be found in compiler.py.\n')
# step1: get places, the places are used in run too.
self._places = []
if use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [
i for i in six.moves.range(core.get_cuda_device_count())
]
self._places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert self._places, "no place for execution"
# step2: init exec_strategy
if exec_strategy is None:
exec_strategy = ExecutionStrategy()
exec_strategy.use_cuda = use_cuda
if exec_strategy.num_threads == 0:
if use_cuda:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
exec_strategy.num_threads = len(self._places) * 4
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2
# step3: init build_strategy
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id build_strategy.trainer_id = trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy.is_distribution = framework.is_pserver_mode(
main_program) or num_trainers > 1
# step4: get main_program, scope, local_scopes
main = main_program if main_program \
else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if build_strategy.memory_optimize is None:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is None:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes()\
if share_vars_from else []
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len(
trainers_endpoints), "num_trainers == len(endpoints)"
build_strategy.trainers_endpoints = trainers_endpoints
# step6: get persistable_vars, places. persistable_vars
# need be broadcast to other local_scope.
persistable_vars = set([
cpt.to_text(v.name) for v in [
var for var in main.list_vars()
if var.persistable and var.type != core.VarDesc.VarType.RAW
]
])
def place_obj(place):
p = core.Place()
p.set_place(place)
return p
places = list(map(place_obj, self._places))
# step7: init ParallelExecutor self._places = compiler.get_available_places(use_cuda)
# ParallelExecutor API will be deprecated, don't support parallel graph. self._scope = scope if scope is not None else executor.global_scope()
self._graph = core.Graph(main.desc)
self.executor = core.ParallelExecutor( main_program = main_program if main_program is not None \
places, persistable_vars, else framework.default_main_program()
cpt.to_text(loss_name) if loss_name else six.u(''), scope,
local_scopes, exec_strategy, build_strategy, self._graph)
self.scope = scope self._compiled_program = compiler.CompiledProgram(main_program)
self._compiled_program.with_data_parallel(
loss_name=loss_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy,
share_vars_from=share_vars_from)
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
self._executor = executor.Executor(self._place)
self._compiled_program._compile(place=self._place, scope=self._scope)
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
""" """
...@@ -256,56 +180,11 @@ class ParallelExecutor(object): ...@@ -256,56 +180,11 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch), loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name])) fetch_list=[avg_cost.name]))
""" """
if feed is None and feed_dict is not None: return self._executor.run(program=self._compiled_program,
feed = feed_dict scope=self._scope,
print( feed=feed,
"`feed_dict` is deprecated. Please use `feed=`", fetch_list=fetch_list,
file=sys.stderr) return_numpy=return_numpy)
if isinstance(feed, dict):
feed_tensor_dict = dict()
for feed_name in feed:
feed_tensor = feed[feed_name]
if not isinstance(feed_tensor, core.LoDTensor):
feed_tensor = core.LoDTensor()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor.set(feed[feed_name], core.CPUPlace())
feed_tensor_dict[feed_name] = feed_tensor
self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(self._places):
raise ValueError(
"Feed a list of tensor, the list should be the same size as places"
)
res = list()
for i, each in enumerate(feed):
if not isinstance(each, dict):
raise TypeError(
"Each element of feed list should be a dict")
res_dict = dict()
for feed_name in each:
tensor = each[feed_name]
if not isinstance(tensor, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(tensor, self._places[i])
tensor = tmp
res_dict[feed_name] = tensor
res.append(res_dict)
self.executor.feed_tensors_into_local_scopes(res)
fetch_var_name = 'fetch'
self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if return_numpy:
return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))]
@property @property
def device_count(self): def device_count(self):
......
...@@ -15,44 +15,139 @@ ...@@ -15,44 +15,139 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np
from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp
class TestMKLDNN(TestConv2dOp): def conv2d_bias_naive(out, bias):
def init_kernel_type(self): _, out_c, _, _ = out.shape
self.use_mkldnn = True
self.data_format = "NCHW"
for l in range(out_c):
out[:, l, :, :] = out[:, l, :, :] + bias[l]
return out
class TestMKLDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
def conv2d_residual_naive(out, residual):
assert out.shape == residual.shape
out = np.add(out, residual)
return out
class TestMKLDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestConv2dMKLDNNOp(TestConv2dOp):
def init_group(self):
self.groups = 1
class TestMKLDNNWithGroup(TestWithGroup):
def init_kernel_type(self): def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW" self.data_format = "NCHW"
self.use_mkldnn = True
self._cpu_only = True
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestMKLDNNWith1x1(TestWith1x1): def setUp(self):
def init_kernel_type(self): self.fuse_bias = False
self.use_mkldnn = True self.bias_size = None
self.data_format = "NCHW" self.fuse_relu = False
self.fuse_residual_connection = False
self.input_residual_size = None
TestConv2dOp.setUp(self)
output = self.outputs['Output']
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): #mkldnn only support either conv-sum-relu, or conv-relu.
def init_kernel_type(self): if self.fuse_bias and self.bias_size is not None:
self.use_mkldnn = True bias = np.random.random(self.bias_size).astype(self.dtype)
self.data_format = "NCHW" output = conv2d_bias_naive(output, bias)
output = output.astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
if self.fuse_residual_connection and self.input_residual_size is not None:
input_residual = np.random.random(self.input_residual_size).astype(
self.dtype)
output = conv2d_residual_naive(output, input_residual)
self.attrs[
'fuse_residual_connection'] = self.fuse_residual_connection
self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
input_residual)
if self.fuse_relu:
output = np.maximum(output, 0).astype(self.dsttype)
output = output.astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.attrs['fuse_relu'] = self.fuse_relu
self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
self.outputs['Output'] = output
class TestWithFuse(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.fuse_bias = True
self.bias_size = [6]
self.fuse_residual_connection = True
self.input_residual_size = [2, 6, 5, 5]
def test_check_grad(self):
pass
def test_check_grad_no_filter(self):
pass
def test_check_grad_no_input(self):
pass
class TestWithPadWithBias(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.input_size = [2, 3, 6, 6]
class TestWithStride(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6]
class TestWithGroup(TestConv2dMKLDNNOp):
def init_group(self):
self.groups = 3
class TestWith1x1(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.filter_size = [6, 3, 1, 1]
class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.input_size = [2, 3, 1, 1] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
def init_group(self):
self.groups = 3
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -18,6 +18,24 @@ import unittest ...@@ -18,6 +18,24 @@ import unittest
from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
def create_test_mkldnn_use_ceil_class(parent):
class TestMKLDNNPool2DUseCeilCase(parent):
def init_kernel_type(self):
self.use_mkldnn = True
def init_ceil_mode(self):
self.ceil_mode = True
cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
create_test_mkldnn_use_ceil_class(TestPool2D_Op)
create_test_mkldnn_use_ceil_class(TestCase1)
create_test_mkldnn_use_ceil_class(TestCase2)
def create_test_mkldnn_class(parent): def create_test_mkldnn_class(parent):
class TestMKLDNNCase(parent): class TestMKLDNNCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import ast
import hashlib
import importlib
import paddle.fluid
files = [
"paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
"paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
"paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
"paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
"paddle.fluid.profiler", "paddle.fluid.recordio_writer",
"paddle.fluid.regularizer", "paddle.fluid.transpiler"
]
def md5(doc):
hash = hashlib.md5()
hash.update(str(doc))
return hash.hexdigest()
def get_module():
for fi in files:
fi_lib = importlib.import_module(fi)
doc_function = getattr(fi_lib, "__all__")
for api in doc_function:
api_name = fi + "." + api
try:
doc_module = getattr(eval(api_name), "__doc__")
except:
pass
doc_md5_code = md5(doc_module)
doc_dict[api_name] = doc_md5_code
def doc_md5_dict(doc_md5_path):
with open(doc_md5_path, "rb") as f:
doc_md5 = f.read()
doc_md5_dict = ast.literal_eval(doc_md5)
return doc_md5_dict
def check_doc_md5():
for k, v in doc_dict.items():
try:
if doc_ci_dict[k] != v:
return doc_dict
except:
return doc_dict
return True
if __name__ == "__main__":
doc_dict = {}
doc_ci_dict = {}
doc_md5_file = "/root/.cache/doc_md5.txt"
if not os.path.exists(doc_md5_file):
os.mknod(doc_md5_file)
else:
doc_ci_dict = doc_md5_dict(doc_md5_file)
get_module()
if not os.path.getsize(doc_md5_file):
with open(doc_md5_file, 'w') as f:
f.write(str(doc_dict))
check_dic = True
print(check_dic)
else:
check_dic = check_doc_md5()
print(check_dic)
...@@ -26,4 +26,10 @@ for each_diff in result: ...@@ -26,4 +26,10 @@ for each_diff in result:
print(each_diff) print(each_diff)
if error: if error:
print(
'''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
1. cd ${paddle_path}, compile paddle;
2. pip install build/python/dist/(build whl package);
3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
)
sys.exit(1) sys.exit(1)
...@@ -24,12 +24,19 @@ import inspect ...@@ -24,12 +24,19 @@ import inspect
import collections import collections
import sys import sys
import pydoc import pydoc
import hashlib
member_dict = collections.OrderedDict() member_dict = collections.OrderedDict()
experimental_namespace = {"paddle.fluid.imperative"} experimental_namespace = {"paddle.fluid.imperative"}
def md5(doc):
hash = hashlib.md5()
hash.update(str(doc).encode('utf-8'))
return hash.hexdigest()
def visit_member(parent_name, member): def visit_member(parent_name, member):
cur_name = ".".join([parent_name, member.__name__]) cur_name = ".".join([parent_name, member.__name__])
if inspect.isclass(member): if inspect.isclass(member):
...@@ -39,7 +46,10 @@ def visit_member(parent_name, member): ...@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
visit_member(cur_name, value) visit_member(cur_name, value)
elif callable(member): elif callable(member):
try: try:
member_dict[cur_name] = inspect.getargspec(member) doc = ('document', md5(member.__doc__))
args = inspect.getargspec(member)
all = (args, doc)
member_dict[cur_name] = all
except TypeError: # special for PyBind method except TypeError: # special for PyBind method
member_dict[cur_name] = " ".join([ member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n') line.strip() for line in pydoc.render_doc(member).split('\n')
......
...@@ -131,7 +131,7 @@ class Timeline(object): ...@@ -131,7 +131,7 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices: if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid self._devices[(k, event.device_id, "CPU")] = pid
# -1 device id represents CUDA api call # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
if event.device_id == -1: if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else: else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册