提交 2fb38c10 编写于 作者: L luotao1

Merge branch 'develop' into runtime_context

此差异已折叠。
......@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......
......@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
......@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std::vector<FetchOpHandle *> fetch_ops;
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
......
......@@ -882,7 +882,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const RuntimeContext& ctx_;
};
static void CheckTensorNANOrInf(const std::string& name,
static void CheckTensorNANOrInf(const std::string& op_type,
const std::string& name,
const framework::Tensor& tensor) {
if (tensor.memory_size() == 0) {
return;
......@@ -892,9 +893,9 @@ static void CheckTensorNANOrInf(const std::string& name,
return;
}
PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
"Tensor %s contains Inf", name);
"Operator %s output Tensor %s contains Inf", op_type, name);
PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
"Tensor %s contains NAN", name);
"Operator %s output Tensor %s contains NAN", op_type, name);
}
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
......@@ -995,9 +996,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
} else if (var->IsType<framework::SelectedRows>()) {
CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
CheckTensorNANOrInf(type_, vname,
var->Get<framework::SelectedRows>().value());
}
}
}
......
......@@ -14,8 +14,11 @@
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
......@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
......@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
......
add_subdirectory(detail)
add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace memory {
......@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
......@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
......@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
} else {
if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream);
} else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num);
}
......@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
}
......@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
}
......
......@@ -84,13 +84,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault("bilinear");
AddAttr<bool>(
"align_corners",
"an optinal bool. Defaults to True. "
"an optional bool. Defaults to True. "
"If True, the centers of 4 corner pixels of the input and output "
"tensors are aligned, preserving the values at the corner pixels, "
"if Flase, are not aligned")
"If False, are not aligned")
.SetDefault(true);
AddAttr<int>("align_mode",
"(int, default \'1\'), optional for bilinear interpolation"
"(int, default \'1\'), optional for bilinear interpolation, "
"can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
"can be \'1\' for src_idx = scale*dst_index .")
.SetDefault(1);
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm>
#include <functional>
#include <memory>
#include <vector>
#include "ngraph/ngraph.hpp"
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
......
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
......
......@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace reader {
......@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
.Get(place_)))
->stream();
events.resize(buffer_size);
for (auto &event : events)
PADDLE_ENFORCE(cudaStreamCreate(&stream));
for (auto &event : events) {
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
}
}
#endif
cpu_buffer_.resize(buffer_size);
......@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
// TensorCopySync would block other stream, because TensorCopySync
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) {
gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout());
......@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if (platform::is_cuda_pinned_place(cpu_place))
if (platform::is_cuda_pinned_place(cpu_place)) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream);
else if ((platform::is_gpu_place(cpu_place)))
} else if ((platform::is_gpu_place(cpu_place))) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream);
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
} else {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
0);
stream);
}
gpu[i].set_lod(cpu[i].lod());
}
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
......
......@@ -30,7 +30,6 @@ limitations under the License. */
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
......@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
// -1 device id represents CUDA api call
tracer->AddCPURecords(
if (api->start != 0 && api->end != 0) {
// -1 device id represents ActiveKind api call
tracer->AddActiveKindRecords(
DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break;
}
case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
tracer->AddCPURecords(
if (api->start != 0 && api->end != 0) {
// -1 device id represents ActiveKind api call
tracer->AddActiveKindRecords(
RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break;
}
default: { break; }
......@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes});
}
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
}
thread_local std::forward_list<ActiveKindRecord>
*local_active_kind_records = nullptr;
if (local_active_kind_records == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
active_kind_records_.emplace_front();
local_active_kind_records = &active_kind_records_.front();
}
// lock is not needed, only one thread call this function.
local_active_kind_records->push_front(ActiveKindRecord{
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
}
void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t device_id, int64_t stream_id,
uint32_t correlation_id) {
......@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
}
const std::vector<int> cbids {
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
......@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear();
}
void GenEventKernelCudaElapsedTime() {
......@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id);
}
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_)
for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
......@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
}
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
}
miss = find = 0;
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
......@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_;
......@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
......
......@@ -63,7 +63,14 @@ class DeviceTracer {
uint32_t correlation_id;
uint64_t bytes;
};
struct ActiveKindRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {}
// Needs to be called once before use.
virtual void Enable() = 0;
......@@ -85,6 +92,10 @@ class DeviceTracer {
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id,
uint32_t correlation_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
......
......@@ -415,10 +415,11 @@ function assert_api_not_changed() {
source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
# Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' new.spec
sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
fi
# ComposeNotAligned has significant difference between py2 and py3
sed -i '/.*ComposeNotAligned.*/d' new.spec
......@@ -452,12 +453,21 @@ function assert_api_spec_approvals() {
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
fi
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
else
echo "You must have panyx0718 approval for the api change! ${API_FILE}"
exit 1
fi
exit 1
fi
fi
done
......@@ -472,19 +482,6 @@ function assert_api_spec_approvals() {
exit 1
fi
fi
pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
if [ "True" != ${CHECK_DOCK_MD5} ]; then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have shanyi15 approval for the api doc change! "
exit 1
fi
echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
fi
}
......
......@@ -17,7 +17,6 @@ import os
import six
import sys
from .. import compat as cpt
from . import framework
from . import core
from . import framework
......@@ -36,6 +35,30 @@ def _place_obj(place):
return p
def _is_pserver_mode(main_program):
main = main_program if main_program \
else default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
def get_available_places(use_cuda):
if use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert places, "no place for execution"
return places
class CompiledProgram(object):
"""
Compiles to Graph for execution.
......@@ -127,8 +150,7 @@ class CompiledProgram(object):
self._exec_strategy = ExecutionStrategy()
if self._build_strategy is None:
self._build_strategy = BuildStrategy()
self._build_strategy.is_distribution = framework.is_pserver_mode(
self._program)
self._build_strategy.is_distribution = _is_pserver_mode(self._program)
return self
def with_inference_optimize(self, config):
......@@ -153,9 +175,9 @@ class CompiledProgram(object):
def _with_distributed(self):
raise NotImplementedError()
def _compile_data_parallel(self):
def _compile_data_parallel(self, use_cuda=False, scope=None):
if self._share_vars_from:
if self._scope:
if scope:
sys.stderr.write("share_vars_from is set, scope is ignored.\n")
if not self._share_vars_from._is_data_parallel:
raise ValueError("share_vars_from is not data parallel. Cannot "
......@@ -166,23 +188,11 @@ class CompiledProgram(object):
"var to share.")
self._local_scopes = self._share_vars_from._executor.local_scopes()
else:
assert scope is not None, ""
self._local_scopes = []
self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
if self._exec_strategy.use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [
i for i in six.moves.range(core.get_cuda_device_count())
]
self._places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert self._places, "no place for execution"
self._exec_strategy.use_cuda = use_cuda
self._places = get_available_places(self._exec_strategy.use_cuda)
if self._exec_strategy.num_threads == 0:
if self._exec_strategy.use_cuda:
......@@ -197,9 +207,11 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if self._build_strategy.memory_optimize is None:
self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
self._build_strategy.memory_optimize = False \
if self._program and self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
self._build_strategy.enable_inplace = False \
if self._program and self._program._is_mem_optimized else True
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
......@@ -221,12 +233,12 @@ class CompiledProgram(object):
places = list(map(_place_obj, self._places))
return core.ParallelExecutor(
places,
set(self._persistable_vars),
cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), self._scope, self._local_scopes,
self._exec_strategy, self._build_strategy, self._graph)
return core.ParallelExecutor(places,
set(self._persistable_vars),
cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), scope,
self._local_scopes, self._exec_strategy,
self._build_strategy, self._graph)
def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config)
......@@ -253,7 +265,9 @@ class CompiledProgram(object):
self._scope = scope
self._place = place
if self._is_data_parallel:
self._executor = self._compile_data_parallel()
self._executor = self._compile_data_parallel(
use_cuda=isinstance(self._place, core.CUDAPlace),
scope=self._scope)
elif self._is_inference:
self._executor = self._compile_inference()
else:
......
......@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
ParallelExecutor.
Python executor takes a program, add feed operators and fetch operators to this program according
An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
Python executor takes a program, adds feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The contents in local scope may be discarded after every minibatch forward/backward
finished. But the global scope variables will be persistent through different runs.
Example:
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
def __init__(self, place):
......@@ -382,6 +379,12 @@ class Executor(object):
]
return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self):
"""
Close this executor.
......@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
>>> cpu = core.CPUPlace()
......
......@@ -87,15 +87,6 @@ def _current_expected_place():
return _imperative_current_expected_place_
def is_pserver_mode(main_program):
main = main_program if main_program \
else default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
class NameScope(object):
def __init__(self, name="", parent=None):
self._children = dict()
......
......@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
# `prog` can be a program defined by the user
prog = fluid.default_main_program()
fluid.io.save_persistables(executor=exe, dirname=param_path,
main_program=None)
main_program=prog)
"""
if main_program and main_program._is_distributed:
......
......@@ -6844,56 +6844,58 @@ def image_resize(input,
Example:
For scale:
if align_corners = True && out_size > 1 :
.. code-block:: text
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
For scale:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
if:
align_corners = False
if align_corners = True && out_size > 1 :
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
if:
align_corners = False
H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
else:
align_corners = True
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
else:
align_corners = True
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
Bilinear interpolation:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
......@@ -7048,41 +7050,39 @@ def resize_bilinear(input,
Align_corners and align_mode are optinal parameters,the calculation
method of interpolation can be selected by them.
Align_corners and align_mode are optinal parameters,the calculation method
of interpolation can be selected by them.
Example:
For scale:
if align_corners = True && out_size > 1 :
.. code-block:: text
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
For scale:
scale_factor = float(in_size/out_size)
if align_corners = True && out_size > 1 :
Bilinear interpolation:
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
......@@ -7134,42 +7134,44 @@ def resize_nearest(input,
align_corners=True):
"""
Resize input by performing nearest neighbor interpolation in both the
3rd dimention(in height direction) and the 4th dimention(in width
direction) based on given output shape which specified by actual_shape,
3rd dimension(in height direction) and the 4th dimension(in width
direction) based on given output shape which is specified by actual_shape,
out_shape and scale in priority order.
Example:
For scale:
if align_corners = True && out_size > 1 :
.. code-block:: text
For scale:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
if:
align_corners = False
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
H_out = floor(H_{in} * scale_{factor})
W_out = floor(W_{in} * scale_{factor})
else:
align_corners = True
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
For details of nearest neighbor interpolation, please refer to Wikipedia:
......
......@@ -13,15 +13,11 @@
# limitations under the License.
from __future__ import print_function
import multiprocessing
from . import core
from . import framework
from . import executor
from .. import compat as cpt
import warnings
from . import compiler
import sys
import six
import os
__all__ = ['ParallelExecutor']
......@@ -97,99 +93,27 @@ class ParallelExecutor(object):
'Please use CompiledProgram and Executor. CompiledProgram '
'is a central place for optimization and Executor is the '
'unified executor. Example can be found in compiler.py.\n')
# step1: get places, the places are used in run too.
self._places = []
if use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [
i for i in six.moves.range(core.get_cuda_device_count())
]
self._places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert self._places, "no place for execution"
# step2: init exec_strategy
if exec_strategy is None:
exec_strategy = ExecutionStrategy()
exec_strategy.use_cuda = use_cuda
if exec_strategy.num_threads == 0:
if use_cuda:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
exec_strategy.num_threads = len(self._places) * 4
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2
# step3: init build_strategy
if build_strategy is None:
build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy.is_distribution = framework.is_pserver_mode(
main_program) or num_trainers > 1
# step4: get main_program, scope, local_scopes
main = main_program if main_program \
else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if build_strategy.memory_optimize is None:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is None:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes()\
if share_vars_from else []
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len(
trainers_endpoints), "num_trainers == len(endpoints)"
build_strategy.trainers_endpoints = trainers_endpoints
# step6: get persistable_vars, places. persistable_vars
# need be broadcast to other local_scope.
persistable_vars = set([
cpt.to_text(v.name) for v in [
var for var in main.list_vars()
if var.persistable and var.type != core.VarDesc.VarType.RAW
]
])
def place_obj(place):
p = core.Place()
p.set_place(place)
return p
places = list(map(place_obj, self._places))
# step7: init ParallelExecutor
# ParallelExecutor API will be deprecated, don't support parallel graph.
self._graph = core.Graph(main.desc)
self._places = compiler.get_available_places(use_cuda)
self._scope = scope if scope is not None else executor.global_scope()
self.executor = core.ParallelExecutor(
places, persistable_vars,
cpt.to_text(loss_name) if loss_name else six.u(''), scope,
local_scopes, exec_strategy, build_strategy, self._graph)
main_program = main_program if main_program is not None \
else framework.default_main_program()
self.scope = scope
self._compiled_program = compiler.CompiledProgram(main_program)
self._compiled_program.with_data_parallel(
loss_name=loss_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy,
share_vars_from=share_vars_from)
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
self._executor = executor.Executor(self._place)
self._compiled_program._compile(place=self._place, scope=self._scope)
def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
"""
......@@ -256,56 +180,11 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
"""
if feed is None and feed_dict is not None:
feed = feed_dict
print(
"`feed_dict` is deprecated. Please use `feed=`",
file=sys.stderr)
if isinstance(feed, dict):
feed_tensor_dict = dict()
for feed_name in feed:
feed_tensor = feed[feed_name]
if not isinstance(feed_tensor, core.LoDTensor):
feed_tensor = core.LoDTensor()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor.set(feed[feed_name], core.CPUPlace())
feed_tensor_dict[feed_name] = feed_tensor
self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(self._places):
raise ValueError(
"Feed a list of tensor, the list should be the same size as places"
)
res = list()
for i, each in enumerate(feed):
if not isinstance(each, dict):
raise TypeError(
"Each element of feed list should be a dict")
res_dict = dict()
for feed_name in each:
tensor = each[feed_name]
if not isinstance(tensor, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(tensor, self._places[i])
tensor = tmp
res_dict[feed_name] = tensor
res.append(res_dict)
self.executor.feed_tensors_into_local_scopes(res)
fetch_var_name = 'fetch'
self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if return_numpy:
return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))]
return self._executor.run(program=self._compiled_program,
scope=self._scope,
feed=feed,
fetch_list=fetch_list,
return_numpy=return_numpy)
@property
def device_count(self):
......
......@@ -15,44 +15,139 @@
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp
class TestMKLDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
def conv2d_bias_naive(out, bias):
_, out_c, _, _ = out.shape
for l in range(out_c):
out[:, l, :, :] = out[:, l, :, :] + bias[l]
return out
class TestMKLDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
def conv2d_residual_naive(out, residual):
assert out.shape == residual.shape
out = np.add(out, residual)
return out
class TestMKLDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestConv2dMKLDNNOp(TestConv2dOp):
def init_group(self):
self.groups = 1
class TestMKLDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
self.use_mkldnn = True
self._cpu_only = True
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
class TestMKLDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
def setUp(self):
self.fuse_bias = False
self.bias_size = None
self.fuse_relu = False
self.fuse_residual_connection = False
self.input_residual_size = None
TestConv2dOp.setUp(self)
output = self.outputs['Output']
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
#mkldnn only support either conv-sum-relu, or conv-relu.
if self.fuse_bias and self.bias_size is not None:
bias = np.random.random(self.bias_size).astype(self.dtype)
output = conv2d_bias_naive(output, bias)
output = output.astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
if self.fuse_residual_connection and self.input_residual_size is not None:
input_residual = np.random.random(self.input_residual_size).astype(
self.dtype)
output = conv2d_residual_naive(output, input_residual)
self.attrs[
'fuse_residual_connection'] = self.fuse_residual_connection
self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
input_residual)
if self.fuse_relu:
output = np.maximum(output, 0).astype(self.dsttype)
output = output.astype(self.dtype)
self.attrs['fuse_bias'] = self.fuse_bias
self.attrs['fuse_relu'] = self.fuse_relu
self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
self.outputs['Output'] = output
class TestWithFuse(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.fuse_bias = True
self.bias_size = [6]
self.fuse_residual_connection = True
self.input_residual_size = [2, 6, 5, 5]
def test_check_grad(self):
pass
def test_check_grad_no_filter(self):
pass
def test_check_grad_no_input(self):
pass
class TestWithPadWithBias(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.input_size = [2, 3, 6, 6]
class TestWithStride(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6]
class TestWithGroup(TestConv2dMKLDNNOp):
def init_group(self):
self.groups = 3
class TestWith1x1(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.filter_size = [6, 3, 1, 1]
class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
def init_test_case(self):
TestConv2dMKLDNNOp.init_test_case(self)
self.input_size = [2, 3, 1, 1] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
def init_group(self):
self.groups = 3
if __name__ == '__main__':
......
......@@ -18,6 +18,24 @@ import unittest
from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
def create_test_mkldnn_use_ceil_class(parent):
class TestMKLDNNPool2DUseCeilCase(parent):
def init_kernel_type(self):
self.use_mkldnn = True
def init_ceil_mode(self):
self.ceil_mode = True
cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
create_test_mkldnn_use_ceil_class(TestPool2D_Op)
create_test_mkldnn_use_ceil_class(TestCase1)
create_test_mkldnn_use_ceil_class(TestCase2)
def create_test_mkldnn_class(parent):
class TestMKLDNNCase(parent):
def init_kernel_type(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import ast
import hashlib
import importlib
import paddle.fluid
files = [
"paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
"paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
"paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
"paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
"paddle.fluid.profiler", "paddle.fluid.recordio_writer",
"paddle.fluid.regularizer", "paddle.fluid.transpiler"
]
def md5(doc):
hash = hashlib.md5()
hash.update(str(doc))
return hash.hexdigest()
def get_module():
for fi in files:
fi_lib = importlib.import_module(fi)
doc_function = getattr(fi_lib, "__all__")
for api in doc_function:
api_name = fi + "." + api
try:
doc_module = getattr(eval(api_name), "__doc__")
except:
pass
doc_md5_code = md5(doc_module)
doc_dict[api_name] = doc_md5_code
def doc_md5_dict(doc_md5_path):
with open(doc_md5_path, "rb") as f:
doc_md5 = f.read()
doc_md5_dict = ast.literal_eval(doc_md5)
return doc_md5_dict
def check_doc_md5():
for k, v in doc_dict.items():
try:
if doc_ci_dict[k] != v:
return doc_dict
except:
return doc_dict
return True
if __name__ == "__main__":
doc_dict = {}
doc_ci_dict = {}
doc_md5_file = "/root/.cache/doc_md5.txt"
if not os.path.exists(doc_md5_file):
os.mknod(doc_md5_file)
else:
doc_ci_dict = doc_md5_dict(doc_md5_file)
get_module()
if not os.path.getsize(doc_md5_file):
with open(doc_md5_file, 'w') as f:
f.write(str(doc_dict))
check_dic = True
print(check_dic)
else:
check_dic = check_doc_md5()
print(check_dic)
......@@ -26,4 +26,10 @@ for each_diff in result:
print(each_diff)
if error:
print(
'''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
1. cd ${paddle_path}, compile paddle;
2. pip install build/python/dist/(build whl package);
3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
)
sys.exit(1)
......@@ -24,12 +24,19 @@ import inspect
import collections
import sys
import pydoc
import hashlib
member_dict = collections.OrderedDict()
experimental_namespace = {"paddle.fluid.imperative"}
def md5(doc):
hash = hashlib.md5()
hash.update(str(doc).encode('utf-8'))
return hash.hexdigest()
def visit_member(parent_name, member):
cur_name = ".".join([parent_name, member.__name__])
if inspect.isclass(member):
......@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
visit_member(cur_name, value)
elif callable(member):
try:
member_dict[cur_name] = inspect.getargspec(member)
doc = ('document', md5(member.__doc__))
args = inspect.getargspec(member)
all = (args, doc)
member_dict[cur_name] = all
except TypeError: # special for PyBind method
member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n')
......
......@@ -131,7 +131,7 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid
# -1 device id represents CUDA api call
# -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册