提交 6cc78705 编写于 作者: D dzhwinter 提交者: sneaxiy

fix concat synchronization bug

上级 d292ad85
......@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_row, out_col, output->data<T>());
}
// Wait() must be called because `inputs_data` may be destructed before
// kernel ends
context.Wait();
}
};
......@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
}
// Wait() must be called because `outputs_data` may be destructed before
// kernel ends
context.Wait();
}
};
......
if (NOT WIN32)
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
......@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
if(WITH_GPU)
nv_library(enforce SRCS enforce.cc)
......@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if (NOT WIN32)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
endif(NOT WIN32)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
......
......@@ -22,9 +22,13 @@ limitations under the License. */
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#else
#include <unistd.h>
#endif
#endif // _WIN32
#include <algorithm>
#include "gflags/gflags.h"
......@@ -32,16 +36,20 @@ limitations under the License. */
DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#if !defined(_WIN32)
DEFINE_uint64(initial_cpu_memory_in_mb,
#ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/
5000,
5000ul,
#else
500,
500ul,
#endif
"Initial CPU memory for PaddlePaddle, in MD unit.");
#else
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
#endif // !defined(_WIN32)
DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5,
......@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
return 0L;
#elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo;
sMeminfo.dwLength = sizeof(sMeminfo);
GlobalMemoryStatusEx(&sMeminfo);
return sMeminfo.ullTotalPhys;
#else
int64_t pages = sysconf(_SC_PHYS_PAGES);
int64_t page_size = sysconf(_SC_PAGE_SIZE);
......
......@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h>
#include <chrono> // NOLINT
#include <string>
......@@ -27,12 +32,15 @@ namespace platform {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
#if !defined(_WIN32)
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
#else
inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
#endif // !_WIN32
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
......
......@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
......
......@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <mutex> // NOLINT
#include <string>
......@@ -23,6 +21,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
......
......@@ -18,6 +18,11 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#endif
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
......@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
// always forces branch prediction of true.
// This generates faster binary code. __builtin_expect is since C++11.
// For more details, please check https://stackoverflow.com/a/43870188/724872.
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0)
#endif
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
......@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
throw_on_error(e, "");
}
#if !defined(_WIN32)
#define PADDLE_THROW(...) \
do { \
throw ::paddle::platform::EnforceNotMet( \
......@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \
} \
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
#define PADDLE_THROW_EOF() \
do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
/*
* Some enforce helpers here, usage:
* int a = 1;
......
......@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
......@@ -94,6 +95,15 @@ struct RecordBlock {
std::string name_;
uint64_t start_ns_;
};
#else
// windows do not support profiler temporarily.
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
};
struct RecordBlock {
explicit RecordBlock(int block_id) {}
};
#endif
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册