提交 6cc78705 编写于 作者: D dzhwinter 提交者: sneaxiy

fix concat synchronization bug

上级 d292ad85
...@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()), dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_row, out_col, output->data<T>()); out_row, out_col, output->data<T>());
} }
// Wait() must be called because `inputs_data` may be destructed before
// kernel ends
context.Wait();
} }
}; };
...@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
input.data<T>(), in_row, in_col, dev_outs_col_data, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), dev_out_gpu_data); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
// Wait() must be called because `outputs_data` may be destructed before
// kernel ends
context.Wait();
} }
}; };
......
if (NOT WIN32)
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
...@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD ...@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
nv_library(enforce SRCS enforce.cc) nv_library(enforce SRCS enforce.cc)
...@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) ...@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if (NOT WIN32)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
endif(NOT WIN32)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
......
...@@ -22,9 +22,13 @@ limitations under the License. */ ...@@ -22,9 +22,13 @@ limitations under the License. */
#ifdef __APPLE__ #ifdef __APPLE__
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/types.h> #include <sys/types.h>
#elif defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#else #else
#include <unistd.h> #include <unistd.h>
#endif #endif // _WIN32
#include <algorithm> #include <algorithm>
#include "gflags/gflags.h" #include "gflags/gflags.h"
...@@ -32,16 +36,20 @@ limitations under the License. */ ...@@ -32,16 +36,20 @@ limitations under the License. */
DEFINE_double(fraction_of_cpu_memory_to_use, 1, DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
#if !defined(_WIN32)
DEFINE_uint64(initial_cpu_memory_in_mb, DEFINE_uint64(initial_cpu_memory_in_mb,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/ * to obtain the best performance*/
5000, 5000ul,
#else #else
500, 500ul,
#endif #endif
"Initial CPU memory for PaddlePaddle, in MD unit."); "Initial CPU memory for PaddlePaddle, in MD unit.");
#else
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
#endif // !defined(_WIN32)
DEFINE_double( DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
...@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() { ...@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
size_t len = sizeof(size); size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
return 0L; return 0L;
#elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo;
sMeminfo.dwLength = sizeof(sMeminfo);
GlobalMemoryStatusEx(&sMeminfo);
return sMeminfo.ullTotalPhys;
#else #else
int64_t pages = sysconf(_SC_PHYS_PAGES); int64_t pages = sysconf(_SC_PHYS_PAGES);
int64_t page_size = sysconf(_SC_PAGE_SIZE); int64_t page_size = sysconf(_SC_PAGE_SIZE);
......
...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h> #include <time.h>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <string> #include <string>
...@@ -27,12 +32,15 @@ namespace platform { ...@@ -27,12 +32,15 @@ namespace platform {
/////////////////////// ///////////////////////
// WARN: Under Development. Don't depend on it yet. // WARN: Under Development. Don't depend on it yet.
////////////////////// //////////////////////
#if !defined(_WIN32)
inline uint64_t PosixInNsec() { inline uint64_t PosixInNsec() {
struct timeval tv; struct timeval tv;
gettimeofday(&tv, nullptr); gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
#else
inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
#endif // !_WIN32
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
......
...@@ -16,7 +16,9 @@ if (CUPTI_FOUND) ...@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
if (NOT WIN32)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
endif(NOT WIN32)
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif() endif()
......
...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
...@@ -23,6 +21,7 @@ limitations under the License. */ ...@@ -23,6 +21,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
DEFINE_string(cudnn_dir, "", DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, " "Specify path for loading libcudnn.so. For instance, "
......
...@@ -18,6 +18,11 @@ limitations under the License. */ ...@@ -18,6 +18,11 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle #include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__ #endif // __GNUC__
#if defined(_WIN32)
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h> #include <cudnn.h>
...@@ -117,7 +122,12 @@ struct EOFException : public std::exception { ...@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
// always forces branch prediction of true. // always forces branch prediction of true.
// This generates faster binary code. __builtin_expect is since C++11. // This generates faster binary code. __builtin_expect is since C++11.
// For more details, please check https://stackoverflow.com/a/43870188/724872. // For more details, please check https://stackoverflow.com/a/43870188/724872.
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0)
#endif
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
...@@ -230,6 +240,7 @@ inline void throw_on_error(T e) { ...@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
throw_on_error(e, ""); throw_on_error(e, "");
} }
#if !defined(_WIN32)
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
do { \ do { \
throw ::paddle::platform::EnforceNotMet( \ throw ::paddle::platform::EnforceNotMet( \
...@@ -248,15 +259,28 @@ inline void throw_on_error(T e) { ...@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} \ } \
} while (false) } while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
#define PADDLE_THROW_EOF() \ #define PADDLE_THROW_EOF() \
do { \ do { \
throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
__LINE__); \ __LINE__); \
} while (false) } while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
#endif // REPLACE_ENFORCE_GLOG
#else // !_WIN32
// disable enforce, caused by the varardic macro exception error
#define PADDLE_THROW(x) \
do { \
throw std::make_exception_ptr( \
std::runtime_error("Windows disable the enforce.")); \
} while (false)
#define PADDLE_ENFORCE(x, ...) x
#endif // !_WIN32
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
* int a = 1; * int a = 1;
......
...@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); ...@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct RecordEvent { struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx); RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
...@@ -94,6 +95,15 @@ struct RecordBlock { ...@@ -94,6 +95,15 @@ struct RecordBlock {
std::string name_; std::string name_;
uint64_t start_ns_; uint64_t start_ns_;
}; };
#else
// windows do not support profiler temporarily.
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
};
struct RecordBlock {
explicit RecordBlock(int block_id) {}
};
#endif
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册