diff --git a/CMakeLists.txt b/CMakeLists.txt index b309ff37e52b4fd28b14925bdd7e3740e1e2fa47..5df83499d5dde29b205ee17fba81a63c9a643235 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") -SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) @@ -201,6 +199,10 @@ if(WITH_GOLANG) endif(WITH_GOLANG) set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") + +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 5f826aeb8371b944bdb8db8c842f5555e41c9549..25a0db27688a9984ad401cbd462fe70048b204da 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -59,6 +59,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) +cc_test(threadpool_test SRCS threadpool_test.cc) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index 4deb4fa903dec04e9b76c5a620f1eb76c9f1db07..3ff2da344627ed3ada3955ec5ee2c886402554f4 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -54,7 +54,7 @@ bool InitDevices(const std::vector &devices) { #ifdef PADDLE_WITH_CUDA auto pos = string::RFind(p, ':', string::Piece::npos); auto number = device.substr(pos + 1); - places.emplace_back(platform::GPUPlace(std::stoi(number))); + places.emplace_back(platform::CUDAPlace(std::stoi(number))); #else LOG(WARNING) << "'GPU' is not supported, Please re-compile with WITH_GPU option"; diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 465f8c62b5fe2efd549f68bb3a9823d299ba5393..d766d3c4163b6b7c6fdc772acb4b7e7b315f8783 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), + boost::get(tensor.place()), reinterpret_cast(data), size_to_write, gpu_dev_ctx.stream()); gpu_dev_ctx.Wait(); diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu index 5b90fbfca7f6bec4f2c862d0ff18dfd7cf39e181..e8508ad2658ae850e4c98aa798b5db6d007e67d0 100644 --- a/paddle/framework/lod_tensor_test.cu +++ b/paddle/framework/lod_tensor_test.cu @@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) { TEST(LoDTensor, LoDInGPU) { paddle::framework::LoDTensor lod_tensor; - paddle::platform::GPUPlace place(0); + paddle::platform::CUDAPlace place(0); paddle::framework::LoD src_lod; src_lod.push_back(std::vector{0, 2, 4, 6, 8, 10, 12, 14}); diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index 899676b5c1a3799427d6ef03bae47284550c781b..8753d7cc378662ce116e447dc6a340a07e5dd2ca 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) { using OpKernelType = paddle::framework::OpKernelType; using DataType = paddle::framework::proto::DataType; using CPUPlace = paddle::platform::CPUPlace; - using GPUPlace = paddle::platform::GPUPlace; + using CUDAPlace = paddle::platform::CUDAPlace; using DataLayout = paddle::framework::DataLayout; using LibraryType = paddle::framework::LibraryType; OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, LibraryType::kCUDNN); - OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW, + OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); OpKernelType::Hash hasher; diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 244c1174655f61cad7176a211b07863dbfbba9aa..9bb2a3b5c2931d03152cc3262c0ad8da17b8aacb 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar { } #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ - REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) diff --git a/paddle/framework/tensor.md b/paddle/framework/tensor.md index 7a80816d8e4ffa3a9462f3d9b87eff0f048466aa..0a27ac9bb6b03649d42e12100fda9e80a56e7f56 100644 --- a/paddle/framework/tensor.md +++ b/paddle/framework/tensor.md @@ -71,7 +71,7 @@ private: ``` ```c++ -typedef boost::variant Place; +typedef boost::variant Place; typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar; typedef boost::variant< diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index aba1f9f09329f890ef190f8820b958c56f017e89..3d93b7808bc96143b5261e1ec41ad98b15c74975 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } #endif offset_ = 0; diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index ceca64365a1a628642eb374a3e3bbdff490c955a..f347981f2e11fdc4770df8d5e7277cd55744ab3f 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -80,20 +80,20 @@ TEST(Tensor, MutableData) { float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), GPUPlace()); + p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CUDAPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), GPUPlace()); + p2 = src_tensor.mutable_data(make_ddim({3, 4}), CUDAPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), GPUPlace()); + p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CUDAPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), GPUPlace()); + p2 = src_tensor.mutable_data(make_ddim({2, 2}), CUDAPlace()); EXPECT_EQ(p1, p2); } #endif @@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) { { Tensor src_tensor; Tensor dst_tensor; - src_tensor.mutable_data(make_ddim({2, 3, 4}), GPUPlace()); + src_tensor.mutable_data(make_ddim({2, 3, 4}), CUDAPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -166,7 +166,7 @@ TEST(Tensor, Slice) { #ifdef PADDLE_WITH_CUDA { Tensor src_tensor; - src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); + src_tensor.mutable_data(make_ddim({6, 9}), CUDAPlace()); Tensor slice_tensor = src_tensor.Slice(2, 6); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); @@ -176,11 +176,11 @@ TEST(Tensor, Slice) { uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), GPUPlace())); + src_tensor.mutable_data(src_tensor.dims(), CUDAPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), GPUPlace())); + slice_tensor.mutable_data(slice_tensor.dims(), CUDAPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index 4e34b90d57eed8fea84b83045df61a98483c8849..ebfb0e553877b776afe13d8a4e7a7ffa8710405e 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); + auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); memory::Copy( dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, @@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_cpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); + auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); memory::Copy( dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, reinterpret_cast(ctx).stream()); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, @@ -82,6 +82,28 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, #endif } +/** + * @brief CopyFrom support CPU <-> CPU + */ +inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + src.check_memory_size(); + dst->Resize(src.dims()); + + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + PADDLE_ENFORCE(platform::is_cpu_place(src_place) && + platform::is_cpu_place(dst_place)); + + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); +} + /** * @brief Copy the content of an external vector to a tensor. * @@ -108,13 +130,28 @@ inline void CopyFromVector(const std::vector& src, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - boost::get(dst_place), dst_ptr, src_place, src_ptr, + boost::get(dst_place), dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } #endif } +/** + * @brief CopyFromVector CPU vector -> CPU Tensor + */ +template +inline void CopyFromVector(const std::vector& src, Tensor* dst) { + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); +} + /** * @brief Copy the content of a tensor to a vector * @@ -141,12 +178,30 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, boost::get(src.place()), + dst_place, dst_ptr, boost::get(src.place()), src_ptr, size, reinterpret_cast(ctx).stream()); } #endif } +/** + * @brief CopyToVector CPUTensor <-> CPU Vector + */ +template +inline void CopyToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index 03a70de182d0eb499a81413d38229c81c4378b91..6fc243aaf6ea68d716e7bb6a73289197fde56247 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -17,6 +17,7 @@ namespace paddle { namespace framework { + TEST(CopyFrom, Tensor) { Tensor src_tensor; Tensor dst_tensor; @@ -29,7 +30,7 @@ TEST(CopyFrom, Tensor) { memcpy(src_ptr, arr, 9 * sizeof(int)); auto cpu_place = new platform::CPUPlace(); - CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor); + CopyFrom(src_tensor, *cpu_place, &dst_tensor); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); @@ -58,7 +59,7 @@ TEST(CopyFrom, Tensor) { memcpy(src_ptr, arr, 9 * sizeof(int)); // CPU Tensor to GPU Tensor - auto gpu_place = new platform::GPUPlace(0); + auto gpu_place = new platform::CUDAPlace(0); platform::CUDADeviceContext gpu_ctx(*gpu_place); CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); @@ -104,8 +105,7 @@ TEST(CopyFromVector, Tensor) { // Copy to CPU Tensor cpu_tensor.Resize(make_ddim({3, 3})); auto cpu_place = new paddle::platform::CPUPlace(); - CPUDeviceContext cpu_ctx(*cpu_place); - CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + CopyFromVector(src_vec, &cpu_tensor); // Compare Tensors const int* cpu_ptr = cpu_tensor.data(); @@ -117,7 +117,7 @@ TEST(CopyFromVector, Tensor) { src_vec.erase(src_vec.begin(), src_vec.begin() + 5); cpu_tensor.Resize(make_ddim({2, 2})); - CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + CopyFromVector(src_vec, &cpu_tensor); cpu_ptr = cpu_tensor.data(); src_ptr = src_vec.data(); ASSERT_NE(src_ptr, cpu_ptr); @@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) { // Copy to GPUTensor gpu_tensor.Resize(make_ddim({3, 3})); - auto gpu_place = new paddle::platform::GPUPlace(); + auto gpu_place = new paddle::platform::CUDAPlace(); CUDADeviceContext gpu_ctx(*gpu_place); CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); // Copy from GPU to CPU tensor for comparison @@ -198,9 +198,8 @@ TEST(CopyToVector, Tensor) { } CPUPlace place; - CPUDeviceContext cpu_ctx(place); std::vector dst; - CopyToVector(src, cpu_ctx, &dst); + CopyToVector(src, &dst); for (int i = 0; i < 3 * 3; ++i) { EXPECT_EQ(src_ptr[i], dst[i]); @@ -210,7 +209,7 @@ TEST(CopyToVector, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; Tensor gpu_tensor; - GPUPlace place; + CUDAPlace place; CUDADeviceContext gpu_ctx(place); CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..9a1ece3ae8452399205e71fbaa26977710fcdcac --- /dev/null +++ b/paddle/framework/threadpool.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/platform/call_once.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace framework { + +typedef std::function Task; + +class ThreadPool { + public: + /** + * @brief Get a instance of threadpool, the thread number will + * be specified as the number of hardware thread contexts + */ + static ThreadPool* GetInstance() { + std::call_once(init_flag, &ThreadPool::Init); + return threadpool.get(); + } + + ~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } + } + + int GetNumThreads() const { return num_threads_; } + + int GetAvailable() { + std::unique_lock lock(mutex_); + return available_; + } + + /** + * @brief Push a function to the queue, and will be scheduled and + * executed if a thread is available. + * @param[in] Task will be pushed to the task queue. + */ + void Run(const Task& fn) { + std::unique_lock lock(mutex_); + tasks_.push(fn); + lock.unlock(); + scheduled_.notify_one(); + } + + /** + * @brief Wait until all the tasks are completed. + */ + void Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); + } + + private: + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool(const ThreadPool&) = delete; + + ThreadPool(int num_threads) + : num_threads_(num_threads), available_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } + } + + /** + * @brief If the task queue is empty and avaialbe + * is equal to the number of threads, means that + * all tasks are completed. + * + * Note: this function is not thread-safe. + * + * @return true if all tasks are completed. + */ + bool Done() { return tasks_.empty() && available_ == num_threads_; } + + void TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = tasks_.front(); + tasks_.pop(); + + --available_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++available_; + if (Done()) { + completed_.notify_all(); + } + } + } + } + + static void Init() { + if (threadpool.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool.reset(new ThreadPool(num_threads)); + } + } + + private: + static std::unique_ptr threadpool; + static std::once_flag init_flag; + + int num_threads_; + int available_; + bool running_; + std::queue tasks_; + std::vector> threads_; + std::mutex mutex_; + std::condition_variable scheduled_; + std::condition_variable completed_; +}; + +std::unique_ptr ThreadPool::threadpool(nullptr); +std::once_flag ThreadPool::init_flag; +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..78c762608ed920c1a586c99d68952ffcc653c75e --- /dev/null +++ b/paddle/framework/threadpool_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "threadpool.h" +#include +#include +#include +#include +#include + +namespace framework = paddle::framework; + +void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { + for (int i = 0; i < cnt; ++i) { + pool->Run([&sum]() { sum.fetch_add(1); }); + } +} + +TEST(ThreadPool, ConcurrentInit) { + framework::ThreadPool* pool; + int concurrent_cnt = 50; + std::vector threads; + for (int i = 0; i < concurrent_cnt; ++i) { + std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } +} + +TEST(ThreadPool, ConcurrentStart) { + framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); + std::atomic sum(0); + std::vector threads; + int concurrent_cnt = 50; + // sum = (n * (n + 1)) / 2 + for (int i = 1; i <= concurrent_cnt; ++i) { + std::thread t(do_sum, pool, std::ref(sum), i); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } + pool->Wait(); + EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); +} diff --git a/paddle/memory/README.md b/paddle/memory/README.md index 6cb003c50bc7d142d65b0591e7e5235431d2ea42..7cf61d089b39041b7a15184e0ea9211d14a66f5e 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024); To allocate 4KB memory on the 3rd GPU: ```cpp -p = memory::Alloc(platform::GPUPlace(2), 4*1024); +p = memory::Alloc(platform::CUDAPlace(2), 4*1024); ``` To free memory and check the so-far used amount of memory on a place: ```cpp -auto pl = platform::GPUPlace(0); +auto pl = platform::CUDAPlace(0); p = memory::Alloc(pl, 4*1024); cout << memory::Used(pl); memory::Free(pl, p); @@ -36,7 +36,7 @@ template size_t Used(Place); } // namespace memory ``` -These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: +These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`: ```cpp template<> @@ -49,7 +49,7 @@ and ```cpp template<> -void Alloc(GPUPlace p, size_t size) { +void Alloc(CUDAPlace p, size_t size) { return GetGPUBuddyAllocator(p.id)->Alloc(size); } ``` @@ -122,7 +122,7 @@ There are two implementations of `Context`: 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. -1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. +1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. ### Majel diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 5c629dc3d2aca2705e439df836214c1284b31c8f..b46141aafd7146bd3def12d86108c10f1f143d20 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -28,31 +28,25 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA template <> -void Copy(platform::CPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(dst_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { if (dst_place == src_place) { platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 9cafdfda75d0511227ef648d50a8635320a81d32..c4bb6baee7ebf2941cee5915ca2723c298689261 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } template <> -size_t Used(platform::GPUPlace place) { +size_t Used(platform::CUDAPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void* Alloc(platform::GPUPlace place, size_t size) { +void* Alloc(platform::CUDAPlace place, size_t size) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -101,14 +101,14 @@ void* Alloc(platform::GPUPlace place, size_t size) { LOG(WARNING) << "total " << total; LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); - LOG(WARNING) << "GPU memory used: " << Used(place); + LOG(WARNING) << "GPU memory used: " << Used(place); platform::SetDeviceId(cur_dev); } return ptr; } template <> -void Free(platform::GPUPlace place, void* p) { +void Free(platform::CUDAPlace place, void* p) { GetGPUBuddyAllocator(place.device)->Free(p); } diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 2444931e26774ae80b916fbb7bd46ff93025d9ed..f476bf71264da59a5c546968f4689145e1d8801b 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { #ifdef PADDLE_WITH_CUDA -size_t align(size_t size, paddle::platform::GPUPlace place) { +size_t align(size_t size, paddle::platform::CUDAPlace place) { size += sizeof(paddle::memory::detail::Metadata); size_t alignment = paddle::platform::GpuMinChunkSize(); size_t remaining = size % alignment; @@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) { EXPECT_EQ(p, nullptr); - paddle::platform::GPUPlace gpu(0); + paddle::platform::CUDAPlace gpu(0); p = paddle::memory::Alloc(gpu, 4096); EXPECT_NE(p, nullptr); @@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) { } TEST(BuddyAllocator, GPUMultAlloc) { - paddle::platform::GPUPlace gpu; + paddle::platform::CUDAPlace gpu; std::unordered_map ps; diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index dd51aad105fecf4e3118f03e2f1868abb5523bc8..0aadd5af41531e54b357756441f92da668d4ec01 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* inference = ctx.Input("Out"); auto* indices = ctx.Input("Indices"); auto* label = ctx.Input("Label"); diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 55d0736a4c8e09eea637e5ab7e49af9a618e7fd8..3d17725ab47682355b2093782848849857f9bf59 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -53,7 +53,7 @@ class BatchNormKernel public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -179,7 +179,7 @@ class BatchNormGradKernel public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 3da0a9001aafbb5b2c4b9a91c4527d9437ac38a1..79e020b7556e1e349d40820616798a7a10f3b221 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); @@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel { handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); // Allocate on GPU memory - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- T alpha = 1.0f, beta = 0.0f; @@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); @@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- // Already on GPU void* cudnn_workspace = nullptr; - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- T alpha = 1.0f, beta = 0.0f; diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index f0297f6c40c132c28b50184997d657451f26362b..b3663209ff989c1d8fddd2f393db44de23e1e2a7 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); @@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { cudnn_output_desc, algo, &workspace_size_in_bytes)); // Allocate on GPU memory - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv transpose forward --------------------- @@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); @@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- // Already on GPU void* cudnn_workspace = nullptr; - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h index 068c82f399316a1587d7322d8dab75823656800e..b81bb8ba7e9f7d29a75d56b6f403bd0d7ed86f7c 100644 --- a/paddle/operators/detail/strided_memcpy.h +++ b/paddle/operators/detail/strided_memcpy.h @@ -35,7 +35,7 @@ struct StridedMemcpyFunctor { memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); } else { #ifdef PADDLE_WITH_CUDA - auto& gpu_place = boost::get(place); + auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 694584e79c3a1e818814a4a2145f52d8db7cf10a..19c6715ec877dea6dcf0babc7373333a4d9eed0f 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { // operators runs on GPU device. auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, Tensor* dst) { - dst->mutable_data(platform::GPUPlace()); - framework::CopyFrom(src, platform::GPUPlace(), ctx, dst); + dst->mutable_data(platform::CUDAPlace()); + framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst); }; copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst); @@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src, Tensor* dst) { if (src && dst) { - dst->mutable_data(platform::GPUPlace()); - framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst); + dst->mutable_data(platform::CUDAPlace()); + framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst); } }; copyTensor(ctx, emission_grad_src, emission_grad_dst); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 9431030a53975acafe9bcb22dc9164492929b07a..a3ab1a729761d59c058de1985b15575c9d50d3c5 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_dim[0]); - auto gpu_place = boost::get(context.GetPlace()); + auto gpu_place = boost::get(context.GetPlace()); memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, ids_dim[0] * sizeof(int64_t), stream); diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index 291f2c295e78288c01c6575df936ceedceba7ce8..4b164d964c3f56e52dc2b6ddcd945a241b39a7f9 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* x_tensor = ctx.Input("X"); auto* c_prev_tensor = ctx.Input("C_prev"); @@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto x_tensor = ctx.Input("X"); auto c_prev_tensor = ctx.Input("C_prev"); diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 256f3bc9bd487d11b0f139ef057f5a98556b4db1..26c038e435827b401d723ee6eef2255a89670f46 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -159,6 +159,7 @@ void testIm2col() { TEST(math, im2col) { testIm2col(); #ifdef PADDLE_WITH_CUDA - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 7852bb53a9035f71f52a51529c8e3cea22b0d4aa..0a818bc5d4575ac25f27c32b583780e1b6946f1e 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -105,7 +105,7 @@ void matmul( PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_out->place()), - "Matrix must all be in GPUPlace"); + "Matrix must all be in CUDAPlace"); int M = dim_out[0]; int N = dim_out[1]; @@ -134,7 +134,7 @@ void matmul( PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_out->place()), - "Matrix must all be in GPUPlace"); + "Matrix must all be in CUDAPlace"); int M = dim_out[0]; int N = dim_out[1]; @@ -266,7 +266,7 @@ struct TensorSetConstantGPU { }; template <> -void set_constant_with_place( +void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { framework::VisitDataType(framework::ToDataType(tensor->type()), @@ -277,7 +277,7 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - set_constant_with_place(context, tensor, value); + set_constant_with_place(context, tensor, value); } template struct RowwiseAdd; diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index 32e96d948714a8fd1fa2c089057603fdaed85c16..4325a79664f15cfaea48870cd503ce70cc31044f 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) { T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::framework::Tensor g_mat_a; paddle::framework::Tensor g_vec_b; paddle::framework::Tensor g_vec_c; diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index c44577e00af5f362ae7e168495e496d60d05de95..9fddd97a36f7fdb6628d6eeb192cb216fdae3e5b 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -58,15 +58,15 @@ struct SelectedRowsAdd { PADDLE_ENFORCE(platform::is_gpu_place(out_place)); memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, + boost::get(out_place), out_data, + boost::get(in1_place), in1_data, in1_value.numel() * sizeof(T), reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); - memory::Copy(boost::get(out_place), + memory::Copy(boost::get(out_place), out_data + in1_value.numel(), - boost::get(in2_place), in2_data, + boost::get(in2_place), in2_data, in2_value.numel() * sizeof(T), context.stream()); } }; @@ -160,9 +160,9 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy(boost::get(in2_place), + memory::Copy(boost::get(in2_place), in2_data + input2_offset, - boost::get(in1_place), in1_data, + boost::get(in1_place), in1_data, in1_value.numel() * sizeof(T), context.stream()); } }; diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 777caf5635647d11e8fde05a68fdf7e2c32f48df..0a2e36f68acee04bd6b272d37679c18231cb8760 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) { using namespace paddle::platform; using namespace paddle::operators::math; - GPUPlace gpu_place(0); + CUDAPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); SetConstant functor; @@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) { using namespace paddle::platform; using namespace paddle::operators::math; - GPUPlace gpu_place(0); + CUDAPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); SetConstant functor; diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index f46db3c56713399798a45854bf1613d07aee26e6..3794f0e52d200a08253a979991da04ec564cae47 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -122,6 +122,6 @@ TEST(math, vol2col) { testVol2col(); #ifdef PADDLE_WITH_CUDA testVol2col(); + paddle::platform::CUDAPlace>(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 47986e9ff86f2e08b0861cde35ac3a44b10caed1..57e6880b4e8b32eb36ce5d7f5c4a00ca0a3c48ff 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel { CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::GPUPlace place = boost::get(ctx.GetPlace()); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::GPUPlace place = boost::get(ctx.GetPlace()); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 6ca6db7253da0e59c742f115cd25a1b8203a3044..1b986a13650de7d77f4828d71798ee00d61c1284 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto stream = ctx.cuda_device_context().stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); for (size_t i = 0; i < ins.size(); ++i) { @@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel { ctx.device_context()) .stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); auto ins_names = ctx.Inputs("X"); @@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel { ctx.device_context()) .stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); if (idx == root) { diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index b6e4ccb73f8133a55c17ddfa93b3ab4a21496561..361bfa8d75963bce7344a7263732f10190b078f3 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -52,7 +52,7 @@ class NCCLTester : public ::testing::Test { virtual void SetUp() override { paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list.size(); ++i) { - p::GPUPlace place(i); + p::CUDAPlace place(i); dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); } @@ -87,7 +87,7 @@ class NCCLTester : public ::testing::Test { std::unique_lock lk(mu); const f::OpDesc *op1 = &op_desc; - p::GPUPlace place(gpu_id); + p::CUDAPlace place(gpu_id); auto &ctx = dev_ctxs.at(gpu_id); auto *send_tensor = scope->Var("st")->GetMutable(); @@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { for (size_t i = 0; i < dev_scopes.size(); ++i) { p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[i]); + p::CUDAPlace gpu_place(gpu_list[i]); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[i])->stream()); @@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) { float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[kRoot]); + p::CUDAPlace gpu_place(gpu_list[kRoot]); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[kRoot])->stream()); @@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) { float result = kRoot; p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[idx]); + p::CUDAPlace gpu_place(gpu_list[idx]); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[idx])->stream()); @@ -300,7 +300,7 @@ int main(int argc, char **argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); gpu_list.emplace_back(i); } diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index fc2b37bd0fbac82005e709779b2939843b839596..2d0001ba1184c99d9fc642f60c97ba89cec97ccd 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); const Tensor *input = ctx.Input("X"); Tensor *output = ctx.Output("Out"); @@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); const Tensor *input = ctx.Input("X"); const Tensor *output = ctx.Input("Out"); diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu index b7329238c0ea8ebb374d35bd7cddced3dfee1a2c..a5dcd2ec9647eb3085ea8cbb2f9edbff9ab9cf92 100644 --- a/paddle/operators/reshape_op.cu +++ b/paddle/operators/reshape_op.cu @@ -16,7 +16,7 @@ REGISTER_OP_CUDA_KERNEL( reshape, - paddle::operators::ReshapeKernel); + paddle::operators::ReshapeKernel); REGISTER_OP_CUDA_KERNEL( reshape_grad, - paddle::operators::ReshapeGradKernel); + paddle::operators::ReshapeGradKernel); diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc index 230cc1ab0bbd5ac57eb7494795e3fbcdf02c3cc8..d47fd98d06e52b55056521d08231fa9316289a2b 100644 --- a/paddle/operators/strided_memcpy_test.cc +++ b/paddle/operators/strided_memcpy_test.cc @@ -82,7 +82,7 @@ TEST(StridedMemcpy, GPUCrop) { }; // clang-format on - platform::GPUPlace gpu0(0); + platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); @@ -121,7 +121,7 @@ TEST(StridedMemcpy, GPUConcat) { }; // clang-format on - platform::GPUPlace gpu0(0); + platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index 453bd07267e3a6e33211117368dd9aff10a9e23f..0a70ad87e672080f11439f44f0c9bfcf01114fd6 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h index b6311cb23d695c3cd851bcca120c24cced7fdd62..67d5f626d41c0fae280062533d6c1d1074341164 100644 --- a/paddle/platform/cuda_profiler.h +++ b/paddle/platform/cuda_profiler.h @@ -22,23 +22,7 @@ namespace paddle { namespace platform { void CudaProfilerInit(std::string output_file, std::string output_mode, - std::vector config_flags) { - std::array buf; - std::string tmpl = "/tmp/cuda_profile_config.XXXXXX"; - PADDLE_ENFORCE_LT(tmpl.size(), buf.size()); - memcpy(buf.data(), tmpl.data(), tmpl.size()); - auto result = mktemp(buf.data()); - PADDLE_ENFORCE(strlen(result) != 0); - std::string config_file = result; - - { - std::ofstream ofs(config_file, std::ios::out | std::ios::trunc); - PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); - for (const auto& line : config_flags) { - ofs << line << std::endl; - } - } - + std::string config_file) { PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE( diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index a28e9de716c857145955cced85b99b77ef89b101..8ee0f18e64599869670bd6665df43027cfde9501 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -58,10 +58,10 @@ DeviceContextPool::DeviceContextPool( #ifdef PADDLE_WITH_CUDA device_contexts_.emplace(places[i], new platform::CUDADeviceContext( - boost::get(places[i]))); + boost::get(places[i]))); #else PADDLE_THROW( - "'GPUPlace' is not supported, Please re-compile with WITH_GPU " + "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "option"); #endif } @@ -91,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } ~EigenCudaStreamDevice() override {} - void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { + void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { stream_ = cuda_stream; place_ = place; device_prop_ = &Eigen::m_deviceProperties[place.device]; @@ -130,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } private: - GPUPlace place_; + CUDAPlace place_; const cudaStream_t* stream_; // not owned; const cudaDeviceProp* device_prop_; // not owned; mutable void* scratch_; mutable unsigned int* semaphore_; }; -CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 9b958f7c920a32c9208f3dfd3ff54ac9620da9e7..877a66363a1a523692b07e7bef0e11e3caddce9f 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -58,7 +58,7 @@ class EigenCudaStreamDevice; class CUDADeviceContext : public DeviceContext { public: - explicit CUDADeviceContext(GPUPlace place); + explicit CUDADeviceContext(CUDAPlace place); virtual ~CUDADeviceContext(); /*! \brief Wait for all operations completion in the stream. */ @@ -80,7 +80,7 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream() const; private: - GPUPlace place_; + CUDAPlace place_; std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; @@ -143,7 +143,7 @@ class DeviceContextPool { size_t operator()(const platform::Place& place) const { int pre_hash = place.which() + (1 << LEFT_SHIFT); if (platform::is_gpu_place(place)) { - pre_hash += boost::get(place).GetDeviceId(); + pre_hash += boost::get(place).GetDeviceId(); } return hash_(pre_hash); } diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index f046c79e0a015023568071a157ae183bfb8df556..186824c0194a2e1d300e60d1e8a84b2f65a20988 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -20,11 +20,11 @@ limitations under the License. */ TEST(Device, Init) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; @@ -33,11 +33,11 @@ TEST(Device, Init) { TEST(Device, CUDADeviceContext) { using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); @@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CUDADeviceContext; using paddle::platform::Place; using paddle::platform::CPUPlace; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; DeviceContextPool& pool = DeviceContextPool::Get(); auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); @@ -80,14 +80,14 @@ TEST(Device, DeviceContextPool) { std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(GPUPlace(i)); + gpu_places.emplace_back(CUDAPlace(i)); } auto dev_ctxs = pool.Borrow(gpu_places); for (size_t i = 0; i < dev_ctxs.size(); ++i) { auto* dev_ctx = static_cast(dev_ctxs[i]); - // check same as GPUPlace(i) - GPUPlace place = boost::get(dev_ctx->GetPlace()); + // check same as CUDAPlace(i) + CUDAPlace place = boost::get(dev_ctx->GetPlace()); EXPECT_EQ(place.GetDeviceId(), static_cast(i)); } } @@ -106,7 +106,7 @@ int main(int argc, char** argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); } VLOG(0) << " DeviceCount " << count; diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 6750c8da7db86500e9593cd41d39dbd229abad7a..f57c329402da4854b2028bf5836e5d7a013bf8e2 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -50,7 +50,7 @@ struct PerThreadData { T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } - PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) { + PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { send_buff.resize(size); for (size_t i = 0; i < size; ++i) { send_buff[i] = static_cast(i); @@ -140,7 +140,7 @@ int main(int argc, char** argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); } VLOG(0) << " DeviceCount " << count; diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc index 25fe8d21b49b07a6afe2938245906dc1bdd90398..4d23cfd8865bd83e9702169e3b3d5f88b8a32059 100644 --- a/paddle/platform/place.cc +++ b/paddle/platform/place.cc @@ -24,7 +24,9 @@ class PlacePrinter : public boost::static_visitor<> { explicit PlacePrinter(std::ostream &os) : os_(os) {} void operator()(const CPUPlace &) { os_ << "CPUPlace"; } void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; } - void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } + void operator()(const CUDAPlace &p) { + os_ << "CUDAPlace(" << p.device << ")"; + } private: std::ostream &os_; @@ -37,12 +39,12 @@ static Place the_default_place; void set_place(const Place &place) { the_default_place = place; } const Place &get_place() { return the_default_place; } -const GPUPlace default_gpu() { return GPUPlace(0); } +const CUDAPlace default_gpu() { return CUDAPlace(0); } const CPUPlace default_cpu() { return CPUPlace(); } const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); } bool is_gpu_place(const Place &p) { - return boost::apply_visitor(IsGPUPlace(), p); + return boost::apply_visitor(IsCUDAPlace(), p); } bool is_cpu_place(const Place &p) { return !is_gpu_place(p) && !is_mkldnn_place(p); diff --git a/paddle/platform/place.h b/paddle/platform/place.h index daeafbbcd780aaeab20c8fcbbeed60a587e0049b..4eab1a3964bddcce40412450263610b7e0824721 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -39,43 +39,45 @@ struct MKLDNNPlace { inline bool operator!=(const MKLDNNPlace &) const { return false; } }; -struct GPUPlace { - GPUPlace() : GPUPlace(0) {} - explicit GPUPlace(int d) : device(d) {} +struct CUDAPlace { + CUDAPlace() : CUDAPlace(0) {} + explicit CUDAPlace(int d) : device(d) {} inline int GetDeviceId() const { return device; } // needed for variant equality comparison - inline bool operator==(const GPUPlace &o) const { return device == o.device; } - inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } + inline bool operator==(const CUDAPlace &o) const { + return device == o.device; + } + inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } int device; }; -struct CUDNNPlace : public GPUPlace { - CUDNNPlace() : GPUPlace() {} - explicit CUDNNPlace(int d) : GPUPlace(d) {} +struct CUDNNPlace : public CUDAPlace { + CUDNNPlace() : CUDAPlace() {} + explicit CUDNNPlace(int d) : CUDAPlace(d) {} }; -struct IsGPUPlace : public boost::static_visitor { +struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const MKLDNNPlace &) const { return false; } - bool operator()(const GPUPlace &gpu) const { return true; } + bool operator()(const CUDAPlace &gpu) const { return true; } bool operator()(const CUDNNPlace &) const { return true; } }; struct IsMKLDNNPlace : public boost::static_visitor { bool operator()(const MKLDNNPlace &) const { return true; } bool operator()(const CPUPlace &) const { return false; } - bool operator()(const GPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDNNPlace &) const { return false; } }; -typedef boost::variant Place; +typedef boost::variant Place; void set_place(const Place &); const Place &get_place(); -const GPUPlace default_gpu(); +const CUDAPlace default_gpu(); const CPUPlace default_cpu(); const MKLDNNPlace default_mkldnn(); diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc index c536b59ed8f71bd078bd09c5bd5afeab74c71b28..21f7d9f21323ec6d494c3c3a2fa53d07a31e9edd 100644 --- a/paddle/platform/place_test.cc +++ b/paddle/platform/place_test.cc @@ -4,7 +4,7 @@ TEST(Place, Equality) { paddle::platform::CPUPlace cpu; - paddle::platform::GPUPlace g0(0), g1(1), gg0(0); + paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0); EXPECT_EQ(cpu, cpu); @@ -41,8 +41,8 @@ TEST(Place, Default) { TEST(Place, Print) { { std::stringstream ss; - ss << paddle::platform::GPUPlace(1); - EXPECT_EQ("GPUPlace(1)", ss.str()); + ss << paddle::platform::CUDAPlace(1); + EXPECT_EQ("CUDAPlace(1)", ss.str()); } { std::stringstream ss; diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index 464096111e4a85b8d64d9223bfb85a1d1d42fad4..8e2483aa84568a6986a923670f71050a12b55136 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -49,7 +49,7 @@ TEST(Transform, CPUUnary) { TEST(Transform, GPUUnary) { using namespace paddle::platform; using namespace paddle::memory; - GPUPlace gpu0(0); + CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); @@ -80,7 +80,7 @@ TEST(Transform, GPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - GPUPlace gpu0(0); + CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index de6b24f70d84a28add0c0a09cac79b8c5b1044de..668a48e816919cbc6a4bb646adedfbbdcb57922a 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) { self.Resize(make_ddim(dim)); }) .def("alloc_float", - [](Tensor &self, paddle::platform::GPUPlace &place) { + [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) .def("alloc_float", @@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) { self.mutable_data(place); }) .def("alloc_int", - [](Tensor &self, paddle::platform::GPUPlace &place) { + [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) .def("set", PyCPUTensorSetFromArray) @@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::CPUDeviceContext(); }) .def_static("create", - [](paddle::platform::GPUPlace& place) + [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("GPUPlace is not supported in CPU device."); + PADDLE_THROW("CUDAPlace is not supported in CPU device."); #else return new paddle::platform::CUDADeviceContext(place); #endif @@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_CUDA py::class_(m, "Communicator").def(py::init<>()); #endif - py::class_(m, "GPUPlace") + py::class_(m, "CUDAPlace") .def(py::init()) - .def("__str__", string::to_string); + .def("__str__", string::to_string); py::class_(m, "CPUPlace") .def(py::init<>()) @@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle. self = cpu_place; }) .def("set_place", - [](platform::Place &self, const platform::GPUPlace &gpu_place) { + [](platform::Place &self, const platform::CUDAPlace &gpu_place) { self = gpu_place; }); @@ -363,7 +363,7 @@ All parameter, weight, gradient are variables in Paddle. const platform::CPUPlace &place) { self.Run(scope, place); }) .def("run", [](OperatorBase &self, const Scope &scope, - const platform::GPUPlace &place) { self.Run(scope, place); }) + const platform::CUDAPlace &place) { self.Run(scope, place); }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 413fd9b046f3f302feb5bd52beb284553a8ae192..7b8c29ff84ff4950a1ecf2d6364793b8567bfd4e 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -71,7 +71,7 @@ struct CastToPyBufferImpl { dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), cudaMemcpyDeviceToHost, dev_ctx->stream()); #else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; @@ -127,7 +127,7 @@ template void PyCUDATensorSetFromArray( framework::Tensor &self, py::array_t array, - paddle::platform::GPUPlace &place) { + paddle::platform::CUDAPlace &place) { std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 7ba1bf095ab74e4b64a8fb39b84172d6f371a2cf..108ff335bf6b920c648d4bfebbd6a40ffb6fd939 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -36,7 +36,7 @@ int main(int argc, char** argv) { paddle::memory::Used(paddle::platform::CPUPlace()); std::vector devs = {"CPU"}; #ifdef PADDLE_WITH_CUDA - paddle::memory::Used(paddle::platform::GPUPlace(0)); + paddle::memory::Used(paddle::platform::CUDAPlace(0)); devs.push_back("GPU:0"); #endif paddle::framework::InitDevices(devs); diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 051b9094aafa74b186776ae2041f95d0fe6d5f77..c72b5730695dbc4f772015f1fb8dec6814cd1837 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -15,14 +15,14 @@ import backward import regularizer from param_attr import ParamAttr from data_feeder import DataFeeder -from core import LoDTensor, CPUPlace, GPUPlace +from core import LoDTensor, CPUPlace, CUDAPlace from distribute_transpiler import DistributeTranspiler import clip Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', - 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' + 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr' 'DataFeeder', 'clip', 'DistributeTranspiler' ] diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index cdd576294f4f53bd3760b2c95a41b2129004a51a..2c91afb363bf72f2791e60c6df0d9130ccd698c5 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -47,7 +47,7 @@ class Executor(object): act_places.append(p) # TODO(dzhwinter) : consider that our fluid tests all written in - # GPUPlace(gpu_id), this will be changed in the future + # CUDAPlace(gpu_id), this will be changed in the future if core.is_compile_gpu(): core.init_devices(["CPU", "GPU:0"]) else: diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 2adce99d052639ec7d9063b1c234c623e7cdb9c6..941675ec3e1705896c83fb4f61d3f5aa7afb844e 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -163,8 +163,9 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): Examples: .. code-block:: python + dict_size = len(dataset.ids) data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') - fc = fluid.layers.embedding(input=data, size=16) + fc = fluid.layers.embedding(input=data, size=[dict_size, 16]) """ helper = LayerHelper('embedding', **locals()) diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index 2069b713faf41c5c00ceaf47e030864b98c678da..dcecd76224e70d03ed987a5bb104a977a527d218 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -1,5 +1,6 @@ import paddle.v2.fluid.core as core from contextlib import contextmanager +import os __all__ = ['CudaProfiler'] @@ -30,17 +31,21 @@ def cuda_profiler(output_file, output_mode=None, config=None): written into this file. output_mode (string) : The output mode has Key-Value pair format and Comma separated values format. It should be 'kvp' or 'csv'. - config (string) : The profiler options and counters can refer to - "Compute Command Line Profiler User Guide". + config (list of string) : The profiler options and counters can refer + to "Compute Command Line Profiler User Guide". """ if output_mode is None: output_mode = 'csv' if output_mode not in ['kvp', 'csv']: raise ValueError("The output mode must be 'kvp' or 'csv'.") config = NVPROF_CONFIG if config is None else config - core.nvprof_init(output_file, output_mode, config) + config_file = 'nvprof_config_file' + with open(config_file, 'wb') as fp: + fp.writelines(["%s\n" % item for item in config]) + core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield # Disables profiler collection. core.nvprof_stop() + os.remove(config_file) diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index b0c11ba3414d6d87d078189ccee0791921040d91..e3cc2a89371233014dec4ba3d730a866722d3eae 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -142,7 +142,7 @@ def main(): opts = sgd_optimizer.minimize(cost) if USE_GPU: - place = core.GPUPlace(0) + place = core.CUDAPlace(0) else: place = core.CPUPlace() diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index 087283bfded07e25ddfd446849b9c5ca9d1e7651..8dbfbd547a6677517f028997e6269709aac43b67 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): def check_output(self, atol=1e-5): places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu(self.op_type): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_output_with_place(place, atol) @@ -379,7 +379,7 @@ class OpTest(unittest.TestCase): "Gradient Check On %s" % str(cpu_place)) if core.is_compile_gpu() and self.op.support_gpu(): - gpu_place = core.GPUPlace(0) + gpu_place = core.CUDAPlace(0) gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place, output_names, no_grad_set) diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 1ff3932164bed75be71b5c6b7114df362b893f09..7b2d02fbf4256d2c27383a3452d526271af543a3 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -167,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase): def test_sparse_adagrad(self): places = [core.CPUPlace()] if core.is_compile_gpu(): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index dfc047e1f0dc9fcf3d72d007b17d4c2de2077fbd..abbd48d2b843cedb77caffc13413d2f9695defa6 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -304,7 +304,7 @@ class TestBatchNormOp(OpTest): self.__assert_close(saved_variance_tensor, saved_variance, "saved_variance") self.__assert_close(mean_out_tensor, mean_out, "mean_out") - if isinstance(place, core.GPUPlace): + if isinstance(place, core.CUDAPlace): atol = 5e-2 else: atol = 1e-4 @@ -339,7 +339,7 @@ class TestBatchNormOp(OpTest): places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) core.init_devices(["CPU", "GPU:0"]) else: diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index 4afe0c6a6d36b4ab7b88b459ce8d182b287b860e..6f6a60ccb3ff17f6a12eec6974b8b2d73885c29f 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -20,7 +20,7 @@ class TestGaussianRandomOp(unittest.TestCase): def test_gpu(self): if core.is_compile_gpu(): - self.gaussian_random_test(place=fluid.GPUPlace(0)) + self.gaussian_random_test(place=fluid.CUDAPlace(0)) def gaussian_random_test(self, place): diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index 395d0dc36a3d1d6fbfebb4cdf34395c4edee412d..e3f3ac58ef9b30864849770510f7339749dab84f 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -3,6 +3,7 @@ import numpy as np import paddle.v2.fluid as fluid import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.layers as layers +import os class TestProfiler(unittest.TestCase): @@ -14,14 +15,16 @@ class TestProfiler(unittest.TestCase): data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) - place = fluid.GPUPlace(0) + place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + output_file = 'cuda_profiler.txt' + with profiler.cuda_profiler(output_file, 'csv') as nvprof: for i in range(epoc): input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) + os.remove(output_file) if __name__ == '__main__': diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py index 9c345792beef46f65ec12e111f1d645fb31e69c7..14d41e172a22c677235ab3fa997ef6f0b6e39778 100644 --- a/python/paddle/v2/fluid/tests/test_sgd_op.py +++ b/python/paddle/v2/fluid/tests/test_sgd_op.py @@ -78,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase): def test_sparse_sgd(self): places = [core.CPUPlace()] if core.is_compile_gpu(): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index d6872c8ba351a13b6fb8622cc23029c8c5cbe2e1..dbe4d6bcd069d2088b3cc1b4efd575d14afd4198 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -23,7 +23,7 @@ class TestUniformRandomOp(unittest.TestCase): def test_gpu(self): if core.is_compile_gpu(): - self.uniform_random_test(place=core.GPUPlace(0)) + self.uniform_random_test(place=core.CUDAPlace(0)) def uniform_random_test(self, place): program = fluid.Program()