提交 0d2235aa 编写于 作者: D dzhwinter 提交者: QI JUN

GPUPlace to CUDAPlace (#6960)

上级 87aae57c
...@@ -54,7 +54,7 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -54,7 +54,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto pos = string::RFind(p, ':', string::Piece::npos); auto pos = string::RFind(p, ':', string::Piece::npos);
auto number = device.substr(pos + 1); auto number = device.substr(pos + 1);
places.emplace_back(platform::GPUPlace(std::stoi(number))); places.emplace_back(platform::CUDAPlace(std::stoi(number)));
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'GPU' is not supported, Please re-compile with WITH_GPU option";
......
...@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, ...@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
while (size != 0) { while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size)); size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(), memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()), boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write, reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
gpu_dev_ctx.Wait(); gpu_dev_ctx.Wait();
......
...@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) { ...@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0); paddle::platform::CUDAPlace place(0);
paddle::framework::LoD src_lod; paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14}); src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
......
...@@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) { ...@@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) {
using OpKernelType = paddle::framework::OpKernelType; using OpKernelType = paddle::framework::OpKernelType;
using DataType = paddle::framework::proto::DataType; using DataType = paddle::framework::proto::DataType;
using CPUPlace = paddle::platform::CPUPlace; using CPUPlace = paddle::platform::CPUPlace;
using GPUPlace = paddle::platform::GPUPlace; using CUDAPlace = paddle::platform::CUDAPlace;
using DataLayout = paddle::framework::DataLayout; using DataLayout = paddle::framework::DataLayout;
using LibraryType = paddle::framework::LibraryType; using LibraryType = paddle::framework::LibraryType;
OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW, OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
OpKernelType::Hash hasher; OpKernelType::Hash hasher;
......
...@@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar { ...@@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar {
} }
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
......
...@@ -71,7 +71,7 @@ private: ...@@ -71,7 +71,7 @@ private:
``` ```
```c++ ```c++
typedef boost::variant<GpuPlace, CpuPlace> Place; typedef boost::variant<CUDAPlace, CpuPlace> Place;
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar; Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
typedef boost::variant< typedef boost::variant<
......
...@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::GPUPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::GPUPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type));
} }
#endif #endif
offset_ = 0; offset_ = 0;
......
...@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) { ...@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace()); p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace()); p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace()); p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace()); p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#endif #endif
...@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) { ...@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace()); src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
...@@ -166,7 +166,7 @@ TEST(Tensor, Slice) { ...@@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace()); src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
Tensor slice_tensor = src_tensor.Slice(2, 6); Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims(); DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2); ASSERT_EQ(arity(slice_dims), 2);
...@@ -176,11 +176,11 @@ TEST(Tensor, Slice) { ...@@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>()); reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace())); src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>()); reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace())); slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
......
...@@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
...@@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
...@@ -108,7 +108,7 @@ inline void CopyFromVector(const std::vector<T>& src, ...@@ -108,7 +108,7 @@ inline void CopyFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr, boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
size, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
...@@ -141,7 +141,7 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -141,7 +141,7 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
src_ptr, size, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
......
...@@ -58,7 +58,7 @@ TEST(CopyFrom, Tensor) { ...@@ -58,7 +58,7 @@ TEST(CopyFrom, Tensor) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::GPUPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
...@@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) {
// Copy to GPUTensor // Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3})); gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::GPUPlace(); auto gpu_place = new paddle::platform::CUDAPlace();
CUDADeviceContext gpu_ctx(*gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
...@@ -210,7 +210,7 @@ TEST(CopyToVector, Tensor) { ...@@ -210,7 +210,7 @@ TEST(CopyToVector, Tensor) {
{ {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
Tensor gpu_tensor; Tensor gpu_tensor;
GPUPlace place; CUDAPlace place;
CUDADeviceContext gpu_ctx(place); CUDADeviceContext gpu_ctx(place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
......
...@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024); ...@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
To allocate 4KB memory on the 3rd GPU: To allocate 4KB memory on the 3rd GPU:
```cpp ```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024); p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
``` ```
To free memory and check the so-far used amount of memory on a place: To free memory and check the so-far used amount of memory on a place:
```cpp ```cpp
auto pl = platform::GPUPlace(0); auto pl = platform::CUDAPlace(0);
p = memory::Alloc(pl, 4*1024); p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl); cout << memory::Used(pl);
memory::Free(pl, p); memory::Free(pl, p);
...@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place); ...@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
} // namespace memory } // namespace memory
``` ```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
```cpp ```cpp
template<> template<>
...@@ -49,7 +49,7 @@ and ...@@ -49,7 +49,7 @@ and
```cpp ```cpp
template<> template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) { void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size); return GetGPUBuddyAllocator(p.id)->Alloc(size);
} }
``` ```
...@@ -122,7 +122,7 @@ There are two implementations of `Context`: ...@@ -122,7 +122,7 @@ There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. 1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel ### Majel
......
...@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <> template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::CUDAPlace>(
void* dst, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} }
template <> template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place, void Copy<platform::CUDAPlace, platform::CPUPlace>(
void* dst, platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} }
template <> template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place, void Copy<platform::CUDAPlace, platform::CUDAPlace>(
void* dst, platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
......
...@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
} }
template <> template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) { size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <> template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) { void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size); auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
...@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) { ...@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
LOG(WARNING) << "total " << total; LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place); LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} }
return ptr; return ptr;
} }
template <> template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) { void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
} }
......
...@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::GPUPlace place) { size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::GpuMinChunkSize(); size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
...@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) { ...@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {
EXPECT_EQ(p, nullptr); EXPECT_EQ(p, nullptr);
paddle::platform::GPUPlace gpu(0); paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096); p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
...@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) { ...@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
} }
TEST(BuddyAllocator, GPUMultAlloc) { TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::GPUPlace gpu; paddle::platform::CUDAPlace gpu;
std::unordered_map<void *, size_t> ps; std::unordered_map<void *, size_t> ps;
......
...@@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* inference = ctx.Input<Tensor>("Out"); auto* inference = ctx.Input<Tensor>("Out");
auto* indices = ctx.Input<Tensor>("Indices"); auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
......
...@@ -53,7 +53,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -53,7 +53,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon")); double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum"); const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
...@@ -179,7 +179,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -179,7 +179,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon")); double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout"); const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout = const DataLayout data_layout =
......
...@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter"); auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
...@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory // Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
...@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter"); auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
...@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
......
...@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter"); auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
...@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory // Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
...@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter"); auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
...@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call. // FIXME(typhoonzero): template type T may not be the same as cudnn call.
......
...@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::GPUPlace>(place); auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
......
...@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
// operators runs on GPU device. // operators runs on GPU device.
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) { Tensor* dst) {
dst->mutable_data<T>(platform::GPUPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(src, platform::GPUPlace(), ctx, dst); framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
}; };
copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst);
...@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
Tensor* dst) { Tensor* dst) {
if (src && dst) { if (src && dst) {
dst->mutable_data<T>(platform::GPUPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst); framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
} }
}; };
copyTensor(ctx, emission_grad_src, emission_grad_dst); copyTensor(ctx, emission_grad_src, emission_grad_dst);
......
...@@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
// copy GPU memory to CPU pinned memory // copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
ids_dim[0] * sizeof(int64_t), stream); ids_dim[0] * sizeof(int64_t), stream);
......
...@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> { ...@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* x_tensor = ctx.Input<framework::Tensor>("X"); auto* x_tensor = ctx.Input<framework::Tensor>("X");
auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev"); auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
...@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto x_tensor = ctx.Input<Tensor>("X"); auto x_tensor = ctx.Input<Tensor>("X");
auto c_prev_tensor = ctx.Input<Tensor>("C_prev"); auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
......
...@@ -159,6 +159,7 @@ void testIm2col() { ...@@ -159,6 +159,7 @@ void testIm2col() {
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>(); testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif #endif
} }
...@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>( ...@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()), platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace"); "Matrix must all be in CUDAPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>( ...@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()), platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace"); "Matrix must all be in CUDAPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -266,7 +266,7 @@ struct TensorSetConstantGPU { ...@@ -266,7 +266,7 @@ struct TensorSetConstantGPU {
}; };
template <> template <>
void set_constant_with_place<platform::GPUPlace>( void set_constant_with_place<platform::CUDAPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor, const platform::DeviceContext& context, framework::Tensor* tensor,
float value) { float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()), framework::VisitDataType(framework::ToDataType(tensor->type()),
...@@ -277,7 +277,7 @@ template <> ...@@ -277,7 +277,7 @@ template <>
void set_constant_with_place<platform::CUDNNPlace>( void set_constant_with_place<platform::CUDNNPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor, const platform::DeviceContext& context, framework::Tensor* tensor,
float value) { float value) {
set_constant_with_place<platform::GPUPlace>(context, tensor, value); set_constant_with_place<platform::CUDAPlace>(context, tensor, value);
} }
template struct RowwiseAdd<platform::CUDADeviceContext, float>; template struct RowwiseAdd<platform::CUDADeviceContext, float>;
......
...@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) { ...@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) { ...@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) { ...@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) {
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::framework::Tensor g_mat_a; paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b; paddle::framework::Tensor g_vec_b;
paddle::framework::Tensor g_vec_c; paddle::framework::Tensor g_vec_c;
......
...@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(
boost::get<platform::GPUPlace>(out_place), out_data, boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::GPUPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T), in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::GPUPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
out_data + in1_value.numel(), out_data + in1_value.numel(),
boost::get<platform::GPUPlace>(in2_place), in2_data, boost::get<platform::CUDAPlace>(in2_place), in2_data,
in2_value.numel() * sizeof(T), context.stream()); in2_value.numel() * sizeof(T), context.stream());
} }
}; };
...@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
memory::Copy(boost::get<platform::GPUPlace>(in2_place), memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
in2_data + input2_offset, in2_data + input2_offset,
boost::get<platform::GPUPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T), context.stream()); in1_value.numel() * sizeof(T), context.stream());
} }
}; };
......
...@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::operators::math; using namespace paddle::operators::math;
GPUPlace gpu_place(0); CUDAPlace gpu_place(0);
CPUPlace cpu_place; CPUPlace cpu_place;
CUDADeviceContext ctx(gpu_place); CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor; SetConstant<CUDADeviceContext, float> functor;
...@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::operators::math; using namespace paddle::operators::math;
GPUPlace gpu_place(0); CUDAPlace gpu_place(0);
CPUPlace cpu_place; CPUPlace cpu_place;
CUDADeviceContext ctx(gpu_place); CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor; SetConstant<CUDADeviceContext, float> functor;
......
...@@ -122,6 +122,6 @@ TEST(math, vol2col) { ...@@ -122,6 +122,6 @@ TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testVol2col<paddle::platform::CUDADeviceContext, testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::GPUPlace>(); paddle::platform::CUDAPlace>();
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
} }
...@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
int32_t k = index[i]; int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
...@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]); size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) { if (d_ins[k]) {
......
...@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
...@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
auto ins_names = ctx.Inputs("X"); auto ins_names = ctx.Inputs("X");
...@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
if (idx == root) { if (idx == root) {
......
...@@ -52,7 +52,7 @@ class NCCLTester : public ::testing::Test { ...@@ -52,7 +52,7 @@ class NCCLTester : public ::testing::Test {
virtual void SetUp() override { virtual void SetUp() override {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
p::GPUPlace place(i); p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
} }
...@@ -87,7 +87,7 @@ class NCCLTester : public ::testing::Test { ...@@ -87,7 +87,7 @@ class NCCLTester : public ::testing::Test {
std::unique_lock<std::mutex> lk(mu); std::unique_lock<std::mutex> lk(mu);
const f::OpDesc *op1 = &op_desc; const f::OpDesc *op1 = &op_desc;
p::GPUPlace place(gpu_id); p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id); auto &ctx = dev_ctxs.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
...@@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
for (size_t i = 0; i < dev_scopes.size(); ++i) { for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[i]); p::CUDAPlace gpu_place(gpu_list[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
...@@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[kRoot]); p::CUDAPlace gpu_place(gpu_list[kRoot]);
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
...@@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
float result = kRoot; float result = kRoot;
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[idx]); p::CUDAPlace gpu_place(gpu_list[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
...@@ -300,7 +300,7 @@ int main(int argc, char **argv) { ...@@ -300,7 +300,7 @@ int main(int argc, char **argv) {
places.emplace_back(paddle::platform::CPUPlace()); places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::GPUPlace(i)); places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i); gpu_list.emplace_back(i);
} }
......
...@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
Tensor *output = ctx.Output<Tensor>("Out"); Tensor *output = ctx.Output<Tensor>("Out");
...@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> { ...@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
const Tensor *output = ctx.Input<Tensor>("Out"); const Tensor *output = ctx.Input<Tensor>("Out");
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reshape, reshape,
paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>); paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reshape_grad, reshape_grad,
paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>); paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
...@@ -82,7 +82,7 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -82,7 +82,7 @@ TEST(StridedMemcpy, GPUCrop) {
}; };
// clang-format on // clang-format on
platform::GPUPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
...@@ -121,7 +121,7 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -121,7 +121,7 @@ TEST(StridedMemcpy, GPUConcat) {
}; };
// clang-format on // clang-format on
platform::GPUPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
......
...@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
......
...@@ -58,10 +58,10 @@ DeviceContextPool::DeviceContextPool( ...@@ -58,10 +58,10 @@ DeviceContextPool::DeviceContextPool(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i], device_contexts_.emplace(places[i],
new platform::CUDADeviceContext( new platform::CUDADeviceContext(
boost::get<platform::GPUPlace>(places[i]))); boost::get<platform::CUDAPlace>(places[i])));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'GPUPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
"option"); "option");
#endif #endif
} }
...@@ -91,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -91,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
~EigenCudaStreamDevice() override {} ~EigenCudaStreamDevice() override {}
void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
stream_ = cuda_stream; stream_ = cuda_stream;
place_ = place; place_ = place;
device_prop_ = &Eigen::m_deviceProperties[place.device]; device_prop_ = &Eigen::m_deviceProperties[place.device];
...@@ -130,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -130,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
private: private:
GPUPlace place_; CUDAPlace place_;
const cudaStream_t* stream_; // not owned; const cudaStream_t* stream_; // not owned;
const cudaDeviceProp* device_prop_; // not owned; const cudaDeviceProp* device_prop_; // not owned;
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
}; };
CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
......
...@@ -58,7 +58,7 @@ class EigenCudaStreamDevice; ...@@ -58,7 +58,7 @@ class EigenCudaStreamDevice;
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
explicit CUDADeviceContext(GPUPlace place); explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -80,7 +80,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -80,7 +80,7 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream() const; cudaStream_t stream() const;
private: private:
GPUPlace place_; CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
...@@ -143,7 +143,7 @@ class DeviceContextPool { ...@@ -143,7 +143,7 @@ class DeviceContextPool {
size_t operator()(const platform::Place& place) const { size_t operator()(const platform::Place& place) const {
int pre_hash = place.which() + (1 << LEFT_SHIFT); int pre_hash = place.which() + (1 << LEFT_SHIFT);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
pre_hash += boost::get<platform::GPUPlace>(place).GetDeviceId(); pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
} }
return hash_(pre_hash); return hash_(pre_hash);
} }
......
...@@ -20,11 +20,11 @@ limitations under the License. */ ...@@ -20,11 +20,11 @@ limitations under the License. */
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
delete device_context; delete device_context;
...@@ -33,11 +33,11 @@ TEST(Device, Init) { ...@@ -33,11 +33,11 @@ TEST(Device, Init) {
TEST(Device, CUDADeviceContext) { TEST(Device, CUDADeviceContext) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
...@@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) { ...@@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::Place; using paddle::platform::Place;
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
DeviceContextPool& pool = DeviceContextPool::Get(); DeviceContextPool& pool = DeviceContextPool::Get();
auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
...@@ -80,14 +80,14 @@ TEST(Device, DeviceContextPool) { ...@@ -80,14 +80,14 @@ TEST(Device, DeviceContextPool) {
std::vector<Place> gpu_places; std::vector<Place> gpu_places;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
gpu_places.emplace_back(GPUPlace(i)); gpu_places.emplace_back(CUDAPlace(i));
} }
auto dev_ctxs = pool.Borrow(gpu_places); auto dev_ctxs = pool.Borrow(gpu_places);
for (size_t i = 0; i < dev_ctxs.size(); ++i) { for (size_t i = 0; i < dev_ctxs.size(); ++i) {
auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]); auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
// check same as GPUPlace(i) // check same as CUDAPlace(i)
GPUPlace place = boost::get<GPUPlace>(dev_ctx->GetPlace()); CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i)); EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
} }
} }
...@@ -106,7 +106,7 @@ int main(int argc, char** argv) { ...@@ -106,7 +106,7 @@ int main(int argc, char** argv) {
places.emplace_back(paddle::platform::CPUPlace()); places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::GPUPlace(i)); places.emplace_back(paddle::platform::CUDAPlace(i));
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
......
...@@ -50,7 +50,7 @@ struct PerThreadData { ...@@ -50,7 +50,7 @@ struct PerThreadData {
T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) { PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
send_buff.resize(size); send_buff.resize(size);
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
send_buff[i] = static_cast<T>(i); send_buff[i] = static_cast<T>(i);
...@@ -140,7 +140,7 @@ int main(int argc, char** argv) { ...@@ -140,7 +140,7 @@ int main(int argc, char** argv) {
places.emplace_back(paddle::platform::CPUPlace()); places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::GPUPlace(i)); places.emplace_back(paddle::platform::CUDAPlace(i));
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
......
...@@ -24,7 +24,9 @@ class PlacePrinter : public boost::static_visitor<> { ...@@ -24,7 +24,9 @@ class PlacePrinter : public boost::static_visitor<> {
explicit PlacePrinter(std::ostream &os) : os_(os) {} explicit PlacePrinter(std::ostream &os) : os_(os) {}
void operator()(const CPUPlace &) { os_ << "CPUPlace"; } void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; } void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; }
void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } void operator()(const CUDAPlace &p) {
os_ << "CUDAPlace(" << p.device << ")";
}
private: private:
std::ostream &os_; std::ostream &os_;
...@@ -37,12 +39,12 @@ static Place the_default_place; ...@@ -37,12 +39,12 @@ static Place the_default_place;
void set_place(const Place &place) { the_default_place = place; } void set_place(const Place &place) { the_default_place = place; }
const Place &get_place() { return the_default_place; } const Place &get_place() { return the_default_place; }
const GPUPlace default_gpu() { return GPUPlace(0); } const CUDAPlace default_gpu() { return CUDAPlace(0); }
const CPUPlace default_cpu() { return CPUPlace(); } const CPUPlace default_cpu() { return CPUPlace(); }
const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); } const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }
bool is_gpu_place(const Place &p) { bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsGPUPlace(), p); return boost::apply_visitor(IsCUDAPlace(), p);
} }
bool is_cpu_place(const Place &p) { bool is_cpu_place(const Place &p) {
return !is_gpu_place(p) && !is_mkldnn_place(p); return !is_gpu_place(p) && !is_mkldnn_place(p);
......
...@@ -39,43 +39,45 @@ struct MKLDNNPlace { ...@@ -39,43 +39,45 @@ struct MKLDNNPlace {
inline bool operator!=(const MKLDNNPlace &) const { return false; } inline bool operator!=(const MKLDNNPlace &) const { return false; }
}; };
struct GPUPlace { struct CUDAPlace {
GPUPlace() : GPUPlace(0) {} CUDAPlace() : CUDAPlace(0) {}
explicit GPUPlace(int d) : device(d) {} explicit CUDAPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; } inline int GetDeviceId() const { return device; }
// needed for variant equality comparison // needed for variant equality comparison
inline bool operator==(const GPUPlace &o) const { return device == o.device; } inline bool operator==(const CUDAPlace &o) const {
inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } return device == o.device;
}
inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
int device; int device;
}; };
struct CUDNNPlace : public GPUPlace { struct CUDNNPlace : public CUDAPlace {
CUDNNPlace() : GPUPlace() {} CUDNNPlace() : CUDAPlace() {}
explicit CUDNNPlace(int d) : GPUPlace(d) {} explicit CUDNNPlace(int d) : CUDAPlace(d) {}
}; };
struct IsGPUPlace : public boost::static_visitor<bool> { struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const MKLDNNPlace &) const { return false; } bool operator()(const MKLDNNPlace &) const { return false; }
bool operator()(const GPUPlace &gpu) const { return true; } bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const CUDNNPlace &) const { return true; } bool operator()(const CUDNNPlace &) const { return true; }
}; };
struct IsMKLDNNPlace : public boost::static_visitor<bool> { struct IsMKLDNNPlace : public boost::static_visitor<bool> {
bool operator()(const MKLDNNPlace &) const { return true; } bool operator()(const MKLDNNPlace &) const { return true; }
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const GPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDNNPlace &) const { return false; } bool operator()(const CUDNNPlace &) const { return false; }
}; };
typedef boost::variant<CUDNNPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place; typedef boost::variant<CUDNNPlace, CUDAPlace, CPUPlace, MKLDNNPlace> Place;
void set_place(const Place &); void set_place(const Place &);
const Place &get_place(); const Place &get_place();
const GPUPlace default_gpu(); const CUDAPlace default_gpu();
const CPUPlace default_cpu(); const CPUPlace default_cpu();
const MKLDNNPlace default_mkldnn(); const MKLDNNPlace default_mkldnn();
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
TEST(Place, Equality) { TEST(Place, Equality) {
paddle::platform::CPUPlace cpu; paddle::platform::CPUPlace cpu;
paddle::platform::GPUPlace g0(0), g1(1), gg0(0); paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0); paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0);
EXPECT_EQ(cpu, cpu); EXPECT_EQ(cpu, cpu);
...@@ -41,8 +41,8 @@ TEST(Place, Default) { ...@@ -41,8 +41,8 @@ TEST(Place, Default) {
TEST(Place, Print) { TEST(Place, Print) {
{ {
std::stringstream ss; std::stringstream ss;
ss << paddle::platform::GPUPlace(1); ss << paddle::platform::CUDAPlace(1);
EXPECT_EQ("GPUPlace(1)", ss.str()); EXPECT_EQ("CUDAPlace(1)", ss.str());
} }
{ {
std::stringstream ss; std::stringstream ss;
......
...@@ -49,7 +49,7 @@ TEST(Transform, CPUUnary) { ...@@ -49,7 +49,7 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) { TEST(Transform, GPUUnary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
GPUPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
...@@ -80,7 +80,7 @@ TEST(Transform, GPUBinary) { ...@@ -80,7 +80,7 @@ TEST(Transform, GPUBinary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
GPUPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
......
...@@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) { ...@@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) {
self.Resize(make_ddim(dim)); self.Resize(make_ddim(dim));
}) })
.def("alloc_float", .def("alloc_float",
[](Tensor &self, paddle::platform::GPUPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("alloc_float", .def("alloc_float",
...@@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) { ...@@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_int", .def("alloc_int",
[](Tensor &self, paddle::platform::GPUPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<float>)
...@@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::CPUDeviceContext(); return new paddle::platform::CPUDeviceContext();
}) })
.def_static("create", .def_static("create",
[](paddle::platform::GPUPlace& place) [](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("GPUPlace is not supported in CPU device."); PADDLE_THROW("CUDAPlace is not supported in CPU device.");
#else #else
return new paddle::platform::CUDADeviceContext(place); return new paddle::platform::CUDADeviceContext(place);
#endif #endif
...@@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
py::class_<platform::GPUPlace>(m, "GPUPlace") py::class_<platform::CUDAPlace>(m, "CUDAPlace")
.def(py::init<int>()) .def(py::init<int>())
.def("__str__", string::to_string<const platform::GPUPlace &>); .def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace") py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
.def(py::init<>()) .def(py::init<>())
...@@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle.
self = cpu_place; self = cpu_place;
}) })
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::GPUPlace &gpu_place) { [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place; self = gpu_place;
}); });
...@@ -363,7 +363,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -363,7 +363,7 @@ All parameter, weight, gradient are variables in Paddle.
const platform::CPUPlace &place) { self.Run(scope, place); }) const platform::CPUPlace &place) { self.Run(scope, place); })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self, const Scope &scope,
const platform::GPUPlace &place) { self.Run(scope, place); }) const platform::CUDAPlace &place) { self.Run(scope, place); })
.def("type", .def("type",
[](const OperatorBase &op) -> std::string { return op.Type(); }) [](const OperatorBase &op) -> std::string { return op.Type(); })
.def("outputs", .def("outputs",
......
...@@ -71,7 +71,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -71,7 +71,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream()); cudaMemcpyDeviceToHost, dev_ctx->stream());
#else #else
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
...@@ -127,7 +127,7 @@ template <typename T> ...@@ -127,7 +127,7 @@ template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
framework::Tensor &self, framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array, py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::GPUPlace &place) { paddle::platform::CUDAPlace &place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
......
...@@ -36,7 +36,7 @@ int main(int argc, char** argv) { ...@@ -36,7 +36,7 @@ int main(int argc, char** argv) {
paddle::memory::Used(paddle::platform::CPUPlace()); paddle::memory::Used(paddle::platform::CPUPlace());
std::vector<std::string> devs = {"CPU"}; std::vector<std::string> devs = {"CPU"};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::GPUPlace(0)); paddle::memory::Used(paddle::platform::CUDAPlace(0));
devs.push_back("GPU:0"); devs.push_back("GPU:0");
#endif #endif
paddle::framework::InitDevices(devs); paddle::framework::InitDevices(devs);
......
...@@ -15,14 +15,14 @@ import backward ...@@ -15,14 +15,14 @@ import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, GPUPlace from core import LoDTensor, CPUPlace, CUDAPlace
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler
import clip import clip
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
'DataFeeder', 'clip', 'DistributeTranspiler' 'DataFeeder', 'clip', 'DistributeTranspiler'
] ]
......
...@@ -47,7 +47,7 @@ class Executor(object): ...@@ -47,7 +47,7 @@ class Executor(object):
act_places.append(p) act_places.append(p)
# TODO(dzhwinter) : consider that our fluid tests all written in # TODO(dzhwinter) : consider that our fluid tests all written in
# GPUPlace(gpu_id), this will be changed in the future # CUDAPlace(gpu_id), this will be changed in the future
if core.is_compile_gpu(): if core.is_compile_gpu():
core.init_devices(["CPU", "GPU:0"]) core.init_devices(["CPU", "GPU:0"])
else: else:
......
...@@ -142,7 +142,7 @@ def main(): ...@@ -142,7 +142,7 @@ def main():
opts = sgd_optimizer.minimize(cost) opts = sgd_optimizer.minimize(cost)
if USE_GPU: if USE_GPU:
place = core.GPUPlace(0) place = core.CUDAPlace(0)
else: else:
place = core.CPUPlace() place = core.CPUPlace()
......
...@@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): ...@@ -316,7 +316,7 @@ class OpTest(unittest.TestCase):
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol)
...@@ -379,7 +379,7 @@ class OpTest(unittest.TestCase): ...@@ -379,7 +379,7 @@ class OpTest(unittest.TestCase):
"Gradient Check On %s" % str(cpu_place)) "Gradient Check On %s" % str(cpu_place))
if core.is_compile_gpu() and self.op.support_gpu(): if core.is_compile_gpu() and self.op.support_gpu():
gpu_place = core.GPUPlace(0) gpu_place = core.CUDAPlace(0)
gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place, gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place,
output_names, no_grad_set) output_names, no_grad_set)
......
...@@ -167,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase): ...@@ -167,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase):
def test_sparse_adagrad(self): def test_sparse_adagrad(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compile_gpu():
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -304,7 +304,7 @@ class TestBatchNormOp(OpTest): ...@@ -304,7 +304,7 @@ class TestBatchNormOp(OpTest):
self.__assert_close(saved_variance_tensor, saved_variance, self.__assert_close(saved_variance_tensor, saved_variance,
"saved_variance") "saved_variance")
self.__assert_close(mean_out_tensor, mean_out, "mean_out") self.__assert_close(mean_out_tensor, mean_out, "mean_out")
if isinstance(place, core.GPUPlace): if isinstance(place, core.CUDAPlace):
atol = 5e-2 atol = 5e-2
else: else:
atol = 1e-4 atol = 1e-4
...@@ -339,7 +339,7 @@ class TestBatchNormOp(OpTest): ...@@ -339,7 +339,7 @@ class TestBatchNormOp(OpTest):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
core.init_devices(["CPU", "GPU:0"]) core.init_devices(["CPU", "GPU:0"])
else: else:
......
...@@ -20,7 +20,7 @@ class TestGaussianRandomOp(unittest.TestCase): ...@@ -20,7 +20,7 @@ class TestGaussianRandomOp(unittest.TestCase):
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.gaussian_random_test(place=fluid.GPUPlace(0)) self.gaussian_random_test(place=fluid.CUDAPlace(0))
def gaussian_random_test(self, place): def gaussian_random_test(self, place):
......
...@@ -15,7 +15,7 @@ class TestProfiler(unittest.TestCase): ...@@ -15,7 +15,7 @@ class TestProfiler(unittest.TestCase):
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.GPUPlace(0) place = fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
......
...@@ -78,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -78,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compile_gpu():
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -23,7 +23,7 @@ class TestUniformRandomOp(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestUniformRandomOp(unittest.TestCase):
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.uniform_random_test(place=core.GPUPlace(0)) self.uniform_random_test(place=core.CUDAPlace(0))
def uniform_random_test(self, place): def uniform_random_test(self, place):
program = fluid.Program() program = fluid.Program()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册