未验证 提交 cced930b 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid operators for rocm (part1), test=develop (#31077)

上级 99fd9815
...@@ -73,7 +73,7 @@ class ConditionalOp : public framework::OperatorBase { ...@@ -73,7 +73,7 @@ class ConditionalOp : public framework::OperatorBase {
ips[0]->numel())); ips[0]->numel()));
bool res = false; bool res = false;
if (platform::is_gpu_place(ips[0]->place())) { if (platform::is_gpu_place(ips[0]->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::LoDTensor cpu_tensor; framework::LoDTensor cpu_tensor;
framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
......
...@@ -26,7 +26,7 @@ namespace imperative { ...@@ -26,7 +26,7 @@ namespace imperative {
class OpBase; class OpBase;
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -34,7 +34,7 @@ namespace paddle { ...@@ -34,7 +34,7 @@ namespace paddle {
namespace operators { namespace operators {
static size_t CUDADevCount() { static size_t CUDADevCount() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return platform::GetCUDADeviceCount(); return platform::GetCUDADeviceCount();
#else #else
return 0UL; return 0UL;
......
...@@ -223,7 +223,7 @@ bool GetCondData(const framework::LoDTensor &cond) { ...@@ -223,7 +223,7 @@ bool GetCondData(const framework::LoDTensor &cond) {
} }
// when platform::is_gpu_place(cond.place()) is true // when platform::is_gpu_place(cond.place()) is true
std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -40,10 +40,12 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo ...@@ -40,10 +40,12 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo
detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
if(WITH_GPU) if(WITH_GPU OR WITH_ROCM)
set(TMPDEPS memory) set(TMPDEPS memory)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) if(WITH_GPU)
set(TMPDEPS memory cub) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
set(TMPDEPS memory cub)
endif()
endif() endif()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS}) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS}) detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS})
......
...@@ -16,10 +16,16 @@ limitations under the License. */ ...@@ -16,10 +16,16 @@ limitations under the License. */
#include <cfloat> #include <cfloat>
#include <string> #include <string>
#include <vector> #include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh" #include "cub/cub.cuh"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
...@@ -58,16 +64,27 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, ...@@ -58,16 +64,27 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num); idx_out, num);
#endif
} }
template <typename T> template <typename T>
......
...@@ -9,8 +9,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,8 +9,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h> #ifdef __NVCC__
#include "cub/cub.cuh" #include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
...@@ -135,17 +141,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -135,17 +141,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in, nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
idx_out, total_roi_num); idx_out, total_roi_num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort score to get corresponding index // sort score to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num);
#else
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(), d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
keys_out, idx_in, idx_out, total_roi_num); keys_out, idx_in, idx_out, total_roi_num);
#endif
index_out_t.Resize({real_post_num}); index_out_t.Resize({real_post_num});
Tensor sorted_rois; Tensor sorted_rois;
sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
...@@ -167,17 +185,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -167,17 +185,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace()); out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
// Determine temporary device storage requirements // Determine temporary device storage requirements
temp_storage_bytes = 0; temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data, nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
batch_idx_in, index_out_t.data<int>(), real_post_num); batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif
// Allocate temporary storage // Allocate temporary storage
d_temp_storage = memory::Alloc(place, temp_storage_bytes); d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort batch_id to get corresponding index // sort batch_id to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(), d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num); out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
#endif
GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois); GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
......
...@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h> #ifdef __NVCC__
#include "cub/cub.cuh" #include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include <paddle/fluid/memory/allocation/allocator.h>
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
...@@ -143,24 +149,42 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> { ...@@ -143,24 +149,42 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
// Determine temporary device storage requirements // Determine temporary device storage requirements
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes, cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
target_lvls_data, keys_out, target_lvls_data, keys_out,
idx_in, idx_out, roi_num); idx_in, idx_out, roi_num);
#endif
// Allocate temporary storage // Allocate temporary storage
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort target level to get corresponding index // sort target level to get corresponding index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num); idx_in, idx_out, roi_num);
#endif
int* restore_idx_data = int* restore_idx_data =
restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace()); restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
// sort current index to get restore index // sort current index to get restore index
#ifdef PADDLE_WITH_HIP
hipcub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
#else
cub::DeviceRadixSort::SortPairs<int, int>( cub::DeviceRadixSort::SortPairs<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num); restore_idx_data, roi_num);
#endif
int start = 0; int start = 0;
auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum"); auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "cub/cub.cuh"
#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
......
...@@ -107,7 +107,7 @@ class TargetAssignKernel : public framework::OpKernel<T> { ...@@ -107,7 +107,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
int64_t k = x->dims()[2]; int64_t k = x->dims()[2];
auto x_lod = x->lod().back(); auto x_lod = x->lod().back();
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace()); size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
#else #else
size_t* x_lod_data = x_lod.data(); size_t* x_lod_data = x_lod.data();
...@@ -129,7 +129,7 @@ class TargetAssignKernel : public framework::OpKernel<T> { ...@@ -129,7 +129,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
"TargetAssignOp input(NegIndices) needs 1 level of LoD")); "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
const int* neg_idx_data = neg_indices->data<int>(); const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back(); auto neg_lod = neg_indices->lod().back();
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
#else #else
size_t* neg_lod_data = neg_lod.data(); size_t* neg_lod_data = neg_lod.data();
......
...@@ -61,7 +61,7 @@ cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) ...@@ -61,7 +61,7 @@ cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator) cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
if(WITH_GPU) if(WITH_GPU OR WITH_ROCM)
cc_test(collective_server_test SRCS collective_server_test.cc cc_test(collective_server_test SRCS collective_server_test.cc
DEPS sendrecvop_rpc executor ${RPC_DEPS} DEPS sendrecvop_rpc executor ${RPC_DEPS}
selected_rows_functor scope math_function) selected_rows_functor scope math_function)
......
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_NCCL #ifdef PADDLE_WITH_NCCL
#include <nccl.h> #include <nccl.h>
#endif #endif
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#include <sys/time.h> #include <sys/time.h>
#include <limits> #include <limits>
#include <memory> #include <memory>
...@@ -144,7 +147,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -144,7 +147,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
} else if (var->IsType<framework::SelectedRows>()) { } else if (var->IsType<framework::SelectedRows>()) {
request->set_type(::sendrecv::SELECTED_ROWS); request->set_type(::sendrecv::SELECTED_ROWS);
payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
} else if (var->IsType<ncclUniqueId>()) { } else if (var->IsType<ncclUniqueId>()) {
request->set_type(::sendrecv::NCCL_ID); request->set_type(::sendrecv::NCCL_ID);
const ncclUniqueId& uid = var->Get<ncclUniqueId>(); const ncclUniqueId& uid = var->Get<ncclUniqueId>();
...@@ -172,7 +175,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, ...@@ -172,7 +175,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
static_cast<const char*>(payload->ptr()), payload->memory_size()); static_cast<const char*>(payload->ptr()), payload->memory_size());
} else { } else {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
IOBufWriter::AppendZeroCopy( IOBufWriter::AppendZeroCopy(
name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
static_cast<const char*>(payload->ptr()), payload->memory_size(), static_cast<const char*>(payload->ptr()), payload->memory_size(),
......
...@@ -159,7 +159,7 @@ void RunTestLodTensor(platform::Place place) { ...@@ -159,7 +159,7 @@ void RunTestLodTensor(platform::Place place) {
TEST(LodTensor, Run) { TEST(LodTensor, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunTestLodTensor(place); RunTestLodTensor(place);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu(0); platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu); RunTestLodTensor(gpu);
#endif #endif
...@@ -168,7 +168,7 @@ TEST(LodTensor, Run) { ...@@ -168,7 +168,7 @@ TEST(LodTensor, Run) {
TEST(SelectedRows, Run) { TEST(SelectedRows, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(place);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu; platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(gpu); RunSerdeTestSelectedRows(gpu);
#endif #endif
......
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_NCCL #ifdef PADDLE_WITH_NCCL
#include <nccl.h> #include <nccl.h>
#endif #endif
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#include <limits> #include <limits>
#include <memory> #include <memory>
#include "grpcpp/impl/codegen/byte_buffer.h" #include "grpcpp/impl/codegen/byte_buffer.h"
...@@ -75,7 +78,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -75,7 +78,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} else if (var->IsType<framework::SelectedRows>()) { } else if (var->IsType<framework::SelectedRows>()) {
request.set_type(::sendrecv::SELECTED_ROWS); request.set_type(::sendrecv::SELECTED_ROWS);
payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
} else if (var->IsType<ncclUniqueId>()) { } else if (var->IsType<ncclUniqueId>()) {
request.set_type(::sendrecv::NCCL_ID); request.set_type(::sendrecv::NCCL_ID);
#endif #endif
...@@ -91,7 +94,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -91,7 +94,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteRawBytes(std::string(header.data(), header.size())); e.WriteRawBytes(std::string(header.data(), header.size()));
// NCCLID is copied directly to the message, return bytebuffer // NCCLID is copied directly to the message, return bytebuffer
// with only one slice if serializing NCCLID. // with only one slice if serializing NCCLID.
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (var->IsType<ncclUniqueId>()) { if (var->IsType<ncclUniqueId>()) {
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
NCCL_UNIQUE_ID_BYTES); NCCL_UNIQUE_ID_BYTES);
......
...@@ -206,7 +206,7 @@ TEST(LodTensor, Run) { ...@@ -206,7 +206,7 @@ TEST(LodTensor, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunTestLodTensor(place); RunTestLodTensor(place);
RunTestLodTensor(place, 1); RunTestLodTensor(place, 1);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu(0); platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu); RunTestLodTensor(gpu);
RunTestLodTensor(gpu, 1); RunTestLodTensor(gpu, 1);
...@@ -217,7 +217,7 @@ TEST(SelectedRows, Run) { ...@@ -217,7 +217,7 @@ TEST(SelectedRows, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(place);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu; platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(gpu); RunSerdeTestSelectedRows(gpu);
#endif #endif
......
...@@ -281,7 +281,7 @@ void prefetchs(const std::vector<std::string> &id_var_names, ...@@ -281,7 +281,7 @@ void prefetchs(const std::vector<std::string> &id_var_names,
} }
} }
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<float> ids_value_vec(ids_size * vec_dim_1); std::vector<float> ids_value_vec(ids_size * vec_dim_1);
for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) { for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
const auto &id = ids[idx]; const auto &id = ids[idx];
......
...@@ -39,7 +39,7 @@ using VarMsg = sendrecv::VariableMessage; ...@@ -39,7 +39,7 @@ using VarMsg = sendrecv::VariableMessage;
static TensorPayload GetCommunicationAllocationFromTensor( static TensorPayload GetCommunicationAllocationFromTensor(
const platform::DeviceContext& ctx, const framework::Tensor& tensor) { const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
if (is_gpu_place(ctx.GetPlace())) { if (is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
is_gpu_place(tensor.place()), true, is_gpu_place(tensor.place()), true,
platform::errors::PreconditionNotMet("Please run in gpu place.")); platform::errors::PreconditionNotMet("Please run in gpu place."));
......
...@@ -33,7 +33,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -33,7 +33,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
int total_written = 0; int total_written = 0;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_dev_ctx = auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx); static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -62,7 +62,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, ...@@ -62,7 +62,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
gpu_dev_ctx.Wait(); gpu_dev_ctx.Wait();
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Unexpected branch, please compile with PADDLE_WITH_CUDA")); "Unexpected branch, please compile with WITH_GPU or WITH_ROCM"));
#endif #endif
return true; return true;
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
...@@ -221,7 +221,7 @@ bool VariableResponse::ProcSerializedField( ...@@ -221,7 +221,7 @@ bool VariableResponse::ProcSerializedField(
platform::errors::PreconditionNotMet("meta info should be got first!")); platform::errors::PreconditionNotMet("meta info should be got first!"));
if (meta_.type() == sendrecv::NCCL_ID) { if (meta_.type() == sendrecv::NCCL_ID) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* var = scope_->FindVar(meta_.varname()); auto* var = scope_->FindVar(meta_.varname());
if (var != nullptr) { if (var != nullptr) {
ncclUniqueId* id = var->GetMutable<ncclUniqueId>(); ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
......
...@@ -43,8 +43,19 @@ __global__ void AccuracyCudaKernel(const int N, const int D, ...@@ -43,8 +43,19 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
total[threadIdx.x] = count; total[threadIdx.x] = count;
__syncthreads(); __syncthreads();
// reduce the count with init value 0, and output accuracy. // reduce the count with init value 0, and output accuracy.
#ifdef PADDLE_WITH_CUDA
int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
#else
// HIP thrust::reduce not support __device__
for (int s = BlockSize / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
total[threadIdx.x] += total[threadIdx.x + s];
}
__syncthreads();
}
int result = total[0];
#endif
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
*correct_data = result; *correct_data = result;
*accuracy = static_cast<float>(result) / static_cast<float>(N); *accuracy = static_cast<float>(result) / static_cast<float>(N);
......
...@@ -130,6 +130,7 @@ class AucCUDAKernel : public framework::OpKernel<T> { ...@@ -130,6 +130,7 @@ class AucCUDAKernel : public framework::OpKernel<T> {
auto *pos_in_data = stat_pos_in_tensor->data<int64_t>(); auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg"); auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
auto *neg_in_data = stat_neg_in_tensor->data<int64_t>(); auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
#ifdef PADDLE_WITH_CUDA
if (stat_pos_in_tensor != stat_pos) { if (stat_pos_in_tensor != stat_pos) {
cudaMemcpy(origin_stat_pos, pos_in_data, cudaMemcpy(origin_stat_pos, pos_in_data,
((1 + slide_steps) * (num_thresholds + 1) + ((1 + slide_steps) * (num_thresholds + 1) +
...@@ -144,6 +145,22 @@ class AucCUDAKernel : public framework::OpKernel<T> { ...@@ -144,6 +145,22 @@ class AucCUDAKernel : public framework::OpKernel<T> {
sizeof(int64_t), sizeof(int64_t),
cudaMemcpyDeviceToDevice); cudaMemcpyDeviceToDevice);
} }
#else
if (stat_pos_in_tensor != stat_pos) {
hipMemcpy(origin_stat_pos, pos_in_data,
((1 + slide_steps) * (num_thresholds + 1) +
(slide_steps > 0 ? 1 : 0)) *
sizeof(int64_t),
hipMemcpyDeviceToDevice);
}
if (stat_neg_in_tensor != stat_neg) {
hipMemcpy(origin_stat_neg, neg_in_data,
((1 + slide_steps) * (num_thresholds + 1) +
(slide_steps > 0 ? 1 : 0)) *
sizeof(int64_t),
hipMemcpyDeviceToDevice);
}
#endif
statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos, statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos,
origin_stat_neg); origin_stat_neg);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册