未验证 提交 798670bb 编写于 作者: D danleifeng 提交者: GitHub
上级 1149a378
...@@ -241,3 +241,6 @@ endif() ...@@ -241,3 +241,6 @@ endif()
if(WITH_CUSTOM_DEVICE AND NOT WIN32) if(WITH_CUSTOM_DEVICE AND NOT WIN32)
add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
endif() endif()
if(WITH_GPU_GRAPH)
add_definitions(-DPADDLE_WITH_GPU_GRAPH)
endif()
...@@ -144,10 +144,8 @@ int32_t GraphBrpcService::add_graph_node(Table *table, ...@@ -144,10 +144,8 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(int64_t);
int64_t *node_data = (int64_t *)(request.params(1).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t); std::vector<uint64_t> node_ids(node_data, node_data + node_num);
// int64_t *node_data = (int64_t *)(request.params(0).c_str());
std::vector<int64_t> node_ids(node_data, node_data + node_num);
std::vector<bool> is_weighted_list; std::vector<bool> is_weighted_list;
if (request.params_size() == 3) { if (request.params_size() == 3) {
size_t weight_list_size = request.params(2).size() / sizeof(bool); size_t weight_list_size = request.params(2).size() / sizeof(bool);
...@@ -179,11 +177,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table, ...@@ -179,11 +177,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
return 0; return 0;
} }
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(uint64_t);
int64_t *node_data = (int64_t *)(request.params(1).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t); std::vector<uint64_t> node_ids(node_data, node_data + node_num);
// int64_t *node_data = (int64_t *)(request.params(0).c_str());
std::vector<int64_t> node_ids(node_data, node_data + node_num);
((GraphTable *)table)->remove_graph_node(idx_, node_ids); ((GraphTable *)table)->remove_graph_node(idx_, node_ids);
return 0; return 0;
...@@ -217,11 +213,6 @@ int32_t GraphBrpcService::Initialize() { ...@@ -217,11 +213,6 @@ int32_t GraphBrpcService::Initialize() {
&GraphBrpcService::graph_set_node_feat; &GraphBrpcService::graph_set_node_feat;
_service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] = _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] =
&GraphBrpcService::sample_neighbors_across_multi_servers; &GraphBrpcService::sample_neighbors_across_multi_servers;
// _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
// &GraphBrpcService::use_neighbors_sample_cache;
// _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] =
// &GraphBrpcService::load_graph_split_config;
// shard初始化,server启动后才可从env获取到server_list的shard信息
InitializeShardInfo(); InitializeShardInfo();
return 0; return 0;
...@@ -389,9 +380,6 @@ int32_t GraphBrpcService::pull_graph_list(Table *table, ...@@ -389,9 +380,6 @@ int32_t GraphBrpcService::pull_graph_list(Table *table,
int start = *(int *)(request.params(2).c_str()); int start = *(int *)(request.params(2).c_str());
int size = *(int *)(request.params(3).c_str()); int size = *(int *)(request.params(3).c_str());
int step = *(int *)(request.params(4).c_str()); int step = *(int *)(request.params(4).c_str());
// int start = *(int *)(request.params(0).c_str());
// int size = *(int *)(request.params(1).c_str());
// int step = *(int *)(request.params(2).c_str());
std::unique_ptr<char[]> buffer; std::unique_ptr<char[]> buffer;
int actual_size; int actual_size;
((GraphTable *)table) ((GraphTable *)table)
...@@ -414,14 +402,10 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( ...@@ -414,14 +402,10 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
return 0; return 0;
} }
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(uint64_t);
int64_t *node_data = (int64_t *)(request.params(1).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
int sample_size = *(int64_t *)(request.params(2).c_str()); int sample_size = *(int *)(request.params(2).c_str());
bool need_weight = *(bool *)(request.params(3).c_str()); bool need_weight = *(bool *)(request.params(3).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t);
// int64_t *node_data = (int64_t *)(request.params(0).c_str());
// int sample_size = *(int64_t *)(request.params(1).c_str());
// bool need_weight = *(bool *)(request.params(2).c_str());
std::vector<std::shared_ptr<char>> buffers(node_num); std::vector<std::shared_ptr<char>> buffers(node_num);
std::vector<int> actual_sizes(node_num, 0); std::vector<int> actual_sizes(node_num, 0);
((GraphTable *)table) ((GraphTable *)table)
...@@ -443,7 +427,7 @@ int32_t GraphBrpcService::graph_random_sample_nodes( ...@@ -443,7 +427,7 @@ int32_t GraphBrpcService::graph_random_sample_nodes(
brpc::Controller *cntl) { brpc::Controller *cntl) {
int type_id = *(int *)(request.params(0).c_str()); int type_id = *(int *)(request.params(0).c_str());
int idx_ = *(int *)(request.params(1).c_str()); int idx_ = *(int *)(request.params(1).c_str());
size_t size = *(int64_t *)(request.params(2).c_str()); size_t size = *(uint64_t *)(request.params(2).c_str());
// size_t size = *(int64_t *)(request.params(0).c_str()); // size_t size = *(int64_t *)(request.params(0).c_str());
std::unique_ptr<char[]> buffer; std::unique_ptr<char[]> buffer;
int actual_size; int actual_size;
...@@ -470,11 +454,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, ...@@ -470,11 +454,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
return 0; return 0;
} }
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(uint64_t);
int64_t *node_data = (int64_t *)(request.params(1).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t); std::vector<uint64_t> node_ids(node_data, node_data + node_num);
// int64_t *node_data = (int64_t *)(request.params(0).c_str());
std::vector<int64_t> node_ids(node_data, node_data + node_num);
std::vector<std::string> feature_names = std::vector<std::string> feature_names =
paddle::string::split_string<std::string>(request.params(2), "\t"); paddle::string::split_string<std::string>(request.params(2), "\t");
...@@ -511,21 +493,14 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( ...@@ -511,21 +493,14 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
} }
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(uint64_t);
int64_t *node_data = (int64_t *)(request.params(1).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
int sample_size = *(int64_t *)(request.params(2).c_str()); int sample_size = *(int *)(request.params(2).c_str());
bool need_weight = *(int64_t *)(request.params(3).c_str()); bool need_weight = *(bool *)(request.params(3).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t),
// size_of_size_t = sizeof(size_t);
// int64_t *node_data = (int64_t *)(request.params(0).c_str());
// int sample_size = *(int64_t *)(request.params(1).c_str());
// bool need_weight = *(int64_t *)(request.params(2).c_str());
// std::vector<int64_t> res = ((GraphTable
// *)table).filter_out_non_exist_nodes(node_data, sample_size);
std::vector<int> request2server; std::vector<int> request2server;
std::vector<int> server2request(server_size, -1); std::vector<int> server2request(server_size, -1);
std::vector<int64_t> local_id; std::vector<uint64_t> local_id;
std::vector<int> local_query_idx; std::vector<int> local_query_idx;
size_t rank = GetRank(); size_t rank = GetRank();
for (size_t query_idx = 0; query_idx < node_num; ++query_idx) { for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
...@@ -548,7 +523,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( ...@@ -548,7 +523,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
std::vector<std::shared_ptr<char>> local_buffers; std::vector<std::shared_ptr<char>> local_buffers;
std::vector<int> local_actual_sizes; std::vector<int> local_actual_sizes;
std::vector<size_t> seq; std::vector<size_t> seq;
std::vector<std::vector<int64_t>> node_id_buckets(request_call_num); std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
std::vector<std::vector<int>> query_idx_buckets(request_call_num); std::vector<std::vector<int>> query_idx_buckets(request_call_num);
for (size_t query_idx = 0; query_idx < node_num; ++query_idx) { for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
int server_index = int server_index =
...@@ -639,7 +614,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( ...@@ -639,7 +614,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
closure->request(request_idx) closure->request(request_idx)
->add_params((char *)node_id_buckets[request_idx].data(), ->add_params((char *)node_id_buckets[request_idx].data(),
sizeof(int64_t) * node_num); sizeof(uint64_t) * node_num);
closure->request(request_idx) closure->request(request_idx)
->add_params((char *)&sample_size, sizeof(int)); ->add_params((char *)&sample_size, sizeof(int));
closure->request(request_idx) closure->request(request_idx)
...@@ -682,11 +657,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, ...@@ -682,11 +657,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
} }
int idx_ = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(0).c_str());
// size_t node_num = request.params(0).size() / sizeof(int64_t); size_t node_num = request.params(1).size() / sizeof(uint64_t);
// int64_t *node_data = (int64_t *)(request.params(0).c_str()); uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
size_t node_num = request.params(1).size() / sizeof(int64_t); std::vector<uint64_t> node_ids(node_data, node_data + node_num);
int64_t *node_data = (int64_t *)(request.params(1).c_str());
std::vector<int64_t> node_ids(node_data, node_data + node_num);
// std::vector<std::string> feature_names = // std::vector<std::string> feature_names =
// paddle::string::split_string<std::string>(request.params(1), "\t"); // paddle::string::split_string<std::string>(request.params(1), "\t");
......
...@@ -18,7 +18,7 @@ set_source_files_properties( ...@@ -18,7 +18,7 @@ set_source_files_properties(
cc_library( cc_library(
graph_node graph_node
SRCS ${graphDir}/graph_node.cc SRCS ${graphDir}/graph_node.cc
DEPS WeightedSampler) DEPS WeightedSampler enforce)
set_source_files_properties( set_source_files_properties(
memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties( set_source_files_properties(
......
...@@ -21,12 +21,17 @@ ...@@ -21,12 +21,17 @@
#include <set> #include <set>
#include <sstream> #include <sstream>
#include "gflags/gflags.h"
#include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/common/utils.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(graph_load_in_parallel);
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -47,34 +52,125 @@ int32_t GraphTable::Load_to_ssd(const std::string &path, ...@@ -47,34 +52,125 @@ int32_t GraphTable::Load_to_ssd(const std::string &path,
return 0; return 0;
} }
paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
int idx, std::vector<int64_t> ids) { std::vector<uint64_t> &node_ids, int slot_num) {
std::vector<std::vector<int64_t>> bags(task_pool_size_); std::vector<std::vector<uint64_t>> bags(task_pool_size_);
for (auto x : ids) { for (int i = 0; i < task_pool_size_; i++) {
auto predsize = node_ids.size() / task_pool_size_;
bags[i].reserve(predsize * 1.2);
}
for (auto x : node_ids) {
int location = x % shard_num % task_pool_size_; int location = x % shard_num % task_pool_size_;
bags[location].push_back(x); bags[location].push_back(x);
} }
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
std::vector<int64_t> edge_array[task_pool_size_]; std::vector<uint64_t> feature_array[task_pool_size_];
std::vector<paddle::framework::GpuPsGraphNode> node_array[task_pool_size_]; std::vector<uint8_t> slot_id_array[task_pool_size_];
std::vector<uint64_t> node_id_array[task_pool_size_];
std::vector<paddle::framework::GpuPsFeaInfo>
node_fea_info_array[task_pool_size_];
for (size_t i = 0; i < bags.size(); i++) { for (size_t i = 0; i < bags.size(); i++) {
if (bags[i].size() > 0) { if (bags[i].size() > 0) {
tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
paddle::framework::GpuPsGraphNode x; uint64_t node_id;
paddle::framework::GpuPsFeaInfo x;
std::vector<uint64_t> feature_ids;
for (size_t j = 0; j < bags[i].size(); j++) { for (size_t j = 0; j < bags[i].size(); j++) {
Node *v = find_node(0, idx, bags[i][j]); // TODO use FEATURE_TABLE instead
x.node_id = bags[i][j]; Node *v = find_node(1, bags[i][j]);
node_id = bags[i][j];
if (v == NULL) { if (v == NULL) {
x.neighbor_size = 0; x.feature_size = 0;
x.neighbor_offset = 0; x.feature_offset = 0;
node_array[i].push_back(x); node_fea_info_array[i].push_back(x);
} else { } else {
x.neighbor_size = v->get_neighbor_size(); // x <- v
x.neighbor_offset = edge_array[i].size(); x.feature_offset = feature_array[i].size();
node_array[i].push_back(x); int total_feature_size = 0;
for (size_t k = 0; k < x.neighbor_size; k++) { for (int k = 0; k < slot_num; ++k) {
v->get_feature_ids(k, &feature_ids);
total_feature_size += feature_ids.size();
if (!feature_ids.empty()) {
feature_array[i].insert(feature_array[i].end(),
feature_ids.begin(),
feature_ids.end());
slot_id_array[i].insert(
slot_id_array[i].end(), feature_ids.size(), k);
}
}
x.feature_size = total_feature_size;
node_fea_info_array[i].push_back(x);
}
node_id_array[i].push_back(node_id);
}
return 0;
}));
}
}
for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
paddle::framework::GpuPsCommGraphFea res;
uint64_t tot_len = 0;
for (int i = 0; i < task_pool_size_; i++) {
tot_len += feature_array[i].size();
}
VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len
<< "] node_ids_size[" << node_ids.size() << "]";
res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
unsigned int offset = 0, ind = 0;
for (int i = 0; i < task_pool_size_; i++) {
for (int j = 0; j < (int)node_id_array[i].size(); j++) {
res.node_list[ind] = node_id_array[i][j];
res.fea_info_list[ind] = node_fea_info_array[i][j];
res.fea_info_list[ind++].feature_offset += offset;
}
for (size_t j = 0; j < feature_array[i].size(); j++) {
res.feature_list[offset + j] = feature_array[i][j];
res.slot_id_list[offset + j] = slot_id_array[i][j];
}
offset += feature_array[i].size();
}
return res;
}
paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
int idx, std::vector<uint64_t> ids) {
std::vector<std::vector<uint64_t>> bags(task_pool_size_);
for (int i = 0; i < task_pool_size_; i++) {
auto predsize = ids.size() / task_pool_size_;
bags[i].reserve(predsize * 1.2);
}
for (auto x : ids) {
int location = x % shard_num % task_pool_size_;
bags[location].push_back(x);
}
std::vector<std::future<int>> tasks;
std::vector<uint64_t> node_array[task_pool_size_]; // node id list
std::vector<paddle::framework::GpuPsNodeInfo> info_array[task_pool_size_];
std::vector<uint64_t> edge_array[task_pool_size_]; // edge id list
for (size_t i = 0; i < bags.size(); i++) {
if (bags[i].size() > 0) {
tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
node_array[i].resize(bags[i].size());
info_array[i].resize(bags[i].size());
edge_array[i].reserve(bags[i].size());
for (size_t j = 0; j < bags[i].size(); j++) {
auto node_id = bags[i][j];
node_array[i][j] = node_id;
Node *v = find_node(0, idx, node_id);
if (v != nullptr) {
info_array[i][j].neighbor_offset = edge_array[i].size();
info_array[i][j].neighbor_size = v->get_neighbor_size();
for (size_t k = 0; k < v->get_neighbor_size(); k++) {
edge_array[i].push_back(v->get_neighbor_id(k)); edge_array[i].push_back(v->get_neighbor_id(k));
} }
} else {
info_array[i][j].neighbor_offset = 0;
info_array[i][j].neighbor_size = 0;
} }
} }
return 0; return 0;
...@@ -82,21 +178,20 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( ...@@ -82,21 +178,20 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
} }
} }
for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
paddle::framework::GpuPsCommGraph res;
int64_t tot_len = 0; int64_t tot_len = 0;
for (int i = 0; i < task_pool_size_; i++) { for (int i = 0; i < task_pool_size_; i++) {
tot_len += edge_array[i].size(); tot_len += edge_array[i].size();
} }
// res.neighbor_size = tot_len;
// res.node_size = ids.size(); paddle::framework::GpuPsCommGraph res;
// res.neighbor_list = new int64_t[tot_len];
// res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
res.init_on_cpu(tot_len, ids.size()); res.init_on_cpu(tot_len, ids.size());
int64_t offset = 0, ind = 0; int64_t offset = 0, ind = 0;
for (int i = 0; i < task_pool_size_; i++) { for (int i = 0; i < task_pool_size_; i++) {
for (int j = 0; j < (int)node_array[i].size(); j++) { for (int j = 0; j < (int)node_array[i].size(); j++) {
res.node_list[ind] = node_array[i][j]; res.node_list[ind] = node_array[i][j];
res.node_list[ind++].neighbor_offset += offset; res.node_info_list[ind] = info_array[i][j];
res.node_info_list[ind++].neighbor_offset += offset;
} }
for (size_t j = 0; j < edge_array[i].size(); j++) { for (size_t j = 0; j < edge_array[i].size(); j++) {
res.neighbor_list[offset + j] = edge_array[i][j]; res.neighbor_list[offset + j] = edge_array[i][j];
...@@ -107,62 +202,41 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( ...@@ -107,62 +202,41 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
} }
int32_t GraphTable::add_node_to_ssd( int32_t GraphTable::add_node_to_ssd(
int type_id, int idx, int64_t src_id, char *data, int len) { int type_id, int idx, uint64_t src_id, char *data, int len) {
if (_db != NULL) { if (_db != NULL) {
char ch[sizeof(int) * 2 + sizeof(int64_t)]; char ch[sizeof(int) * 2 + sizeof(uint64_t)];
memcpy(ch, &type_id, sizeof(int)); memcpy(ch, &type_id, sizeof(int));
memcpy(ch + sizeof(int), &idx, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int));
memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t)); memcpy(ch + sizeof(int) * 2, &src_id, sizeof(uint64_t));
std::string str; std::string str;
if (_db->get(src_id % shard_num % task_pool_size_, if (_db->get(src_id % shard_num % task_pool_size_,
ch, ch,
sizeof(int) * 2 + sizeof(int64_t), sizeof(int) * 2 + sizeof(uint64_t),
str) == 0) { str) == 0) {
int64_t *stored_data = ((int64_t *)str.c_str()); uint64_t *stored_data = ((uint64_t *)str.c_str());
int n = str.size() / sizeof(int64_t); int n = str.size() / sizeof(uint64_t);
char *new_data = new char[n * sizeof(int64_t) + len]; char *new_data = new char[n * sizeof(uint64_t) + len];
memcpy(new_data, stored_data, n * sizeof(int64_t)); memcpy(new_data, stored_data, n * sizeof(uint64_t));
memcpy(new_data + n * sizeof(int64_t), data, len); memcpy(new_data + n * sizeof(uint64_t), data, len);
_db->put(src_id % shard_num % task_pool_size_, _db->put(src_id % shard_num % task_pool_size_,
ch, ch,
sizeof(int) * 2 + sizeof(int64_t), sizeof(int) * 2 + sizeof(uint64_t),
(char *)new_data, (char *)new_data,
n * sizeof(int64_t) + len); n * sizeof(uint64_t) + len);
delete[] new_data; delete[] new_data;
} else { } else {
_db->put(src_id % shard_num % task_pool_size_, _db->put(src_id % shard_num % task_pool_size_,
ch, ch,
sizeof(int) * 2 + sizeof(int64_t), sizeof(int) * 2 + sizeof(uint64_t),
(char *)data, (char *)data,
len); len);
} }
// _db->flush(src_id % shard_num % task_pool_size_);
// std::string x;
// if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
// 2 * sizeof(int), x) ==0){
// VLOG(0)<<"put result";
// for(int i = 0;i < x.size();i+=8){
// VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
// }
//}
// if(src_id == 429){
// str = "";
// _db->get(src_id % shard_num % task_pool_size_, ch,
// sizeof(int) * 2 + sizeof(int64_t), str);
// int64_t *stored_data = ((int64_t *)str.c_str());
// int n = str.size() / sizeof(int64_t);
// VLOG(0)<<"429 has "<<n<<"neighbors";
// for(int i =0;i< n;i++){
// VLOG(0)<<"get an id "<<*((int64_t *)(str.c_str() +
// i*sizeof(int64_t)));
// }
// }
} }
return 0; return 0;
} }
char *GraphTable::random_sample_neighbor_from_ssd( char *GraphTable::random_sample_neighbor_from_ssd(
int idx, int idx,
int64_t id, uint64_t id,
int sample_size, int sample_size,
const std::shared_ptr<std::mt19937_64> rng, const std::shared_ptr<std::mt19937_64> rng,
int &actual_size) { int &actual_size) {
...@@ -172,18 +246,18 @@ char *GraphTable::random_sample_neighbor_from_ssd( ...@@ -172,18 +246,18 @@ char *GraphTable::random_sample_neighbor_from_ssd(
} }
std::string str; std::string str;
VLOG(2) << "sample ssd for key " << id; VLOG(2) << "sample ssd for key " << id;
char ch[sizeof(int) * 2 + sizeof(int64_t)]; char ch[sizeof(int) * 2 + sizeof(uint64_t)];
memset(ch, 0, sizeof(int)); memset(ch, 0, sizeof(int));
memcpy(ch + sizeof(int), &idx, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int));
memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t)); memcpy(ch + sizeof(int) * 2, &id, sizeof(uint64_t));
if (_db->get(id % shard_num % task_pool_size_, if (_db->get(id % shard_num % task_pool_size_,
ch, ch,
sizeof(int) * 2 + sizeof(int64_t), sizeof(int) * 2 + sizeof(uint64_t),
str) == 0) { str) == 0) {
int64_t *data = ((int64_t *)str.c_str()); uint64_t *data = ((uint64_t *)str.c_str());
int n = str.size() / sizeof(int64_t); int n = str.size() / sizeof(uint64_t);
std::unordered_map<int, int> m; std::unordered_map<int, int> m;
// std::vector<int64_t> res; // std::vector<uint64_t> res;
int sm_size = std::min(n, sample_size); int sm_size = std::min(n, sample_size);
actual_size = sm_size * Node::id_size; actual_size = sm_size * Node::id_size;
char *buff = new char[actual_size]; char *buff = new char[actual_size];
...@@ -207,7 +281,7 @@ char *GraphTable::random_sample_neighbor_from_ssd( ...@@ -207,7 +281,7 @@ char *GraphTable::random_sample_neighbor_from_ssd(
// res.push_back(data[pos]); // res.push_back(data[pos]);
} }
for (int i = 0; i < actual_size; i += 8) { for (int i = 0; i < actual_size; i += 8) {
VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i]; VLOG(2) << "sampled an neighbor " << *(uint64_t *)&buff[i];
} }
return buff; return buff;
} }
...@@ -216,8 +290,8 @@ char *GraphTable::random_sample_neighbor_from_ssd( ...@@ -216,8 +290,8 @@ char *GraphTable::random_sample_neighbor_from_ssd(
} }
int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
std::vector<int64_t> &ids) { std::vector<uint64_t> &ids) {
std::vector<std::vector<int64_t>> bags(task_pool_size_); std::vector<std::vector<uint64_t>> bags(task_pool_size_);
for (auto x : ids) { for (auto x : ids) {
int location = x % shard_num % task_pool_size_; int location = x % shard_num % task_pool_size_;
bags[location].push_back(x); bags[location].push_back(x);
...@@ -227,17 +301,17 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, ...@@ -227,17 +301,17 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
for (size_t i = 0; i < bags.size(); i++) { for (size_t i = 0; i < bags.size(); i++) {
if (bags[i].size() > 0) { if (bags[i].size() > 0) {
tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
char ch[sizeof(int) * 2 + sizeof(int64_t)]; char ch[sizeof(int) * 2 + sizeof(uint64_t)];
memset(ch, 0, sizeof(int)); memset(ch, 0, sizeof(int));
memcpy(ch + sizeof(int), &idx, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int));
for (size_t k = 0; k < bags[i].size(); k++) { for (size_t k = 0; k < bags[i].size(); k++) {
auto v = bags[i][k]; auto v = bags[i][k];
memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t)); memcpy(ch + sizeof(int) * 2, &v, sizeof(uint64_t));
std::string str; std::string str;
if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) { if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
count[i] += (int64_t)str.size(); count[i] += (int64_t)str.size();
for (int j = 0; j < str.size(); j += sizeof(int64_t)) { for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) {
int64_t id = *(int64_t *)(str.c_str() + j); uint64_t id = *(uint64_t *)(str.c_str() + j);
add_comm_edge(idx, v, id); add_comm_edge(idx, v, id);
} }
} }
...@@ -274,7 +348,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { ...@@ -274,7 +348,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
std::vector<double> weight_cost(part_len, 0); std::vector<double> weight_cost(part_len, 0);
std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount); std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
std::vector<double> score(part_len, 0); std::vector<double> score(part_len, 0);
std::unordered_map<int64_t, int> id_map; std::unordered_map<uint64_t, int> id_map;
std::vector<rocksdb::Iterator *> iters; std::vector<rocksdb::Iterator *> iters;
for (int i = 0; i < task_pool_size_; i++) { for (int i = 0; i < task_pool_size_; i++) {
iters.push_back(_db->get_iterator(i)); iters.push_back(_db->get_iterator(i));
...@@ -282,7 +356,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { ...@@ -282,7 +356,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
} }
int next = 0; int next = 0;
while (iters.size()) { while (iters.size()) {
if (next >= iters.size()) { if (next >= (int)iters.size()) {
next = 0; next = 0;
} }
if (!iters[next]->Valid()) { if (!iters[next]->Valid()) {
...@@ -298,7 +372,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { ...@@ -298,7 +372,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
continue; continue;
} }
std::string value = iters[next]->value().ToString(); std::string value = iters[next]->value().ToString();
std::int64_t i_key = *(int64_t *)(key.c_str() + sizeof(int) * 2); std::uint64_t i_key = *(uint64_t *)(key.c_str() + sizeof(int) * 2);
for (int i = 0; i < part_len; i++) { for (int i = 0; i < part_len; i++) {
if (memory_remaining[i] < (int64_t)value.size()) { if (memory_remaining[i] < (int64_t)value.size()) {
score[i] = -100000.0; score[i] = -100000.0;
...@@ -306,8 +380,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { ...@@ -306,8 +380,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
score[i] = 0; score[i] = 0;
} }
} }
for (int j = 0; j < value.size(); j += sizeof(int64_t)) { for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) {
int64_t v = *((int64_t *)(value.c_str() + j)); uint64_t v = *((uint64_t *)(value.c_str() + j));
int index = -1; int index = -1;
if (id_map.find(v) != id_map.end()) { if (id_map.find(v) != id_map.end()) {
index = id_map[v]; index = id_map[v];
...@@ -398,7 +472,7 @@ void GraphTable::clear_graph(int idx) { ...@@ -398,7 +472,7 @@ void GraphTable::clear_graph(int idx) {
} }
} }
int32_t GraphTable::load_next_partition(int idx) { int32_t GraphTable::load_next_partition(int idx) {
if (next_partition >= partitions[idx].size()) { if (next_partition >= (int)partitions[idx].size()) {
VLOG(0) << "partition iteration is done"; VLOG(0) << "partition iteration is done";
return -1; return -1;
} }
...@@ -426,8 +500,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, ...@@ -426,8 +500,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
auto paths = paddle::string::split_string<std::string>(path, ";"); auto paths = paddle::string::split_string<std::string>(path, ";");
int64_t count = 0; int64_t count = 0;
std::string sample_type = "random"; std::string sample_type = "random";
bool is_weighted = false;
int valid_count = 0;
for (auto path : paths) { for (auto path : paths) {
std::ifstream file(path); std::ifstream file(path);
std::string line; std::string line;
...@@ -438,16 +510,16 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, ...@@ -438,16 +510,16 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
if (values.size() < 2) continue; if (values.size() < 2) continue;
auto src_id = std::stoll(values[0]); auto src_id = std::stoll(values[0]);
auto dist_ids = paddle::string::split_string<std::string>(values[1], ";"); auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
std::vector<int64_t> dist_data; std::vector<uint64_t> dist_data;
for (auto x : dist_ids) { for (auto x : dist_ids) {
dist_data.push_back(std::stoll(x)); dist_data.push_back(std::stoll(x));
total_memory_cost += sizeof(int64_t); total_memory_cost += sizeof(uint64_t);
} }
add_node_to_ssd(0, add_node_to_ssd(0,
idx, idx,
src_id, src_id,
(char *)dist_data.data(), (char *)dist_data.data(),
(int)(dist_data.size() * sizeof(int64_t))); (int)(dist_data.size() * sizeof(uint64_t)));
} }
} }
VLOG(0) << "total memory cost = " << total_memory_cost << " bytes"; VLOG(0) << "total memory cost = " << total_memory_cost << " bytes";
...@@ -456,9 +528,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, ...@@ -456,9 +528,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
int32_t GraphTable::dump_edges_to_ssd(int idx) { int32_t GraphTable::dump_edges_to_ssd(int idx) {
VLOG(2) << "calling dump edges to ssd"; VLOG(2) << "calling dump edges to ssd";
const int64_t fixed_size = 10000;
// std::vector<int64_t> edge_array[task_pool_size_];
std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
std::vector<std::future<int64_t>> tasks; std::vector<std::future<int64_t>> tasks;
auto &shards = edge_shards[idx]; auto &shards = edge_shards[idx];
for (size_t i = 0; i < shards.size(); ++i) { for (size_t i = 0; i < shards.size(); ++i) {
...@@ -466,18 +535,17 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) { ...@@ -466,18 +535,17 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
[&, i, this]() -> int64_t { [&, i, this]() -> int64_t {
int64_t cost = 0; int64_t cost = 0;
std::vector<Node *> &v = shards[i]->get_bucket(); std::vector<Node *> &v = shards[i]->get_bucket();
size_t ind = i % this->task_pool_size_;
for (size_t j = 0; j < v.size(); j++) { for (size_t j = 0; j < v.size(); j++) {
std::vector<int64_t> s; std::vector<uint64_t> s;
for (int k = 0; k < v[j]->get_neighbor_size(); k++) { for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) {
s.push_back(v[j]->get_neighbor_id(k)); s.push_back(v[j]->get_neighbor_id(k));
} }
cost += v[j]->get_neighbor_size() * sizeof(int64_t); cost += v[j]->get_neighbor_size() * sizeof(uint64_t);
add_node_to_ssd(0, add_node_to_ssd(0,
idx, idx,
v[j]->get_id(), v[j]->get_id(),
(char *)s.data(), (char *)s.data(),
s.size() * sizeof(int64_t)); s.size() * sizeof(uint64_t));
} }
return cost; return cost;
})); }));
...@@ -489,7 +557,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { ...@@ -489,7 +557,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
VLOG(0) << "make_complementary_graph"; VLOG(0) << "make_complementary_graph";
const int64_t fixed_size = byte_size / 8; const int64_t fixed_size = byte_size / 8;
// std::vector<int64_t> edge_array[task_pool_size_]; // std::vector<int64_t> edge_array[task_pool_size_];
std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_); std::vector<std::unordered_map<uint64_t, int>> count(task_pool_size_);
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
auto &shards = edge_shards[idx]; auto &shards = edge_shards[idx];
for (size_t i = 0; i < shards.size(); ++i) { for (size_t i = 0; i < shards.size(); ++i) {
...@@ -499,7 +567,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { ...@@ -499,7 +567,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
size_t ind = i % this->task_pool_size_; size_t ind = i % this->task_pool_size_;
for (size_t j = 0; j < v.size(); j++) { for (size_t j = 0; j < v.size(); j++) {
// size_t location = v[j]->get_id(); // size_t location = v[j]->get_id();
for (int k = 0; k < v[j]->get_neighbor_size(); k++) { for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) {
count[ind][v[j]->get_neighbor_id(k)]++; count[ind][v[j]->get_neighbor_id(k)]++;
} }
} }
...@@ -507,9 +575,9 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { ...@@ -507,9 +575,9 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
})); }));
} }
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
std::unordered_map<int64_t, int> final_count; std::unordered_map<uint64_t, int> final_count;
std::map<int, std::vector<int64_t>> count_to_id; std::map<int, std::vector<uint64_t>> count_to_id;
std::vector<int64_t> buffer; std::vector<uint64_t> buffer;
clear_graph(idx); clear_graph(idx);
for (int i = 0; i < task_pool_size_; i++) { for (int i = 0; i < task_pool_size_; i++) {
...@@ -546,6 +614,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { ...@@ -546,6 +614,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
bucket[i]->build_sampler(sample_type); bucket[i]->build_sampler(sample_type);
} }
} }
return 0; return 0;
} }
#endif #endif
...@@ -840,7 +909,7 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) { ...@@ -840,7 +909,7 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
size_t GraphShard::get_size() { return bucket.size(); } size_t GraphShard::get_size() { return bucket.size(); }
int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) { int32_t GraphTable::add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id) {
size_t src_shard_id = src_id % shard_num; size_t src_shard_id = src_id % shard_num;
if (src_shard_id >= shard_end || src_shard_id < shard_start) { if (src_shard_id >= shard_end || src_shard_id < shard_start) {
...@@ -852,11 +921,11 @@ int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) { ...@@ -852,11 +921,11 @@ int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) {
return 0; return 0;
} }
int32_t GraphTable::add_graph_node(int idx, int32_t GraphTable::add_graph_node(int idx,
std::vector<int64_t> &id_list, std::vector<uint64_t> &id_list,
std::vector<bool> &is_weight_list) { std::vector<bool> &is_weight_list) {
auto &shards = edge_shards[idx]; auto &shards = edge_shards[idx];
size_t node_size = id_list.size(); size_t node_size = id_list.size();
std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_); std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
for (size_t i = 0; i < node_size; i++) { for (size_t i = 0; i < node_size; i++) {
size_t shard_id = id_list[i] % shard_num; size_t shard_id = id_list[i] % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) { if (shard_id >= shard_end || shard_id < shard_start) {
...@@ -881,9 +950,9 @@ int32_t GraphTable::add_graph_node(int idx, ...@@ -881,9 +950,9 @@ int32_t GraphTable::add_graph_node(int idx,
return 0; return 0;
} }
int32_t GraphTable::remove_graph_node(int idx, std::vector<int64_t> &id_list) { int32_t GraphTable::remove_graph_node(int idx, std::vector<uint64_t> &id_list) {
size_t node_size = id_list.size(); size_t node_size = id_list.size();
std::vector<std::vector<int64_t>> batch(task_pool_size_); std::vector<std::vector<uint64_t>> batch(task_pool_size_);
for (size_t i = 0; i < node_size; i++) { for (size_t i = 0; i < node_size; i++) {
size_t shard_id = id_list[i] % shard_num; size_t shard_id = id_list[i] % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) continue; if (shard_id >= shard_end || shard_id < shard_start) continue;
...@@ -916,7 +985,7 @@ void GraphShard::clear() { ...@@ -916,7 +985,7 @@ void GraphShard::clear() {
GraphShard::~GraphShard() { clear(); } GraphShard::~GraphShard() { clear(); }
void GraphShard::delete_node(int64_t id) { void GraphShard::delete_node(uint64_t id) {
auto iter = node_location.find(id); auto iter = node_location.find(id);
if (iter == node_location.end()) return; if (iter == node_location.end()) return;
int pos = iter->second; int pos = iter->second;
...@@ -928,7 +997,7 @@ void GraphShard::delete_node(int64_t id) { ...@@ -928,7 +997,7 @@ void GraphShard::delete_node(int64_t id) {
node_location.erase(id); node_location.erase(id);
bucket.pop_back(); bucket.pop_back();
} }
GraphNode *GraphShard::add_graph_node(int64_t id) { GraphNode *GraphShard::add_graph_node(uint64_t id) {
if (node_location.find(id) == node_location.end()) { if (node_location.find(id) == node_location.end()) {
node_location[id] = bucket.size(); node_location[id] = bucket.size();
bucket.push_back(new GraphNode(id)); bucket.push_back(new GraphNode(id));
...@@ -944,19 +1013,25 @@ GraphNode *GraphShard::add_graph_node(Node *node) { ...@@ -944,19 +1013,25 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
} }
return (GraphNode *)bucket[node_location[id]]; return (GraphNode *)bucket[node_location[id]];
} }
FeatureNode *GraphShard::add_feature_node(int64_t id) {
FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap) {
if (node_location.find(id) == node_location.end()) { if (node_location.find(id) == node_location.end()) {
node_location[id] = bucket.size(); node_location[id] = bucket.size();
bucket.push_back(new FeatureNode(id)); bucket.push_back(new FeatureNode(id));
return (FeatureNode *)bucket[node_location[id]];
} }
if (is_overlap) {
return (FeatureNode *)bucket[node_location[id]]; return (FeatureNode *)bucket[node_location[id]];
}
return NULL;
} }
void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) { void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
find_node(id)->add_edge(dst_id, weight); find_node(id)->add_edge(dst_id, weight);
} }
Node *GraphShard::find_node(int64_t id) { Node *GraphShard::find_node(uint64_t id) {
auto iter = node_location.find(id); auto iter = node_location.find(id);
return iter == node_location.end() ? nullptr : bucket[iter->second]; return iter == node_location.end() ? nullptr : bucket[iter->second];
} }
...@@ -992,15 +1067,93 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) { ...@@ -992,15 +1067,93 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
return 0; return 0;
} }
std::string GraphTable::get_inverse_etype(std::string &etype) {
auto etype_split = paddle::string::split_string<std::string>(etype, "2");
std::string res;
if ((int)etype_split.size() == 3) {
res = etype_split[2] + "2" + etype_split[1] + "2" + etype_split[0];
} else {
res = etype_split[1] + "2" + etype_split[0];
}
return res;
}
int32_t GraphTable::load_node_and_edge_file(std::string etype,
std::string ntype,
std::string epath,
std::string npath,
int part_num,
bool reverse) {
auto etypes = paddle::string::split_string<std::string>(etype, ",");
auto ntypes = paddle::string::split_string<std::string>(ntype, ",");
VLOG(0) << "etypes size: " << etypes.size();
VLOG(0) << "whether reverse: " << reverse;
std::string delim = ";";
size_t total_len = etypes.size() + 1; // 1 is for node
std::vector<std::future<int>> tasks;
for (size_t i = 0; i < total_len; i++) {
tasks.push_back(
_shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
if (i < etypes.size()) {
std::string etype_path = epath + "/" + etypes[i];
auto etype_path_list = paddle::framework::localfs_list(etype_path);
std::string etype_path_str;
if (part_num > 0 && part_num < (int)etype_path_list.size()) {
std::vector<std::string> sub_etype_path_list(
etype_path_list.begin(), etype_path_list.begin() + part_num);
etype_path_str =
paddle::string::join_strings(sub_etype_path_list, delim);
} else {
etype_path_str =
paddle::string::join_strings(etype_path_list, delim);
}
this->load_edges(etype_path_str, false, etypes[i]);
if (reverse) {
std::string r_etype = get_inverse_etype(etypes[i]);
this->load_edges(etype_path_str, true, r_etype);
}
} else {
auto npath_list = paddle::framework::localfs_list(npath);
std::string npath_str;
if (part_num > 0 && part_num < (int)npath_list.size()) {
std::vector<std::string> sub_npath_list(
npath_list.begin(), npath_list.begin() + part_num);
npath_str = paddle::string::join_strings(sub_npath_list, delim);
} else {
npath_str = paddle::string::join_strings(npath_list, delim);
}
if (ntypes.size() == 0) {
VLOG(0) << "node_type not specified, nothing will be loaded ";
return 0;
}
if (FLAGS_graph_load_in_parallel) {
this->load_nodes(npath_str, "");
} else {
for (size_t j = 0; j < ntypes.size(); j++) {
this->load_nodes(npath_str, ntypes[j]);
}
}
}
return 0;
}));
}
for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
return 0;
}
int32_t GraphTable::get_nodes_ids_by_ranges( int32_t GraphTable::get_nodes_ids_by_ranges(
int type_id, int type_id,
int idx, int idx,
std::vector<std::pair<int, int>> ranges, std::vector<std::pair<int, int>> ranges,
std::vector<int64_t> &res) { std::vector<uint64_t> &res) {
std::mutex mutex;
int start = 0, end, index = 0, total_size = 0; int start = 0, end, index = 0, total_size = 0;
res.clear(); res.clear();
auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
std::vector<std::future<std::vector<int64_t>>> tasks; std::vector<std::future<size_t>> tasks;
for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
end = total_size + shards[i]->get_size(); end = total_size + shards[i]->get_size();
start = total_size; start = total_size;
...@@ -1016,86 +1169,173 @@ int32_t GraphTable::get_nodes_ids_by_ranges( ...@@ -1016,86 +1169,173 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
first -= total_size; first -= total_size;
second -= total_size; second -= total_size;
tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
[&shards, this, first, second, i]() -> std::vector<int64_t> { [&shards, this, first, second, i, &res, &mutex]() -> size_t {
return shards[i]->get_ids_by_range(first, second); std::vector<uint64_t> keys;
shards[i]->get_ids_by_range(first, second, &keys);
size_t num = keys.size();
mutex.lock();
res.reserve(res.size() + num);
for (auto &id : keys) {
res.push_back(id);
std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
}
mutex.unlock();
return num;
})); }));
} }
} }
total_size += shards[i]->get_size(); total_size += shards[i]->get_size();
} }
for (size_t i = 0; i < tasks.size(); i++) { for (size_t i = 0; i < tasks.size(); i++) {
auto vec = tasks[i].get(); tasks[i].get();
for (auto &id : vec) {
res.push_back(id);
std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
}
} }
return 0; return 0;
} }
int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
auto paths = paddle::string::split_string<std::string>(path, ";"); const std::string &path, const std::string &node_type, int idx) {
int64_t count = 0;
int64_t valid_count = 0;
int idx = 0;
if (node_type == "") {
VLOG(0) << "node_type not specified, loading edges to " << id_to_feature[0]
<< " part";
} else {
if (feature_to_id.find(node_type) == feature_to_id.end()) {
VLOG(0) << "node_type " << node_type
<< " is not defined, nothing will be loaded";
return 0;
}
idx = feature_to_id[node_type];
}
for (auto path : paths) {
std::ifstream file(path); std::ifstream file(path);
std::string line; std::string line;
while (std::getline(file, line)) { uint64_t local_count = 0;
auto values = paddle::string::split_string<std::string>(line, "\t"); uint64_t local_valid_count = 0;
if (values.size() < 2) continue;
auto id = std::stoull(values[1]);
int num = 0;
std::vector<paddle::string::str_ptr> vals;
size_t n = node_type.length();
while (std::getline(file, line)) {
if (strncmp(line.c_str(), node_type.c_str(), n) != 0) {
continue;
}
vals.clear();
num = paddle::string::split_string_ptr(
line.c_str() + n + 1, line.length() - n - 1, '\t', &vals);
if (num == 0) {
continue;
}
uint64_t id = std::strtoul(vals[0].ptr, NULL, 10);
size_t shard_id = id % shard_num; size_t shard_id = id % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) { if (shard_id >= shard_end || shard_id < shard_start) {
VLOG(4) << "will not load " << id << " from " << path VLOG(4) << "will not load " << id << " from " << path
<< ", please check id distribution"; << ", please check id distribution";
continue; continue;
} }
local_count++;
if (count % 1000000 == 0) { size_t index = shard_id - shard_start;
VLOG(0) << count << " nodes are loaded from filepath"; auto node = feature_shards[idx][index]->add_feature_node(id, false);
VLOG(0) << line; if (node != NULL) {
node->set_feature_size(feat_name[idx].size());
for (int i = 1; i < num; ++i) {
auto &v = vals[i];
parse_feature(idx, v.ptr, v.len, node);
} }
count++; }
local_valid_count++;
}
VLOG(2) << "node_type[" << node_type << "] loads " << local_count
<< " nodes from filepath->" << path;
return {local_count, local_valid_count};
}
std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
const std::string &path) {
std::ifstream file(path);
std::string line;
uint64_t local_count = 0;
uint64_t local_valid_count = 0;
int idx = 0;
auto path_split = paddle::string::split_string<std::string>(path, "/");
auto path_name = path_split[path_split.size() - 1];
std::string nt = values[0]; int num = 0;
if (nt != node_type) { std::vector<paddle::string::str_ptr> vals;
while (std::getline(file, line)) {
vals.clear();
num = paddle::string::split_string_ptr(
line.c_str(), line.length(), '\t', &vals);
if (vals.empty()) {
continue;
}
std::string parse_node_type = vals[0].to_string();
auto it = feature_to_id.find(parse_node_type);
if (it == feature_to_id.end()) {
VLOG(0) << parse_node_type << "type error, please check";
continue;
}
idx = it->second;
uint64_t id = std::strtoul(vals[1].ptr, NULL, 10);
size_t shard_id = id % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) {
VLOG(4) << "will not load " << id << " from " << path
<< ", please check id distribution";
continue; continue;
} }
local_count++;
size_t index = shard_id - shard_start; size_t index = shard_id - shard_start;
auto node = feature_shards[idx][index]->add_feature_node(id, false);
if (node != NULL) {
for (int i = 2; i < num; ++i) {
auto &v = vals[i];
parse_feature(idx, v.ptr, v.len, node);
}
}
local_valid_count++;
}
VLOG(2) << local_valid_count << "/" << local_count << " nodes from filepath->"
<< path;
return {local_count, local_valid_count};
}
// auto node = shards[index]->add_feature_node(id); // TODO opt load all node_types in once reading
auto node = feature_shards[idx][index]->add_feature_node(id); int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
node->set_feature_size(feat_name[idx].size()); auto paths = paddle::string::split_string<std::string>(path, ";");
uint64_t count = 0;
for (size_t slice = 2; slice < values.size(); slice++) { uint64_t valid_count = 0;
auto feat = this->parse_feature(idx, values[slice]); int idx = 0;
if (feat.first >= 0) { if (FLAGS_graph_load_in_parallel) {
node->set_feature(feat.first, feat.second); if (node_type == "") {
VLOG(0) << "Begin GraphTable::load_nodes(), will load all node_type once";
}
std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
for (size_t i = 0; i < paths.size(); i++) {
tasks.push_back(load_node_edge_task_pool->enqueue(
[&, i, this]() -> std::pair<uint64_t, uint64_t> {
return parse_node_file(paths[i]);
}));
}
for (int i = 0; i < (int)tasks.size(); i++) {
auto res = tasks[i].get();
count += res.first;
valid_count += res.second;
}
} else {
VLOG(0) << "Begin GraphTable::load_nodes() node_type[" << node_type << "]";
if (node_type == "") {
VLOG(0) << "node_type not specified, loading edges to "
<< id_to_feature[0] << " part";
} else { } else {
VLOG(4) << "Node feature: " << values[slice] if (feature_to_id.find(node_type) == feature_to_id.end()) {
<< " not in feature_map."; VLOG(0) << "node_type " << node_type
<< " is not defined, nothing will be loaded";
return 0;
} }
idx = feature_to_id[node_type];
} }
valid_count++; for (auto path : paths) {
VLOG(2) << "Begin GraphTable::load_nodes(), path[" << path << "]";
auto res = parse_node_file(path, node_type, idx);
count += res.first;
valid_count += res.second;
} }
} }
VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type
<< " are loaded successfully in " << path; << "] are loaded successfully!";
return 0; return 0;
} }
...@@ -1108,85 +1348,114 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) { ...@@ -1108,85 +1348,114 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) {
} }
return 0; return 0;
} }
int32_t GraphTable::load_edges(const std::string &path,
bool reverse_edge,
const std::string &edge_type) {
#ifdef PADDLE_WITH_HETERPS
// if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
if (search_level == 2) total_memory_cost = 0;
const int64_t fixed_load_edges = 1000000;
#endif
int idx = 0;
if (edge_type == "") {
VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
<< " part";
} else {
if (edge_to_id.find(edge_type) == edge_to_id.end()) {
VLOG(0) << "edge_type " << edge_type
<< " is not defined, nothing will be loaded";
return 0;
}
idx = edge_to_id[edge_type];
}
auto paths = paddle::string::split_string<std::string>(path, ";"); std::pair<uint64_t, uint64_t> GraphTable::parse_edge_file(
int64_t count = 0; const std::string &path, int idx, bool reverse) {
std::string sample_type = "random"; std::string sample_type = "random";
bool is_weighted = false; bool is_weighted = false;
int valid_count = 0;
for (auto path : paths) {
std::ifstream file(path); std::ifstream file(path);
std::string line; std::string line;
uint64_t local_count = 0;
uint64_t local_valid_count = 0;
uint64_t part_num = 0;
if (FLAGS_graph_load_in_parallel) {
auto path_split = paddle::string::split_string<std::string>(path, "/");
auto part_name_split = paddle::string::split_string<std::string>(
path_split[path_split.size() - 1], "-");
part_num = std::stoull(part_name_split[part_name_split.size() - 1]);
}
while (std::getline(file, line)) { while (std::getline(file, line)) {
auto values = paddle::string::split_string<std::string>(line, "\t"); size_t start = line.find_first_of('\t');
count++; if (start == std::string::npos) continue;
if (values.size() < 2) continue; local_count++;
auto src_id = std::stoull(values[0]); uint64_t src_id = std::stoull(&line[0]);
auto dst_id = std::stoull(values[1]); uint64_t dst_id = std::stoull(&line[start + 1]);
if (reverse_edge) { if (reverse) {
std::swap(src_id, dst_id); std::swap(src_id, dst_id);
} }
size_t src_shard_id = src_id % shard_num;
if (FLAGS_graph_load_in_parallel) {
if (src_shard_id != (part_num % shard_num)) {
continue;
}
}
float weight = 1; float weight = 1;
if (values.size() == 3) { size_t last = line.find_last_of('\t');
weight = std::stof(values[2]); if (start != last) {
weight = std::stof(&line[last + 1]);
sample_type = "weighted"; sample_type = "weighted";
is_weighted = true; is_weighted = true;
} }
size_t src_shard_id = src_id % shard_num;
if (src_shard_id >= shard_end || src_shard_id < shard_start) { if (src_shard_id >= shard_end || src_shard_id < shard_start) {
VLOG(4) << "will not load " << src_id << " from " << path VLOG(4) << "will not load " << src_id << " from " << path
<< ", please check id distribution"; << ", please check id distribution";
continue; continue;
} }
size_t index = src_shard_id - shard_start;
auto node = edge_shards[idx][index]->add_graph_node(src_id);
if (node != NULL) {
node->build_edges(is_weighted);
node->add_edge(dst_id, weight);
}
if (count % 1000000 == 0) { local_valid_count++;
VLOG(0) << count << " edges are loaded from filepath";
VLOG(0) << line;
} }
VLOG(2) << local_count << " edges are loaded from filepath->" << path;
return {local_count, local_valid_count};
}
size_t index = src_shard_id - shard_start; int32_t GraphTable::load_edges(const std::string &path,
edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted); bool reverse_edge,
edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight); const std::string &edge_type) {
valid_count++;
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
// if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); if (search_level == 2) total_memory_cost = 0;
if (count > fixed_load_edges && search_level == 2) { const uint64_t fixed_load_edges = 1000000;
dump_edges_to_ssd(idx);
VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
clear_graph(idx);
count = 0;
}
#endif #endif
int idx = 0;
if (edge_type == "") {
VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
<< " part";
} else {
if (edge_to_id.find(edge_type) == edge_to_id.end()) {
VLOG(0) << "edge_type " << edge_type
<< " is not defined, nothing will be loaded";
return 0;
} }
idx = edge_to_id[edge_type];
} }
VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
<< path;
// Build Sampler j auto paths = paddle::string::split_string<std::string>(path, ";");
uint64_t count = 0;
uint64_t valid_count = 0;
VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]";
if (FLAGS_graph_load_in_parallel) {
std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
for (int i = 0; i < paths.size(); i++) {
tasks.push_back(load_node_edge_task_pool->enqueue(
[&, i, idx, this]() -> std::pair<uint64_t, uint64_t> {
return parse_edge_file(paths[i], idx, reverse_edge);
}));
}
for (int j = 0; j < (int)tasks.size(); j++) {
auto res = tasks[j].get();
count += res.first;
valid_count += res.second;
}
} else {
for (auto path : paths) {
auto res = parse_edge_file(path, idx, reverse_edge);
count += res.first;
valid_count += res.second;
}
}
VLOG(0) << valid_count << "/" << count << " edge_type[" << edge_type
<< "] edges are loaded successfully";
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
// if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
if (search_level == 2) { if (search_level == 2) {
if (count > 0) { if (count > 0) {
dump_edges_to_ssd(idx); dump_edges_to_ssd(idx);
...@@ -1197,31 +1466,65 @@ int32_t GraphTable::load_edges(const std::string &path, ...@@ -1197,31 +1466,65 @@ int32_t GraphTable::load_edges(const std::string &path,
return 0; return 0;
} }
#endif #endif
if (!build_sampler_on_cpu) {
// To reduce memory overhead, CPU samplers won't be created in gpugraph.
// In order not to affect the sampler function of other scenario,
// this optimization is only performed in load_edges function.
VLOG(0) << "run in gpugraph mode!";
} else {
std::string sample_type = "random";
VLOG(0) << "build sampler ... ";
for (auto &shard : edge_shards[idx]) { for (auto &shard : edge_shards[idx]) {
auto bucket = shard->get_bucket(); auto bucket = shard->get_bucket();
for (size_t i = 0; i < bucket.size(); i++) { for (size_t i = 0; i < bucket.size(); i++) {
bucket[i]->build_sampler(sample_type); bucket[i]->build_sampler(sample_type);
} }
} }
}
return 0; return 0;
} }
Node *GraphTable::find_node(int type_id, int idx, int64_t id) { Node *GraphTable::find_node(int type_id, uint64_t id) {
size_t shard_id = id % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) {
return nullptr;
}
Node *node = nullptr;
size_t index = shard_id - shard_start;
auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
for (auto &search_shard : search_shards) {
PADDLE_ENFORCE_NOT_NULL(search_shard[index],
paddle::platform::errors::InvalidArgument(
"search_shard[%d] should not be null.", index));
node = search_shard[index]->find_node(id);
if (node != nullptr) {
break;
}
}
return node;
}
Node *GraphTable::find_node(int type_id, int idx, uint64_t id) {
size_t shard_id = id % shard_num; size_t shard_id = id % shard_num;
if (shard_id >= shard_end || shard_id < shard_start) { if (shard_id >= shard_end || shard_id < shard_start) {
return nullptr; return nullptr;
} }
size_t index = shard_id - shard_start; size_t index = shard_id - shard_start;
auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
PADDLE_ENFORCE_NOT_NULL(search_shards[index],
paddle::platform::errors::InvalidArgument(
"search_shard[%d] should not be null.", index));
Node *node = search_shards[index]->find_node(id); Node *node = search_shards[index]->find_node(id);
return node; return node;
} }
uint32_t GraphTable::get_thread_pool_index(int64_t node_id) { uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
return node_id % shard_num % shard_num_per_server % task_pool_size_; return node_id % shard_num % shard_num_per_server % task_pool_size_;
} }
uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) { uint32_t GraphTable::get_thread_pool_index_by_shard_index(
uint64_t shard_index) {
return shard_index % shard_num_per_server % task_pool_size_; return shard_index % shard_num_per_server % task_pool_size_;
} }
...@@ -1293,9 +1596,9 @@ int32_t GraphTable::random_sample_nodes(int type_id, ...@@ -1293,9 +1596,9 @@ int32_t GraphTable::random_sample_nodes(int type_id,
} }
} }
for (auto &pair : first_half) second_half.push_back(pair); for (auto &pair : first_half) second_half.push_back(pair);
std::vector<int64_t> res; std::vector<uint64_t> res;
get_nodes_ids_by_ranges(type_id, idx, second_half, res); get_nodes_ids_by_ranges(type_id, idx, second_half, res);
actual_size = res.size() * sizeof(int64_t); actual_size = res.size() * sizeof(uint64_t);
buffer.reset(new char[actual_size]); buffer.reset(new char[actual_size]);
char *pointer = buffer.get(); char *pointer = buffer.get();
memcpy(pointer, res.data(), actual_size); memcpy(pointer, res.data(), actual_size);
...@@ -1303,7 +1606,7 @@ int32_t GraphTable::random_sample_nodes(int type_id, ...@@ -1303,7 +1606,7 @@ int32_t GraphTable::random_sample_nodes(int type_id,
} }
int32_t GraphTable::random_sample_neighbors( int32_t GraphTable::random_sample_neighbors(
int idx, int idx,
int64_t *node_ids, uint64_t *node_ids,
int sample_size, int sample_size,
std::vector<std::shared_ptr<char>> &buffers, std::vector<std::shared_ptr<char>> &buffers,
std::vector<int> &actual_sizes, std::vector<int> &actual_sizes,
...@@ -1323,7 +1626,7 @@ int32_t GraphTable::random_sample_neighbors( ...@@ -1323,7 +1626,7 @@ int32_t GraphTable::random_sample_neighbors(
for (int i = 0; i < (int)seq_id.size(); i++) { for (int i = 0; i < (int)seq_id.size(); i++) {
if (seq_id[i].size() == 0) continue; if (seq_id[i].size() == 0) continue;
tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
int64_t node_id; uint64_t node_id;
std::vector<std::pair<SampleKey, SampleResult>> r; std::vector<std::pair<SampleKey, SampleResult>> r;
LRUResponse response = LRUResponse::blocked; LRUResponse response = LRUResponse::blocked;
if (use_cache) { if (use_cache) {
...@@ -1369,7 +1672,7 @@ int32_t GraphTable::random_sample_neighbors( ...@@ -1369,7 +1672,7 @@ int32_t GraphTable::random_sample_neighbors(
res.size() * (need_weight ? (Node::id_size + Node::weight_size) res.size() * (need_weight ? (Node::id_size + Node::weight_size)
: Node::id_size); : Node::id_size);
int offset = 0; int offset = 0;
int64_t id; uint64_t id;
float weight; float weight;
char *buffer_addr = new char[actual_size]; char *buffer_addr = new char[actual_size];
if (response == LRUResponse::ok) { if (response == LRUResponse::ok) {
...@@ -1405,13 +1708,13 @@ int32_t GraphTable::random_sample_neighbors( ...@@ -1405,13 +1708,13 @@ int32_t GraphTable::random_sample_neighbors(
} }
int32_t GraphTable::get_node_feat(int idx, int32_t GraphTable::get_node_feat(int idx,
const std::vector<int64_t> &node_ids, const std::vector<uint64_t> &node_ids,
const std::vector<std::string> &feature_names, const std::vector<std::string> &feature_names,
std::vector<std::vector<std::string>> &res) { std::vector<std::vector<std::string>> &res) {
size_t node_num = node_ids.size(); size_t node_num = node_ids.size();
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
for (size_t idy = 0; idy < node_num; ++idy) { for (size_t idy = 0; idy < node_num; ++idy) {
int64_t node_id = node_ids[idy]; uint64_t node_id = node_ids[idy];
tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
[&, idx, idy, node_id]() -> int { [&, idx, idy, node_id]() -> int {
Node *node = find_node(1, idx, node_id); Node *node = find_node(1, idx, node_id);
...@@ -1440,13 +1743,13 @@ int32_t GraphTable::get_node_feat(int idx, ...@@ -1440,13 +1743,13 @@ int32_t GraphTable::get_node_feat(int idx,
int32_t GraphTable::set_node_feat( int32_t GraphTable::set_node_feat(
int idx, int idx,
const std::vector<int64_t> &node_ids, const std::vector<uint64_t> &node_ids,
const std::vector<std::string> &feature_names, const std::vector<std::string> &feature_names,
const std::vector<std::vector<std::string>> &res) { const std::vector<std::vector<std::string>> &res) {
size_t node_num = node_ids.size(); size_t node_num = node_ids.size();
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
for (size_t idy = 0; idy < node_num; ++idy) { for (size_t idy = 0; idy < node_num; ++idy) {
int64_t node_id = node_ids[idy]; uint64_t node_id = node_ids[idy];
tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
[&, idx, idy, node_id]() -> int { [&, idx, idy, node_id]() -> int {
size_t index = node_id % this->shard_num - this->shard_start; size_t index = node_id % this->shard_num - this->shard_start;
...@@ -1469,60 +1772,247 @@ int32_t GraphTable::set_node_feat( ...@@ -1469,60 +1772,247 @@ int32_t GraphTable::set_node_feat(
return 0; return 0;
} }
std::pair<int32_t, std::string> GraphTable::parse_feature( void string_vector_2_string(std::vector<std::string>::iterator strs_begin,
int idx, std::string feat_str) { std::vector<std::string>::iterator strs_end,
char delim,
std::string *output) {
size_t i = 0;
for (std::vector<std::string>::iterator iter = strs_begin; iter != strs_end;
++iter) {
if (i > 0) {
*output += delim;
}
*output += *iter;
++i;
}
}
void string_vector_2_string(
std::vector<paddle::string::str_ptr>::iterator strs_begin,
std::vector<paddle::string::str_ptr>::iterator strs_end,
char delim,
std::string *output) {
size_t i = 0;
for (auto iter = strs_begin; iter != strs_end; ++iter) {
if (i > 0) {
output->append(&delim, 1);
}
output->append((*iter).ptr, (*iter).len);
++i;
}
}
int GraphTable::parse_feature(int idx,
const char *feat_str,
size_t len,
FeatureNode *node) {
// Return (feat_id, btyes) if name are in this->feat_name, else return (-1, // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
// "") // "")
auto fields = paddle::string::split_string<std::string>(feat_str, " "); thread_local std::vector<paddle::string::str_ptr> fields;
if (feat_id_map[idx].count(fields[0])) { fields.clear();
// if (this->feat_id_map.count(fields[0])) { const char c = feature_separator_.at(0);
int32_t id = this->feat_id_map[idx][fields[0]]; paddle::string::split_string_ptr(feat_str, len, c, &fields);
std::string name = fields[0].to_string();
auto it = feat_id_map[idx].find(name);
if (it != feat_id_map[idx].end()) {
int32_t id = it->second;
std::string *fea_ptr = node->mutable_feature(id);
std::string dtype = this->feat_dtype[idx][id]; std::string dtype = this->feat_dtype[idx][id];
std::vector<std::string> values(fields.begin() + 1, fields.end());
if (dtype == "feasign") { if (dtype == "feasign") {
return std::make_pair<int32_t, std::string>( // string_vector_2_string(fields.begin() + 1, fields.end(), ' ',
int32_t(id), paddle::string::join_strings(values, ' ')); // fea_ptr);
FeatureNode::parse_value_to_bytes<uint64_t>(
fields.begin() + 1, fields.end(), fea_ptr);
return 0;
} else if (dtype == "string") { } else if (dtype == "string") {
return std::make_pair<int32_t, std::string>( string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr);
int32_t(id), paddle::string::join_strings(values, ' ')); return 0;
} else if (dtype == "float32") { } else if (dtype == "float32") {
return std::make_pair<int32_t, std::string>( FeatureNode::parse_value_to_bytes<float>(
int32_t(id), FeatureNode::parse_value_to_bytes<float>(values)); fields.begin() + 1, fields.end(), fea_ptr);
return 0;
} else if (dtype == "float64") { } else if (dtype == "float64") {
return std::make_pair<int32_t, std::string>( FeatureNode::parse_value_to_bytes<double>(
int32_t(id), FeatureNode::parse_value_to_bytes<double>(values)); fields.begin() + 1, fields.end(), fea_ptr);
return 0;
} else if (dtype == "int32") { } else if (dtype == "int32") {
return std::make_pair<int32_t, std::string>( FeatureNode::parse_value_to_bytes<int32_t>(
int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values)); fields.begin() + 1, fields.end(), fea_ptr);
return 0;
} else if (dtype == "int64") { } else if (dtype == "int64") {
return std::make_pair<int32_t, std::string>( FeatureNode::parse_value_to_bytes<uint64_t>(
int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values)); fields.begin() + 1, fields.end(), fea_ptr);
return 0;
} }
} else {
VLOG(2) << "feature_name[" << name << "] is not in feat_id_map, ntype_id["
<< idx << "] feat_id_map_size[" << feat_id_map.size() << "]";
} }
return std::make_pair<int32_t, std::string>(-1, "");
return -1;
}
// thread safe shard vector merge
class MergeShardVector {
public:
MergeShardVector(std::vector<std::vector<uint64_t>> *output, int slice_num) {
_slice_num = slice_num;
_shard_keys = output;
_shard_keys->resize(slice_num);
_mutexs = new std::mutex[slice_num];
}
~MergeShardVector() {
if (_mutexs != nullptr) {
delete[] _mutexs;
_mutexs = nullptr;
}
}
// merge shard keys
void merge(const std::vector<std::vector<uint64_t>> &shard_keys) {
// add to shard
for (int shard_id = 0; shard_id < _slice_num; ++shard_id) {
auto &dest = (*_shard_keys)[shard_id];
auto &src = shard_keys[shard_id];
_mutexs[shard_id].lock();
dest.insert(dest.end(), src.begin(), src.end());
_mutexs[shard_id].unlock();
}
}
private:
int _slice_num = 0;
std::mutex *_mutexs = nullptr;
std::vector<std::vector<uint64_t>> *_shard_keys;
};
int GraphTable::get_all_id(int type_id,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
MergeShardVector shard_merge(output, slice_num);
auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
std::vector<std::future<size_t>> tasks;
for (int idx = 0; idx < search_shards.size(); idx++) {
for (int j = 0; j < search_shards[idx].size(); j++) {
tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
[&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
std::vector<std::vector<uint64_t>> shard_keys;
size_t num =
search_shards[idx][j]->get_all_id(&shard_keys, slice_num);
// add to shard
shard_merge.merge(shard_keys);
return num;
}));
}
}
for (size_t i = 0; i < tasks.size(); ++i) {
tasks[i].wait();
}
return 0;
} }
std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int GraphTable::get_all_neighbor_id(
int type_id, int slice_num, std::vector<std::vector<uint64_t>> *output) {
MergeShardVector shard_merge(output, slice_num);
auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
std::vector<std::future<size_t>> tasks;
for (int idx = 0; idx < search_shards.size(); idx++) {
for (int j = 0; j < search_shards[idx].size(); j++) {
tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
[&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
std::vector<std::vector<uint64_t>> shard_keys;
size_t num = search_shards[idx][j]->get_all_neighbor_id(&shard_keys,
slice_num);
// add to shard
shard_merge.merge(shard_keys);
return num;
}));
}
}
for (size_t i = 0; i < tasks.size(); ++i) {
tasks[i].wait();
}
return 0;
}
int GraphTable::get_all_id(int type_id,
int idx, int idx,
int slice_num) { int slice_num,
std::vector<std::vector<int64_t>> res(slice_num); std::vector<std::vector<uint64_t>> *output) {
MergeShardVector shard_merge(output, slice_num);
auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
std::vector<std::future<std::vector<int64_t>>> tasks; std::vector<std::future<size_t>> tasks;
VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
for (size_t i = 0; i < search_shards.size(); i++) { for (size_t i = 0; i < search_shards.size(); i++) {
tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
[&search_shards, i]() -> std::vector<int64_t> { [&search_shards, i, slice_num, &shard_merge]() -> size_t {
return search_shards[i]->get_all_id(); std::vector<std::vector<uint64_t>> shard_keys;
size_t num = search_shards[i]->get_all_id(&shard_keys, slice_num);
// add to shard
shard_merge.merge(shard_keys);
return num;
})); }));
} }
for (size_t i = 0; i < tasks.size(); ++i) { for (size_t i = 0; i < tasks.size(); ++i) {
tasks[i].wait(); tasks[i].wait();
} }
for (size_t i = 0; i < tasks.size(); i++) { VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]";
auto ids = tasks[i].get(); return 0;
for (auto &id : ids) res[(uint64_t)(id) % slice_num].push_back(id); }
int GraphTable::get_all_neighbor_id(
int type_id,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
MergeShardVector shard_merge(output, slice_num);
auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
std::vector<std::future<size_t>> tasks;
VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
for (int i = 0; i < search_shards.size(); i++) {
tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
[&search_shards, i, slice_num, &shard_merge]() -> size_t {
std::vector<std::vector<uint64_t>> shard_keys;
size_t num =
search_shards[i]->get_all_neighbor_id(&shard_keys, slice_num);
// add to shard
shard_merge.merge(shard_keys);
return num;
}));
} }
return res; for (size_t i = 0; i < tasks.size(); ++i) {
tasks[i].wait();
}
VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]";
return 0;
} }
int GraphTable::get_all_feature_ids(
int type_id,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
MergeShardVector shard_merge(output, slice_num);
auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
std::vector<std::future<size_t>> tasks;
for (int i = 0; i < search_shards.size(); i++) {
tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
[&search_shards, i, slice_num, &shard_merge]() -> size_t {
std::vector<std::vector<uint64_t>> shard_keys;
size_t num =
search_shards[i]->get_all_feature_ids(&shard_keys, slice_num);
// add to shard
shard_merge.merge(shard_keys);
return num;
}));
}
for (size_t i = 0; i < tasks.size(); ++i) {
tasks[i].wait();
}
return 0;
}
int32_t GraphTable::pull_graph_list(int type_id, int32_t GraphTable::pull_graph_list(int type_id,
int idx, int idx,
int start, int start,
...@@ -1576,7 +2066,11 @@ int32_t GraphTable::pull_graph_list(int type_id, ...@@ -1576,7 +2066,11 @@ int32_t GraphTable::pull_graph_list(int type_id,
return 0; return 0;
} }
int32_t GraphTable::get_server_index_by_id(int64_t id) { void GraphTable::set_feature_separator(const std::string &ch) {
feature_separator_ = ch;
}
int32_t GraphTable::get_server_index_by_id(uint64_t id) {
return id % shard_num / shard_num_per_server; return id % shard_num / shard_num_per_server;
} }
int32_t GraphTable::Initialize(const TableParameter &config, int32_t GraphTable::Initialize(const TableParameter &config,
...@@ -1617,6 +2111,7 @@ void GraphTable::load_node_weight(int type_id, int idx, std::string path) { ...@@ -1617,6 +2111,7 @@ void GraphTable::load_node_weight(int type_id, int idx, std::string path) {
} }
int32_t GraphTable::Initialize(const GraphParameter &graph) { int32_t GraphTable::Initialize(const GraphParameter &graph) {
task_pool_size_ = graph.task_pool_size(); task_pool_size_ = graph.task_pool_size();
build_sampler_on_cpu = graph.build_sampler_on_cpu();
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
_db = NULL; _db = NULL;
...@@ -1651,6 +2146,8 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { ...@@ -1651,6 +2146,8 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
_shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_pool[i].reset(new ::ThreadPool(1));
_shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
} }
load_node_edge_task_pool.reset(new ::ThreadPool(load_thread_num));
auto graph_feature = graph.graph_feature(); auto graph_feature = graph.graph_feature();
auto node_types = graph.node_types(); auto node_types = graph.node_types();
auto edge_types = graph.edge_types(); auto edge_types = graph.edge_types();
......
...@@ -58,33 +58,80 @@ class GraphShard { ...@@ -58,33 +58,80 @@ class GraphShard {
~GraphShard(); ~GraphShard();
std::vector<Node *> &get_bucket() { return bucket; } std::vector<Node *> &get_bucket() { return bucket; }
std::vector<Node *> get_batch(int start, int end, int step); std::vector<Node *> get_batch(int start, int end, int step);
std::vector<int64_t> get_ids_by_range(int start, int end) { void get_ids_by_range(int start, int end, std::vector<uint64_t> *res) {
std::vector<int64_t> res; res->reserve(res->size() + end - start);
for (int i = start; i < end && i < (int)bucket.size(); i++) { for (int i = start; i < end && i < (int)bucket.size(); i++) {
res.push_back(bucket[i]->get_id()); res->emplace_back(bucket[i]->get_id());
} }
return res;
} }
std::vector<int64_t> get_all_id() { size_t get_all_id(std::vector<std::vector<uint64_t>> *shard_keys,
std::vector<int64_t> res; int slice_num) {
int bucket_num = bucket.size();
shard_keys->resize(slice_num);
for (int i = 0; i < slice_num; ++i) {
(*shard_keys)[i].reserve(bucket_num / slice_num);
}
for (int i = 0; i < bucket_num; i++) {
uint64_t k = bucket[i]->get_id();
(*shard_keys)[k % slice_num].emplace_back(k);
}
return bucket_num;
}
size_t get_all_neighbor_id(std::vector<std::vector<uint64_t>> *total_res,
int slice_num) {
std::vector<uint64_t> keys;
for (size_t i = 0; i < bucket.size(); i++) {
size_t neighbor_size = bucket[i]->get_neighbor_size();
size_t n = keys.size();
keys.resize(n + neighbor_size);
for (size_t j = 0; j < neighbor_size; j++) {
keys[n + j] = bucket[i]->get_neighbor_id(j);
}
}
return dedup2shard_keys(&keys, total_res, slice_num);
}
size_t get_all_feature_ids(std::vector<std::vector<uint64_t>> *total_res,
int slice_num) {
std::vector<uint64_t> keys;
for (int i = 0; i < (int)bucket.size(); i++) { for (int i = 0; i < (int)bucket.size(); i++) {
res.push_back(bucket[i]->get_id()); bucket[i]->get_feature_ids(&keys);
}
return dedup2shard_keys(&keys, total_res, slice_num);
}
size_t dedup2shard_keys(std::vector<uint64_t> *keys,
std::vector<std::vector<uint64_t>> *total_res,
int slice_num) {
size_t num = keys->size();
uint64_t last_key = 0;
// sort key insert to vector
std::sort(keys->begin(), keys->end());
total_res->resize(slice_num);
for (int shard_id = 0; shard_id < slice_num; ++shard_id) {
(*total_res)[shard_id].reserve(num / slice_num);
} }
return res; for (size_t i = 0; i < num; ++i) {
const uint64_t &k = (*keys)[i];
if (i > 0 && last_key == k) {
continue;
} }
GraphNode *add_graph_node(int64_t id); last_key = k;
(*total_res)[k % slice_num].push_back(k);
}
return num;
}
GraphNode *add_graph_node(uint64_t id);
GraphNode *add_graph_node(Node *node); GraphNode *add_graph_node(Node *node);
FeatureNode *add_feature_node(int64_t id); FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true);
Node *find_node(int64_t id); Node *find_node(uint64_t id);
void delete_node(int64_t id); void delete_node(uint64_t id);
void clear(); void clear();
void add_neighbor(int64_t id, int64_t dst_id, float weight); void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
std::unordered_map<int64_t, int> &get_node_location() { std::unordered_map<uint64_t, int> &get_node_location() {
return node_location; return node_location;
} }
private: private:
std::unordered_map<int64_t, int> node_location; std::unordered_map<uint64_t, int> node_location;
std::vector<Node *> bucket; std::vector<Node *> bucket;
}; };
...@@ -92,11 +139,11 @@ enum LRUResponse { ok = 0, blocked = 1, err = 2 }; ...@@ -92,11 +139,11 @@ enum LRUResponse { ok = 0, blocked = 1, err = 2 };
struct SampleKey { struct SampleKey {
int idx; int idx;
int64_t node_key; uint64_t node_key;
size_t sample_size; size_t sample_size;
bool is_weighted; bool is_weighted;
SampleKey(int _idx, SampleKey(int _idx,
int64_t _node_key, uint64_t _node_key,
size_t _sample_size, size_t _sample_size,
bool _is_weighted) { bool _is_weighted) {
idx = _idx; idx = _idx;
...@@ -467,7 +514,7 @@ class GraphTable : public Table { ...@@ -467,7 +514,7 @@ class GraphTable : public Table {
virtual int32_t random_sample_neighbors( virtual int32_t random_sample_neighbors(
int idx, int idx,
int64_t *node_ids, uint64_t *node_ids,
int sample_size, int sample_size,
std::vector<std::shared_ptr<char>> &buffers, std::vector<std::shared_ptr<char>> &buffers,
std::vector<int> &actual_sizes, std::vector<int> &actual_sizes,
...@@ -483,30 +530,62 @@ class GraphTable : public Table { ...@@ -483,30 +530,62 @@ class GraphTable : public Table {
int type_id, int type_id,
int idx, int idx,
std::vector<std::pair<int, int>> ranges, std::vector<std::pair<int, int>> ranges,
std::vector<int64_t> &res); std::vector<uint64_t> &res);
virtual int32_t Initialize() { return 0; } virtual int32_t Initialize() { return 0; }
virtual int32_t Initialize(const TableParameter &config, virtual int32_t Initialize(const TableParameter &config,
const FsClientParameter &fs_config); const FsClientParameter &fs_config);
virtual int32_t Initialize(const GraphParameter &config); virtual int32_t Initialize(const GraphParameter &config);
int32_t Load(const std::string &path, const std::string &param); int32_t Load(const std::string &path, const std::string &param);
int32_t load_node_and_edge_file(std::string etype,
std::string ntype,
std::string epath,
std::string npath,
int part_num,
bool reverse);
std::string get_inverse_etype(std::string &etype);
int32_t load_edges(const std::string &path, int32_t load_edges(const std::string &path,
bool reverse, bool reverse,
const std::string &edge_type); const std::string &edge_type);
std::vector<std::vector<int64_t>> get_all_id(int type, int get_all_id(int type,
int slice_num,
std::vector<std::vector<uint64_t>> *output);
int get_all_neighbor_id(int type,
int slice_num,
std::vector<std::vector<uint64_t>> *output);
int get_all_id(int type,
int idx, int idx,
int slice_num); int slice_num,
int32_t load_nodes(const std::string &path, std::string node_type); std::vector<std::vector<uint64_t>> *output);
int get_all_neighbor_id(int type_id,
int id,
int slice_num,
std::vector<std::vector<uint64_t>> *output);
int get_all_feature_ids(int type,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>> *output);
int32_t load_nodes(const std::string &path,
std::string node_type = std::string());
std::pair<uint64_t, uint64_t> parse_edge_file(const std::string &path,
int idx,
bool reverse);
std::pair<uint64_t, uint64_t> parse_node_file(const std::string &path,
const std::string &node_type,
int idx);
std::pair<uint64_t, uint64_t> parse_node_file(const std::string &path);
int32_t add_graph_node(int idx, int32_t add_graph_node(int idx,
std::vector<int64_t> &id_list, std::vector<uint64_t> &id_list,
std::vector<bool> &is_weight_list); std::vector<bool> &is_weight_list);
int32_t remove_graph_node(int idx, std::vector<int64_t> &id_list); int32_t remove_graph_node(int idx, std::vector<uint64_t> &id_list);
int32_t get_server_index_by_id(int64_t id); int32_t get_server_index_by_id(uint64_t id);
Node *find_node(int type_id, int idx, int64_t id); Node *find_node(int type_id, int idx, uint64_t id);
Node *find_node(int type_id, uint64_t id);
virtual int32_t Pull(TableContext &context) { return 0; } virtual int32_t Pull(TableContext &context) { return 0; }
virtual int32_t Push(TableContext &context) { return 0; } virtual int32_t Push(TableContext &context) { return 0; }
...@@ -531,19 +610,21 @@ class GraphTable : public Table { ...@@ -531,19 +610,21 @@ class GraphTable : public Table {
this->server_num = server_num; this->server_num = server_num;
return 0; return 0;
} }
virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index); virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
virtual uint32_t get_thread_pool_index(int64_t node_id); virtual uint32_t get_thread_pool_index(uint64_t node_id);
virtual std::pair<int32_t, std::string> parse_feature(int idx, virtual int parse_feature(int idx,
std::string feat_str); const char *feat_str,
size_t len,
FeatureNode *node);
virtual int32_t get_node_feat(int idx, virtual int32_t get_node_feat(int idx,
const std::vector<int64_t> &node_ids, const std::vector<uint64_t> &node_ids,
const std::vector<std::string> &feature_names, const std::vector<std::string> &feature_names,
std::vector<std::vector<std::string>> &res); std::vector<std::vector<std::string>> &res);
virtual int32_t set_node_feat( virtual int32_t set_node_feat(
int idx, int idx,
const std::vector<int64_t> &node_ids, const std::vector<uint64_t> &node_ids,
const std::vector<std::string> &feature_names, const std::vector<std::string> &feature_names,
const std::vector<std::vector<std::string>> &res); const std::vector<std::vector<std::string>> &res);
...@@ -578,22 +659,24 @@ class GraphTable : public Table { ...@@ -578,22 +659,24 @@ class GraphTable : public Table {
virtual void export_partition_files(int idx, std::string file_path); virtual void export_partition_files(int idx, std::string file_path);
virtual char *random_sample_neighbor_from_ssd( virtual char *random_sample_neighbor_from_ssd(
int idx, int idx,
int64_t id, uint64_t id,
int sample_size, int sample_size,
const std::shared_ptr<std::mt19937_64> rng, const std::shared_ptr<std::mt19937_64> rng,
int &actual_size); int &actual_size);
virtual int32_t add_node_to_ssd( virtual int32_t add_node_to_ssd(
int type_id, int idx, int64_t src_id, char *data, int len); int type_id, int idx, uint64_t src_id, char *data, int len);
virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph( virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
int idx, std::vector<int64_t> ids); int idx, std::vector<uint64_t> ids);
virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
std::vector<uint64_t> &node_ids, int slot_num);
int32_t Load_to_ssd(const std::string &path, const std::string &param); int32_t Load_to_ssd(const std::string &path, const std::string &param);
int64_t load_graph_to_memory_from_ssd(int idx, std::vector<int64_t> &ids); int64_t load_graph_to_memory_from_ssd(int idx, std::vector<uint64_t> &ids);
int32_t make_complementary_graph(int idx, int64_t byte_size); int32_t make_complementary_graph(int idx, int64_t byte_size);
int32_t dump_edges_to_ssd(int idx); int32_t dump_edges_to_ssd(int idx);
int32_t get_partition_num(int idx) { return partitions[idx].size(); } int32_t get_partition_num(int idx) { return partitions[idx].size(); }
std::vector<int64_t> get_partition(int idx, int index) { std::vector<uint64_t> get_partition(int idx, int index) {
if (idx >= partitions.size() || index >= partitions[idx].size()) if (idx >= (int)partitions.size() || index >= (int)partitions[idx].size())
return std::vector<int64_t>(); return std::vector<uint64_t>();
return partitions[idx][index]; return partitions[idx][index];
} }
int32_t load_edges_to_ssd(const std::string &path, int32_t load_edges_to_ssd(const std::string &path,
...@@ -603,17 +686,20 @@ class GraphTable : public Table { ...@@ -603,17 +686,20 @@ class GraphTable : public Table {
void set_search_level(int search_level) { this->search_level = search_level; } void set_search_level(int search_level) { this->search_level = search_level; }
int search_level; int search_level;
int64_t total_memory_cost; int64_t total_memory_cost;
std::vector<std::vector<std::vector<int64_t>>> partitions; std::vector<std::vector<std::vector<uint64_t>>> partitions;
int next_partition; int next_partition;
#endif #endif
virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id); virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id);
virtual int32_t build_sampler(int idx, std::string sample_type = "random"); virtual int32_t build_sampler(int idx, std::string sample_type = "random");
void set_feature_separator(const std::string &ch);
std::vector<std::vector<GraphShard *>> edge_shards, feature_shards; std::vector<std::vector<GraphShard *>> edge_shards, feature_shards;
size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
int task_pool_size_ = 24; int task_pool_size_ = 24;
int load_thread_num = 160;
const int random_sample_nodes_ranges = 3; const int random_sample_nodes_ranges = 3;
std::vector<std::vector<std::unordered_map<int64_t, double>>> node_weight; std::vector<std::vector<std::unordered_map<uint64_t, double>>> node_weight;
std::vector<std::vector<std::string>> feat_name; std::vector<std::vector<std::string>> feat_name;
std::vector<std::vector<std::string>> feat_dtype; std::vector<std::vector<std::string>> feat_dtype;
std::vector<std::vector<int32_t>> feat_shape; std::vector<std::vector<int32_t>> feat_shape;
...@@ -625,21 +711,24 @@ class GraphTable : public Table { ...@@ -625,21 +711,24 @@ class GraphTable : public Table {
std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool; std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool; std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
std::shared_ptr<::ThreadPool> load_node_edge_task_pool;
std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru; std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
std::unordered_set<int64_t> extra_nodes; std::unordered_set<uint64_t> extra_nodes;
std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index; std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
bool use_cache, use_duplicate_nodes; bool use_cache, use_duplicate_nodes;
int cache_size_limit; int cache_size_limit;
int cache_ttl; int cache_ttl;
mutable std::mutex mutex_; mutable std::mutex mutex_;
bool build_sampler_on_cpu;
std::shared_ptr<pthread_rwlock_t> rw_lock; std::shared_ptr<pthread_rwlock_t> rw_lock;
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
// paddle::framework::GpuPsGraphTable gpu_graph_table; // paddle::framework::GpuPsGraphTable gpu_graph_table;
paddle::distributed::RocksDBHandler *_db; paddle::distributed::RocksDBHandler *_db;
// std::shared_ptr<::ThreadPool> graph_sample_pool; // std::shared_ptr<::ThreadPool> graph_sample_pool;
// std::shared_ptr<GraphSampler> graph_sampler; // std::shared_ptr<GraphSampler> graph_sampler;
// REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) // REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
#endif #endif
std::string feature_separator_ = std::string(" ");
}; };
/* /*
...@@ -657,7 +746,7 @@ class CompleteGraphSampler : public GraphSampler { ...@@ -657,7 +746,7 @@ class CompleteGraphSampler : public GraphSampler {
protected: protected:
GraphTable *graph_table; GraphTable *graph_table;
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes; std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<int64_t>> sample_neighbors; std::vector<std::vector<uint64_t>> sample_neighbors;
// std::vector<GpuPsCommGraph> sample_res; // std::vector<GpuPsCommGraph> sample_res;
// std::shared_ptr<std::mt19937_64> random; // std::shared_ptr<std::mt19937_64> random;
int gpu_num; int gpu_num;
...@@ -676,11 +765,11 @@ class BasicBfsGraphSampler : public GraphSampler { ...@@ -676,11 +765,11 @@ class BasicBfsGraphSampler : public GraphSampler {
GraphTable *graph_table; GraphTable *graph_table;
// std::vector<std::vector<GpuPsGraphNode>> sample_nodes; // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes; std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<int64_t>> sample_neighbors; std::vector<std::vector<uint64_t>> sample_neighbors;
size_t gpu_num; size_t gpu_num;
int init_search_size, node_num_for_each_shard, edge_num_for_each_node; int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
int rounds, interval; int rounds, interval;
std::vector<std::unordered_map<int64_t, std::vector<int64_t>>> std::vector<std::unordered_map<uint64_t, std::vector<uint64_t>>>
sample_neighbors_map; sample_neighbors_map;
}; };
#endif #endif
......
...@@ -16,10 +16,15 @@ ...@@ -16,10 +16,15 @@
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <set>
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#include "glog/logging.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -30,6 +35,7 @@ class Node { ...@@ -30,6 +35,7 @@ class Node {
virtual ~Node() {} virtual ~Node() {}
static int id_size, int_size, weight_size; static int id_size, int_size, weight_size;
uint64_t get_id() { return id; } uint64_t get_id() { return id; }
int64_t get_py_id() { return (int64_t)id; }
void set_id(uint64_t id) { this->id = id; } void set_id(uint64_t id) { this->id = id; }
virtual void build_edges(bool is_weighted) {} virtual void build_edges(bool is_weighted) {}
...@@ -46,7 +52,11 @@ class Node { ...@@ -46,7 +52,11 @@ class Node {
virtual void to_buffer(char *buffer, bool need_feature); virtual void to_buffer(char *buffer, bool need_feature);
virtual void recover_from_buffer(char *buffer); virtual void recover_from_buffer(char *buffer);
virtual std::string get_feature(int idx) { return std::string(""); } virtual std::string get_feature(int idx) { return std::string(""); }
virtual void set_feature(int idx, std::string str) {} virtual int get_feature_ids(std::vector<uint64_t> *res) const { return 0; }
virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
return 0;
}
virtual void set_feature(int idx, const std::string &str) {}
virtual void set_feature_size(int size) {} virtual void set_feature_size(int size) {}
virtual int get_feature_size() { return 0; } virtual int get_feature_size() { return 0; }
virtual size_t get_neighbor_size() { return 0; } virtual size_t get_neighbor_size() { return 0; }
...@@ -95,7 +105,64 @@ class FeatureNode : public Node { ...@@ -95,7 +105,64 @@ class FeatureNode : public Node {
} }
} }
virtual void set_feature(int idx, std::string str) { virtual int get_feature_ids(std::vector<uint64_t> *res) const {
PADDLE_ENFORCE_NOT_NULL(res,
paddle::platform::errors::InvalidArgument(
"get_feature_ids res should not be null"));
errno = 0;
for (auto &feature_item : feature) {
const uint64_t *feas = (const uint64_t *)(feature_item.c_str());
size_t num = feature_item.length() / sizeof(uint64_t);
CHECK((feature_item.length() % sizeof(uint64_t)) == 0)
<< "bad feature_item: [" << feature_item << "]";
size_t n = res->size();
res->resize(n + num);
for (size_t i = 0; i < num; ++i) {
(*res)[n + i] = feas[i];
}
}
PADDLE_ENFORCE_EQ(
errno,
0,
paddle::platform::errors::InvalidArgument(
"get_feature_ids get errno should be 0, but got %d.", errno));
return 0;
}
virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
PADDLE_ENFORCE_NOT_NULL(res,
paddle::platform::errors::InvalidArgument(
"get_feature_ids res should not be null"));
res->clear();
errno = 0;
if (slot_idx < (int)this->feature.size()) {
const std::string &s = this->feature[slot_idx];
const uint64_t *feas = (const uint64_t *)(s.c_str());
size_t num = s.length() / sizeof(uint64_t);
CHECK((s.length() % sizeof(uint64_t)) == 0)
<< "bad feature_item: [" << s << "]";
res->resize(num);
for (size_t i = 0; i < num; ++i) {
(*res)[i] = feas[i];
}
}
PADDLE_ENFORCE_EQ(
errno,
0,
paddle::platform::errors::InvalidArgument(
"get_feature_ids get errno should be 0, but got %d.", errno));
return 0;
}
virtual std::string *mutable_feature(int idx) {
if (idx >= (int)this->feature.size()) {
this->feature.resize(idx + 1);
}
return &(this->feature[idx]);
}
virtual void set_feature(int idx, const std::string &str) {
if (idx >= (int)this->feature.size()) { if (idx >= (int)this->feature.size()) {
this->feature.resize(idx + 1); this->feature.resize(idx + 1);
} }
...@@ -117,6 +184,23 @@ class FeatureNode : public Node { ...@@ -117,6 +184,23 @@ class FeatureNode : public Node {
return std::string(buffer, Tsize); return std::string(buffer, Tsize);
} }
template <typename T>
static void parse_value_to_bytes(
std::vector<std::string>::iterator feat_str_begin,
std::vector<std::string>::iterator feat_str_end,
std::string *output) {
T v;
size_t feat_str_size = feat_str_end - feat_str_begin;
size_t Tsize = sizeof(T) * feat_str_size;
char buffer[Tsize] = {'\0'};
for (size_t i = 0; i < feat_str_size; i++) {
std::stringstream ss(*(feat_str_begin + i));
ss >> v;
std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
}
output->assign(buffer);
}
template <typename T> template <typename T>
static std::vector<T> parse_bytes_to_array(std::string feat_str) { static std::vector<T> parse_bytes_to_array(std::string feat_str) {
T v; T v;
...@@ -131,8 +215,28 @@ class FeatureNode : public Node { ...@@ -131,8 +215,28 @@ class FeatureNode : public Node {
return out; return out;
} }
template <typename T>
static void parse_value_to_bytes(
std::vector<paddle::string::str_ptr>::iterator feat_str_begin,
std::vector<paddle::string::str_ptr>::iterator feat_str_end,
std::string *output) {
size_t feat_str_size = feat_str_end - feat_str_begin;
size_t Tsize = sizeof(T) * feat_str_size;
size_t num = output->length();
output->resize(num + Tsize);
T *fea_ptrs = (T *)(&(*output)[num]);
thread_local paddle::string::str_ptr_stream ss;
for (size_t i = 0; i < feat_str_size; i++) {
ss.reset(*(feat_str_begin + i));
ss >> fea_ptrs[i];
}
}
protected: protected:
std::vector<std::string> feature; std::vector<std::string> feature;
}; };
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -41,14 +41,14 @@ namespace paddle { ...@@ -41,14 +41,14 @@ namespace paddle {
namespace distributed { namespace distributed {
int32_t MemorySparseTable::Initialize() { int32_t MemorySparseTable::Initialize() {
_shards_task_pool.resize(_task_pool_size);
for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1));
}
auto& profiler = CostProfiler::instance(); auto& profiler = CostProfiler::instance();
profiler.register_profiler("pserver_sparse_update_all"); profiler.register_profiler("pserver_sparse_update_all");
profiler.register_profiler("pserver_sparse_select_all"); profiler.register_profiler("pserver_sparse_select_all");
InitializeValue(); InitializeValue();
_shards_task_pool.resize(_task_pool_size);
for (int i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1));
}
VLOG(0) << "initalize MemorySparseTable succ"; VLOG(0) << "initalize MemorySparseTable succ";
return 0; return 0;
} }
...@@ -65,9 +65,13 @@ int32_t MemorySparseTable::InitializeValue() { ...@@ -65,9 +65,13 @@ int32_t MemorySparseTable::InitializeValue() {
_real_local_shard_num = _real_local_shard_num =
_real_local_shard_num < 0 ? 0 : _real_local_shard_num; _real_local_shard_num < 0 ? 0 : _real_local_shard_num;
} }
#ifdef PADDLE_WITH_HETERPS
_task_pool_size = _sparse_table_shard_num;
#endif
VLOG(1) << "memory sparse table _avg_local_shard_num: " VLOG(1) << "memory sparse table _avg_local_shard_num: "
<< _avg_local_shard_num << _avg_local_shard_num
<< " _real_local_shard_num: " << _real_local_shard_num; << " _real_local_shard_num: " << _real_local_shard_num
<< " _task_pool_size:" << _task_pool_size;
_local_shards.reset(new shard_type[_real_local_shard_num]); _local_shards.reset(new shard_type[_real_local_shard_num]);
...@@ -336,7 +340,11 @@ int32_t MemorySparseTable::Save(const std::string& dirname, ...@@ -336,7 +340,11 @@ int32_t MemorySparseTable::Save(const std::string& dirname,
size_t file_start_idx = _avg_local_shard_num * _shard_idx; size_t file_start_idx = _avg_local_shard_num * _shard_idx;
#ifdef PADDLE_WITH_GPU_GRAPH
int thread_num = _real_local_shard_num;
#else
int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20; int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
#endif
omp_set_num_threads(thread_num); omp_set_num_threads(thread_num);
#pragma omp parallel for schedule(dynamic) #pragma omp parallel for schedule(dynamic)
for (int i = 0; i < _real_local_shard_num; ++i) { for (int i = 0; i < _real_local_shard_num; ++i) {
......
...@@ -112,7 +112,7 @@ class MemorySparseTable : public Table { ...@@ -112,7 +112,7 @@ class MemorySparseTable : public Table {
virtual int32_t LoadPatch(const std::vector<std::string>& file_list, virtual int32_t LoadPatch(const std::vector<std::string>& file_list,
int save_param); int save_param);
const int _task_pool_size = 24; int _task_pool_size = 24;
int _avg_local_shard_num; int _avg_local_shard_num;
int _real_local_shard_num; int _real_local_shard_num;
int _sparse_table_shard_num; int _sparse_table_shard_num;
......
...@@ -126,13 +126,20 @@ message TableParameter { ...@@ -126,13 +126,20 @@ message TableParameter {
message TableAccessorParameter { message TableAccessorParameter {
optional string accessor_class = 1; optional string accessor_class = 1;
optional uint32 fea_dim = 4 [ default = 11 ]; optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value
optional uint32 embedx_dim = 5 [ default = 8 ]; optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
optional uint32 embedx_threshold = 6 [ default = 10 ]; optional uint32 embedx_threshold = 6
[ default = 10 ]; // embedx feature create threshold
optional CtrAccessorParameter ctr_accessor_param = 7; optional CtrAccessorParameter ctr_accessor_param = 7;
repeated TableAccessorSaveParameter table_accessor_save_param = 8; repeated TableAccessorSaveParameter table_accessor_save_param = 8;
optional SparseCommonSGDRuleParameter embed_sgd_param = 10; optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
optional GraphSGDParameter graph_sgd_param = 12;
}
message GraphSGDParameter {
optional uint32 nodeid_slot = 1 [ default = 9008 ];
optional float feature_learning_rate = 2 [ default = 0.05 ];
} }
message CtrAccessorParameter { message CtrAccessorParameter {
...@@ -232,6 +239,7 @@ message GraphParameter { ...@@ -232,6 +239,7 @@ message GraphParameter {
optional string table_type = 9 [ default = "" ]; optional string table_type = 9 [ default = "" ];
optional int32 shard_num = 10 [ default = 127 ]; optional int32 shard_num = 10 [ default = 127 ];
optional int32 search_level = 11 [ default = 1 ]; optional int32 search_level = 11 [ default = 1 ];
optional bool build_sampler_on_cpu = 12 [ default = true ];
} }
message GraphFeature { message GraphFeature {
......
...@@ -740,6 +740,19 @@ if(WITH_DISTRIBUTE) ...@@ -740,6 +740,19 @@ if(WITH_DISTRIBUTE)
set_source_files_properties( set_source_files_properties(
heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
elseif(WITH_PSCORE) elseif(WITH_PSCORE)
# cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
# dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
# heterxpu_trainer.cc heter_pipeline_trainer.cc
# data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
# downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc data_feed.cu
# pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
# device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
# index_sampler index_wrapper sampler index_dataset_proto
# lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
# graph_to_program_pass variable_helper timer monitor
# heter_service_proto fleet heter_server brpc fleet_executor
# graph_gpu_wrapper)
cc_library( cc_library(
executor executor
SRCS executor.cc SRCS executor.cc
...@@ -1001,6 +1014,25 @@ cc_library( ...@@ -1001,6 +1014,25 @@ cc_library(
DEPS parallel_executor) DEPS parallel_executor)
if(WITH_PSCORE) if(WITH_PSCORE)
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
if(WITH_HETERPS)
cc_test(
dist_multi_trainer_test
SRCS dist_multi_trainer_test.cc
DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS}
graph_gpu_wrapper)
cc_test(
heter_pipeline_trainer_test
SRCS heter_pipeline_trainer_test.cc
DEPS conditional_block_op
scale_op
heter_listen_and_serv_op
executor
heter_server
gloo_wrapper
eigen_function
${RPC_DEPS}
graph_gpu_wrapper)
else()
cc_test( cc_test(
dist_multi_trainer_test dist_multi_trainer_test
SRCS dist_multi_trainer_test.cc SRCS dist_multi_trainer_test.cc
...@@ -1016,6 +1048,7 @@ if(WITH_PSCORE) ...@@ -1016,6 +1048,7 @@ if(WITH_PSCORE)
gloo_wrapper gloo_wrapper
eigen_function eigen_function
${RPC_DEPS}) ${RPC_DEPS})
endif()
else() else()
cc_test( cc_test(
dist_multi_trainer_test dist_multi_trainer_test
......
...@@ -2108,6 +2108,9 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { ...@@ -2108,6 +2108,9 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
} else { } else {
so_parser_name_.clear(); so_parser_name_.clear();
} }
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.SetConfig(data_feed_desc);
#endif
} }
void SlotRecordInMemoryDataFeed::LoadIntoMemory() { void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
...@@ -2644,6 +2647,9 @@ bool SlotRecordInMemoryDataFeed::Start() { ...@@ -2644,6 +2647,9 @@ bool SlotRecordInMemoryDataFeed::Start() {
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
CHECK(paddle::platform::is_gpu_place(this->place_)); CHECK(paddle::platform::is_gpu_place(this->place_));
pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_); pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
#endif
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_);
#endif #endif
return true; return true;
} }
...@@ -2651,7 +2657,7 @@ bool SlotRecordInMemoryDataFeed::Start() { ...@@ -2651,7 +2657,7 @@ bool SlotRecordInMemoryDataFeed::Start() {
int SlotRecordInMemoryDataFeed::Next() { int SlotRecordInMemoryDataFeed::Next() {
#ifdef _LINUX #ifdef _LINUX
this->CheckStart(); this->CheckStart();
if (!gpu_graph_mode_) {
VLOG(3) << "enable heter next: " << offset_index_ VLOG(3) << "enable heter next: " << offset_index_
<< " batch_offsets: " << batch_offsets_.size(); << " batch_offsets: " << batch_offsets_.size();
if (offset_index_ >= batch_offsets_.size()) { if (offset_index_ >= batch_offsets_.size()) {
...@@ -2672,6 +2678,12 @@ int SlotRecordInMemoryDataFeed::Next() { ...@@ -2672,6 +2678,12 @@ int SlotRecordInMemoryDataFeed::Next() {
VLOG(3) << "enable heter next: " << offset_index_ VLOG(3) << "enable heter next: " << offset_index_
<< " batch_offsets: " << batch_offsets_.size() << " batch_offsets: " << batch_offsets_.size()
<< " baych_size: " << this->batch_size_; << " baych_size: " << this->batch_size_;
} else {
VLOG(3) << "datafeed in gpu graph mode";
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
this->batch_size_ = gpu_graph_data_generator_.GenerateBatch();
#endif
}
return this->batch_size_; return this->batch_size_;
#else #else
......
...@@ -18,6 +18,15 @@ limitations under the License. */ ...@@ -18,6 +18,15 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include <thrust/device_ptr.h>
#include <thrust/random.h>
#include <thrust/shuffle.h>
#include <sstream>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
DECLARE_bool(enable_opt_get_features);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -182,6 +191,1012 @@ void SlotRecordInMemoryDataFeed::CopyForTensor( ...@@ -182,6 +191,1012 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
__global__ void GraphFillCVMKernel(int64_t *tensor, int len) {
CUDA_KERNEL_LOOP(idx, len) { tensor[idx] = 1; }
}
__global__ void CopyDuplicateKeys(int64_t *dist_tensor,
uint64_t *src_tensor,
int len) {
CUDA_KERNEL_LOOP(idx, len) {
dist_tensor[idx * 2] = src_tensor[idx];
dist_tensor[idx * 2 + 1] = src_tensor[idx];
}
}
int GraphDataGenerator::AcquireInstance(BufState *state) {
//
if (state->GetNextStep()) {
state->Debug();
return state->len;
} else if (state->GetNextCentrolWord()) {
state->Debug();
return state->len;
} else if (state->GetNextBatch()) {
state->Debug();
return state->len;
}
return 0;
}
// TODO opt
__global__ void GraphFillFeatureKernel(uint64_t *id_tensor,
int *fill_ins_num,
uint64_t *walk,
uint64_t *feature,
int *row,
int central_word,
int step,
int len,
int col_num,
int slot_num) {
__shared__ int32_t local_key[CUDA_NUM_THREADS * 16];
__shared__ int local_num;
__shared__ int global_num;
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x == 0) {
local_num = 0;
}
__syncthreads();
if (idx < len) {
int src = row[idx] * col_num + central_word;
if (walk[src] != 0 && walk[src + step] != 0) {
size_t dst = atomicAdd(&local_num, 1);
for (int i = 0; i < slot_num; ++i) {
local_key[dst * 2 * slot_num + i * 2] = feature[src * slot_num + i];
local_key[dst * 2 * slot_num + i * 2 + 1] =
feature[(src + step) * slot_num + i];
}
}
}
__syncthreads();
if (threadIdx.x == 0) {
global_num = atomicAdd(fill_ins_num, local_num);
}
__syncthreads();
if (threadIdx.x < local_num) {
for (int i = 0; i < slot_num; ++i) {
id_tensor[(global_num * 2 + 2 * threadIdx.x) * slot_num + i] =
local_key[(2 * threadIdx.x) * slot_num + i];
id_tensor[(global_num * 2 + 2 * threadIdx.x + 1) * slot_num + i] =
local_key[(2 * threadIdx.x + 1) * slot_num + i];
}
}
}
__global__ void GraphFillIdKernel(uint64_t *id_tensor,
int *fill_ins_num,
uint64_t *walk,
int *row,
int central_word,
int step,
int len,
int col_num) {
__shared__ uint64_t local_key[CUDA_NUM_THREADS * 2];
__shared__ int local_num;
__shared__ int global_num;
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x == 0) {
local_num = 0;
}
__syncthreads();
// int dst = idx * 2;
// id_tensor[dst] = walk[src];
// id_tensor[dst + 1] = walk[src + step];
if (idx < len) {
int src = row[idx] * col_num + central_word;
if (walk[src] != 0 && walk[src + step] != 0) {
size_t dst = atomicAdd(&local_num, 1);
local_key[dst * 2] = walk[src];
local_key[dst * 2 + 1] = walk[src + step];
}
}
__syncthreads();
if (threadIdx.x == 0) {
global_num = atomicAdd(fill_ins_num, local_num);
}
__syncthreads();
if (threadIdx.x < local_num) {
id_tensor[global_num * 2 + 2 * threadIdx.x] = local_key[2 * threadIdx.x];
id_tensor[global_num * 2 + 2 * threadIdx.x + 1] =
local_key[2 * threadIdx.x + 1];
}
}
__global__ void GraphFillSlotKernel(uint64_t *id_tensor,
uint64_t *feature_buf,
int len,
int total_ins,
int slot_num) {
CUDA_KERNEL_LOOP(idx, len) {
int slot_idx = idx / total_ins;
int ins_idx = idx % total_ins;
((uint64_t *)(id_tensor[slot_idx]))[ins_idx] =
feature_buf[ins_idx * slot_num + slot_idx];
}
}
__global__ void GraphFillSlotLodKernelOpt(uint64_t *id_tensor,
int len,
int total_ins) {
CUDA_KERNEL_LOOP(idx, len) {
int slot_idx = idx / total_ins;
int ins_idx = idx % total_ins;
((uint64_t *)(id_tensor[slot_idx]))[ins_idx] = ins_idx;
}
}
__global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) {
CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; }
}
int GraphDataGenerator::FillInsBuf() {
if (ins_buf_pair_len_ >= batch_size_) {
return batch_size_;
}
int total_instance = AcquireInstance(&buf_state_);
VLOG(2) << "total_ins: " << total_instance;
buf_state_.Debug();
if (total_instance == 0) {
int res = FillWalkBuf(d_walk_);
if (!res) {
// graph iterate complete
return -1;
} else {
total_instance = buf_state_.len;
VLOG(2) << "total_ins: " << total_instance;
buf_state_.Debug();
// if (total_instance == 0) {
// return -1;
//}
}
if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
FillFeatureBuf(d_walk_, d_feature_);
if (debug_mode_) {
int len = buf_size_ > 5000 ? 5000 : buf_size_;
uint64_t h_walk[len];
cudaMemcpy(h_walk,
d_walk_->ptr(),
len * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
uint64_t h_feature[len * slot_num_];
cudaMemcpy(h_feature,
d_feature_->ptr(),
len * slot_num_ * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
for (int i = 0; i < len; ++i) {
std::stringstream ss;
for (int j = 0; j < slot_num_; ++j) {
ss << h_feature[i * slot_num_ + j] << " ";
}
VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
<< "] = " << (uint64_t)h_walk[i] << " feature["
<< i * slot_num_ << ".." << (i + 1) * slot_num_
<< "] = " << ss.str();
}
}
}
}
uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
int *random_row = reinterpret_cast<int *>(d_random_row_->ptr());
int *d_pair_num = reinterpret_cast<int *>(d_pair_num_->ptr());
cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
int len = buf_state_.len;
GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
ins_buf + ins_buf_pair_len_ * 2,
d_pair_num,
walk,
random_row + buf_state_.cursor,
buf_state_.central_word,
window_step_[buf_state_.step],
len,
walk_len_);
int h_pair_num;
cudaMemcpyAsync(
&h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_);
if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
uint64_t *feature = reinterpret_cast<uint64_t *>(d_feature_->ptr());
cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
int len = buf_state_.len;
VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_
<< "] len[" << len << "]";
GraphFillFeatureKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
feature_buf + ins_buf_pair_len_ * 2 * slot_num_,
d_pair_num,
walk,
feature,
random_row + buf_state_.cursor,
buf_state_.central_word,
window_step_[buf_state_.step],
len,
walk_len_,
slot_num_);
}
cudaStreamSynchronize(stream_);
ins_buf_pair_len_ += h_pair_num;
if (debug_mode_) {
uint64_t h_ins_buf[ins_buf_pair_len_ * 2];
cudaMemcpy(h_ins_buf,
ins_buf,
2 * ins_buf_pair_len_ * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
VLOG(2) << "h_pair_num = " << h_pair_num
<< ", ins_buf_pair_len = " << ins_buf_pair_len_;
for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) {
VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx];
}
delete[] h_ins_buf;
if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
uint64_t *feature_buf =
reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_];
cudaMemcpy(h_feature_buf,
feature_buf,
(batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) {
VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx];
}
}
}
return ins_buf_pair_len_;
}
int GraphDataGenerator::GenerateBatch() {
int total_instance = 0;
platform::CUDADeviceGuard guard(gpuid_);
int res = 0;
if (!gpu_graph_training_) {
while (cursor_ < h_device_keys_.size()) {
size_t device_key_size = h_device_keys_[cursor_]->size();
if (infer_node_type_start_[cursor_] >= device_key_size) {
cursor_++;
continue;
}
total_instance =
(infer_node_type_start_[cursor_] + batch_size_ <= device_key_size)
? batch_size_
: device_key_size - infer_node_type_start_[cursor_];
uint64_t *d_type_keys =
reinterpret_cast<uint64_t *>(d_device_keys_[cursor_]->ptr());
d_type_keys += infer_node_type_start_[cursor_];
infer_node_type_start_[cursor_] += total_instance;
VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
<< " instance = " << total_instance;
total_instance *= 2;
id_tensor_ptr_ = feed_vec_[0]->mutable_data<int64_t>({total_instance, 1},
this->place_);
show_tensor_ptr_ =
feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
clk_tensor_ptr_ =
feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
CUDA_NUM_THREADS,
0,
stream_>>>(
id_tensor_ptr_, d_type_keys, total_instance / 2);
GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
CUDA_NUM_THREADS,
0,
stream_>>>(show_tensor_ptr_, total_instance);
GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
CUDA_NUM_THREADS,
0,
stream_>>>(clk_tensor_ptr_, total_instance);
break;
}
if (total_instance == 0) {
return 0;
}
} else {
while (ins_buf_pair_len_ < batch_size_) {
res = FillInsBuf();
if (res == -1) {
if (ins_buf_pair_len_ == 0) {
return 0;
} else {
break;
}
}
}
total_instance =
ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_;
total_instance *= 2;
id_tensor_ptr_ =
feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
show_tensor_ptr_ =
feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
clk_tensor_ptr_ =
feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
}
int64_t *slot_tensor_ptr_[slot_num_];
int64_t *slot_lod_tensor_ptr_[slot_num_];
if (slot_num_ > 0) {
for (int i = 0; i < slot_num_; ++i) {
slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
{total_instance, 1}, this->place_);
slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
{total_instance + 1}, this->place_);
}
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
slot_tensor_ptr_,
sizeof(uint64_t *) * slot_num_,
cudaMemcpyHostToDevice,
stream_);
cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
slot_lod_tensor_ptr_,
sizeof(uint64_t *) * slot_num_,
cudaMemcpyHostToDevice,
stream_);
}
}
uint64_t *ins_cursor, *ins_buf;
if (gpu_graph_training_) {
VLOG(2) << "total_instance: " << total_instance
<< ", ins_buf_pair_len = " << ins_buf_pair_len_;
// uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
// uint64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
cudaMemcpyAsync(id_tensor_ptr_,
ins_cursor,
sizeof(uint64_t) * total_instance,
cudaMemcpyDeviceToDevice,
stream_);
GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
CUDA_NUM_THREADS,
0,
stream_>>>(show_tensor_ptr_, total_instance);
GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
CUDA_NUM_THREADS,
0,
stream_>>>(clk_tensor_ptr_, total_instance);
} else {
ins_cursor = (uint64_t *)id_tensor_ptr_;
}
if (slot_num_ > 0) {
uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
FillFeatureBuf(ins_cursor, feature_buf, total_instance);
// FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance);
if (debug_mode_) {
uint64_t h_walk[total_instance];
cudaMemcpy(h_walk,
ins_cursor,
total_instance * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
uint64_t h_feature[total_instance * slot_num_];
cudaMemcpy(h_feature,
feature_buf,
total_instance * slot_num_ * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
for (int i = 0; i < total_instance; ++i) {
std::stringstream ss;
for (int j = 0; j < slot_num_; ++j) {
ss << h_feature[i * slot_num_ + j] << " ";
}
VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
<< "] = " << (uint64_t)h_walk[i] << " feature["
<< i * slot_num_ << ".." << (i + 1) * slot_num_
<< "] = " << ss.str();
}
}
GraphFillSlotKernel<<<GET_BLOCKS(total_instance * slot_num_),
CUDA_NUM_THREADS,
0,
stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
feature_buf,
total_instance * slot_num_,
total_instance,
slot_num_);
GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
CUDA_NUM_THREADS,
0,
stream_>>>(
(uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
(total_instance + 1) * slot_num_,
total_instance + 1);
} else {
for (int i = 0; i < slot_num_; ++i) {
int feature_buf_offset =
(ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2;
for (int j = 0; j < total_instance; j += 2) {
VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf["
<< feature_buf_offset + j * slot_num_ << "]";
VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf["
<< feature_buf_offset + j * slot_num_ + 1 << "]";
cudaMemcpyAsync(slot_tensor_ptr_[i] + j,
&feature_buf[feature_buf_offset + j * slot_num_],
sizeof(uint64_t) * 2,
cudaMemcpyDeviceToDevice,
stream_);
}
GraphFillSlotLodKernel<<<GET_BLOCKS(total_instance),
CUDA_NUM_THREADS,
0,
stream_>>>(slot_lod_tensor_ptr_[i],
total_instance + 1);
}
}
}
offset_.clear();
offset_.push_back(0);
offset_.push_back(total_instance);
LoD lod{offset_};
feed_vec_[0]->set_lod(lod);
if (slot_num_ > 0) {
for (int i = 0; i < slot_num_; ++i) {
feed_vec_[3 + 2 * i]->set_lod(lod);
}
}
cudaStreamSynchronize(stream_);
if (!gpu_graph_training_) return 1;
ins_buf_pair_len_ -= total_instance / 2;
if (debug_mode_) {
uint64_t h_slot_tensor[slot_num_][total_instance];
uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
for (int i = 0; i < slot_num_; ++i) {
cudaMemcpy(h_slot_tensor[i],
slot_tensor_ptr_[i],
total_instance * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
int len = total_instance > 5000 ? 5000 : total_instance;
for (int j = 0; j < len; ++j) {
VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
<< "] = " << h_slot_tensor[i][j];
}
cudaMemcpy(h_slot_lod_tensor[i],
slot_lod_tensor_ptr_[i],
(total_instance + 1) * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
for (int j = 0; j < len; ++j) {
VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
<< "] = " << h_slot_lod_tensor[i][j];
}
}
}
return 1;
}
__global__ void GraphFillSampleKeysKernel(uint64_t *neighbors,
uint64_t *sample_keys,
int *prefix_sum,
int *sampleidx2row,
int *tmp_sampleidx2row,
int *actual_sample_size,
int cur_degree,
int len) {
CUDA_KERNEL_LOOP(idx, len) {
for (int k = 0; k < actual_sample_size[idx]; k++) {
size_t offset = prefix_sum[idx] + k;
sample_keys[offset] = neighbors[idx * cur_degree + k];
tmp_sampleidx2row[offset] = sampleidx2row[idx] + k;
}
}
}
__global__ void GraphDoWalkKernel(uint64_t *neighbors,
uint64_t *walk,
int *d_prefix_sum,
int *actual_sample_size,
int cur_degree,
int step,
int len,
int *id_cnt,
int *sampleidx2row,
int col_size) {
CUDA_KERNEL_LOOP(i, len) {
for (int k = 0; k < actual_sample_size[i]; k++) {
// int idx = sampleidx2row[i];
size_t row = sampleidx2row[k + d_prefix_sum[i]];
// size_t row = idx * cur_degree + k;
size_t col = step;
size_t offset = (row * col_size + col);
walk[offset] = neighbors[i * cur_degree + k];
}
}
}
// Fill keys to the first column of walk
__global__ void GraphFillFirstStepKernel(int *prefix_sum,
int *sampleidx2row,
uint64_t *walk,
uint64_t *keys,
int len,
int walk_degree,
int col_size,
int *actual_sample_size,
uint64_t *neighbors,
uint64_t *sample_keys) {
CUDA_KERNEL_LOOP(idx, len) {
for (int k = 0; k < actual_sample_size[idx]; k++) {
size_t row = prefix_sum[idx] + k;
sample_keys[row] = neighbors[idx * walk_degree + k];
sampleidx2row[row] = row;
size_t offset = col_size * row;
walk[offset] = keys[idx];
walk[offset + 1] = neighbors[idx * walk_degree + k];
}
}
}
// Fill sample_res to the stepth column of walk
void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
uint64_t *walk,
int len,
NeighborSampleResult &sample_res,
int cur_degree,
int step,
int *len_per_row) {
size_t temp_storage_bytes = 0;
int *d_actual_sample_size = sample_res.actual_sample_size;
uint64_t *d_neighbors = sample_res.val;
int *d_prefix_sum = reinterpret_cast<int *>(d_prefix_sum_->ptr());
uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
int *d_sampleidx2row =
reinterpret_cast<int *>(d_sampleidx2rows_[cur_sampleidx2row_]->ptr());
int *d_tmp_sampleidx2row =
reinterpret_cast<int *>(d_sampleidx2rows_[1 - cur_sampleidx2row_]->ptr());
CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL,
temp_storage_bytes,
d_actual_sample_size,
d_prefix_sum + 1,
len,
stream_));
auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes,
d_actual_sample_size,
d_prefix_sum + 1,
len,
stream_));
cudaStreamSynchronize(stream_);
if (step == 1) {
GraphFillFirstStepKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
d_prefix_sum,
d_tmp_sampleidx2row,
walk,
d_start_ids,
len,
walk_degree_,
walk_len_,
d_actual_sample_size,
d_neighbors,
d_sample_keys);
} else {
GraphFillSampleKeysKernel<<<GET_BLOCKS(len),
CUDA_NUM_THREADS,
0,
stream_>>>(d_neighbors,
d_sample_keys,
d_prefix_sum,
d_sampleidx2row,
d_tmp_sampleidx2row,
d_actual_sample_size,
cur_degree,
len);
GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
d_neighbors,
walk,
d_prefix_sum,
d_actual_sample_size,
cur_degree,
step,
len,
len_per_row,
d_tmp_sampleidx2row,
walk_len_);
}
if (debug_mode_) {
size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
int *h_prefix_sum = new int[len + 1];
int *h_actual_size = new int[len];
int *h_offset2idx = new int[once_max_sample_keynum];
uint64_t h_sample_keys[once_max_sample_keynum];
cudaMemcpy(h_offset2idx,
d_tmp_sampleidx2row,
once_max_sample_keynum * sizeof(int),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_prefix_sum,
d_prefix_sum,
(len + 1) * sizeof(int),
cudaMemcpyDeviceToHost);
for (int xx = 0; xx < once_max_sample_keynum; xx++) {
VLOG(2) << "h_offset2idx[" << xx << "]: " << h_offset2idx[xx];
}
for (int xx = 0; xx < len + 1; xx++) {
VLOG(2) << "h_prefix_sum[" << xx << "]: " << h_prefix_sum[xx];
}
delete[] h_prefix_sum;
delete[] h_actual_size;
delete[] h_offset2idx;
delete[] h_sample_keys;
}
cudaStreamSynchronize(stream_);
cur_sampleidx2row_ = 1 - cur_sampleidx2row_;
}
int GraphDataGenerator::FillFeatureBuf(uint64_t *d_walk,
uint64_t *d_feature,
size_t key_num) {
platform::CUDADeviceGuard guard(gpuid_);
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
int ret = gpu_graph_ptr->get_feature_of_nodes(
gpuid_, d_walk, d_feature, key_num, slot_num_);
return ret;
}
int GraphDataGenerator::FillFeatureBuf(
std::shared_ptr<phi::Allocation> d_walk,
std::shared_ptr<phi::Allocation> d_feature) {
platform::CUDADeviceGuard guard(gpuid_);
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_,
(uint64_t *)d_walk->ptr(),
(uint64_t *)d_feature->ptr(),
buf_size_,
slot_num_);
return ret;
}
int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
platform::CUDADeviceGuard guard(gpuid_);
size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
////////
uint64_t *h_walk;
uint64_t *h_sample_keys;
int *h_offset2idx;
int *h_len_per_row;
uint64_t *h_prefix_sum;
if (debug_mode_) {
h_walk = new uint64_t[buf_size_];
h_sample_keys = new uint64_t[once_max_sample_keynum];
h_offset2idx = new int[once_max_sample_keynum];
h_len_per_row = new int[once_max_sample_keynum];
h_prefix_sum = new uint64_t[once_max_sample_keynum + 1];
}
///////
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk->ptr());
int *len_per_row = reinterpret_cast<int *>(d_len_per_row_->ptr());
uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_);
cudaMemsetAsync(
len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_);
int i = 0;
int total_row = 0;
size_t node_type_len = first_node_type_.size();
int remain_size =
buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
while (i <= remain_size) {
int cur_node_idx = cursor_ % node_type_len;
int node_type = first_node_type_[cur_node_idx];
auto &path = meta_path_[cur_node_idx];
size_t start = node_type_start_[node_type];
// auto node_query_result = gpu_graph_ptr->query_node_list(
// gpuid_, node_type, start, once_sample_startid_len_);
// int tmp_len = node_query_result.actual_sample_size;
VLOG(2) << "choose start type: " << node_type;
int type_index = type_to_index_[node_type];
size_t device_key_size = h_device_keys_[type_index]->size();
VLOG(2) << "type: " << node_type << " size: " << device_key_size
<< " start: " << start;
uint64_t *d_type_keys =
reinterpret_cast<uint64_t *>(d_device_keys_[type_index]->ptr());
int tmp_len = start + once_sample_startid_len_ > device_key_size
? device_key_size - start
: once_sample_startid_len_;
node_type_start_[node_type] = tmp_len + start;
if (tmp_len == 0) {
finish_node_type_.insert(node_type);
if (finish_node_type_.size() == node_type_start_.size()) {
break;
}
cursor_ += 1;
continue;
}
// if (tmp_len == 0) {
// break;
//}
VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_
<< " tmp_len = " << tmp_len << " cursor = " << cursor_
<< " once_max_sample_keynum = " << once_max_sample_keynum;
uint64_t *cur_walk = walk + i;
NeighborSampleQuery q;
q.initialize(gpuid_,
path[0],
(uint64_t)(d_type_keys + start),
walk_degree_,
tmp_len);
auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
int step = 1;
VLOG(2) << "sample edge type: " << path[0] << " step: " << 1;
jump_rows_ = sample_res.total_sample_size;
FillOneStep(d_type_keys + start,
cur_walk,
tmp_len,
sample_res,
walk_degree_,
step,
len_per_row);
VLOG(2) << "jump_row: " << jump_rows_;
/////////
if (debug_mode_) {
cudaMemcpy(
h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost);
for (int xx = 0; xx < buf_size_; xx++) {
VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
}
}
/////////
step++;
size_t path_len = path.size();
for (; step < walk_len_; step++) {
if (sample_res.total_sample_size == 0) {
break;
}
auto sample_key_mem = sample_res.actual_val_mem;
uint64_t *sample_keys_ptr =
reinterpret_cast<uint64_t *>(sample_key_mem->ptr());
int edge_type_id = path[(step - 1) % path_len];
VLOG(2) << "sample edge type: " << edge_type_id << " step: " << step;
q.initialize(gpuid_,
edge_type_id,
(uint64_t)sample_keys_ptr,
1,
sample_res.total_sample_size);
sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
FillOneStep(d_type_keys + start,
cur_walk,
sample_res.total_sample_size,
sample_res,
1,
step,
len_per_row);
if (debug_mode_) {
cudaMemcpy(
h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost);
for (int xx = 0; xx < buf_size_; xx++) {
VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
}
}
}
// cursor_ += tmp_len;
i += jump_rows_ * walk_len_;
total_row += jump_rows_;
cursor_ += 1;
}
buf_state_.Reset(total_row);
int *d_random_row = reinterpret_cast<int *>(d_random_row_->ptr());
thrust::random::default_random_engine engine(shuffle_seed_);
const auto &exec_policy = thrust::cuda::par.on(stream_);
thrust::counting_iterator<int> cnt_iter(0);
thrust::shuffle_copy(exec_policy,
cnt_iter,
cnt_iter + total_row,
thrust::device_pointer_cast(d_random_row),
engine);
cudaStreamSynchronize(stream_);
shuffle_seed_ = engine();
if (debug_mode_) {
int *h_random_row = new int[total_row + 10];
cudaMemcpy(h_random_row,
d_random_row,
total_row * sizeof(int),
cudaMemcpyDeviceToHost);
for (int xx = 0; xx < total_row; xx++) {
VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
}
delete[] h_random_row;
delete[] h_walk;
delete[] h_sample_keys;
delete[] h_offset2idx;
delete[] h_len_per_row;
delete[] h_prefix_sum;
}
return total_row != 0;
}
void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
std::vector<LoDTensor *> feed_vec) {
place_ = place;
gpuid_ = place_.GetDeviceId();
VLOG(3) << "gpuid " << gpuid_;
stream_ = dynamic_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
feed_vec_ = feed_vec;
slot_num_ = (feed_vec_.size() - 3) / 2;
// d_device_keys_.resize(h_device_keys_.size());
VLOG(2) << "h_device_keys size: " << h_device_keys_.size();
infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
for (size_t i = 0; i < h_device_keys_.size(); i++) {
for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
VLOG(3) << "h_device_keys_[" << i << "][" << j
<< "] = " << (*(h_device_keys_[i]))[j];
}
auto buf = memory::AllocShared(
place_, h_device_keys_[i]->size() * sizeof(uint64_t));
d_device_keys_.push_back(buf);
CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
h_device_keys_[i]->data(),
h_device_keys_[i]->size() * sizeof(uint64_t),
cudaMemcpyHostToDevice,
stream_));
}
// h_device_keys_ = h_device_keys;
// device_key_size_ = h_device_keys_->size();
// d_device_keys_ =
// memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
// CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
// device_key_size_ * sizeof(int64_t),
// cudaMemcpyHostToDevice, stream_));
size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
d_prefix_sum_ =
memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int));
int *d_prefix_sum_ptr = reinterpret_cast<int *>(d_prefix_sum_->ptr());
cudaMemsetAsync(
d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_);
cursor_ = 0;
jump_rows_ = 0;
d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t));
cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
d_feature_ =
memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t));
cudaMemsetAsync(
d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
}
d_sample_keys_ =
memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t));
d_sampleidx2rows_.push_back(
memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
d_sampleidx2rows_.push_back(
memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
cur_sampleidx2row_ = 0;
d_len_per_row_ =
memory::AllocShared(place_, once_max_sample_keynum * sizeof(int));
for (int i = -window_; i < 0; i++) {
window_step_.push_back(i);
}
for (int i = 0; i < window_; i++) {
window_step_.push_back(i + 1);
}
buf_state_.Init(batch_size_, walk_len_, &window_step_);
d_random_row_ = memory::AllocShared(
place_,
(once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int));
shuffle_seed_ = 0;
ins_buf_pair_len_ = 0;
d_ins_buf_ =
memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(uint64_t));
if (slot_num_ > 0) {
d_feature_buf_ = memory::AllocShared(
place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t));
}
d_pair_num_ = memory::AllocShared(place_, sizeof(int));
if (FLAGS_enable_opt_get_features && slot_num_ > 0) {
d_slot_tensor_ptr_ =
memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
d_slot_lod_tensor_ptr_ =
memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
}
cudaStreamSynchronize(stream_);
}
void GraphDataGenerator::SetConfig(
const paddle::framework::DataFeedDesc &data_feed_desc) {
auto graph_config = data_feed_desc.graph_config();
walk_degree_ = graph_config.walk_degree();
walk_len_ = graph_config.walk_len();
window_ = graph_config.window();
once_sample_startid_len_ = graph_config.once_sample_startid_len();
debug_mode_ = graph_config.debug_mode();
gpu_graph_training_ = graph_config.gpu_graph_training();
if (debug_mode_ || !gpu_graph_training_) {
batch_size_ = graph_config.batch_size();
} else {
batch_size_ = once_sample_startid_len_;
}
repeat_time_ = graph_config.sample_times_one_chunk();
buf_size_ =
once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_;
VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_
<< ", walk_len : " << walk_len_ << ", window : " << window_
<< ", once_sample_startid_len : " << once_sample_startid_len_
<< ", sample_times_one_chunk : " << repeat_time_
<< ", batch_size: " << batch_size_;
std::string first_node_type = graph_config.first_node_type();
std::string meta_path = graph_config.meta_path();
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
auto edge_to_id = gpu_graph_ptr->edge_to_id;
auto node_to_id = gpu_graph_ptr->feature_to_id;
// parse first_node_type
auto node_types =
paddle::string::split_string<std::string>(first_node_type, ";");
VLOG(2) << "node_types: " << first_node_type;
finish_node_type_.clear();
node_type_start_.clear();
for (auto &type : node_types) {
auto iter = node_to_id.find(type);
PADDLE_ENFORCE_NE(
iter,
node_to_id.end(),
platform::errors::NotFound("(%s) is not found in node_to_id.", type));
VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
first_node_type_.push_back(iter->second);
node_type_start_[iter->second] = 0;
}
meta_path_.resize(first_node_type_.size());
auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
for (size_t i = 0; i < meta_paths.size(); i++) {
auto path = meta_paths[i];
auto nodes = paddle::string::split_string<std::string>(path, "-");
for (auto &node : nodes) {
auto iter = edge_to_id.find(node);
PADDLE_ENFORCE_NE(
iter,
edge_to_id.end(),
platform::errors::NotFound("(%s) is not found in edge_to_id.", node));
VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
meta_path_[i].push_back(iter->second);
}
}
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#endif #endif
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
...@@ -42,6 +43,7 @@ limitations under the License. */ ...@@ -42,6 +43,7 @@ limitations under the License. */
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
...@@ -56,6 +58,8 @@ namespace framework { ...@@ -56,6 +58,8 @@ namespace framework {
class DataFeedDesc; class DataFeedDesc;
class Scope; class Scope;
class Variable; class Variable;
class NeighborSampleResult;
class NodeQueryResult;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -420,7 +424,6 @@ struct UsedSlotGpuType { ...@@ -420,7 +424,6 @@ struct UsedSlotGpuType {
}; };
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
#define CUDA_CHECK(val) CHECK(val == gpuSuccess)
template <typename T> template <typename T>
struct CudaBuffer { struct CudaBuffer {
T* cu_buffer; T* cu_buffer;
...@@ -776,6 +779,202 @@ class DLManager { ...@@ -776,6 +779,202 @@ class DLManager {
std::map<std::string, DLHandle> handle_map_; std::map<std::string, DLHandle> handle_map_;
}; };
struct engine_wrapper_t {
std::default_random_engine engine;
#if !defined(_WIN32)
engine_wrapper_t() {
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
static std::atomic<uint64_t> x(0);
std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
engine.seed(sseq);
}
#endif
};
struct BufState {
int left;
int right;
int central_word;
int step;
engine_wrapper_t random_engine_;
int len;
int cursor;
int row_num;
int batch_size;
int walk_len;
std::vector<int>* window;
BufState() {}
~BufState() {}
void Init(int graph_batch_size,
int graph_walk_len,
std::vector<int>* graph_window) {
batch_size = graph_batch_size;
walk_len = graph_walk_len;
window = graph_window;
left = 0;
right = window->size() - 1;
central_word = -1;
step = -1;
len = 0;
cursor = 0;
row_num = 0;
for (size_t i = 0; i < graph_window->size(); i++) {
VLOG(2) << "graph_window[" << i << "] = " << (*graph_window)[i];
}
}
void Reset(int total_rows) {
cursor = 0;
row_num = total_rows;
int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
len = tmp_len;
central_word = -1;
step = -1;
GetNextCentrolWord();
}
int GetNextStep() {
step++;
if (step <= right && central_word + (*window)[step] < walk_len) {
return 1;
}
return 0;
}
void Debug() {
VLOG(2) << "left: " << left << " right: " << right
<< " central_word: " << central_word << " step: " << step
<< " cursor: " << cursor << " len: " << len
<< " row_num: " << row_num;
}
int GetNextCentrolWord() {
if (++central_word >= walk_len) {
return 0;
}
int window_size = window->size() / 2;
int random_window = random_engine_.engine() % window_size + 1;
left = window_size - random_window;
right = window_size + random_window - 1;
VLOG(2) << "random window: " << random_window << " window[" << left
<< "] = " << (*window)[left] << " window[" << right
<< "] = " << (*window)[right];
for (step = left; step <= right; step++) {
if (central_word + (*window)[step] >= 0) {
return 1;
}
}
return 0;
}
int GetNextBatch() {
cursor += len;
int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
if (tmp_len == 0) {
return 0;
}
len = tmp_len;
central_word = -1;
step = -1;
GetNextCentrolWord();
return tmp_len != 0;
}
};
class GraphDataGenerator {
public:
GraphDataGenerator(){};
virtual ~GraphDataGenerator(){};
void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc);
void AllocResource(const paddle::platform::Place& place,
std::vector<LoDTensor*> feed_vec);
int AcquireInstance(BufState* state);
int GenerateBatch();
int FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk);
int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num);
int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
std::shared_ptr<phi::Allocation> d_feature);
void FillOneStep(uint64_t* start_ids,
uint64_t* walk,
int len,
NeighborSampleResult& sample_res,
int cur_degree,
int step,
int* len_per_row);
int FillInsBuf();
void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
type_to_index_[type] = h_device_keys_.size();
h_device_keys_.push_back(device_keys);
}
protected:
int walk_degree_;
int walk_len_;
int window_;
int once_sample_startid_len_;
int gpuid_;
// start ids
// int64_t* device_keys_;
// size_t device_key_size_;
std::vector<std::vector<uint64_t>*> h_device_keys_;
std::unordered_map<int, int> type_to_index_;
// point to device_keys_
size_t cursor_;
size_t jump_rows_;
int64_t* id_tensor_ptr_;
int64_t* show_tensor_ptr_;
int64_t* clk_tensor_ptr_;
cudaStream_t stream_;
paddle::platform::Place place_;
std::vector<LoDTensor*> feed_vec_;
std::vector<size_t> offset_;
std::shared_ptr<phi::Allocation> d_prefix_sum_;
std::vector<std::shared_ptr<phi::Allocation>> d_device_keys_;
std::shared_ptr<phi::Allocation> d_walk_;
std::shared_ptr<phi::Allocation> d_feature_;
std::shared_ptr<phi::Allocation> d_len_per_row_;
std::shared_ptr<phi::Allocation> d_random_row_;
//
std::vector<std::shared_ptr<phi::Allocation>> d_sampleidx2rows_;
int cur_sampleidx2row_;
// record the keys to call graph_neighbor_sample
std::shared_ptr<phi::Allocation> d_sample_keys_;
int sample_keys_len_;
std::set<int> finish_node_type_;
std::unordered_map<int, size_t> node_type_start_;
std::vector<int> infer_node_type_start_;
std::shared_ptr<phi::Allocation> d_ins_buf_;
std::shared_ptr<phi::Allocation> d_feature_buf_;
std::shared_ptr<phi::Allocation> d_pair_num_;
std::shared_ptr<phi::Allocation> d_slot_tensor_ptr_;
std::shared_ptr<phi::Allocation> d_slot_lod_tensor_ptr_;
int ins_buf_pair_len_;
// size of a d_walk buf
size_t buf_size_;
int repeat_time_;
std::vector<int> window_step_;
BufState buf_state_;
int batch_size_;
int slot_num_;
int shuffle_seed_;
int debug_mode_;
std::vector<int> first_node_type_;
std::vector<std::vector<int>> meta_path_;
bool gpu_graph_training_;
};
class DataFeed { class DataFeed {
public: public:
DataFeed() { DataFeed() {
...@@ -838,6 +1037,14 @@ class DataFeed { ...@@ -838,6 +1037,14 @@ class DataFeed {
virtual void SetParseLogKey(bool parse_logkey) {} virtual void SetParseLogKey(bool parse_logkey) {}
virtual void SetEnablePvMerge(bool enable_pv_merge) {} virtual void SetEnablePvMerge(bool enable_pv_merge) {}
virtual void SetCurrentPhase(int current_phase) {} virtual void SetCurrentPhase(int current_phase) {}
virtual void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.SetDeviceKeys(device_keys, type);
#endif
}
virtual void SetGpuGraphMode(int gpu_graph_mode) {
gpu_graph_mode_ = gpu_graph_mode;
}
virtual void SetFileListMutex(std::mutex* mutex) { virtual void SetFileListMutex(std::mutex* mutex) {
mutex_for_pick_file_ = mutex; mutex_for_pick_file_ = mutex;
} }
...@@ -921,6 +1128,10 @@ class DataFeed { ...@@ -921,6 +1128,10 @@ class DataFeed {
// The input type of pipe reader, 0 for one sample, 1 for one batch // The input type of pipe reader, 0 for one sample, 1 for one batch
int input_type_; int input_type_;
int gpu_graph_mode_ = 0;
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
GraphDataGenerator gpu_graph_data_generator_;
#endif
}; };
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
......
...@@ -27,6 +27,19 @@ message MultiSlotDesc { ...@@ -27,6 +27,19 @@ message MultiSlotDesc {
optional string uid_slot = 2; optional string uid_slot = 2;
} }
message GraphConfig {
optional int32 walk_degree = 1 [ default = 1 ];
optional int32 walk_len = 2 [ default = 20 ];
optional int32 window = 3 [ default = 5 ];
optional int32 once_sample_startid_len = 4 [ default = 8000 ];
optional int32 sample_times_one_chunk = 5 [ default = 10 ];
optional int32 batch_size = 6 [ default = 1 ];
optional int32 debug_mode = 7 [ default = 0 ];
optional string first_node_type = 8;
optional string meta_path = 9;
optional bool gpu_graph_training = 10 [ default = true ];
}
message DataFeedDesc { message DataFeedDesc {
optional string name = 1; optional string name = 1;
optional int32 batch_size = 2 [ default = 32 ]; optional int32 batch_size = 2 [ default = 32 ];
...@@ -37,4 +50,5 @@ message DataFeedDesc { ...@@ -37,4 +50,5 @@ message DataFeedDesc {
optional int32 pv_batch_size = 7 [ default = 32 ]; optional int32 pv_batch_size = 7 [ default = 32 ];
optional int32 input_type = 8 [ default = 0 ]; optional int32 input_type = 8 [ default = 0 ];
optional string so_parser_name = 9; optional string so_parser_name = 9;
optional GraphConfig graph_config = 10;
} }
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/data_set.h"
#include "gflags/gflags.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE) #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/index_dataset/index_sampler.h" #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
...@@ -26,6 +27,7 @@ ...@@ -26,6 +27,7 @@
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
#endif #endif
#if defined _WIN32 || defined __APPLE__ #if defined _WIN32 || defined __APPLE__
...@@ -34,6 +36,8 @@ ...@@ -34,6 +36,8 @@
#endif #endif
USE_INT_STAT(STAT_total_feasign_num_in_mem); USE_INT_STAT(STAT_total_feasign_num_in_mem);
DECLARE_bool(graph_get_neighbor_id);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -196,6 +200,16 @@ void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) { ...@@ -196,6 +200,16 @@ void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
<< " with record candidate size: " << record_candidate_size; << " with record candidate size: " << record_candidate_size;
} }
template <typename T>
void DatasetImpl<T>::SetGpuGraphMode(int is_graph_mode) {
gpu_graph_mode_ = is_graph_mode;
}
template <typename T>
int DatasetImpl<T>::GetGpuGraphMode() {
return gpu_graph_mode_;
}
template <typename T> template <typename T>
std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() { std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
std::vector<paddle::framework::DataFeed*> ret; std::vector<paddle::framework::DataFeed*> ret;
...@@ -440,6 +454,84 @@ void DatasetImpl<T>::LoadIntoMemory() { ...@@ -440,6 +454,84 @@ void DatasetImpl<T>::LoadIntoMemory() {
platform::Timer timeline; platform::Timer timeline;
timeline.Start(); timeline.Start();
std::vector<std::thread> load_threads; std::vector<std::thread> load_threads;
if (gpu_graph_mode_) {
VLOG(0) << "in gpu_graph_mode";
#ifdef PADDLE_WITH_HETERPS
graph_all_type_total_keys_.clear();
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
auto node_to_id = gpu_graph_ptr->feature_to_id;
auto edge_to_id = gpu_graph_ptr->edge_to_id;
graph_all_type_total_keys_.resize(node_to_id.size());
int cnt = 0;
for (auto& iter : node_to_id) {
int node_idx = iter.second;
std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
gpu_graph_ptr->get_all_id(
1, node_idx, thread_num_, &gpu_graph_device_keys);
auto& type_total_key = graph_all_type_total_keys_[cnt];
type_total_key.resize(thread_num_);
for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
<< "] = " << gpu_graph_device_keys[i].size();
for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
type_total_key[i].push_back(gpu_graph_device_keys[i][j]);
}
}
for (size_t i = 0; i < readers_.size(); i++) {
readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx);
readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
}
cnt++;
}
VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size["
<< gpu_graph_total_keys_.size() << "]";
for (auto& iter : node_to_id) {
std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
int node_idx = iter.second;
gpu_graph_ptr->get_all_feature_ids(
1, node_idx, thread_num_, &gpu_graph_device_keys);
for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys["
<< i << "] = " << gpu_graph_device_keys[i].size();
for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
}
VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys["
<< i << "] = " << gpu_graph_device_keys[i].size();
}
}
VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size["
<< gpu_graph_total_keys_.size() << "]";
// FIX: trick for iterate edge table
for (auto& iter : edge_to_id) {
int edge_idx = iter.second;
std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
gpu_graph_ptr->get_all_id(
0, edge_idx, thread_num_, &gpu_graph_device_keys);
for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
VLOG(1) << "edge type: " << edge_idx << ", gpu_graph_device_keys[" << i
<< "] = " << gpu_graph_device_keys[i].size();
for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
}
}
if (FLAGS_graph_get_neighbor_id) {
std::vector<std::vector<uint64_t>> gpu_graph_neighbor_keys;
gpu_graph_ptr->get_all_neighbor_id(
0, edge_idx, thread_num_, &gpu_graph_neighbor_keys);
for (size_t i = 0; i < gpu_graph_neighbor_keys.size(); i++) {
for (size_t k = 0; k < gpu_graph_neighbor_keys[i].size(); k++) {
gpu_graph_total_keys_.push_back(gpu_graph_neighbor_keys[i][k]);
}
}
}
}
#endif
} else {
for (int64_t i = 0; i < thread_num_; ++i) { for (int64_t i = 0; i < thread_num_; ++i) {
load_threads.push_back(std::thread( load_threads.push_back(std::thread(
&paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
...@@ -447,6 +539,7 @@ void DatasetImpl<T>::LoadIntoMemory() { ...@@ -447,6 +539,7 @@ void DatasetImpl<T>::LoadIntoMemory() {
for (std::thread& t : load_threads) { for (std::thread& t : load_threads) {
t.join(); t.join();
} }
}
input_channel_->Close(); input_channel_->Close();
int64_t in_chan_size = input_channel_->Size(); int64_t in_chan_size = input_channel_->Size();
input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1); input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1);
......
...@@ -165,6 +165,9 @@ class Dataset { ...@@ -165,6 +165,9 @@ class Dataset {
virtual std::vector<std::string> GetSlots() = 0; virtual std::vector<std::string> GetSlots() = 0;
virtual void SetGpuGraphMode(int is_graph_mode) = 0;
virtual int GetGpuGraphMode() = 0;
protected: protected:
virtual int ReceiveFromClient(int msg_type, virtual int ReceiveFromClient(int msg_type,
int client_id, int client_id,
...@@ -213,6 +216,8 @@ class DatasetImpl : public Dataset { ...@@ -213,6 +216,8 @@ class DatasetImpl : public Dataset {
virtual std::pair<std::string, std::string> GetHdfsConfig() { virtual std::pair<std::string, std::string> GetHdfsConfig() {
return std::make_pair(fs_name_, fs_ugi_); return std::make_pair(fs_name_, fs_ugi_);
} }
virtual void SetGpuGraphMode(int is_graph_mode);
virtual int GetGpuGraphMode();
virtual std::string GetDownloadCmd(); virtual std::string GetDownloadCmd();
virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() { virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() {
return data_feed_desc_; return data_feed_desc_;
...@@ -272,7 +277,9 @@ class DatasetImpl : public Dataset { ...@@ -272,7 +277,9 @@ class DatasetImpl : public Dataset {
return multi_consume_channel_; return multi_consume_channel_;
} }
} }
std::vector<uint64_t>& GetGpuGraphTotalKeys() {
return gpu_graph_total_keys_;
}
Channel<T>& GetInputChannelRef() { return input_channel_; } Channel<T>& GetInputChannelRef() { return input_channel_; }
protected: protected:
...@@ -333,6 +340,10 @@ class DatasetImpl : public Dataset { ...@@ -333,6 +340,10 @@ class DatasetImpl : public Dataset {
std::vector<T> input_records_; // only for paddleboxdatafeed std::vector<T> input_records_; // only for paddleboxdatafeed
std::vector<std::string> use_slots_; std::vector<std::string> use_slots_;
bool enable_heterps_ = false; bool enable_heterps_ = false;
int gpu_graph_mode_ = 0;
// std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
std::vector<std::vector<std::vector<uint64_t>>> graph_all_type_total_keys_;
std::vector<uint64_t> gpu_graph_total_keys_;
}; };
// use std::vector<MultiSlotType> or Record as data type // use std::vector<MultiSlotType> or Record as data type
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include <chrono>
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
namespace phi { namespace phi {
class DenseTensor; class DenseTensor;
} // namespace phi } // namespace phi
...@@ -32,48 +32,179 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) { ...@@ -32,48 +32,179 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
} }
template <typename T> template <typename T>
std::string PrintLodTensorType(Tensor* tensor, int64_t start, int64_t end) { std::string PrintLodTensorType(Tensor* tensor,
int64_t start,
int64_t end,
char separator = ',',
bool need_leading_separator = true) {
auto count = tensor->numel(); auto count = tensor->numel();
if (start < 0 || end > count) { if (start < 0 || end > count) {
VLOG(3) << "access violation"; VLOG(3) << "access violation";
return "access violation"; return "access violation";
} }
if (start >= end) return "";
std::ostringstream os; std::ostringstream os;
if (!need_leading_separator) {
os << tensor->data<T>()[start];
start++;
}
for (int64_t i = start; i < end; i++) { for (int64_t i = start; i < end; i++) {
os << ":" << tensor->data<T>()[i]; // os << ":" << tensor->data<T>()[i];
os << separator << tensor->data<T>()[i];
} }
return os.str(); return os.str();
} }
template <typename T>
void PrintLodTensorType(Tensor* tensor,
int64_t start,
int64_t end,
std::string& out_val,
char separator = ',',
bool need_leading_separator = true) {
auto count = tensor->numel();
if (start < 0 || end > count) {
VLOG(3) << "access violation";
out_val += "access violation";
return;
}
if (start >= end) return;
if (!need_leading_separator) {
out_val += std::to_string(tensor->data<T>()[start]);
// os << tensor->data<T>()[start];
start++;
}
for (int64_t i = start; i < end; i++) {
// os << ":" << tensor->data<T>()[i];
// os << separator << tensor->data<T>()[i];
out_val += separator;
out_val += std::to_string(tensor->data<T>()[i]);
}
}
std::string PrintLodTensorIntType(Tensor* tensor, int64_t start, int64_t end) { #define FLOAT_EPS 1e-8
#define MAX_FLOAT_BUFF_SIZE 40
template <>
void PrintLodTensorType<float>(Tensor* tensor,
int64_t start,
int64_t end,
std::string& out_val,
char separator,
bool need_leading_separator) {
char buf[MAX_FLOAT_BUFF_SIZE];
auto count = tensor->numel();
if (start < 0 || end > count) {
VLOG(3) << "access violation";
out_val += "access violation";
return;
}
if (start >= end) return;
for (int64_t i = start; i < end; i++) {
if (i != start || need_leading_separator) out_val += separator;
if (tensor->data<float>()[i] > -FLOAT_EPS &&
tensor->data<float>()[i] < FLOAT_EPS)
out_val += "0";
else {
sprintf(buf, "%.9f", tensor->data<float>()[i]);
out_val += buf;
}
}
}
std::string PrintLodTensorIntType(Tensor* tensor,
int64_t start,
int64_t end,
char separator = ',',
bool need_leading_separator = true) {
auto count = tensor->numel(); auto count = tensor->numel();
if (start < 0 || end > count) { if (start < 0 || end > count) {
VLOG(3) << "access violation"; VLOG(3) << "access violation";
return "access violation"; return "access violation";
} }
if (start >= end) return "";
std::ostringstream os; std::ostringstream os;
if (!need_leading_separator) {
os << static_cast<uint64_t>(tensor->data<int64_t>()[start]);
start++;
}
for (int64_t i = start; i < end; i++) { for (int64_t i = start; i < end; i++) {
os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]); // os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
} }
return os.str(); return os.str();
} }
std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end) { void PrintLodTensorIntType(Tensor* tensor,
int64_t start,
int64_t end,
std::string& out_val,
char separator = ',',
bool need_leading_separator = true) {
auto count = tensor->numel();
if (start < 0 || end > count) {
VLOG(3) << "access violation";
out_val += "access violation";
return;
}
if (start >= end) return;
if (!need_leading_separator) {
out_val +=
std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[start]));
start++;
}
for (int64_t i = start; i < end; i++) {
// os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
// os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
out_val += separator;
out_val +=
std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[i]));
}
// return os.str();
}
std::string PrintLodTensor(Tensor* tensor,
int64_t start,
int64_t end,
char separator,
bool need_leading_separator) {
std::string out_val; std::string out_val;
if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) { if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
out_val = PrintLodTensorType<float>(tensor, start, end); out_val = PrintLodTensorType<float>(
tensor, start, end, separator, need_leading_separator);
} else if (framework::TransToProtoVarType(tensor->dtype()) == } else if (framework::TransToProtoVarType(tensor->dtype()) ==
proto::VarType::INT64) { proto::VarType::INT64) {
out_val = PrintLodTensorIntType(tensor, start, end); out_val = PrintLodTensorIntType(
tensor, start, end, separator, need_leading_separator);
} else if (framework::TransToProtoVarType(tensor->dtype()) == } else if (framework::TransToProtoVarType(tensor->dtype()) ==
proto::VarType::FP64) { proto::VarType::FP64) {
out_val = PrintLodTensorType<double>(tensor, start, end); out_val = PrintLodTensorType<double>(
tensor, start, end, separator, need_leading_separator);
} else { } else {
out_val = "unsupported type"; out_val = "unsupported type";
} }
return out_val; return out_val;
} }
void PrintLodTensor(Tensor* tensor,
int64_t start,
int64_t end,
std::string& out_val,
char separator,
bool need_leading_separator) {
if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
PrintLodTensorType<float>(
tensor, start, end, out_val, separator, need_leading_separator);
} else if (framework::TransToProtoVarType(tensor->dtype()) ==
proto::VarType::INT64) {
PrintLodTensorIntType(
tensor, start, end, out_val, separator, need_leading_separator);
} else if (framework::TransToProtoVarType(tensor->dtype()) ==
proto::VarType::FP64) {
PrintLodTensorType<double>(
tensor, start, end, out_val, separator, need_leading_separator);
} else {
out_val += "unsupported type";
}
}
std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) { std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
auto& dims = tensor->dims(); auto& dims = tensor->dims();
if (tensor->lod().size() != 0) { if (tensor->lod().size() != 0) {
...@@ -122,6 +253,11 @@ void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) { ...@@ -122,6 +253,11 @@ void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) {
} }
void DeviceWorker::InitRandomDumpConfig(const TrainerDesc& desc) { void DeviceWorker::InitRandomDumpConfig(const TrainerDesc& desc) {
bool is_dump_in_simple_mode = desc.is_dump_in_simple_mode();
if (is_dump_in_simple_mode) {
dump_mode_ = 3;
return;
}
bool enable_random_dump = desc.enable_random_dump(); bool enable_random_dump = desc.enable_random_dump();
if (!enable_random_dump) { if (!enable_random_dump) {
dump_mode_ = 0; dump_mode_ = 0;
...@@ -140,16 +276,124 @@ void DeviceWorker::DumpField(const Scope& scope, ...@@ -140,16 +276,124 @@ void DeviceWorker::DumpField(const Scope& scope,
int dump_interval) { // dump_mode: 0: no random, int dump_interval) { // dump_mode: 0: no random,
// 1: random with insid hash, // 1: random with insid hash,
// 2: random with random // 2: random with random
// number // 3: simple mode using multi-threads, for gpugraphps-mode
auto start1 = std::chrono::steady_clock::now();
size_t batch_size = device_reader_->GetCurBatchSize(); size_t batch_size = device_reader_->GetCurBatchSize();
auto& ins_id_vec = device_reader_->GetInsIdVec(); auto& ins_id_vec = device_reader_->GetInsIdVec();
auto& ins_content_vec = device_reader_->GetInsContentVec(); auto& ins_content_vec = device_reader_->GetInsContentVec();
if (ins_id_vec.size() > 0) { if (dump_mode_ == 3) {
batch_size = std::string::npos;
bool has_valid_batch = false;
for (auto& field : *dump_fields_) {
Variable* var = scope.FindVar(field);
if (var == nullptr) {
VLOG(0) << "Note: field[" << field
<< "] cannot be find in scope, so it was skipped.";
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (!tensor->IsInitialized()) {
VLOG(0) << "Note: field[" << field
<< "] is not initialized, so it was skipped.";
continue;
}
auto& dims = tensor->dims();
if (dims.size() == 2 && dims[0] > 0) {
batch_size = std::min(batch_size, static_cast<size_t>(dims[0]));
// VLOG(0)<<"in dump field ---> "<<field<<" dim_size = "<<dims[0]<<"
// "<<dims[1]<<" batch_size = "<<batch_size;
has_valid_batch = true;
}
}
if (!has_valid_batch) return;
} else if (ins_id_vec.size() > 0) {
batch_size = ins_id_vec.size(); batch_size = ins_id_vec.size();
} }
std::vector<std::string> ars(batch_size); std::vector<std::string> ars(batch_size);
std::vector<bool> hit(batch_size, false); if (dump_mode_ == 3) {
if (dump_fields_ == NULL || (*dump_fields_).size() == 0) {
return;
}
auto set_output_str = [&, this](
size_t begin, size_t end, LoDTensor* tensor) {
std::pair<int64_t, int64_t> bound;
auto& dims = tensor->dims();
for (size_t i = begin; i < end; ++i) {
bound = {i * dims[1], (i + 1) * dims[1]};
// auto bound = GetTensorBound(tensor, i);
if (ars[i].size() > 0) ars[i] += "\t";
// ars[i] += '[';
PrintLodTensor(tensor, bound.first, bound.second, ars[i], ' ', false);
// ars[i] += ']';
// ars[i] += "<" + PrintLodTensor(tensor, bound.first, bound.second, '
// ', false) + ">";
}
};
std::vector<std::thread> threads(tensor_iterator_thread_num);
for (auto& field : *dump_fields_) {
Variable* var = scope.FindVar(field);
if (var == nullptr) {
VLOG(0) << "Note: field[" << field
<< "] cannot be find in scope, so it was skipped.";
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (!tensor->IsInitialized()) {
VLOG(0) << "Note: field[" << field
<< "] is not initialized, so it was skipped.";
continue;
}
framework::LoDTensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
cpu_tensor.set_lod(tensor->lod());
tensor = &cpu_tensor;
}
auto& dims = tensor->dims();
if (dims.size() != 2 || dims[0] <= 0) {
VLOG(0) << "Note: field[" << field
<< "] cannot pass check, so it was "
"skipped. Maybe the dimension is "
"wrong ";
VLOG(0) << dims.size() << " " << dims[0] << " * " << dims[1];
continue;
}
size_t acutal_thread_num =
std::min((size_t)batch_size, tensor_iterator_thread_num);
for (size_t i = 0; i < acutal_thread_num; i++) {
size_t average_size = batch_size / acutal_thread_num;
size_t begin =
average_size * i + std::min(batch_size % acutal_thread_num, i);
size_t end =
begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
threads[i] = std::thread(set_output_str, begin, end, tensor);
}
for (size_t i = 0; i < acutal_thread_num; i++) threads[i].join();
}
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
VLOG(1) << "writing a batch takes " << tt.count() << " us";
size_t acutal_thread_num =
std::min((size_t)batch_size, tensor_iterator_thread_num);
for (size_t i = 0; i < acutal_thread_num; i++) {
size_t average_size = batch_size / acutal_thread_num;
size_t begin =
average_size * i + std::min(batch_size % acutal_thread_num, i);
size_t end =
begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
for (size_t j = begin + 1; j < end; j++) {
if (ars[begin].size() > 0 && ars[j].size() > 0) ars[begin] += "\n";
ars[begin] += ars[j];
}
if (ars[begin].size() > 0) writer_ << ars[begin];
}
return;
}
std::vector<bool> hit(batch_size, false);
std::default_random_engine engine(0); std::default_random_engine engine(0);
std::uniform_int_distribution<size_t> dist(0U, INT_MAX); std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
for (size_t i = 0; i < batch_size; i++) { for (size_t i = 0; i < batch_size; i++) {
...@@ -206,6 +450,7 @@ void DeviceWorker::DumpField(const Scope& scope, ...@@ -206,6 +450,7 @@ void DeviceWorker::DumpField(const Scope& scope,
ars[i] += PrintLodTensor(tensor, bound.first, bound.second); ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
} }
} }
// #pragma omp parallel for // #pragma omp parallel for
for (size_t i = 0; i < ars.size(); i++) { for (size_t i = 0; i < ars.size(); i++) {
if (ars[i].length() == 0) { if (ars[i].length() == 0) {
......
...@@ -31,6 +31,7 @@ limitations under the License. */ ...@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif #endif
#include <map>
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/heter_util.h"
...@@ -59,7 +60,17 @@ class Scope; ...@@ -59,7 +60,17 @@ class Scope;
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end); std::string PrintLodTensor(Tensor* tensor,
int64_t start,
int64_t end,
char separator = ',',
bool need_leading_separator = false);
void PrintLodTensor(Tensor* tensor,
int64_t start,
int64_t end,
std::string& output_str,
char separator = ',',
bool need_leading_separator = false);
std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index); std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index);
bool CheckValidOutput(LoDTensor* tensor, size_t batch_size); bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
...@@ -230,6 +241,7 @@ class DeviceWorker { ...@@ -230,6 +241,7 @@ class DeviceWorker {
int dump_mode_ = 0; int dump_mode_ = 0;
int dump_interval_ = 10000; int dump_interval_ = 10000;
ChannelWriter<std::string> writer_; ChannelWriter<std::string> writer_;
const size_t tensor_iterator_thread_num = 16;
platform::DeviceContext* dev_ctx_ = nullptr; platform::DeviceContext* dev_ctx_ = nullptr;
}; };
...@@ -772,7 +784,6 @@ class HeterSectionWorker : public DeviceWorker { ...@@ -772,7 +784,6 @@ class HeterSectionWorker : public DeviceWorker {
static uint64_t batch_id_; static uint64_t batch_id_;
uint64_t total_ins_num_ = 0; uint64_t total_ins_num_ = 0;
platform::DeviceContext* dev_ctx_ = nullptr; platform::DeviceContext* dev_ctx_ = nullptr;
bool debug_ = false; bool debug_ = false;
std::vector<double> op_total_time_; std::vector<double> op_total_time_;
std::vector<std::string> op_name_; std::vector<std::string> op_name_;
......
...@@ -29,7 +29,7 @@ TEST(LodTensor, PrintLodTensor) { ...@@ -29,7 +29,7 @@ TEST(LodTensor, PrintLodTensor) {
std::string res = PrintLodTensor(&tensor1, -1, 2); std::string res = PrintLodTensor(&tensor1, -1, 2);
ASSERT_EQ(res, "access violation"); ASSERT_EQ(res, "access violation");
res = PrintLodTensor(&tensor1, 0, 2); res = PrintLodTensor(&tensor1, 0, 2);
ASSERT_EQ(res, ":0.2:0.5"); ASSERT_EQ(res, "0.2,0.5");
LoDTensor tensor2; LoDTensor tensor2;
tensor2.Resize({2}); tensor2.Resize({2});
...@@ -39,7 +39,7 @@ TEST(LodTensor, PrintLodTensor) { ...@@ -39,7 +39,7 @@ TEST(LodTensor, PrintLodTensor) {
res = PrintLodTensor(&tensor2, -1, 2); res = PrintLodTensor(&tensor2, -1, 2);
ASSERT_EQ(res, "access violation"); ASSERT_EQ(res, "access violation");
res = PrintLodTensor(&tensor2, 0, 2); res = PrintLodTensor(&tensor2, 0, 2);
ASSERT_EQ(res, ":1:2"); ASSERT_EQ(res, "1,2");
LoDTensor tensor3; LoDTensor tensor3;
tensor3.Resize({2}); tensor3.Resize({2});
...@@ -47,7 +47,40 @@ TEST(LodTensor, PrintLodTensor) { ...@@ -47,7 +47,40 @@ TEST(LodTensor, PrintLodTensor) {
tensor3.data<double>()[0] = 0.1; tensor3.data<double>()[0] = 0.1;
tensor3.data<double>()[1] = 0.2; tensor3.data<double>()[1] = 0.2;
res = PrintLodTensor(&tensor3, 0, 2); res = PrintLodTensor(&tensor3, 0, 2);
ASSERT_EQ(res, ":0.1:0.2"); ASSERT_EQ(res, "0.1,0.2");
LoDTensor tensor4;
tensor4.Resize({2});
tensor4.mutable_data<double>(platform::CPUPlace());
tensor4.data<double>()[0] = 0.1;
tensor4.data<double>()[1] = 0.2;
res = "";
PrintLodTensor(&tensor4, 0, 2, res);
// ASSERT_EQ(res, "0.1,0.2");
LoDTensor tensor5;
tensor5.Resize({2});
tensor5.mutable_data<int64_t>(platform::CPUPlace());
tensor5.data<int64_t>()[0] = 1;
tensor5.data<int64_t>()[1] = 2;
res = "";
PrintLodTensor(&tensor5, -1, 2, res);
ASSERT_EQ(res, "access violation");
res = "";
PrintLodTensor(&tensor5, 0, 2, res);
ASSERT_EQ(res, "1,2");
LoDTensor tensor6;
tensor6.Resize({2});
tensor6.mutable_data<float>(platform::CPUPlace());
tensor6.data<float>()[0] = 0.2;
tensor6.data<float>()[1] = 0.5;
res = "";
PrintLodTensor(&tensor6, -1, 2, res);
// ASSERT_EQ(res, "access violation");
res = "";
PrintLodTensor(&tensor6, 0, 2, res);
// ASSERT_EQ(res, "0.2,0.5");
} }
TEST(LodTensor, GetTensorBound) { TEST(LodTensor, GetTensorBound) {
......
...@@ -207,6 +207,12 @@ message TableAccessorParameter { ...@@ -207,6 +207,12 @@ message TableAccessorParameter {
repeated TableAccessorSaveParameter table_accessor_save_param = 8; repeated TableAccessorSaveParameter table_accessor_save_param = 8;
optional SGDParameter embed_sgd_param = 10; optional SGDParameter embed_sgd_param = 10;
optional SGDParameter embedx_sgd_param = 11; optional SGDParameter embedx_sgd_param = 11;
optional GraphSGDParameter graph_sgd_param = 12;
}
message GraphSGDParameter {
optional uint32 nodeid_slot = 1 [ default = 9008 ];
optional float feature_learning_rate = 2 [ default = 0.05 ];
} }
message SGDParameter { message SGDParameter {
......
...@@ -51,6 +51,8 @@ ...@@ -51,6 +51,8 @@
} }
#endif #endif
DECLARE_bool(gpugraph_enable_hbm_table_collision_stat);
// TODO: can we do this more efficiently? // TODO: can we do this more efficiently?
__inline__ __device__ int8_t atomicCAS(int8_t* address, __inline__ __device__ int8_t atomicCAS(int8_t* address,
int8_t compare, int8_t compare,
...@@ -330,8 +332,7 @@ template <typename Key, ...@@ -330,8 +332,7 @@ template <typename Key,
Key unused_key, Key unused_key,
typename Hasher = default_hash<Key>, typename Hasher = default_hash<Key>,
typename Equality = equal_to<Key>, typename Equality = equal_to<Key>,
typename Allocator = managed_allocator<thrust::pair<Key, Element>>, typename Allocator = managed_allocator<thrust::pair<Key, Element>>>
bool count_collisions = false>
class concurrent_unordered_map : public managed { class concurrent_unordered_map : public managed {
public: public:
using size_type = size_t; using size_type = size_t;
...@@ -363,9 +364,12 @@ class concurrent_unordered_map : public managed { ...@@ -363,9 +364,12 @@ class concurrent_unordered_map : public managed {
m_allocator(a), m_allocator(a),
m_hashtbl_size(n), m_hashtbl_size(n),
m_hashtbl_capacity(n), m_hashtbl_capacity(n),
m_collisions(0), m_unused_element(unused_element),
m_unused_element( m_enable_collision_stat(false),
unused_element) { // allocate the raw data of hash table: m_insert_times(0),
m_insert_collisions(0),
m_query_times(0),
m_query_collisions(0) { // allocate the raw data of hash table:
// m_hashtbl_values,pre-alloc it on current GPU if UM. // m_hashtbl_values,pre-alloc it on current GPU if UM.
m_hashtbl_values = m_allocator.allocate(m_hashtbl_capacity); m_hashtbl_values = m_allocator.allocate(m_hashtbl_capacity);
constexpr int block_size = 128; constexpr int block_size = 128;
...@@ -390,9 +394,9 @@ class concurrent_unordered_map : public managed { ...@@ -390,9 +394,9 @@ class concurrent_unordered_map : public managed {
// Initialize kernel, set all entry to unused <K,V> // Initialize kernel, set all entry to unused <K,V>
init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size>>>( init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size>>>(
m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element); m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element);
// CUDA_RT_CALL( cudaGetLastError() );
CUDA_RT_CALL(cudaStreamSynchronize(0)); CUDA_RT_CALL(cudaStreamSynchronize(0));
CUDA_RT_CALL(cudaGetLastError()); CUDA_RT_CALL(cudaGetLastError());
m_enable_collision_stat = FLAGS_gpugraph_enable_hbm_table_collision_stat;
} }
~concurrent_unordered_map() { ~concurrent_unordered_map() {
...@@ -572,11 +576,16 @@ class concurrent_unordered_map : public managed { ...@@ -572,11 +576,16 @@ class concurrent_unordered_map : public managed {
// TODO: How to handle data types less than 32 bits? // TODO: How to handle data types less than 32 bits?
if (keys_equal(unused_key, old_key) || keys_equal(insert_key, old_key)) { if (keys_equal(unused_key, old_key) || keys_equal(insert_key, old_key)) {
update_existing_value(existing_value, x, op); update_existing_value(existing_value, x, op);
insert_success = true; insert_success = true;
if (m_enable_collision_stat) {
atomicAdd(&m_insert_times, 1);
}
break; break;
} }
if (m_enable_collision_stat) {
atomicAdd(&m_insert_collisions, 1);
}
current_index = (current_index + 1) % hashtbl_size; current_index = (current_index + 1) % hashtbl_size;
current_hash_bucket = &(hashtbl_values[current_index]); current_hash_bucket = &(hashtbl_values[current_index]);
} }
...@@ -614,9 +623,9 @@ std::numeric_limits<mapped_type>::is_integer && sizeof(unsigned long long int) ...@@ -614,9 +623,9 @@ std::numeric_limits<mapped_type>::is_integer && sizeof(unsigned long long int)
reinterpret_cast<unsigned long long reinterpret_cast<unsigned long long
int*>(tmp_it), unused, value ); if ( old_val == unused ) { it = tmp_it; int*>(tmp_it), unused, value ); if ( old_val == unused ) { it = tmp_it;
} }
else if ( count_collisions ) else if ( m_enable_collision_stat )
{ {
atomicAdd( &m_collisions, 1 ); atomicAdd( &m_insert_collisions, 1 );
} }
} else { } else {
const key_type old_key = atomicCAS( &(tmp_it->first), unused_key, const key_type old_key = atomicCAS( &(tmp_it->first), unused_key,
...@@ -625,9 +634,9 @@ x.first ); ...@@ -625,9 +634,9 @@ x.first );
(m_hashtbl_values+hash_tbl_idx)->second = x.second; (m_hashtbl_values+hash_tbl_idx)->second = x.second;
it = tmp_it; it = tmp_it;
} }
else if ( count_collisions ) else if ( m_enable_collision_stat )
{ {
atomicAdd( &m_collisions, 1 ); atomicAdd( &m_insert_collisions, 1 );
} }
} }
#else #else
...@@ -648,8 +657,7 @@ x.second ); ...@@ -648,8 +657,7 @@ x.second );
} }
*/ */
__forceinline__ __host__ __device__ const_iterator __forceinline__ __device__ const_iterator find(const key_type& k) {
find(const key_type& k) const {
size_type key_hash = m_hf(k); size_type key_hash = m_hf(k);
size_type hash_tbl_idx = key_hash % m_hashtbl_size; size_type hash_tbl_idx = key_hash % m_hashtbl_size;
...@@ -667,10 +675,17 @@ x.second ); ...@@ -667,10 +675,17 @@ x.second );
begin_ptr = m_hashtbl_values + m_hashtbl_size; begin_ptr = m_hashtbl_values + m_hashtbl_size;
break; break;
} }
if (m_enable_collision_stat) {
atomicAdd(&m_query_collisions, 1);
}
hash_tbl_idx = (hash_tbl_idx + 1) % m_hashtbl_size; hash_tbl_idx = (hash_tbl_idx + 1) % m_hashtbl_size;
++counter; ++counter;
} }
if (m_enable_collision_stat) {
atomicAdd(&m_query_times, 1);
}
return const_iterator( return const_iterator(
m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, begin_ptr); m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, begin_ptr);
} }
...@@ -770,7 +785,7 @@ x.second ); ...@@ -770,7 +785,7 @@ x.second );
int assign_async(const concurrent_unordered_map& other, int assign_async(const concurrent_unordered_map& other,
cudaStream_t stream = 0) { cudaStream_t stream = 0) {
m_collisions = other.m_collisions; m_insert_collisions = other.m_insert_collisions;
if (other.m_hashtbl_size <= m_hashtbl_capacity) { if (other.m_hashtbl_size <= m_hashtbl_capacity) {
m_hashtbl_size = other.m_hashtbl_size; m_hashtbl_size = other.m_hashtbl_size;
} else { } else {
...@@ -795,10 +810,15 @@ x.second ); ...@@ -795,10 +810,15 @@ x.second );
0, 0,
stream>>>( stream>>>(
m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element); m_hashtbl_values, m_hashtbl_size, unused_key, m_unused_element);
if (count_collisions) m_collisions = 0; if (m_enable_collision_stat) {
m_insert_times = 0;
m_insert_collisions = 0;
m_query_times = 0;
m_query_collisions = 0;
}
} }
unsigned long long get_num_collisions() const { return m_collisions; } unsigned long long get_num_collisions() const { return m_insert_collisions; }
void print() { void print() {
for (size_type i = 0; i < 5; ++i) { for (size_type i = 0; i < 5; ++i) {
...@@ -850,6 +870,21 @@ x.second ); ...@@ -850,6 +870,21 @@ x.second );
return it; return it;
} }
__host__ void print_collision(int id) {
if (m_enable_collision_stat) {
printf(
"collision stat for hbm table %d, insert(%lu:%lu:%.2f), "
"query(%lu:%lu:%.2f)\n",
id,
m_insert_times,
m_insert_collisions,
m_insert_collisions / (double)m_insert_times,
m_query_times,
m_query_collisions,
m_query_collisions / (double)m_query_times);
}
}
private: private:
const hasher m_hf; const hasher m_hf;
const key_equal m_equal; const key_equal m_equal;
...@@ -862,7 +897,11 @@ x.second ); ...@@ -862,7 +897,11 @@ x.second );
size_type m_hashtbl_capacity; size_type m_hashtbl_capacity;
value_type* m_hashtbl_values; value_type* m_hashtbl_values;
unsigned long long m_collisions; bool m_enable_collision_stat;
uint64_t m_insert_times;
uint64_t m_insert_collisions;
uint64_t m_query_times;
uint64_t m_query_collisions;
}; };
#endif // CONCURRENT_UNORDERED_MAP_CUH #endif // CONCURRENT_UNORDERED_MAP_CUH
...@@ -13,11 +13,16 @@ limitations under the License. */ ...@@ -13,11 +13,16 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename FVAccessor> const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
template <typename GPUAccessor>
__global__ void PullCopy(float** dest, __global__ void PullCopy(float** dest,
const float* src, const float* src,
const int64_t* len, const int64_t* len,
...@@ -26,7 +31,7 @@ __global__ void PullCopy(float** dest, ...@@ -26,7 +31,7 @@ __global__ void PullCopy(float** dest,
uint64_t** keys, uint64_t** keys,
uint64_t max_val_size, uint64_t max_val_size,
int* gpu_dim, int* gpu_dim,
FVAccessor feature_value_accessor) { GPUAccessor gpu_accessor) {
CUDA_KERNEL_LOOP(i, total_len) { CUDA_KERNEL_LOOP(i, total_len) {
int low = 0; int low = 0;
int high = slot_num - 1; int high = slot_num - 1;
...@@ -42,12 +47,62 @@ __global__ void PullCopy(float** dest, ...@@ -42,12 +47,62 @@ __global__ void PullCopy(float** dest,
float* feature_value_ptr = float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
int mf_dim = gpu_dim[x] - 3; int mf_dim = gpu_dim[x] - 3;
feature_value_accessor.Select( gpu_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
} }
} }
template <typename FVAccessor> template <typename TAccess>
__global__ void PullDedupCopy(const size_t N,
const uint64_t* total_keys,
float** dest,
const float* src,
const int64_t* slot_lens,
uint64_t max_val_size,
const int* slot_dims,
const int hidden,
const int* key2slot,
const uint32_t* restore_idx,
TAccess accessor) {
CUDA_KERNEL_LOOP(idx, N) {
int i = idx / hidden;
int off = idx % hidden;
int x = key2slot[i];
int y = i - slot_lens[x];
assert(slot_dims[x] == hidden);
float* dest_ptr = dest[x] + y * hidden;
// 0 key fill zero
if (total_keys[i] == 0) {
*(dest_ptr + off) = 0;
return;
}
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
uint64_t(max_val_size));
switch (off) {
case 0:
*(dest_ptr + off) = src_ptr[accessor.ShowIndex()];
break;
case 1:
*(dest_ptr + off) = src_ptr[accessor.ClickIndex()];
break;
case 2:
*(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()];
break;
default:
if (src_ptr[accessor.MfSizeIndex()] == 0) {
*(dest_ptr + off) = 0;
} else {
*(dest_ptr + off) = src_ptr[accessor.EmbedxWIndex() + off - 3];
}
break;
}
}
}
template <typename GPUAccessor>
__global__ void PushCopyWithPool(float* dest, __global__ void PushCopyWithPool(float* dest,
float** src, float** src,
int64_t* len, int64_t* len,
...@@ -57,7 +112,7 @@ __global__ void PushCopyWithPool(float* dest, ...@@ -57,7 +112,7 @@ __global__ void PushCopyWithPool(float* dest,
int* slot_vector, int* slot_vector,
int* mf_dim_vector, int* mf_dim_vector,
size_t grad_value_size, size_t grad_value_size,
FVAccessor feature_value_accessor) { GPUAccessor gpu_accessor) {
CUDA_KERNEL_LOOP(i, total_len) { CUDA_KERNEL_LOOP(i, total_len) {
int low = 0; int low = 0;
int high = slot_num - 1; int high = slot_num - 1;
...@@ -72,24 +127,167 @@ __global__ void PushCopyWithPool(float* dest, ...@@ -72,24 +127,167 @@ __global__ void PushCopyWithPool(float* dest,
int y = i - (x ? len[low - 1] : 0); int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size); float* cur = (float*)((char*)dest + i * grad_value_size);
cur[feature_value_accessor.common_push_value.SlotIndex()] = cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
(float)slot_vector[x];
int mf_dim = mf_dim_vector[x]; int mf_dim = mf_dim_vector[x];
cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
cur[feature_value_accessor.common_push_value.ShowIndex()] = cur[gpu_accessor.common_push_value.ShowIndex()] =
*(src[x] + y * (mf_dim + 3)); *(src[x] + y * (mf_dim + 3));
cur[feature_value_accessor.common_push_value.ClickIndex()] = cur[gpu_accessor.common_push_value.ClickIndex()] =
*(src[x] + y * (mf_dim + 3) + 1); *(src[x] + y * (mf_dim + 3) + 1);
cur[feature_value_accessor.common_push_value.EmbedGIndex()] = cur[gpu_accessor.common_push_value.EmbedGIndex()] =
*(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
for (int j = 0; j < mf_dim; j++) { for (int j = 0; j < mf_dim; j++) {
cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = cur[gpu_accessor.common_push_value.EmbedxGIndex() + j] =
*(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
} }
} }
} }
template <typename TAccess>
__global__ void PushMergeCopyAtomic(const size_t N,
const uint64_t* total_keys,
float* dest,
float** src,
const int hidden,
const int bs,
const int* slot_vector,
const int* slot_dims,
const int64_t* slot_lens,
const int* key2slot,
const uint32_t* d_restore_idx,
size_t grad_value_size,
TAccess accessor) {
CUDA_KERNEL_LOOP(idx, N) {
int i = idx / hidden;
int off = idx % hidden;
// filter 0 keys
if (total_keys[i] == 0) {
return;
}
int x = key2slot[i];
int y = i - slot_lens[x];
const float* ptr = src[x] + y * hidden;
float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
int mf_dim = slot_dims[x] - 3;
switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.MfDimIndex()] = mf_dim;
paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
*(ptr + off));
break;
case 1:
paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
*(ptr + off));
break;
case 2:
paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs);
break;
default:
int embedx_idx = off - 3;
if (mf_dim < embedx_idx) {
return;
}
paddle::platform::CudaAtomicAdd(
&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs);
break;
}
}
}
#define SUM_GRAD_VALUE \
for (uint32_t j = 0; j < count; ++j) { \
const uint32_t& pos = d_sort_idx[start + j]; \
const int& x = key2slot[pos]; \
y = pos - slot_lens[x]; \
val += *(reinterpret_cast<float*>(src[x] + y * hidden + off)); \
}
template <typename TAccess>
__global__ void PushMergeCopy(const size_t N,
const uint64_t* total_keys,
float* dest,
float** src,
const int hidden,
const int bs,
const int* slot_vector,
const int* slot_dims,
const int64_t* slot_lens,
const int* key2slot,
const uint32_t* d_sort_idx,
const uint32_t* d_sort_offset,
const uint32_t* d_sort_cnt,
size_t grad_value_size,
TAccess accessor) {
CUDA_KERNEL_LOOP(idx, N) {
int i = idx / hidden;
int off = idx % hidden;
// filter 0 keys
float* cur = (float*)((char*)dest + i * grad_value_size);
if (total_keys[i] == 0) {
switch (off) {
case 0:
cur[accessor.SlotIndex()] = 0;
cur[accessor.MfDimIndex()] = 0;
cur[accessor.ShowIndex()] = 0.0;
break;
case 1:
cur[accessor.ClickIndex()] = 0.0;
break;
case 2:
cur[accessor.EmbedGIndex()] = 0.0;
break;
default:
cur[accessor.EmbedxGIndex() + off - 3] = 0.0;
break;
}
return;
}
const uint32_t& start = d_sort_offset[i];
const uint32_t& count = d_sort_cnt[i];
const uint32_t& pos = d_sort_idx[start];
const int& x = key2slot[pos];
int y = pos - slot_lens[x];
int mf_dim = slot_dims[x] - 3;
double val = 0.0;
switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.MfDimIndex()] = mf_dim;
SUM_GRAD_VALUE
cur[accessor.ShowIndex()] = val;
break;
case 1:
SUM_GRAD_VALUE
cur[accessor.ClickIndex()] = val;
break;
case 2:
SUM_GRAD_VALUE
cur[accessor.EmbedGIndex()] = val * -1. * bs;
break;
default:
int embedx_idx = off - 3;
if (mf_dim < embedx_idx) {
cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0;
return;
}
SUM_GRAD_VALUE
cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs;
break;
}
}
}
template <typename GPUAccessor> template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPullImpl( void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
const paddle::platform::Place& place, const paddle::platform::Place& place,
...@@ -183,6 +381,118 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl( ...@@ -183,6 +381,118 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPullDedupImpl(
const paddle::platform::Place& place,
const uint64_t* total_keys,
float** gpu_values,
const float* total_values_gpu,
const int64_t* slot_lens,
const int* key2slot,
const int hidden_size,
const int64_t total_length,
const int* slot_dims,
const uint32_t* gpu_restore_idx,
int pull_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
size_t N = total_length * hidden_size;
PullDedupCopy<<<CUDA_BLOCK(N), stream>>>(N,
total_keys,
gpu_values,
total_values_gpu,
slot_lens,
pull_value_size,
slot_dims,
hidden_size,
key2slot,
gpu_restore_idx,
gpu_accessor_.common_pull_value);
cudaStreamSynchronize(stream);
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* d_restore_idx,
const size_t grad_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
cudaMemsetAsync(
total_grad_values_gpu, 0, dedup_length * grad_value_size, stream);
size_t N = total_length * hidden_size;
PushMergeCopyAtomic<<<CUDA_BLOCK(N), stream>>>(
N,
total_keys,
total_grad_values_gpu,
grad_values,
hidden_size,
batch_size,
slots,
slot_dims,
slot_lens,
key2slot,
d_restore_idx,
grad_value_size,
gpu_accessor_.common_push_value);
cudaStreamSynchronize(stream);
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* gpu_sort_idx,
const uint32_t* gpu_sort_offset,
const uint32_t* gpu_sort_lens,
const size_t grad_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
// merge all grad to one
size_t N = dedup_length * hidden_size;
PushMergeCopy<<<CUDA_BLOCK(N), stream>>>(N,
total_keys,
total_grad_values_gpu,
grad_values,
hidden_size,
batch_size,
slots,
slot_dims,
slot_lens,
key2slot,
gpu_sort_idx,
gpu_sort_offset,
gpu_sort_lens,
grad_value_size,
gpu_accessor_.common_push_value);
cudaStreamSynchronize(stream);
}
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
template class AccessorWrapper<CommonFeatureValueAccessor>; template class AccessorWrapper<CommonFeatureValueAccessor>;
#endif #endif
......
...@@ -36,27 +36,10 @@ typedef uint64_t FeatureKey; ...@@ -36,27 +36,10 @@ typedef uint64_t FeatureKey;
#define TYPEALIGN(ALIGNVAL, LEN) \ #define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class FeatureValueAccessor {
public:
__host__ __device__ FeatureValueAccessor() {}
__host__ __device__ ~FeatureValueAccessor() {}
__host__ __device__ virtual int Configure(
std::unordered_map<std::string, float> config) {
_config = config;
Initialize();
return 0;
}
__host__ __device__ virtual int Initialize() = 0;
protected:
std::unordered_map<std::string, float> _config;
};
// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n // adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n
// adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n // adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n
// adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n // adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n
class CommonFeatureValueAccessor : public FeatureValueAccessor { class CommonFeatureValueAccessor {
public: public:
struct CommonFeatureValue { struct CommonFeatureValue {
/* /*
...@@ -175,6 +158,30 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -175,6 +158,30 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
int optimizer_type_; int optimizer_type_;
}; };
struct CommonPullValue {
/*
float show;
float click;
float embed_w;
float mf_size
std::vector<float> embedx_w;
*/
__host__ __device__ static int Dim(int embedx_dim) {
return 4 + embedx_dim;
}
__host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
__host__ __device__ int Size(int embedx_dim) {
return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
}
__host__ __device__ int ShowIndex() { return 0; }
__host__ __device__ int ClickIndex() { return 1; }
__host__ __device__ int EmbedWIndex() { return 2; }
__host__ __device__ int MfSizeIndex() {
return 3;
} // actual mf size (ex. 0)
__host__ __device__ int EmbedxWIndex() { return 4; }
};
struct CommonPushValue { struct CommonPushValue {
/* /*
float slot; float slot;
...@@ -229,43 +236,10 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -229,43 +236,10 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
} }
}; };
struct CommonPullValue {
/*
float show;
float click;
float embed_w;
std::vector<float> embedx_w;
*/
__host__ __device__ static int Dim(int embedx_dim) {
return 3 + embedx_dim;
}
__host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
__host__ __device__ int Size(int embedx_dim) {
return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
}
__host__ __device__ int ShowIndex() { return 0; }
__host__ __device__ int ClickIndex() { return 1; }
__host__ __device__ int EmbedWIndex() { return 2; }
__host__ __device__ int EmbedxWIndex() { return 3; }
__host__ __device__ float& Show(float* val) {
return val[CommonPullValue::ShowIndex()];
}
__host__ __device__ float& Click(float* val) {
return val[CommonPullValue::ClickIndex()];
}
__host__ __device__ float& EmbedW(float* val) {
return val[CommonPullValue::EmbedWIndex()];
}
__host__ __device__ float* EmbedxW(float* val) {
return val + CommonPullValue::EmbedxWIndex();
}
};
__host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ CommonFeatureValueAccessor() {}
__host__ __device__ ~CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {}
__host__ __device__ virtual int Initialize() { __host__ int Initialize() {
int optimizer_type = (_config.find("optimizer_type") == _config.end()) int optimizer_type = (_config.find("optimizer_type") == _config.end())
? 1 ? 1
: int(_config["optimizer_type"]); : int(_config["optimizer_type"]);
...@@ -288,6 +262,12 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -288,6 +262,12 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
return 0; return 0;
} }
__host__ int Configure(std::unordered_map<std::string, float>& config) {
_config = config;
Initialize();
return 0;
}
// // build阶段从cpu_val赋值给gpu_val // // build阶段从cpu_val赋值给gpu_val
__host__ void BuildFill( __host__ void BuildFill(
float* gpu_val, float* gpu_val,
...@@ -388,7 +368,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -388,7 +368,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
#endif #endif
} }
// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel // dy_mf_fill_dvals_kernel 阶段 gpukernel
// 中从src_val赋值给dest_val // 中从src_val赋值给dest_val
__host__ __device__ void FeatureValueFill(float* dest_val, __host__ __device__ void FeatureValueFill(float* dest_val,
float* src_val, float* src_val,
...@@ -422,6 +402,32 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -422,6 +402,32 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
} }
} }
// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
// 中从src_val赋值给dest_val
__host__ __device__ void PullValueFill(float* dest_val, float* src_val) {
dest_val[common_pull_value.ShowIndex()] =
src_val[common_feature_value.ShowIndex()];
dest_val[common_pull_value.ClickIndex()] =
src_val[common_feature_value.ClickIndex()];
dest_val[common_pull_value.EmbedWIndex()] =
src_val[common_feature_value.EmbedWIndex()];
int mf_size = int(src_val[common_feature_value.MfSizeIndex()]);
if (mf_size == 0) {
dest_val[common_pull_value.MfSizeIndex()] = 0;
return;
}
// set pull value real dim size
int mf_dim = int(src_val[common_feature_value.MfDimIndex()]);
dest_val[common_pull_value.MfSizeIndex()] = mf_dim;
int embedx_off = common_pull_value.EmbedxWIndex();
int value_off = common_feature_value.EmbedxWIndex();
for (int k = 0; k < mf_dim; ++k) {
dest_val[embedx_off + k] = src_val[value_off + k];
}
}
// dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
// 中从src_val赋值给dest_val // 中从src_val赋值给dest_val
__host__ __device__ void PushValueFill(float* dest_val, __host__ __device__ void PushValueFill(float* dest_val,
...@@ -508,8 +514,9 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -508,8 +514,9 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
} }
} else { } else {
for (int j = 0; j < mf_dim; j++) { for (int j = 0; j < mf_dim; j++) {
*(dest_val + common_pull_value.EmbedxWIndex() + j) = // common_pull_value EmbedxWIndex 之前还有 MfSizeIndex,
src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; // 所以这里没有直接使用 common_pull_value.EmbedxWIndex()
*(dest_val + 3 + j) = src_val[common_pull_value.EmbedxWIndex() + j];
} }
} }
} }
...@@ -554,6 +561,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { ...@@ -554,6 +561,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
} }
public: public:
std::unordered_map<std::string, float> _config;
CommonFeatureValue common_feature_value; CommonFeatureValue common_feature_value;
CommonPushValue common_push_value; CommonPushValue common_push_value;
CommonPullValue common_pull_value; CommonPullValue common_pull_value;
...@@ -638,6 +646,8 @@ class VirtualAccessor { ...@@ -638,6 +646,8 @@ class VirtualAccessor {
virtual size_t GetPushValueSize(int& mf_dim) = 0; virtual size_t GetPushValueSize(int& mf_dim) = 0;
virtual size_t GetPullValueSize(int& mf_dim) = 0;
virtual void BuildFill(void* gpu_val, virtual void BuildFill(void* gpu_val,
void* cpu_val, void* cpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor, paddle::distributed::ValueAccessor* cpu_table_accessor,
...@@ -657,6 +667,18 @@ class VirtualAccessor { ...@@ -657,6 +667,18 @@ class VirtualAccessor {
const int64_t total_length, const int64_t total_length,
int* gpu_dim, int* gpu_dim,
int feature_value_size) = 0; int feature_value_size) = 0;
// dedup
virtual void CopyForPull(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** gpu_values,
const float* total_values_gpu,
const int64_t* slot_lens,
const int* key2slot,
const int hidden_size,
const int64_t total_length,
const int* slot_dims,
const uint32_t* gpu_restore_idx,
int pull_value_size) = 0;
virtual void CopyForPush(const paddle::platform::Place& place, virtual void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values, const std::vector<const float*>& grad_values,
...@@ -668,6 +690,39 @@ class VirtualAccessor { ...@@ -668,6 +690,39 @@ class VirtualAccessor {
std::vector<int>& slot_vector, std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) = 0; std::vector<int>& slot_mf_dim_vector) = 0;
// dedup
virtual void CopyForPush(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* d_restore_idx,
const size_t grad_value_size) = 0;
virtual void CopyForPush(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* gpu_sort_idx,
const uint32_t* gpu_sort_offset,
const uint32_t* gpu_sort_lens,
const size_t grad_value_size) = 0;
virtual std::string ParseToString(const float* v, int param_size) = 0; virtual std::string ParseToString(const float* v, int param_size) = 0;
}; };
...@@ -691,6 +746,12 @@ class AccessorWrapper : public VirtualAccessor { ...@@ -691,6 +746,12 @@ class AccessorWrapper : public VirtualAccessor {
return gpu_accessor_.common_push_value.Size(mf_dim); return gpu_accessor_.common_push_value.Size(mf_dim);
} }
virtual size_t GetPullValueSize(int& mf_dim) {
return gpu_accessor_.common_pull_value.Size(mf_dim);
}
GPUAccessor* AccessorPtr() { return &gpu_accessor_; }
virtual void BuildFill(void* gpu_val, virtual void BuildFill(void* gpu_val,
void* cpu_val, void* cpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor, paddle::distributed::ValueAccessor* cpu_table_accessor,
...@@ -727,6 +788,30 @@ class AccessorWrapper : public VirtualAccessor { ...@@ -727,6 +788,30 @@ class AccessorWrapper : public VirtualAccessor {
feature_value_size); feature_value_size);
} }
virtual void CopyForPull(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** gpu_values,
const float* total_values_gpu,
const int64_t* slot_lens,
const int* key2slot,
const int hidden_size,
const int64_t total_length,
const int* slot_dims,
const uint32_t* gpu_restore_idx,
int pull_value_size) {
CopyForPullDedupImpl(place,
total_keys,
gpu_values,
total_values_gpu,
slot_lens,
key2slot,
hidden_size,
total_length,
slot_dims,
gpu_restore_idx,
pull_value_size);
}
virtual void CopyForPush(const paddle::platform::Place& place, virtual void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values, const std::vector<const float*>& grad_values,
float* total_grad_values_gpu, float* total_grad_values_gpu,
...@@ -747,6 +832,70 @@ class AccessorWrapper : public VirtualAccessor { ...@@ -747,6 +832,70 @@ class AccessorWrapper : public VirtualAccessor {
slot_mf_dim_vector); slot_mf_dim_vector);
} }
virtual void CopyForPush(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* d_restore_idx,
const size_t grad_value_size) {
CopyForPushDedupImpl(place,
total_keys,
grad_values,
total_grad_values_gpu,
slots,
slot_lens,
hidden_size,
total_length,
dedup_length,
batch_size,
slot_dims,
key2slot,
d_restore_idx,
grad_value_size);
}
virtual void CopyForPush(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* gpu_sort_idx,
const uint32_t* gpu_sort_offset,
const uint32_t* gpu_sort_lens,
const size_t grad_value_size) {
CopyForPushDedupImpl(place,
total_keys,
grad_values,
total_grad_values_gpu,
slots,
slot_lens,
hidden_size,
total_length,
dedup_length,
batch_size,
slot_dims,
key2slot,
gpu_sort_idx,
gpu_sort_offset,
gpu_sort_lens,
grad_value_size);
}
void CopyForPullImpl(const paddle::platform::Place& place, void CopyForPullImpl(const paddle::platform::Place& place,
uint64_t** gpu_keys, uint64_t** gpu_keys,
const std::vector<float*>& values, const std::vector<float*>& values,
...@@ -768,6 +917,49 @@ class AccessorWrapper : public VirtualAccessor { ...@@ -768,6 +917,49 @@ class AccessorWrapper : public VirtualAccessor {
std::vector<int>& slot_vector, std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector); std::vector<int>& slot_mf_dim_vector);
void CopyForPullDedupImpl(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** gpu_values,
const float* total_values_gpu,
const int64_t* slot_lens,
const int* key2slot,
const int hidden_size,
const int64_t total_length,
const int* slot_dims,
const uint32_t* gpu_restore_idx,
int pull_value_size);
void CopyForPushDedupImpl(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* d_restore_idx,
const size_t grad_value_size);
void CopyForPushDedupImpl(const paddle::platform::Place& place,
const uint64_t* total_keys,
float** grad_values,
float* total_grad_values_gpu,
const int* slots,
const int64_t* slot_lens,
const int hidden_size,
const int64_t total_length,
const int64_t dedup_length,
const int batch_size,
const int* slot_dims,
const int* key2slot,
const uint32_t* gpu_sort_idx,
const uint32_t* gpu_sort_offset,
const uint32_t* gpu_sort_lens,
const size_t grad_value_size);
virtual std::string ParseToString(const float* v, int param_size) { virtual std::string ParseToString(const float* v, int param_size) {
return gpu_accessor_.ParseToString(v, param_size); return gpu_accessor_.ParseToString(v, param_size);
} }
...@@ -775,10 +967,10 @@ class AccessorWrapper : public VirtualAccessor { ...@@ -775,10 +967,10 @@ class AccessorWrapper : public VirtualAccessor {
GPUAccessor gpu_accessor_; GPUAccessor gpu_accessor_;
}; };
class GlobalAccessorTransfor { class GlobalAccessorFactory {
public: public:
static GlobalAccessorTransfor& GetInstance() { static GlobalAccessorFactory& GetInstance() {
static GlobalAccessorTransfor ins; static GlobalAccessorFactory ins;
return ins; return ins;
} }
void Init(std::string accessor_type) { void Init(std::string accessor_type) {
...@@ -788,7 +980,7 @@ class GlobalAccessorTransfor { ...@@ -788,7 +980,7 @@ class GlobalAccessorTransfor {
if (accessor_type == "CtrDymfAccessor") { if (accessor_type == "CtrDymfAccessor") {
accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>(); accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
} else { } else {
VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:" VLOG(0) << "GlobalAccessorFactory Init not support accessor_type:"
<< accessor_type; << accessor_type;
accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>(); accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
} }
......
...@@ -21,56 +21,75 @@ ...@@ -21,56 +21,75 @@
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/phi/core/enforce.h"
DECLARE_bool(gpugraph_load_node_list_into_hbm);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct GpuPsGraphNode { struct GpuPsNodeInfo {
int64_t node_id; uint32_t neighbor_size, neighbor_offset;
int64_t neighbor_size, neighbor_offset; GpuPsNodeInfo() : neighbor_size(0), neighbor_offset(0) {}
// this node's neighbor is stored on [neighbor_offset,neighbor_offset + // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
// neighbor_size) of int64_t *neighbor_list; // neighbor_size) of int64_t *neighbor_list;
}; };
struct GpuPsCommGraph { struct GpuPsCommGraph {
int64_t *neighbor_list; uint64_t *node_list;
GpuPsGraphNode *node_list; // when FLAGS_gpugraph_load_node_list_into_hbm is ture locate on both side
int64_t neighbor_size, node_size; // else only locate on host side
// the size of neighbor array and graph_node_list array int64_t node_size; // the size of node_list
GpuPsNodeInfo *node_info_list; // only locate on host side
uint64_t *neighbor_list; // locate on both side
int64_t neighbor_size; // the size of neighbor_list
GpuPsCommGraph() GpuPsCommGraph()
: neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} : node_list(nullptr),
GpuPsCommGraph(int64_t *neighbor_list_, node_size(0),
GpuPsGraphNode *node_list_, node_info_list(nullptr),
int64_t neighbor_size_, neighbor_list(nullptr),
int64_t node_size_) neighbor_size(0) {}
: neighbor_list(neighbor_list_), GpuPsCommGraph(uint64_t *node_list_,
node_list(node_list_), int64_t node_size_,
neighbor_size(neighbor_size_), GpuPsNodeInfo *node_info_list_,
node_size(node_size_) {} uint64_t *neighbor_list_,
void init_on_cpu(int64_t neighbor_size, int64_t node_size) { int64_t neighbor_size_)
this->neighbor_size = neighbor_size; : node_list(node_list_),
this->node_size = node_size; node_size(node_size_),
this->neighbor_list = new int64_t[neighbor_size]; node_info_list(node_info_list_),
this->node_list = new paddle::framework::GpuPsGraphNode[node_size]; neighbor_list(neighbor_list_),
neighbor_size(neighbor_size_) {}
void init_on_cpu(int64_t neighbor_size_, int64_t node_size_) {
if (node_size_ > 0) {
this->node_size = node_size_;
this->node_list = new uint64_t[node_size_];
this->node_info_list = new paddle::framework::GpuPsNodeInfo[node_size_];
}
if (neighbor_size_) {
this->neighbor_size = neighbor_size_;
this->neighbor_list = new uint64_t[neighbor_size_];
}
} }
void release_on_cpu() { void release_on_cpu() {
delete[] neighbor_list; #define DEL_PTR_ARRAY(p) \
delete[] node_list; if (p != nullptr) { \
delete[] p; \
p = nullptr; \
}
DEL_PTR_ARRAY(node_list);
DEL_PTR_ARRAY(neighbor_list);
DEL_PTR_ARRAY(node_info_list);
node_size = 0;
neighbor_size = 0;
} }
void display_on_cpu() { void display_on_cpu() const {
VLOG(0) << "neighbor_size = " << neighbor_size; VLOG(0) << "neighbor_size = " << neighbor_size;
VLOG(0) << "node_size = " << node_size; VLOG(0) << "node_size = " << node_size;
for (size_t i = 0; i < neighbor_size; i++) { for (int64_t i = 0; i < neighbor_size; i++) {
VLOG(0) << "neighbor " << i << " " << neighbor_list[i]; VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
} }
for (size_t i = 0; i < node_size; i++) { for (int64_t i = 0; i < node_size; i++) {
VLOG(0) << "node i " << node_list[i].node_id auto id = node_list[i];
<< " neighbor_size = " << node_list[i].neighbor_size; auto val = node_info_list[i];
std::string str; VLOG(0) << "node id " << id << "," << val.neighbor_offset << ":"
int offset = node_list[i].neighbor_offset; << val.neighbor_size;
for (size_t j = 0; j < node_list[i].neighbor_size; j++) {
if (j > 0) str += ",";
str += std::to_string(neighbor_list[j + offset]);
}
VLOG(0) << str;
} }
} }
}; };
...@@ -110,37 +129,33 @@ node 9:[14,14] ...@@ -110,37 +129,33 @@ node 9:[14,14]
node 17:[15,15] node 17:[15,15]
... ...
by the above information, by the above information,
we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph we generate a node_list and node_info_list in GpuPsCommGraph,
of size 9, node_list: [0,5,1,2,7,3,8,9,17]
where node_list[i].id = u_id[i] node_info_list: [(2,0),(2,2),(1,4),(1,5),(3,6),(4,9),(1,13),(1,14),(1,15)]
then we have: Here, we design the data in this format to better
node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 adapt to gpu and avoid to convert again.
node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
*/ */
struct NeighborSampleQuery { struct NeighborSampleQuery {
int gpu_id; int gpu_id;
int64_t *key; int table_idx;
int sample_size; uint64_t *src_nodes;
int len; int len;
void initialize(int gpu_id, int64_t key, int sample_size, int len) { int sample_size;
void initialize(
int gpu_id, int table_idx, uint64_t src_nodes, int sample_size, int len) {
this->table_idx = table_idx;
this->gpu_id = gpu_id; this->gpu_id = gpu_id;
this->key = (int64_t *)key; this->src_nodes = (uint64_t *)src_nodes;
this->sample_size = sample_size; this->sample_size = sample_size;
this->len = len; this->len = len;
} }
void display() { void display() {
int64_t *sample_keys = new int64_t[len]; uint64_t *sample_keys = new uint64_t[len];
VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size; VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size;
VLOG(0) << "there are " << len << " keys "; VLOG(0) << "there are " << len << " keys to sample for graph " << table_idx;
std::string key_str; std::string key_str;
cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost); cudaMemcpy(
sample_keys, src_nodes, len * sizeof(uint64_t), cudaMemcpyDeviceToHost);
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
if (key_str.size() > 0) key_str += ";"; if (key_str.size() > 0) key_str += ";";
...@@ -151,14 +166,14 @@ struct NeighborSampleQuery { ...@@ -151,14 +166,14 @@ struct NeighborSampleQuery {
} }
}; };
struct NeighborSampleResult { struct NeighborSampleResult {
int64_t *val; uint64_t *val;
int64_t *actual_val; uint64_t *actual_val;
int *actual_sample_size, sample_size, key_size; int *actual_sample_size, sample_size, key_size;
int total_sample_size; int total_sample_size;
std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem; std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
std::shared_ptr<memory::Allocation> actual_val_mem; std::shared_ptr<memory::Allocation> actual_val_mem;
int64_t *get_val() { return val; } uint64_t *get_val() { return val; }
int64_t get_actual_val() { return (int64_t)actual_val; } uint64_t get_actual_val() { return (uint64_t)actual_val; }
int *get_actual_sample_size() { return actual_sample_size; } int *get_actual_sample_size() { return actual_sample_size; }
int get_sample_size() { return sample_size; } int get_sample_size() { return sample_size; }
int get_key_size() { return key_size; } int get_key_size() { return key_size; }
...@@ -170,8 +185,8 @@ struct NeighborSampleResult { ...@@ -170,8 +185,8 @@ struct NeighborSampleResult {
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id);
val_mem = val_mem =
memory::AllocShared(place, _sample_size * _key_size * sizeof(int64_t)); memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
val = (int64_t *)val_mem->ptr(); val = (uint64_t *)val_mem->ptr();
actual_sample_size_mem = actual_sample_size_mem =
memory::AllocShared(place, _key_size * sizeof(int)); memory::AllocShared(place, _key_size * sizeof(int));
actual_sample_size = (int *)actual_sample_size_mem->ptr(); actual_sample_size = (int *)actual_sample_size_mem->ptr();
...@@ -217,13 +232,15 @@ struct NeighborSampleResult { ...@@ -217,13 +232,15 @@ struct NeighborSampleResult {
delete[] ac_size; delete[] ac_size;
VLOG(0) << " ------------------"; VLOG(0) << " ------------------";
} }
std::vector<int64_t> get_sampled_graph(NeighborSampleQuery q) { std::vector<uint64_t> get_sampled_graph(NeighborSampleQuery q) {
std::vector<int64_t> graph; std::vector<uint64_t> graph;
int64_t *sample_keys = new int64_t[q.len]; int64_t *sample_keys = new int64_t[q.len];
std::string key_str; std::string key_str;
cudaMemcpy( cudaMemcpy(sample_keys,
sample_keys, q.key, q.len * sizeof(int64_t), cudaMemcpyDeviceToHost); q.src_nodes,
int64_t *res = new int64_t[sample_size * key_size]; q.len * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
uint64_t *res = new uint64_t[sample_size * key_size];
cudaMemcpy(res, cudaMemcpy(res,
val, val,
sample_size * key_size * sizeof(int64_t), sample_size * key_size * sizeof(int64_t),
...@@ -263,25 +280,25 @@ struct NeighborSampleResult { ...@@ -263,25 +280,25 @@ struct NeighborSampleResult {
}; };
struct NodeQueryResult { struct NodeQueryResult {
int64_t *val; uint64_t *val;
int actual_sample_size; int actual_sample_size;
int64_t get_val() { return (int64_t)val; } uint64_t get_val() { return (uint64_t)val; }
int get_len() { return actual_sample_size; } int get_len() { return actual_sample_size; }
std::shared_ptr<memory::Allocation> val_mem; std::shared_ptr<memory::Allocation> val_mem;
void initialize(int query_size, int dev_id) { void initialize(int query_size, int dev_id) {
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id);
val_mem = memory::AllocShared(place, query_size * sizeof(int64_t)); val_mem = memory::AllocShared(place, query_size * sizeof(uint64_t));
val = (int64_t *)val_mem->ptr(); val = (uint64_t *)val_mem->ptr();
// cudaMalloc((void **)&val, query_size * sizeof(int64_t));
actual_sample_size = 0; actual_sample_size = 0;
} }
void display() { void display() {
VLOG(0) << "in node query result display ------------------"; VLOG(0) << "in node query result display ------------------";
int64_t *res = new int64_t[actual_sample_size]; uint64_t *res = new uint64_t[actual_sample_size];
cudaMemcpy( cudaMemcpy(res,
res, val, actual_sample_size * sizeof(int64_t), cudaMemcpyDeviceToHost); val,
actual_sample_size * sizeof(uint64_t),
cudaMemcpyDeviceToHost);
VLOG(0) << "actual_sample_size =" << actual_sample_size; VLOG(0) << "actual_sample_size =" << actual_sample_size;
std::string str; std::string str;
...@@ -298,7 +315,91 @@ struct NodeQueryResult { ...@@ -298,7 +315,91 @@ struct NodeQueryResult {
actual_sample_size = 0; actual_sample_size = 0;
}; };
~NodeQueryResult() {} ~NodeQueryResult() {}
}; // end of struct NodeQueryResult
struct GpuPsFeaInfo {
uint32_t feature_size, feature_offset;
// this node's feature is stored on [feature_offset,feature_offset +
// feature_size) of int64_t *feature_list;
}; };
} // namespace framework
}; // namespace paddle struct GpuPsCommGraphFea {
uint64_t *node_list; // only locate on host side, the list of node id
uint64_t *feature_list; // locate on both side
uint8_t *slot_id_list; // locate on both side
GpuPsFeaInfo
*fea_info_list; // only locate on host side, the list of fea_info
uint64_t feature_size, node_size;
// the size of feature array and graph_node_list array
GpuPsCommGraphFea()
: node_list(NULL),
feature_list(NULL),
slot_id_list(NULL),
fea_info_list(NULL),
feature_size(0),
node_size(0) {}
GpuPsCommGraphFea(uint64_t *node_list_,
uint64_t *feature_list_,
uint8_t *slot_id_list_,
GpuPsFeaInfo *fea_info_list_,
uint64_t feature_size_,
uint64_t node_size_)
: node_list(node_list_),
feature_list(feature_list_),
slot_id_list(slot_id_list_),
fea_info_list(fea_info_list_),
feature_size(feature_size_),
node_size(node_size_) {}
void init_on_cpu(uint64_t feature_size,
uint64_t node_size,
uint32_t slot_num) {
PADDLE_ENFORCE_LE(
slot_num,
255,
platform::errors::InvalidArgument(
"The number of slot_num should not be greater than 255 "
", but the slot_num is %d ",
slot_num));
this->feature_size = feature_size;
this->node_size = node_size;
this->node_list = new uint64_t[node_size];
this->feature_list = new uint64_t[feature_size];
this->slot_id_list = new uint8_t[feature_size];
this->fea_info_list = new GpuPsFeaInfo[node_size];
}
void release_on_cpu() {
#define DEL_PTR_ARRAY(p) \
if (p != nullptr) { \
delete[] p; \
p = nullptr; \
}
DEL_PTR_ARRAY(node_list);
DEL_PTR_ARRAY(feature_list);
DEL_PTR_ARRAY(slot_id_list);
DEL_PTR_ARRAY(fea_info_list);
}
void display_on_cpu() const {
VLOG(1) << "feature_size = " << feature_size;
VLOG(1) << "node_size = " << node_size;
for (uint64_t i = 0; i < feature_size; i++) {
VLOG(1) << "feature_list[" << i << "] = " << feature_list[i];
}
for (uint64_t i = 0; i < node_size; i++) {
VLOG(1) << "node_id[" << node_list[i]
<< "] feature_size = " << fea_info_list[i].feature_size;
std::string str;
uint32_t offset = fea_info_list[i].feature_offset;
for (uint64_t j = 0; j < fea_info_list[i].feature_size; j++) {
if (j > 0) str += ",";
str += std::to_string(slot_id_list[j + offset]);
str += ":";
str += std::to_string(feature_list[j + offset]);
}
VLOG(1) << str;
}
}
}; // end of struct GpuPsCommGraphFea
} // end of namespace framework
} // end of namespace paddle
#endif #endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
#define CUDA_CHECK(cmd) \
do { \
cudaError_t e = cmd; \
CHECK(e == cudaSuccess) << "Cuda failure " << __FILE__ << ":" << __LINE__ \
<< " " << cudaGetErrorString(e) << std::endl; \
} while (0)
class CudaDeviceRestorer {
public:
CudaDeviceRestorer() { cudaGetDevice(&dev_); }
~CudaDeviceRestorer() { cudaSetDevice(dev_); }
private:
int dev_;
};
inline void debug_gpu_memory_info(int gpu_id, const char* desc) {
CudaDeviceRestorer r;
size_t avail{0};
size_t total{0};
cudaSetDevice(gpu_id);
auto err = cudaMemGetInfo(&avail, &total);
PADDLE_ENFORCE_EQ(
err,
cudaSuccess,
platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
VLOG(0) << "updatex gpu memory on device " << gpu_id << ", "
<< "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
<< "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g, "
<< "use_rate=" << (total - avail) / double(total) << "%, "
<< "desc=" << desc;
}
inline void debug_gpu_memory_info(const char* desc) {
CudaDeviceRestorer r;
int device_num = 0;
auto err = cudaGetDeviceCount(&device_num);
PADDLE_ENFORCE_EQ(
err,
cudaSuccess,
platform::errors::InvalidArgument("cudaGetDeviceCount failed!"));
size_t avail{0};
size_t total{0};
for (int i = 0; i < device_num; ++i) {
cudaSetDevice(i);
auto err = cudaMemGetInfo(&avail, &total);
PADDLE_ENFORCE_EQ(
err,
cudaSuccess,
platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
VLOG(0) << "update gpu memory on device " << i << ", "
<< "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
<< "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g, "
<< "use_rate=" << (total - avail) / double(total) << "%, "
<< "desc=" << desc;
}
}
}; // namespace framework
}; // namespace paddle
...@@ -23,23 +23,48 @@ ...@@ -23,23 +23,48 @@
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
DECLARE_double(gpugraph_hbm_table_load_factor);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
enum GraphTableType { EDGE_TABLE, FEATURE_TABLE };
class GpuPsGraphTable class GpuPsGraphTable
: public HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor> { : public HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor> {
public: public:
GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware) int get_table_offset(int gpu_id, GraphTableType type, int idx) const {
: HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor>( int type_id = type;
return gpu_id * (graph_table_num_ + feature_table_num_) +
type_id * graph_table_num_ + idx;
}
GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource,
int topo_aware,
int graph_table_num)
: HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor>(
1, resource) { 1, resource) {
load_factor_ = 0.25; load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
VLOG(0) << "load_factor = " << load_factor_;
rw_lock.reset(new pthread_rwlock_t()); rw_lock.reset(new pthread_rwlock_t());
this->graph_table_num_ = graph_table_num;
this->feature_table_num_ = 1;
gpu_num = resource_->total_device(); gpu_num = resource_->total_device();
memset(global_device_map, -1, sizeof(global_device_map)); memset(global_device_map, -1, sizeof(global_device_map));
for (auto &table : tables_) {
delete table;
table = NULL;
}
int feature_table_num = 1;
tables_ = std::vector<Table *>(
gpu_num * (graph_table_num + feature_table_num), NULL);
for (int i = 0; i < gpu_num; i++) { for (int i = 0; i < gpu_num; i++) {
gpu_graph_list.push_back(GpuPsCommGraph());
global_device_map[resource_->dev_id(i)] = i; global_device_map[resource_->dev_id(i)] = i;
sample_status.push_back(NULL); for (int j = 0; j < graph_table_num; j++) {
tables_.push_back(NULL); gpu_graph_list_.push_back(GpuPsCommGraph());
}
for (int j = 0; j < feature_table_num; j++) {
gpu_graph_fea_list_.push_back(GpuPsCommGraphFea());
}
} }
cpu_table_status = -1; cpu_table_status = -1;
if (topo_aware) { if (topo_aware) {
...@@ -88,46 +113,56 @@ class GpuPsGraphTable ...@@ -88,46 +113,56 @@ class GpuPsGraphTable
} }
} }
} }
~GpuPsGraphTable() { ~GpuPsGraphTable() {}
// if (cpu_table_status != -1) { void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx);
// end_graph_sampling(); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id);
// } void clear_graph_info(int gpu_id, int index);
} void clear_graph_info(int index);
void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id); void clear_feature_info(int gpu_id, int index);
void clear_graph_info(int gpu_id); void clear_feature_info(int index);
void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list); void build_graph_from_cpu(const std::vector<GpuPsCommGraph> &cpu_node_list,
int idx);
void build_graph_fea_from_cpu(
const std::vector<GpuPsCommGraphFea> &cpu_node_list, int idx);
NodeQueryResult graph_node_sample(int gpu_id, int sample_size); NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
bool cpu_switch); bool cpu_switch);
NeighborSampleResult graph_neighbor_sample(int gpu_id, NeighborSampleResult graph_neighbor_sample(int gpu_id,
int64_t *key, uint64_t *key,
int sample_size, int sample_size,
int len); int len);
NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, NeighborSampleResult graph_neighbor_sample_v2(int gpu_id,
int64_t *key, int idx,
uint64_t *key,
int sample_size, int sample_size,
int len, int len,
bool cpu_query_switch); bool cpu_query_switch);
void init_sample_status();
void free_sample_status(); int get_feature_of_nodes(
NodeQueryResult query_node_list(int gpu_id, int start, int query_size); int gpu_id, uint64_t *d_walk, uint64_t *d_offset, int size, int slot_num);
void clear_graph_info();
NodeQueryResult query_node_list(int gpu_id,
int idx,
int start,
int query_size);
void display_sample_res(void *key, void *val, int len, int sample_len); void display_sample_res(void *key, void *val, int len, int sample_len);
void move_neighbor_sample_result_to_source_gpu(int gpu_id, void move_result_to_source_gpu(int gpu_id,
int gpu_num, int gpu_num,
int sample_size, int sample_size,
int *h_left, int *h_left,
int *h_right, int *h_right,
int64_t *src_sample_res, uint64_t *src_sample_res,
int *actual_sample_size); int *actual_sample_size);
int init_cpu_table(const paddle::distributed::GraphParameter &graph); int init_cpu_table(const paddle::distributed::GraphParameter &graph);
int gpu_num; int gpu_num;
std::vector<GpuPsCommGraph> gpu_graph_list; int graph_table_num_, feature_table_num_;
std::vector<GpuPsCommGraph> gpu_graph_list_;
std::vector<GpuPsCommGraphFea> gpu_graph_fea_list_;
int global_device_map[32]; int global_device_map[32];
std::vector<int *> sample_status;
const int parallel_sample_size = 1; const int parallel_sample_size = 1;
const int dim_y = 256; const int dim_y = 256;
std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table; std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table_;
std::shared_ptr<pthread_rwlock_t> rw_lock; std::shared_ptr<pthread_rwlock_t> rw_lock;
mutable std::mutex mutex_; mutable std::mutex mutex_;
std::condition_variable cv_; std::condition_variable cv_;
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <functional> #include <functional>
#pragma once #pragma once
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -33,9 +34,9 @@ sample_result is to save the neighbor sampling result, its size is len * ...@@ -33,9 +34,9 @@ sample_result is to save the neighbor sampling result, its size is len *
sample_size; sample_size;
*/ */
__global__ void get_cpu_id_index(int64_t* key, __global__ void get_cpu_id_index(uint64_t* key,
int* actual_sample_size, int* actual_sample_size,
int64_t* cpu_key, uint64_t* cpu_key,
int* sum, int* sum,
int* index, int* index,
int len) { int len) {
...@@ -50,13 +51,13 @@ __global__ void get_cpu_id_index(int64_t* key, ...@@ -50,13 +51,13 @@ __global__ void get_cpu_id_index(int64_t* key,
} }
__global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) { __global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) {
CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); } CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(uint64_t); }
} }
template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE> template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
__global__ void copy_buffer_ac_to_final_place(int64_t* gpu_buffer, __global__ void copy_buffer_ac_to_final_place(uint64_t* gpu_buffer,
int* gpu_ac, int* gpu_ac,
int64_t* val, uint64_t* val,
int* actual_sample_size, int* actual_sample_size,
int* index, int* index,
int* cumsum_gpu_ac, int* cumsum_gpu_ac,
...@@ -77,11 +78,48 @@ __global__ void copy_buffer_ac_to_final_place(int64_t* gpu_buffer, ...@@ -77,11 +78,48 @@ __global__ void copy_buffer_ac_to_final_place(int64_t* gpu_buffer,
} }
} }
__global__ void get_features_kernel(GpuPsCommGraphFea graph,
GpuPsFeaInfo* fea_info_array,
int* actual_size,
uint64_t* feature,
int slot_num,
int n) {
int idx = blockIdx.x * blockDim.y + threadIdx.y;
if (idx < n) {
int feature_size = fea_info_array[idx].feature_size;
int offset = idx * slot_num;
if (feature_size == 0) {
for (int k = 0; k < slot_num; ++k) {
feature[offset + k] = 0;
}
actual_size[idx] = slot_num;
return;
}
uint64_t* feature_start =
&(graph.feature_list[fea_info_array[idx].feature_offset]);
uint8_t* slot_id_start =
&(graph.slot_id_list[fea_info_array[idx].feature_offset]);
int m = 0;
for (int k = 0; k < slot_num; ++k) {
if (m >= fea_info_array[idx].feature_size || k < slot_id_start[m]) {
feature[offset + k] = 0;
} else if (k == slot_id_start[m]) {
feature[offset + k] = feature_start[m];
++m;
} else {
assert(0);
}
}
actual_size[idx] = slot_num;
}
}
template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE> template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
__global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, __global__ void neighbor_sample_kernel(GpuPsCommGraph graph,
int64_t* node_index, GpuPsNodeInfo* node_info_list,
int* actual_size, int* actual_size,
int64_t* res, uint64_t* res,
int sample_len, int sample_len,
int n, int n,
int default_value) { int default_value) {
...@@ -92,17 +130,16 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, ...@@ -92,17 +130,16 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, n); const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, n);
curandState rng; curandState rng;
curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
while (i < last_idx) { while (i < last_idx) {
if (node_index[i] == -1) { if (node_info_list[i].neighbor_size == 0) {
actual_size[i] = default_value; actual_size[i] = default_value;
i += BLOCK_WARPS; i += BLOCK_WARPS;
continue; continue;
} }
int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size; int neighbor_len = (int)node_info_list[i].neighbor_size;
int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset; uint32_t data_offset = node_info_list[i].neighbor_offset;
int offset = i * sample_len; int offset = i * sample_len;
int64_t* data = graph.neighbor_list; uint64_t* data = graph.neighbor_list;
if (neighbor_len <= sample_len) { if (neighbor_len <= sample_len) {
for (int j = threadIdx.x; j < neighbor_len; j += WARP_SIZE) { for (int j = threadIdx.x; j < neighbor_len; j += WARP_SIZE) {
res[offset + j] = data[data_offset + j]; res[offset + j] = data[data_offset + j];
...@@ -131,89 +168,10 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, ...@@ -131,89 +168,10 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
} }
} }
__global__ void neighbor_sample_example(GpuPsCommGraph graph,
int64_t* node_index,
int* actual_size,
int64_t* res,
int sample_len,
int* sample_status,
int n,
int from) {
int id = blockIdx.x * blockDim.y + threadIdx.y;
if (id < n) {
if (node_index[id] == -1) {
actual_size[id] = 0;
return;
}
curandState rng;
curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
int64_t index = threadIdx.x;
int64_t offset = id * sample_len;
int64_t* data = graph.neighbor_list;
int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset;
int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size;
int ac_len;
if (sample_len > neighbor_len)
ac_len = neighbor_len;
else {
ac_len = sample_len;
}
if (4 * ac_len >= 3 * neighbor_len) {
if (index == 0) {
res[offset] = curand(&rng) % (neighbor_len - ac_len + 1);
}
__syncwarp();
int start = res[offset];
while (index < ac_len) {
res[offset + index] = data[data_offset + start + index];
index += blockDim.x;
}
actual_size[id] = ac_len;
} else {
while (index < ac_len) {
int num = curand(&rng) % neighbor_len;
int* addr = sample_status + data_offset + num;
int expected = *addr;
if (!(expected & (1 << from))) {
int old = atomicCAS(addr, expected, expected | (1 << from));
if (old == expected) {
res[offset + index] = num;
index += blockDim.x;
}
}
}
__syncwarp();
index = threadIdx.x;
while (index < ac_len) {
int* addr = sample_status + data_offset + res[offset + index];
int expected, old = *addr;
do {
expected = old;
old = atomicCAS(addr, expected, expected & (~(1 << from)));
} while (old != expected);
res[offset + index] = data[data_offset + res[offset + index]];
index += blockDim.x;
}
actual_size[id] = ac_len;
}
}
// const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
// if (i < n) {
// auto node_index = index[i];
// actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
// ? graph.node_list[node_index].neighbor_size
// : sample_size;
// int offset = graph.node_list[node_index].neighbor_offset;
// for (int j = 0; j < actual_size[i]; j++) {
// sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
// }
// }
}
int GpuPsGraphTable::init_cpu_table( int GpuPsGraphTable::init_cpu_table(
const paddle::distributed::GraphParameter& graph) { const paddle::distributed::GraphParameter& graph) {
cpu_graph_table.reset(new paddle::distributed::GraphTable); cpu_graph_table_.reset(new paddle::distributed::GraphTable);
cpu_table_status = cpu_graph_table->Initialize(graph); cpu_table_status = cpu_graph_table_->Initialize(graph);
// if (cpu_table_status != 0) return cpu_table_status; // if (cpu_table_status != 0) return cpu_table_status;
// std::function<void(std::vector<GpuPsCommGraph>&)> callback = // std::function<void(std::vector<GpuPsCommGraph>&)> callback =
// [this](std::vector<GpuPsCommGraph>& res) { // [this](std::vector<GpuPsCommGraph>& res) {
...@@ -227,17 +185,6 @@ int GpuPsGraphTable::init_cpu_table( ...@@ -227,17 +185,6 @@ int GpuPsGraphTable::init_cpu_table(
return cpu_table_status; return cpu_table_status;
} }
// int GpuPsGraphTable::load(const std::string& path, const std::string& param)
// {
// int status = cpu_graph_table->load(path, param);
// if (status != 0) {
// return status;
// }
// std::unique_lock<std::mutex> lock(mutex_);
// cpu_graph_table->start_graph_sampling();
// cv_.wait(lock);
// return 0;
// }
/* /*
comment 1 comment 1
gpu i triggers a neighbor_sample task, gpu i triggers a neighbor_sample task,
...@@ -263,35 +210,36 @@ void GpuPsGraphTable::display_sample_res(void* key, ...@@ -263,35 +210,36 @@ void GpuPsGraphTable::display_sample_res(void* key,
void* val, void* val,
int len, int len,
int sample_len) { int sample_len) {
char key_buffer[len * sizeof(int64_t)]; char key_buffer[len * sizeof(uint64_t)];
char val_buffer[sample_len * sizeof(int64_t) * len + char val_buffer[sample_len * sizeof(int64_t) * len +
(len + len % 2) * sizeof(int) + len * sizeof(int64_t)]; (len + len % 2) * sizeof(int) + len * sizeof(uint64_t)];
cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost); cudaMemcpy(key_buffer, key, sizeof(uint64_t) * len, cudaMemcpyDeviceToHost);
cudaMemcpy(val_buffer, cudaMemcpy(val_buffer,
val, val,
sample_len * sizeof(int64_t) * len + sample_len * sizeof(int64_t) * len +
(len + len % 2) * sizeof(int) + len * sizeof(int64_t), (len + len % 2) * sizeof(int) + len * sizeof(uint64_t),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost);
int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) + uint64_t* sample_val =
(uint64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
len * sizeof(int64_t)); len * sizeof(int64_t));
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t))); printf("key %llu\n", *(int64_t*)(key_buffer + i * sizeof(uint64_t)));
printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t))); printf("index %llu\n", *(int64_t*)(val_buffer + i * sizeof(uint64_t)));
int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t)); int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t));
printf("sampled %d neigbhors\n", ac_size); printf("sampled %d neigbhors\n", ac_size);
for (int j = 0; j < ac_size; j++) { for (int j = 0; j < ac_size; j++) {
printf("%lld ", sample_val[i * sample_len + j]); printf("%llu ", sample_val[i * sample_len + j]);
} }
printf("\n"); printf("\n");
} }
} }
void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
int start_index, void GpuPsGraphTable::move_result_to_source_gpu(int start_index,
int gpu_num, int gpu_num,
int sample_size, int sample_size,
int* h_left, int* h_left,
int* h_right, int* h_right,
int64_t* src_sample_res, uint64_t* src_sample_res,
int* actual_sample_size) { int* actual_sample_size) {
int shard_len[gpu_num]; int shard_len[gpu_num];
for (int i = 0; i < gpu_num; i++) { for (int i = 0; i < gpu_num; i++) {
...@@ -301,144 +249,44 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( ...@@ -301,144 +249,44 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
shard_len[i] = h_right[i] - h_left[i] + 1; shard_len[i] = h_right[i] - h_left[i] + 1;
int cur_step = (int)path_[start_index][i].nodes_.size() - 1; int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
for (int j = cur_step; j > 0; j--) { for (int j = cur_step; j > 0; j--) {
CUDA_CHECK(
cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage, cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
path_[start_index][i].nodes_[j].val_storage, path_[start_index][i].nodes_[j].val_storage,
path_[start_index][i].nodes_[j - 1].val_bytes_len, path_[start_index][i].nodes_[j - 1].val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
path_[start_index][i].nodes_[j - 1].out_stream); path_[start_index][i].nodes_[j - 1].out_stream));
} }
auto& node = path_[start_index][i].nodes_.front(); auto& node = path_[start_index][i].nodes_.front();
cudaMemcpyAsync( CUDA_CHECK(cudaMemcpyAsync(
reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size), reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
node.val_storage + sizeof(int64_t) * shard_len[i] + node.val_storage + sizeof(int64_t) * shard_len[i] +
sizeof(int) * (shard_len[i] + shard_len[i] % 2), sizeof(int) * (shard_len[i] + shard_len[i] % 2),
sizeof(int64_t) * shard_len[i] * sample_size, sizeof(uint64_t) * shard_len[i] * sample_size,
cudaMemcpyDefault, cudaMemcpyDefault,
node.out_stream); node.out_stream));
CUDA_CHECK(
cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]), cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
node.val_storage + sizeof(int64_t) * shard_len[i], node.val_storage + sizeof(int64_t) * shard_len[i],
sizeof(int) * shard_len[i], sizeof(int) * shard_len[i],
cudaMemcpyDefault, cudaMemcpyDefault,
node.out_stream); node.out_stream));
} }
for (int i = 0; i < gpu_num; ++i) { for (int i = 0; i < gpu_num; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
} }
auto& node = path_[start_index][i].nodes_.front(); auto& node = path_[start_index][i].nodes_.front();
cudaStreamSynchronize(node.out_stream); CUDA_CHECK(cudaStreamSynchronize(node.out_stream));
// cudaStreamSynchronize(resource_->remote_stream(i, start_index)); // cudaStreamSynchronize(resource_->remote_stream(i, start_index));
} }
/*
std::queue<CopyTask> que;
// auto& node = path_[gpu_id][i].nodes_.front();
// cudaMemcpyAsync(
// reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
// node.val_storage + sizeof(int64_t) * shard_len,
// node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
// node.out_stream);
// cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
// node.val_storage + sizeof(int) * shard_len,
// sizeof(int) * shard_len, cudaMemcpyDefault,
// node.out_stream);
int cur_step = path_[start_index][i].nodes_.size() - 1;
auto& node = path_[start_index][i].nodes_[cur_step];
if (cur_step == 0) {
// cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[i]),
// node.val_storage, node.val_bytes_len,
// cudaMemcpyDefault,
// node.out_stream);
// VLOG(0)<<"copy "<<node.gpu_num<<" to "<<start_index;
cudaMemcpyAsync(
reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
node.val_storage + sizeof(int64_t) * shard_len[i],
node.val_bytes_len - sizeof(int64_t) * shard_len[i],
cudaMemcpyDefault,
node.out_stream);
//resource_->remote_stream(i, start_index));
cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
node.val_storage + sizeof(int) * shard_len[i],
sizeof(int) * shard_len[i], cudaMemcpyDefault,
node.out_stream);
//resource_->remote_stream(i, start_index));
} else {
CopyTask t(&path_[start_index][i], cur_step - 1);
que.push(t);
// VLOG(0)<<"copy "<<node.gpu_num<<" to
"<<path_[start_index][i].nodes_[cur_step - 1].gpu_num;
cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
node.val_storage,
path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
cudaMemcpyDefault,
path_[start_index][i].nodes_[cur_step - 1].out_stream);
//resource_->remote_stream(i, start_index));
}
}
while (!que.empty()) {
CopyTask& cur_task = que.front();
que.pop();
int cur_step = cur_task.step;
if (cur_task.path->nodes_[cur_step].sync) {
cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
//cudaStreamSynchronize(resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
start_index));
}
if (cur_step > 0) {
CopyTask c(cur_task.path, cur_step - 1);
que.push(c);
cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
cur_task.path->nodes_[cur_step].val_storage,
cur_task.path->nodes_[cur_step - 1].val_bytes_len,
cudaMemcpyDefault,
cur_task.path->nodes_[cur_step - 1].out_stream);
//resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
start_index));
} else if (cur_step == 0) {
int end_index = cur_task.path->nodes_.back().gpu_num;
// cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[end_index]),
// cur_task.path->nodes_[cur_step].val_storage,
// cur_task.path->nodes_[cur_step].val_bytes_len,
// cudaMemcpyDefault,
// cur_task.path->nodes_[cur_step].out_stream);
//VLOG(0)<<"copy "<<cur_task.path->nodes_[cur_step].gpu_num<< " to
"<<start_index;
cudaMemcpyAsync(reinterpret_cast<char*>(src_sample_res +
h_left[end_index] * sample_size),
cur_task.path->nodes_[cur_step].val_storage +
sizeof(int64_t) * shard_len[end_index],
cur_task.path->nodes_[cur_step].val_bytes_len -
sizeof(int64_t) * shard_len[end_index],
cudaMemcpyDefault,
cur_task.path->nodes_[cur_step].out_stream);
//resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
start_index));
cudaMemcpyAsync(
reinterpret_cast<char*>(actual_sample_size + h_left[end_index]),
cur_task.path->nodes_[cur_step].val_storage +
sizeof(int) * shard_len[end_index],
sizeof(int) * shard_len[end_index], cudaMemcpyDefault,
cur_task.path->nodes_[cur_step].out_stream);
//resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
start_index));
}
}
for (int i = 0; i < gpu_num; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
auto& node = path_[start_index][i].nodes_.front();
cudaStreamSynchronize(node.out_stream);
//cudaStreamSynchronize(resource_->remote_stream(i, start_index));
}
*/
} }
/* /*
TODO: TODO:
how to optimize it to eliminate the for loop how to optimize it to eliminate the for loop
*/ */
__global__ void fill_dvalues(int64_t* d_shard_vals, __global__ void fill_dvalues(uint64_t* d_shard_vals,
int64_t* d_vals, uint64_t* d_vals,
int* d_shard_actual_sample_size, int* d_shard_actual_sample_size,
int* d_actual_sample_size, int* d_actual_sample_size,
int* idx, int* idx,
...@@ -453,8 +301,22 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, ...@@ -453,8 +301,22 @@ __global__ void fill_dvalues(int64_t* d_shard_vals,
} }
} }
__global__ void fill_actual_vals(int64_t* vals, __global__ void fill_dvalues(uint64_t* d_shard_vals,
int64_t* actual_vals, uint64_t* d_vals,
int* d_shard_actual_sample_size,
int* idx,
int sample_size,
int len) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) {
for (int j = 0; j < sample_size; j++) {
d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
}
}
}
__global__ void fill_actual_vals(uint64_t* vals,
uint64_t* actual_vals,
int* actual_sample_size, int* actual_sample_size,
int* cumsum_actual_sample_size, int* cumsum_actual_sample_size,
int sample_size, int sample_size,
...@@ -470,39 +332,57 @@ __global__ void fill_actual_vals(int64_t* vals, ...@@ -470,39 +332,57 @@ __global__ void fill_actual_vals(int64_t* vals,
__global__ void node_query_example(GpuPsCommGraph graph, __global__ void node_query_example(GpuPsCommGraph graph,
int start, int start,
int size, int size,
int64_t* res) { uint64_t* res) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) { if (i < size) {
res[i] = graph.node_list[start + i].node_id; res[i] = graph.node_list[start + i];
} }
} }
void GpuPsGraphTable::clear_graph_info(int gpu_id) { void GpuPsGraphTable::clear_feature_info(int gpu_id) {
if (tables_.size() && tables_[gpu_id] != NULL) { int idx = 0;
delete tables_[gpu_id]; if (idx >= feature_table_num_) return;
int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx);
if (offset < tables_.size()) {
delete tables_[offset];
tables_[offset] = NULL;
} }
auto& graph = gpu_graph_list[gpu_id];
if (graph.neighbor_list != NULL) { int graph_fea_idx = gpu_id * feature_table_num_ + idx;
cudaFree(graph.neighbor_list); if (graph_fea_idx >= gpu_graph_fea_list_.size()) {
return;
} }
if (graph.node_list != NULL) { auto& graph = gpu_graph_fea_list_[graph_fea_idx];
cudaFree(graph.node_list); if (graph.feature_list != NULL) {
cudaFree(graph.feature_list);
graph.feature_list = NULL;
}
if (graph.slot_id_list != NULL) {
cudaFree(graph.slot_id_list);
graph.slot_id_list = NULL;
} }
} }
void GpuPsGraphTable::clear_graph_info() {
if (tables_.size()) { void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) {
for (auto table : tables_) delete table; if (idx >= graph_table_num_) return;
int offset = get_table_offset(gpu_id, GraphTableType::EDGE_TABLE, idx);
if (offset < tables_.size()) {
delete tables_[offset];
tables_[offset] = NULL;
} }
tables_.clear(); auto& graph = gpu_graph_list_[gpu_id * graph_table_num_ + idx];
for (auto graph : gpu_graph_list) {
if (graph.neighbor_list != NULL) { if (graph.neighbor_list != NULL) {
cudaFree(graph.neighbor_list); cudaFree(graph.neighbor_list);
graph.neighbor_list = nullptr;
} }
if (graph.node_list != NULL) { if (graph.node_list != NULL) {
cudaFree(graph.node_list); cudaFree(graph.node_list);
graph.node_list = nullptr;
} }
} }
gpu_graph_list.clear(); void GpuPsGraphTable::clear_graph_info(int idx) {
for (int i = 0; i < gpu_num; i++) clear_graph_info(i, idx);
} }
/* /*
the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu. the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
...@@ -512,78 +392,214 @@ for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number ...@@ -512,78 +392,214 @@ for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
In this function, memory is allocated on each gpu to save the graphs, In this function, memory is allocated on each gpu to save the graphs,
gpu i saves the ith graph from cpu_graph_list gpu i saves the ith graph from cpu_graph_list
*/ */
void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
int gpu_id) {
clear_feature_info(gpu_id);
int ntype_id = 0;
void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) { platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
clear_graph_info(i);
platform::CUDADeviceGuard guard(resource_->dev_id(i)); int offset = gpu_id * feature_table_num_ + ntype_id;
// platform::CUDADeviceGuard guard(i); gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
gpu_graph_list[i] = GpuPsCommGraph();
sample_status[i] = NULL; int table_offset =
tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_); get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id);
size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_;
tables_[table_offset] = new Table(capacity);
if (g.node_size > 0) { if (g.node_size > 0) {
std::vector<int64_t> keys; build_ps(gpu_id,
std::vector<int64_t> offset;
cudaMalloc((void**)&gpu_graph_list[i].node_list,
g.node_size * sizeof(GpuPsGraphNode));
cudaMemcpy(gpu_graph_list[i].node_list,
g.node_list, g.node_list,
g.node_size * sizeof(GpuPsGraphNode), (uint64_t*)g.fea_info_list,
g.node_size,
1024,
8,
table_offset);
gpu_graph_fea_list_[offset].node_list = NULL;
gpu_graph_fea_list_[offset].node_size = g.node_size;
} else {
build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset);
gpu_graph_fea_list_[offset].node_list = NULL;
gpu_graph_fea_list_[offset].node_size = 0;
}
if (g.feature_size) {
// TODO
cudaError_t cudaStatus =
cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
g.feature_size * sizeof(uint64_t));
PADDLE_ENFORCE_EQ(
cudaStatus,
cudaSuccess,
platform::errors::InvalidArgument(
"ailed to allocate memory for graph-feature on gpu "));
VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t)
<< " bytes of memory for graph-feature on gpu "
<< resource_->dev_id(gpu_id);
CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
g.feature_list,
g.feature_size * sizeof(uint64_t),
cudaMemcpyHostToDevice));
// TODO
cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
g.feature_size * sizeof(uint8_t));
PADDLE_ENFORCE_EQ(
cudaStatus,
cudaSuccess,
platform::errors::InvalidArgument(
"ailed to allocate memory for graph-feature on gpu "));
VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t)
<< " bytes of memory for graph-feature on gpu "
<< resource_->dev_id(gpu_id);
cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
g.slot_id_list,
g.feature_size * sizeof(uint8_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
for (int64_t j = 0; j < g.node_size; j++) {
keys.push_back(g.node_list[j].node_id); gpu_graph_fea_list_[offset].feature_size = g.feature_size;
offset.push_back(j); } else {
gpu_graph_fea_list_[offset].feature_list = NULL;
gpu_graph_fea_list_[offset].slot_id_list = NULL;
gpu_graph_fea_list_[offset].feature_size = 0;
}
VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
<< gpu_graph_fea_list_[offset].node_size << ", feature_size is "
<< gpu_graph_fea_list_[offset].feature_size;
}
/*
the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
it saves the graph to be saved on each gpu.
for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
== i
In this function, memory is allocated on each gpu to save the graphs,
gpu i saves the ith graph from cpu_graph_list
*/
void GpuPsGraphTable::build_graph_on_single_gpu(const GpuPsCommGraph& g,
int i,
int idx) {
clear_graph_info(i, idx);
platform::CUDADeviceGuard guard(resource_->dev_id(i));
int offset = i * graph_table_num_ + idx;
gpu_graph_list_[offset] = GpuPsCommGraph();
int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
size_t capacity = std::max((uint64_t)1, (uint64_t)g.node_size) / load_factor_;
tables_[table_offset] = new Table(capacity);
if (g.node_size > 0) {
if (FLAGS_gpugraph_load_node_list_into_hbm) {
CUDA_CHECK(cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
g.node_size * sizeof(uint64_t)));
CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].node_list,
g.node_list,
g.node_size * sizeof(uint64_t),
cudaMemcpyHostToDevice));
} }
build_ps(i, (uint64_t*)keys.data(), offset.data(), keys.size(), 1024, 8);
gpu_graph_list[i].node_size = g.node_size; build_ps(i,
g.node_list,
(uint64_t*)(g.node_info_list),
g.node_size,
1024,
8,
table_offset);
gpu_graph_list_[offset].node_size = g.node_size;
} else { } else {
build_ps(i, NULL, NULL, 0, 1024, 8); build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
gpu_graph_list[i].node_list = NULL; gpu_graph_list_[offset].node_list = NULL;
gpu_graph_list[i].node_size = 0; gpu_graph_list_[offset].node_size = 0;
} }
if (g.neighbor_size) { if (g.neighbor_size) {
cudaError_t cudaStatus = cudaError_t cudaStatus =
cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
g.neighbor_size * sizeof(int64_t)); g.neighbor_size * sizeof(uint64_t));
PADDLE_ENFORCE_EQ(cudaStatus, PADDLE_ENFORCE_EQ(cudaStatus,
cudaSuccess, cudaSuccess,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"ailed to allocate memory for graph on gpu ")); "ailed to allocate memory for graph on gpu "));
VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t) VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(uint64_t)
<< " bytes of memory for graph-edges on gpu " << " bytes of memory for graph-edges on gpu "
<< resource_->dev_id(i); << resource_->dev_id(i);
cudaMemcpy(gpu_graph_list[i].neighbor_list, CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].neighbor_list,
g.neighbor_list, g.neighbor_list,
g.neighbor_size * sizeof(int64_t), g.neighbor_size * sizeof(uint64_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice));
gpu_graph_list[i].neighbor_size = g.neighbor_size; gpu_graph_list_[offset].neighbor_size = g.neighbor_size;
} else { } else {
gpu_graph_list[i].neighbor_list = NULL; gpu_graph_list_[offset].neighbor_list = NULL;
gpu_graph_list[i].neighbor_size = 0; gpu_graph_list_[offset].neighbor_size = 0;
} }
VLOG(0) << " gpu node_neighbor info card: " << i << " ,node_size is "
<< gpu_graph_list_[offset].node_size << ", neighbor_size is "
<< gpu_graph_list_[offset].neighbor_size;
} }
void GpuPsGraphTable::init_sample_status() { void GpuPsGraphTable::build_graph_fea_from_cpu(
for (int i = 0; i < gpu_num; i++) { const std::vector<GpuPsCommGraphFea>& cpu_graph_fea_list, int ntype_id) {
if (gpu_graph_list[i].neighbor_size) { PADDLE_ENFORCE_EQ(
cpu_graph_fea_list.size(),
resource_->total_device(),
platform::errors::InvalidArgument("the cpu node list size doesn't match "
"the number of gpu on your machine."));
clear_feature_info(ntype_id);
for (int i = 0; i < cpu_graph_fea_list.size(); i++) {
int table_offset =
get_table_offset(i, GraphTableType::FEATURE_TABLE, ntype_id);
int offset = i * feature_table_num_ + ntype_id;
platform::CUDADeviceGuard guard(resource_->dev_id(i)); platform::CUDADeviceGuard guard(resource_->dev_id(i));
int* addr; gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int)); tables_[table_offset] = new Table(
cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int)); std::max((uint64_t)1, (uint64_t)cpu_graph_fea_list[i].node_size) /
sample_status[i] = addr; load_factor_);
if (cpu_graph_fea_list[i].node_size > 0) {
build_ps(i,
cpu_graph_fea_list[i].node_list,
(uint64_t*)cpu_graph_fea_list[i].fea_info_list,
cpu_graph_fea_list[i].node_size,
1024,
8,
table_offset);
gpu_graph_fea_list_[offset].node_size = cpu_graph_fea_list[i].node_size;
} else {
build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
gpu_graph_fea_list_[offset].node_list = NULL;
gpu_graph_fea_list_[offset].node_size = 0;
}
if (cpu_graph_fea_list[i].feature_size) {
// TODO
CUDA_CHECK(
cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
cpu_graph_fea_list[i].feature_size * sizeof(uint64_t)));
CUDA_CHECK(
cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
cpu_graph_fea_list[i].feature_list,
cpu_graph_fea_list[i].feature_size * sizeof(uint64_t),
cudaMemcpyHostToDevice));
// TODO
CUDA_CHECK(
cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
cpu_graph_fea_list[i].feature_size * sizeof(uint8_t)));
CUDA_CHECK(
cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
cpu_graph_fea_list[i].slot_id_list,
cpu_graph_fea_list[i].feature_size * sizeof(uint8_t),
cudaMemcpyHostToDevice));
gpu_graph_fea_list_[offset].feature_size =
cpu_graph_fea_list[i].feature_size;
} else {
gpu_graph_fea_list_[offset].feature_list = NULL;
gpu_graph_fea_list_[offset].slot_id_list = NULL;
gpu_graph_fea_list_[offset].feature_size = 0;
} }
} }
cudaDeviceSynchronize();
} }
void GpuPsGraphTable::free_sample_status() {
for (int i = 0; i < gpu_num; i++) {
if (sample_status[i] != NULL) {
platform::CUDADeviceGuard guard(resource_->dev_id(i));
cudaFree(sample_status[i]);
}
}
}
void GpuPsGraphTable::build_graph_from_cpu( void GpuPsGraphTable::build_graph_from_cpu(
std::vector<GpuPsCommGraph>& cpu_graph_list) { const std::vector<GpuPsCommGraph>& cpu_graph_list, int idx) {
VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = " VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
<< cpu_graph_list.size(); << cpu_graph_list.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -591,240 +607,77 @@ void GpuPsGraphTable::build_graph_from_cpu( ...@@ -591,240 +607,77 @@ void GpuPsGraphTable::build_graph_from_cpu(
resource_->total_device(), resource_->total_device(),
platform::errors::InvalidArgument("the cpu node list size doesn't match " platform::errors::InvalidArgument("the cpu node list size doesn't match "
"the number of gpu on your machine.")); "the number of gpu on your machine."));
clear_graph_info(); clear_graph_info(idx);
for (int i = 0; i < cpu_graph_list.size(); i++) { for (int i = 0; i < cpu_graph_list.size(); i++) {
int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
int offset = i * graph_table_num_ + idx;
platform::CUDADeviceGuard guard(resource_->dev_id(i)); platform::CUDADeviceGuard guard(resource_->dev_id(i));
gpu_graph_list[i] = GpuPsCommGraph(); gpu_graph_list_[offset] = GpuPsCommGraph();
sample_status[i] = NULL; tables_[table_offset] =
tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) / new Table(std::max((uint64_t)1, (uint64_t)cpu_graph_list[i].node_size) /
load_factor_); load_factor_);
if (cpu_graph_list[i].node_size > 0) { if (cpu_graph_list[i].node_size > 0) {
std::vector<int64_t> keys; CUDA_CHECK(cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
std::vector<int64_t> offset; cpu_graph_list[i].node_size * sizeof(uint64_t)));
cudaMalloc((void**)&gpu_graph_list[i].node_list, CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].node_list,
cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
cudaMemcpy(gpu_graph_list[i].node_list,
cpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode), cpu_graph_list[i].node_size * sizeof(uint64_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice));
for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) { build_ps(i,
keys.push_back(cpu_graph_list[i].node_list[j].node_id); cpu_graph_list[i].node_list,
offset.push_back(j); (uint64_t*)(cpu_graph_list[i].node_info_list),
} cpu_graph_list[i].node_size,
build_ps( 1024,
i, (uint64_t*)(keys.data()), offset.data(), keys.size(), 1024, 8); 8,
gpu_graph_list[i].node_size = cpu_graph_list[i].node_size; table_offset);
gpu_graph_list_[offset].node_size = cpu_graph_list[i].node_size;
} else { } else {
build_ps(i, NULL, NULL, 0, 1024, 8); build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
gpu_graph_list[i].node_list = NULL; gpu_graph_list_[offset].node_list = NULL;
gpu_graph_list[i].node_size = 0; gpu_graph_list_[offset].node_size = 0;
} }
if (cpu_graph_list[i].neighbor_size) { if (cpu_graph_list[i].neighbor_size) {
cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, CUDA_CHECK(
cpu_graph_list[i].neighbor_size * sizeof(int64_t)); cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
cpu_graph_list[i].neighbor_size * sizeof(uint64_t)));
cudaMemcpy(gpu_graph_list[i].neighbor_list, CUDA_CHECK(cudaMemcpy(gpu_graph_list_[offset].neighbor_list,
cpu_graph_list[i].neighbor_list, cpu_graph_list[i].neighbor_list,
cpu_graph_list[i].neighbor_size * sizeof(int64_t), cpu_graph_list[i].neighbor_size * sizeof(uint64_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice));
gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size; gpu_graph_list_[offset].neighbor_size = cpu_graph_list[i].neighbor_size;
} else { } else {
gpu_graph_list[i].neighbor_list = NULL; gpu_graph_list_[offset].neighbor_list = NULL;
gpu_graph_list[i].neighbor_size = 0; gpu_graph_list_[offset].neighbor_size = 0;
} }
} }
cudaDeviceSynchronize(); CUDA_CHECK(cudaDeviceSynchronize());
} }
NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3( NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
NeighborSampleQuery q, bool cpu_switch) { NeighborSampleQuery q, bool cpu_switch) {
return graph_neighbor_sample_v2( return graph_neighbor_sample_v2(global_device_map[q.gpu_id],
global_device_map[q.gpu_id], q.key, q.sample_size, q.len, cpu_switch); q.table_idx,
q.src_nodes,
q.sample_size,
q.len,
cpu_switch);
} }
NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id, NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
int64_t* key, uint64_t* key,
int sample_size, int sample_size,
int len) { int len) {
/* return graph_neighbor_sample_v2(gpu_id, 0, key, sample_size, len, false);
comment 2
this function shares some kernels with heter_comm_inl.h
arguments definitions:
gpu_id:the id of gpu.
len:how many keys are used,(the length of array key)
sample_size:how many neighbors should be sampled for each node in key.
the code below shuffle the key array to make the keys
that belong to a gpu-card stay together,
the shuffled result is saved on d_shard_keys,
if ith element in d_shard_keys_ptr is
from jth element in the original key array, then idx[i] = j,
idx could be used to recover the original array.
if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
b,
if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
when we run this neighbor_sample function,
the key is shuffled to [0,2,4,6,8,1,3,5,7]
the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
h_left = [0,5],h_right = [4,8]
*/
NeighborSampleResult result;
result.initialize(sample_size, len, resource_->dev_id(gpu_id));
if (len == 0) {
return result;
}
platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
int* actual_sample_size = result.actual_sample_size;
int64_t* val = result.val;
int total_gpu = resource_->total_device();
auto stream = resource_->local_stream(gpu_id, 0);
int grid_size = (len - 1) / block_size_ + 1;
int h_left[total_gpu]; // NOLINT
int h_right[total_gpu]; // NOLINT
auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
//
auto d_idx = memory::Alloc(place, len * sizeof(int));
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
int* d_shard_actual_sample_size_ptr =
reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
split_input_to_shard(
(uint64_t*)(key), d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
heter_comm_kernel_->fill_shard_key(
d_shard_keys_ptr, key, d_idx_ptr, len, stream);
cudaStreamSynchronize(stream);
cudaMemcpy(
h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(
h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
// auto start1 = std::chrono::steady_clock::now();
for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue;
}
/*
comment 3
shard_len denotes the size of keys on i-th gpu here,
when we sample on i-th gpu, we allocate shard_len * (1 + sample_size)
int64_t units
of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
for the respective nodes' indexes
and acutal sample_size.
with nodes' indexes we could get the nodes to sample.
since size of int64_t is 8 bits, while size of int is 4,
the range of [0,shard_len) contains shard_len * 2 int uinits;
The values of the first half of this range will be updated by
the k-v map on i-th-gpu.
The second half of this range is saved for actual sample size of each node.
For node x,
its sampling result is saved on the range
[shard_len + sample_size * x,shard_len + sample_size * x +
actual_sample_size_of_x)
of alloc_mem_i, actual_sample_size_of_x equals ((int
*)alloc_mem_i)[shard_len + x]
*/
create_storage(gpu_id,
i,
shard_len * sizeof(int64_t),
shard_len * (1 + sample_size) * sizeof(int64_t) +
sizeof(int) * (shard_len + shard_len % 2));
// auto& node = path_[gpu_id][i].nodes_[0];
}
walk_to_dest(
gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) {
continue;
}
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
auto& node = path_[gpu_id][i].nodes_.back();
cudaMemsetAsync(
node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream);
cudaStreamSynchronize(node.in_stream);
platform::CUDADeviceGuard guard(resource_->dev_id(i));
tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
reinterpret_cast<int64_t*>(node.val_storage),
h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, gpu_id));
// node.in_stream);
auto graph = gpu_graph_list[i];
int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
int* actual_size_array = (int*)(id_array + shard_len);
int64_t* sample_array =
(int64_t*)(actual_size_array + shard_len + shard_len % 2);
int sample_grid_size = (shard_len - 1) / dim_y + 1;
dim3 block(parallel_sample_size, dim_y);
dim3 grid(sample_grid_size);
neighbor_sample_example<<<grid,
block,
0,
resource_->remote_stream(i, gpu_id)>>>(
graph,
id_array,
actual_size_array,
sample_array,
sample_size,
sample_status[i],
shard_len,
gpu_id);
}
for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) {
continue;
}
cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
}
move_neighbor_sample_result_to_source_gpu(gpu_id,
total_gpu,
sample_size,
h_left,
h_right,
d_shard_vals_ptr,
d_shard_actual_sample_size_ptr);
fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
d_shard_vals_ptr,
val,
d_shard_actual_sample_size_ptr,
actual_sample_size,
d_idx_ptr,
sample_size,
len);
for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue;
}
destroy_storage(gpu_id, i);
}
cudaStreamSynchronize(stream);
return result;
} }
NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) { int gpu_id,
int idx,
uint64_t* key,
int sample_size,
int len,
bool cpu_query_switch) {
NeighborSampleResult result; NeighborSampleResult result;
result.initialize(sample_size, len, resource_->dev_id(gpu_id)); result.initialize(sample_size, len, resource_->dev_id(gpu_id));
...@@ -834,8 +687,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -834,8 +687,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
int* actual_sample_size = result.actual_sample_size; int* actual_sample_size = result.actual_sample_size;
int64_t* val = result.val; uint64_t* val = result.val;
int total_gpu = resource_->total_device(); int total_gpu = resource_->total_device();
auto stream = resource_->local_stream(gpu_id, 0); auto stream = resource_->local_stream(gpu_id, 0);
...@@ -853,16 +707,17 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -853,16 +707,17 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
default_value = -1; default_value = -1;
} }
cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
// //
auto d_idx = memory::Alloc(place, len * sizeof(int)); auto d_idx = memory::Alloc(place, len * sizeof(int));
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr()); int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t));
int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr()); uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); auto d_shard_vals =
int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr()); memory::Alloc(place, sample_size * len * sizeof(uint64_t));
uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
int* d_shard_actual_sample_size_ptr = int* d_shard_actual_sample_size_ptr =
reinterpret_cast<int*>(d_shard_actual_sample_size->ptr()); reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
...@@ -873,12 +728,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -873,12 +728,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
heter_comm_kernel_->fill_shard_key( heter_comm_kernel_->fill_shard_key(
d_shard_keys_ptr, key, d_idx_ptr, len, stream); d_shard_keys_ptr, key, d_idx_ptr, len, stream);
cudaStreamSynchronize(stream); CUDA_CHECK(cudaStreamSynchronize(stream));
cudaMemcpy( CUDA_CHECK(cudaMemcpy(
h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
cudaMemcpy( CUDA_CHECK(cudaMemcpy(
h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < total_gpu; ++i) { for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) { if (shard_len == 0) {
...@@ -886,8 +741,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -886,8 +741,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
} }
create_storage(gpu_id, create_storage(gpu_id,
i, i,
shard_len * sizeof(int64_t), shard_len * sizeof(uint64_t),
shard_len * (1 + sample_size) * sizeof(int64_t) + shard_len * sample_size * sizeof(uint64_t) +
shard_len * sizeof(uint64_t) +
sizeof(int) * (shard_len + shard_len % 2)); sizeof(int) * (shard_len + shard_len % 2));
} }
walk_to_dest( walk_to_dest(
...@@ -899,30 +755,35 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -899,30 +755,35 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
} }
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
auto& node = path_[gpu_id][i].nodes_.back(); auto& node = path_[gpu_id][i].nodes_.back();
cudaMemsetAsync(
node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream); CUDA_CHECK(cudaMemsetAsync(
cudaStreamSynchronize(node.in_stream); node.val_storage, 0, shard_len * sizeof(int64_t), node.in_stream));
CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
platform::CUDADeviceGuard guard(resource_->dev_id(i)); platform::CUDADeviceGuard guard(resource_->dev_id(i));
// If not found, val is -1. // If not found, val is -1.
tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage), int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
reinterpret_cast<int64_t*>(node.val_storage), int offset = i * graph_table_num_ + idx;
h_right[i] - h_left[i] + 1, tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
reinterpret_cast<uint64_t*>(node.val_storage),
(size_t)(h_right[i] - h_left[i] + 1),
resource_->remote_stream(i, gpu_id)); resource_->remote_stream(i, gpu_id));
auto graph = gpu_graph_list[i]; auto graph = gpu_graph_list_[offset];
int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage); GpuPsNodeInfo* node_info_list =
int* actual_size_array = (int*)(id_array + shard_len); reinterpret_cast<GpuPsNodeInfo*>(node.val_storage);
int64_t* sample_array = int* actual_size_array = (int*)(node_info_list + shard_len);
(int64_t*)(actual_size_array + shard_len + shard_len % 2); uint64_t* sample_array =
(uint64_t*)(actual_size_array + shard_len + shard_len % 2);
constexpr int WARP_SIZE = 32; constexpr int WARP_SIZE = 32;
constexpr int BLOCK_WARPS = 128 / WARP_SIZE; constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
constexpr int TILE_SIZE = BLOCK_WARPS * 16; constexpr int TILE_SIZE = BLOCK_WARPS * 16;
const dim3 block(WARP_SIZE, BLOCK_WARPS); const dim3 block(WARP_SIZE, BLOCK_WARPS);
const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE); const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE);
neighbor_sample_example_v2<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
neighbor_sample_kernel<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
<<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>( <<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
graph, graph,
id_array, node_info_list,
actual_size_array, actual_size_array,
sample_array, sample_array,
sample_size, sample_size,
...@@ -934,10 +795,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -934,10 +795,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
if (h_left[i] == -1) { if (h_left[i] == -1) {
continue; continue;
} }
cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)); CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
} }
move_result_to_source_gpu(gpu_id,
move_neighbor_sample_result_to_source_gpu(gpu_id,
total_gpu, total_gpu,
sample_size, sample_size,
h_left, h_left,
...@@ -953,11 +813,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -953,11 +813,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
sample_size, sample_size,
len); len);
cudaStreamSynchronize(stream); CUDA_CHECK(cudaStreamSynchronize(stream));
if (cpu_query_switch) { if (cpu_query_switch) {
// Get cpu keys and corresponding position. // Get cpu keys and corresponding position.
thrust::device_vector<int64_t> t_cpu_keys(len); thrust::device_vector<uint64_t> t_cpu_keys(len);
thrust::device_vector<int> t_index(len + 1, 0); thrust::device_vector<int> t_index(len + 1, 0);
get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>( get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>(
key, key,
...@@ -967,52 +827,52 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -967,52 +827,52 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
thrust::raw_pointer_cast(t_index.data()) + 1, thrust::raw_pointer_cast(t_index.data()) + 1,
len); len);
cudaStreamSynchronize(stream); CUDA_CHECK(cudaStreamSynchronize(stream));
int number_on_cpu = 0; int number_on_cpu = 0;
cudaMemcpy(&number_on_cpu, CUDA_CHECK(cudaMemcpy(&number_on_cpu,
thrust::raw_pointer_cast(t_index.data()), thrust::raw_pointer_cast(t_index.data()),
sizeof(int), sizeof(int),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost));
if (number_on_cpu > 0) { if (number_on_cpu > 0) {
int64_t* cpu_keys = new int64_t[number_on_cpu]; uint64_t* cpu_keys = new uint64_t[number_on_cpu];
cudaMemcpy(cpu_keys, CUDA_CHECK(cudaMemcpy(cpu_keys,
thrust::raw_pointer_cast(t_cpu_keys.data()), thrust::raw_pointer_cast(t_cpu_keys.data()),
number_on_cpu * sizeof(int64_t), number_on_cpu * sizeof(uint64_t),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost));
std::vector<std::shared_ptr<char>> buffers(number_on_cpu); std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
std::vector<int> ac(number_on_cpu); std::vector<int> ac(number_on_cpu);
auto status = cpu_graph_table->random_sample_neighbors( auto status = cpu_graph_table_->random_sample_neighbors(
0, cpu_keys, sample_size, buffers, ac, false); idx, cpu_keys, sample_size, buffers, ac, false);
int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0); int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0);
total_cpu_sample_size /= sizeof(int64_t); total_cpu_sample_size /= sizeof(uint64_t);
// Merge buffers into one int64_t vector. // Merge buffers into one uint64_t vector.
int64_t* merge_buffers = new int64_t[total_cpu_sample_size]; uint64_t* merge_buffers = new uint64_t[total_cpu_sample_size];
int start = 0; int start = 0;
for (int j = 0; j < number_on_cpu; j++) { for (int j = 0; j < number_on_cpu; j++) {
memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]); memcpy(merge_buffers + start, (uint64_t*)(buffers[j].get()), ac[j]);
start += ac[j] / sizeof(int64_t); start += ac[j] / sizeof(uint64_t);
} }
// Copy merge_buffers to gpu. // Copy merge_buffers to gpu.
thrust::device_vector<int64_t> gpu_buffers(total_cpu_sample_size); thrust::device_vector<uint64_t> gpu_buffers(total_cpu_sample_size);
thrust::device_vector<int> gpu_ac(number_on_cpu); thrust::device_vector<int> gpu_ac(number_on_cpu);
int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data()); uint64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data()); int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data());
cudaMemcpyAsync(gpu_buffers_ptr, CUDA_CHECK(cudaMemcpyAsync(gpu_buffers_ptr,
merge_buffers, merge_buffers,
total_cpu_sample_size * sizeof(int64_t), total_cpu_sample_size * sizeof(uint64_t),
cudaMemcpyHostToDevice, cudaMemcpyHostToDevice,
stream); stream));
cudaMemcpyAsync(gpu_ac_ptr, CUDA_CHECK(cudaMemcpyAsync(gpu_ac_ptr,
ac.data(), ac.data(),
number_on_cpu * sizeof(int), number_on_cpu * sizeof(int),
cudaMemcpyHostToDevice, cudaMemcpyHostToDevice,
stream); stream));
// Copy gpu_buffers and gpu_ac using kernel. // Copy gpu_buffers and gpu_ac using kernel.
// Kernel divide for gpu_ac_ptr. // Kernel divide for gpu_ac_ptr.
...@@ -1020,7 +880,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -1020,7 +880,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
get_actual_gpu_ac<<<grid_size2, block_size_, 0, stream>>>(gpu_ac_ptr, get_actual_gpu_ac<<<grid_size2, block_size_, 0, stream>>>(gpu_ac_ptr,
number_on_cpu); number_on_cpu);
cudaStreamSynchronize(stream); CUDA_CHECK(cudaStreamSynchronize(stream));
thrust::device_vector<int> cumsum_gpu_ac(number_on_cpu); thrust::device_vector<int> cumsum_gpu_ac(number_on_cpu);
thrust::exclusive_scan( thrust::exclusive_scan(
...@@ -1048,7 +908,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -1048,7 +908,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
} }
{ {
cudaStreamSynchronize(stream); CUDA_CHECK(cudaStreamSynchronize(stream));
platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
...@@ -1060,11 +920,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -1060,11 +920,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
t_actual_sample_size.end()); t_actual_sample_size.end());
result.actual_val_mem = result.actual_val_mem =
memory::AllocShared(place, total_sample_size * sizeof(int64_t)); memory::AllocShared(place, total_sample_size * sizeof(uint64_t));
result.actual_val = (int64_t*)(result.actual_val_mem)->ptr(); result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();
result.set_total_sample_size(total_sample_size); result.set_total_sample_size(total_sample_size);
thrust::device_vector<int> cumsum_actual_sample_size(len); thrust::device_vector<int> cumsum_actual_sample_size(len);
thrust::exclusive_scan(t_actual_sample_size.begin(), thrust::exclusive_scan(t_actual_sample_size.begin(),
t_actual_sample_size.end(), t_actual_sample_size.end(),
...@@ -1085,7 +944,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -1085,7 +944,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
} }
destroy_storage(gpu_id, i); destroy_storage(gpu_id, i);
} }
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
return result; return result;
} }
...@@ -1096,32 +954,13 @@ NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id, ...@@ -1096,32 +954,13 @@ NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
} }
NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
int idx,
int start, int start,
int query_size) { int query_size) {
NodeQueryResult result; NodeQueryResult result;
result.actual_sample_size = 0;
if (query_size <= 0) return result; if (query_size <= 0) return result;
int& actual_size = result.actual_sample_size; std::vector<int> gpu_begin_pos, local_begin_pos;
actual_size = 0;
// int dev_id = resource_->dev_id(gpu_id);
// platform::CUDADeviceGuard guard(dev_id);
std::vector<int> idx, gpu_begin_pos, local_begin_pos;
int sample_size;
/*
if idx[i] = a, gpu_begin_pos[i] = p1,
gpu_local_begin_pos[i] = p2;
sample_size[i] = s;
then on gpu a, the nodes of positions [p1,p1 + s) should be returned
and saved from the p2 position on the sample_result array
for example:
suppose
gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
start = 3, query_size = 5
we know [6,8,1,3,5] should be returned;
idx = [0,1]
gpu_begin_pos = [3,0]
local_begin_pos = [0,3]
sample_size = [2,3]
*/
std::function<int(int, int, int, int, int&, int&)> range_check = std::function<int(int, int, int, int, int&, int&)> range_check =
[](int x, int y, int x1, int y1, int& x2, int& y2) { [](int x, int y, int x1, int y1, int& x2, int& y2) {
if (y <= x1 || x >= y1) return 0; if (y <= x1 || x >= y1) return 0;
...@@ -1129,7 +968,9 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, ...@@ -1129,7 +968,9 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
x2 = max(x1, x); x2 = max(x1, x);
return y2 - x2; return y2 - x2;
}; };
auto graph = gpu_graph_list[gpu_id];
int offset = gpu_id * graph_table_num_ + idx;
const auto& graph = gpu_graph_list_[offset];
if (graph.node_size == 0) { if (graph.node_size == 0) {
return result; return result;
} }
...@@ -1139,69 +980,159 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, ...@@ -1139,69 +980,159 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id,
if (len == 0) { if (len == 0) {
return result; return result;
} }
int64_t* val;
sample_size = len;
result.initialize(len, resource_->dev_id(gpu_id)); result.initialize(len, resource_->dev_id(gpu_id));
actual_size = len; result.actual_sample_size = len;
val = result.val; uint64_t* val = result.val;
int dev_id_i = resource_->dev_id(gpu_id); int dev_id_i = resource_->dev_id(gpu_id);
platform::CUDADeviceGuard guard(dev_id_i); platform::CUDADeviceGuard guard(dev_id_i);
// platform::CUDADeviceGuard guard(i);
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
node_query_example<<<grid_size, node_query_example<<<grid_size,
block_size_, block_size_,
0, 0,
resource_->remote_stream(gpu_id, gpu_id)>>>( resource_->remote_stream(gpu_id, gpu_id)>>>(
gpu_graph_list[gpu_id], x2, len, (int64_t*)val); graph, x2, len, (uint64_t*)val);
cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id)); CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id)));
return result; return result;
/* }
for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
auto graph = gpu_graph_list[i]; int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
if (graph.node_size == 0) { uint64_t* d_nodes,
uint64_t* d_feature,
int node_num,
int slot_num) {
if (node_num == 0) {
return -1;
}
platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
int total_gpu = resource_->total_device();
auto stream = resource_->local_stream(gpu_id, 0);
auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
//
auto d_idx = memory::Alloc(place, node_num * sizeof(int));
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t));
uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
auto d_shard_vals =
memory::Alloc(place, slot_num * node_num * sizeof(uint64_t));
uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int));
int* d_shard_actual_size_ptr =
reinterpret_cast<int*>(d_shard_actual_size->ptr());
split_input_to_shard(
d_nodes, d_idx_ptr, node_num, d_left_ptr, d_right_ptr, gpu_id);
heter_comm_kernel_->fill_shard_key(
d_shard_keys_ptr, d_nodes, d_idx_ptr, node_num, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
int h_left[total_gpu]; // NOLINT
CUDA_CHECK(cudaMemcpy(
h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
int h_right[total_gpu]; // NOLINT
CUDA_CHECK(cudaMemcpy(
h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue; continue;
} }
int x2, y2; create_storage(gpu_id,
int len = range_check(start, start + query_size, size, i,
size + graph.node_size, x2, y2); shard_len * sizeof(uint64_t),
if (len > 0) { shard_len * slot_num * sizeof(uint64_t) +
idx.push_back(i); shard_len * sizeof(uint64_t) +
gpu_begin_pos.emplace_back(x2 - size); sizeof(int) * (shard_len + shard_len % 2));
local_begin_pos.emplace_back(actual_size);
sample_size.push_back(len);
actual_size += len;
create_storage(gpu_id, i, 1, len * sizeof(int64_t));
}
size += graph.node_size;
}
for (int i = 0; i < idx.size(); i++) {
int dev_id_i = resource_->dev_id(idx[i]);
platform::CUDADeviceGuard guard(dev_id_i);
// platform::CUDADeviceGuard guard(i);
auto& node = path_[gpu_id][idx[i]].nodes_.front();
int grid_size = (sample_size[i] - 1) / block_size_ + 1;
node_query_example<<<grid_size, block_size_, 0,
resource_->remote_stream(idx[i], gpu_id)>>>(
gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
(int64_t*)node.val_storage);
} }
for (int i = 0; i < idx.size(); i++) { walk_to_dest(
cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id)); gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
auto& node = path_[gpu_id][idx[i]].nodes_.front();
cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]), for (int i = 0; i < total_gpu; ++i) {
node.val_storage, node.val_bytes_len, cudaMemcpyDefault, if (h_left[i] == -1) {
node.out_stream); continue;
}
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
auto& node = path_[gpu_id][i].nodes_.back();
CUDA_CHECK(cudaMemsetAsync(
node.val_storage, 0, shard_len * sizeof(uint64_t), node.in_stream));
CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
platform::CUDADeviceGuard guard(resource_->dev_id(i));
// If not found, val is -1.
int table_offset = get_table_offset(i, GraphTableType::FEATURE_TABLE, 0);
tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
reinterpret_cast<uint64_t*>(node.val_storage),
(size_t)(h_right[i] - h_left[i] + 1),
resource_->remote_stream(i, gpu_id));
int offset = i * feature_table_num_;
auto graph = gpu_graph_fea_list_[offset];
GpuPsFeaInfo* val_array = reinterpret_cast<GpuPsFeaInfo*>(node.val_storage);
int* actual_size_array = (int*)(val_array + shard_len);
uint64_t* feature_array =
(uint64_t*)(actual_size_array + shard_len + shard_len % 2);
dim3 grid((shard_len - 1) / dim_y + 1);
dim3 block(1, dim_y);
get_features_kernel<<<grid,
block,
0,
resource_->remote_stream(i, gpu_id)>>>(
graph,
val_array,
actual_size_array,
feature_array,
slot_num,
shard_len);
}
for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) {
continue;
} }
for (int i = 0; i < idx.size(); i++) { CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
auto& node = path_[gpu_id][idx[i]].nodes_.front();
cudaStreamSynchronize(node.out_stream);
} }
for (auto x : idx) {
destroy_storage(gpu_id, x); move_result_to_source_gpu(gpu_id,
total_gpu,
slot_num,
h_left,
h_right,
d_shard_vals_ptr,
d_shard_actual_size_ptr);
int grid_size = (node_num - 1) / block_size_ + 1;
fill_dvalues<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr,
d_feature,
d_shard_actual_size_ptr,
d_idx_ptr,
slot_num,
node_num);
for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue;
} }
return result; destroy_storage(gpu_id, i);
*/ }
CUDA_CHECK(cudaStreamSynchronize(stream));
return 0;
} }
} // namespace framework } // namespace framework
}; // namespace paddle }; // namespace paddle
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
#include <sstream>
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
namespace paddle { namespace paddle {
...@@ -25,12 +27,46 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) { ...@@ -25,12 +27,46 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
device_id_mapping.push_back(device_id); device_id_mapping.push_back(device_id);
} }
} }
std::vector<std::vector<int64_t>> GraphGpuWrapper::get_all_id(int type,
int GraphGpuWrapper::get_all_id(int type,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_id(type, slice_num, output);
}
int GraphGpuWrapper::get_all_neighbor_id(
int type, int slice_num, std::vector<std::vector<uint64_t>> *output) {
return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_neighbor_id(type, slice_num, output);
}
int GraphGpuWrapper::get_all_id(int type,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_id(type, idx, slice_num, output);
}
int GraphGpuWrapper::get_all_neighbor_id(
int type,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>> *output) {
return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_neighbor_id(type, idx, slice_num, output);
}
int GraphGpuWrapper::get_all_feature_ids(
int type,
int idx, int idx,
int slice_num) { int slice_num,
std::vector<std::vector<uint64_t>> *output) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->get_all_id(type, idx, slice_num); ->cpu_graph_table_->get_all_feature_ids(type, idx, slice_num, output);
} }
void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types, void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
std::vector<std::string> &node_types) { std::vector<std::string> &node_types) {
id_to_edge = edge_types; id_to_edge = edge_types;
...@@ -49,32 +85,40 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types, ...@@ -49,32 +85,40 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
this->table_feat_conf_feat_shape.resize(node_types.size()); this->table_feat_conf_feat_shape.resize(node_types.size());
} }
void GraphGpuWrapper::set_feature_separator(std::string ch) {
feature_separator_ = ch;
if (graph_table != nullptr) {
((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->set_feature_separator(feature_separator_);
}
}
void GraphGpuWrapper::make_partitions(int idx, void GraphGpuWrapper::make_partitions(int idx,
int64_t byte_size, int64_t byte_size,
int device_len) { int device_len) {
((GpuPsGraphTable *)graph_table) ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->make_partitions(idx, byte_size, device_len); ->cpu_graph_table_->make_partitions(idx, byte_size, device_len);
} }
int32_t GraphGpuWrapper::load_next_partition(int idx) { int32_t GraphGpuWrapper::load_next_partition(int idx) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->load_next_partition(idx); ->cpu_graph_table_->load_next_partition(idx);
} }
void GraphGpuWrapper::set_search_level(int level) { void GraphGpuWrapper::set_search_level(int level) {
((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level); ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->set_search_level(level);
} }
std::vector<int64_t> GraphGpuWrapper::get_partition(int idx, int num) { std::vector<uint64_t> GraphGpuWrapper::get_partition(int idx, int num) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->get_partition(idx, num); ->cpu_graph_table_->get_partition(idx, num);
} }
int32_t GraphGpuWrapper::get_partition_num(int idx) { int32_t GraphGpuWrapper::get_partition_num(int idx) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->get_partition_num(idx); ->cpu_graph_table_->get_partition_num(idx);
} }
void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) { void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) {
((GpuPsGraphTable *)graph_table) ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->make_complementary_graph(idx, byte_size); ->cpu_graph_table_->make_complementary_graph(idx, byte_size);
} }
void GraphGpuWrapper::load_edge_file(std::string name, void GraphGpuWrapper::load_edge_file(std::string name,
std::string filepath, std::string filepath,
...@@ -90,7 +134,7 @@ void GraphGpuWrapper::load_edge_file(std::string name, ...@@ -90,7 +134,7 @@ void GraphGpuWrapper::load_edge_file(std::string name,
} }
if (edge_to_id.find(name) != edge_to_id.end()) { if (edge_to_id.find(name) != edge_to_id.end()) {
((GpuPsGraphTable *)graph_table) ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->Load(std::string(filepath), params); ->cpu_graph_table_->Load(std::string(filepath), params);
} }
} }
...@@ -101,10 +145,21 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { ...@@ -101,10 +145,21 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
if (feature_to_id.find(name) != feature_to_id.end()) { if (feature_to_id.find(name) != feature_to_id.end()) {
((GpuPsGraphTable *)graph_table) ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->Load(std::string(filepath), params); ->cpu_graph_table_->Load(std::string(filepath), params);
} }
} }
void GraphGpuWrapper::load_node_and_edge(std::string etype,
std::string ntype,
std::string epath,
std::string npath,
int part_num,
bool reverse) {
((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->load_node_and_edge_file(
etype, ntype, epath, npath, part_num, reverse);
}
void GraphGpuWrapper::add_table_feat_conf(std::string table_name, void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
std::string feat_name, std::string feat_name,
std::string feat_dtype, std::string feat_dtype,
...@@ -137,8 +192,10 @@ void GraphGpuWrapper::init_search_level(int level) { search_level = level; } ...@@ -137,8 +192,10 @@ void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
void GraphGpuWrapper::init_service() { void GraphGpuWrapper::init_service() {
table_proto.set_task_pool_size(24); table_proto.set_task_pool_size(24);
table_proto.set_shard_num(1000);
table_proto.set_build_sampler_on_cpu(false);
table_proto.set_search_level(search_level); table_proto.set_search_level(search_level);
table_proto.set_table_name("cpu_graph_table"); table_proto.set_table_name("cpu_graph_table_");
table_proto.set_use_cache(false); table_proto.set_use_cache(false);
for (int i = 0; i < id_to_edge.size(); i++) for (int i = 0; i < id_to_edge.size(); i++)
table_proto.add_edge_types(id_to_edge[i]); table_proto.add_edge_types(id_to_edge[i]);
...@@ -155,76 +212,122 @@ void GraphGpuWrapper::init_service() { ...@@ -155,76 +212,122 @@ void GraphGpuWrapper::init_service() {
std::shared_ptr<HeterPsResource> resource = std::shared_ptr<HeterPsResource> resource =
std::make_shared<HeterPsResource>(device_id_mapping); std::make_shared<HeterPsResource>(device_id_mapping);
resource->enable_p2p(); resource->enable_p2p();
GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1); GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1, id_to_edge.size());
g->init_cpu_table(table_proto); g->init_cpu_table(table_proto);
g->cpu_graph_table_->set_feature_separator(feature_separator_);
graph_table = (char *)g; graph_table = (char *)g;
upload_task_pool.reset(new ::ThreadPool(upload_num));
}
void GraphGpuWrapper::finalize() {
((GpuPsGraphTable *)graph_table)->show_table_collisions();
} }
void GraphGpuWrapper::upload_batch(int idx, void GraphGpuWrapper::upload_batch(int type,
std::vector<std::vector<int64_t>> &ids) { int idx,
int slice_num,
const std::string &edge_type) {
VLOG(0) << "begin upload edge, type[" << edge_type << "]";
std::vector<std::vector<uint64_t>> ids;
((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_id(type, idx, slice_num, &ids);
debug_gpu_memory_info("upload_batch node start");
GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
// std::vector<paddle::framework::GpuPsCommGraph> vec; std::vector<std::future<int>> tasks;
for (int i = 0; i < ids.size(); i++) { for (int i = 0; i < ids.size(); i++) {
// vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i])); tasks.push_back(upload_task_pool->enqueue([&, i, idx, this]() -> int {
VLOG(0) << "begin make_gpu_ps_graph, node_id[" << i << "]_size["
<< ids[i].size() << "]";
GpuPsCommGraph sub_graph = GpuPsCommGraph sub_graph =
g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]); g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]);
g->build_graph_on_single_gpu(sub_graph, i); g->build_graph_on_single_gpu(sub_graph, i, idx);
sub_graph.release_on_cpu(); sub_graph.release_on_cpu();
VLOG(0) << "sub graph on gpu " << i << " is built"; VLOG(0) << "sub graph on gpu " << i << " is built";
return 0;
}));
} }
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
debug_gpu_memory_info("upload_batch node end");
}
// feature table
void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) {
std::vector<std::vector<uint64_t>> node_ids;
((GpuPsGraphTable *)graph_table)
->cpu_graph_table_->get_all_id(type, slice_num, &node_ids);
debug_gpu_memory_info("upload_batch feature start");
GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
std::vector<std::future<int>> tasks;
for (int i = 0; i < node_ids.size(); i++) {
tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int {
VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size["
<< node_ids[i].size() << "]";
GpuPsCommGraphFea sub_graph =
g->cpu_graph_table_->make_gpu_ps_graph_fea(node_ids[i], slot_num);
// sub_graph.display_on_cpu();
VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i
<< "]_size[" << node_ids[i].size() << "]";
g->build_graph_fea_on_single_gpu(sub_graph, i);
sub_graph.release_on_cpu();
VLOG(0) << "sub graph fea on gpu " << i << " is built";
return 0;
}));
}
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
// g->build_graph_from_cpu(vec); // g->build_graph_from_cpu(vec);
debug_gpu_memory_info("upload_batch feature end");
} }
// void GraphGpuWrapper::test() {
// int64_t cpu_key[3] = {0, 1, 2};
// void *key;
// platform::CUDADeviceGuard guard(0);
// cudaMalloc((void **)&key, 3 * sizeof(int64_t));
// cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
// auto neighbor_sample_res =
// ((GpuPsGraphTable *)graph_table)
// ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
// int64_t *res = new int64_t[7];
// cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
// cudaMemcpyDeviceToHost);
// int *actual_sample_size = new int[3];
// cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
// 3 * sizeof(int),
// cudaMemcpyDeviceToHost); // 3, 1, 3
// //{0,9} or {9,0} is expected for key 0
// //{0,2} or {2,0} is expected for key 1
// //{1,3} or {3,1} is expected for key 2
// for (int i = 0; i < 3; i++) {
// VLOG(0) << "actual sample size for " << i << " is "
// << actual_sample_size[i];
// for (int j = 0; j < actual_sample_size[i]; j++) {
// VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 +
// j];
// }
// }
// }
NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3( NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
NeighborSampleQuery q, bool cpu_switch) { NeighborSampleQuery q, bool cpu_switch) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->graph_neighbor_sample_v3(q, cpu_switch); ->graph_neighbor_sample_v3(q, cpu_switch);
} }
int GraphGpuWrapper::get_feature_of_nodes(int gpu_id,
uint64_t *d_walk,
uint64_t *d_offset,
uint32_t size,
int slot_num) {
platform::CUDADeviceGuard guard(gpu_id);
PADDLE_ENFORCE_NOT_NULL(graph_table,
paddle::platform::errors::InvalidArgument(
"graph_table should not be null"));
return ((GpuPsGraphTable *)graph_table)
->get_feature_of_nodes(gpu_id, d_walk, d_offset, size, slot_num);
}
NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample(
int gpu_id, uint64_t *device_keys, int walk_degree, int len) {
platform::CUDADeviceGuard guard(gpu_id);
auto neighbor_sample_res =
((GpuPsGraphTable *)graph_table)
->graph_neighbor_sample(gpu_id, device_keys, walk_degree, len);
return neighbor_sample_res;
}
// this function is contributed by Liwb5 // this function is contributed by Liwb5
std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample( std::vector<uint64_t> GraphGpuWrapper::graph_neighbor_sample(
int gpu_id, std::vector<int64_t> &key, int sample_size) { int gpu_id, int idx, std::vector<uint64_t> &key, int sample_size) {
int64_t *cuda_key; std::vector<uint64_t> res;
if (key.size() == 0) {
return res;
}
uint64_t *cuda_key;
platform::CUDADeviceGuard guard(gpu_id); platform::CUDADeviceGuard guard(gpu_id);
cudaMalloc(&cuda_key, key.size() * sizeof(int64_t)); cudaMalloc(&cuda_key, key.size() * sizeof(uint64_t));
cudaMemcpy(cuda_key, cudaMemcpy(cuda_key,
key.data(), key.data(),
key.size() * sizeof(int64_t), key.size() * sizeof(uint64_t),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
VLOG(0) << "key_size: " << key.size();
auto neighbor_sample_res = auto neighbor_sample_res =
((GpuPsGraphTable *)graph_table) ((GpuPsGraphTable *)graph_table)
->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size()); ->graph_neighbor_sample_v2(
gpu_id, idx, cuda_key, sample_size, key.size(), false);
int *actual_sample_size = new int[key.size()]; int *actual_sample_size = new int[key.size()];
cudaMemcpy(actual_sample_size, cudaMemcpy(actual_sample_size,
neighbor_sample_res.actual_sample_size, neighbor_sample_res.actual_sample_size,
...@@ -235,12 +338,12 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample( ...@@ -235,12 +338,12 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
cumsum += actual_sample_size[i]; cumsum += actual_sample_size[i];
} }
std::vector<int64_t> cpu_key, res; std::vector<uint64_t> cpu_key;
cpu_key.resize(key.size() * sample_size); cpu_key.resize(key.size() * sample_size);
cudaMemcpy(cpu_key.data(), cudaMemcpy(cpu_key.data(),
neighbor_sample_res.val, neighbor_sample_res.val,
key.size() * sample_size * sizeof(int64_t), key.size() * sample_size * sizeof(uint64_t),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost);
for (int i = 0; i < key.size(); i++) { for (int i = 0; i < key.size(); i++) {
for (int j = 0; j < actual_sample_size[i]; j++) { for (int j = 0; j < actual_sample_size[i]; j++) {
...@@ -256,27 +359,26 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample( ...@@ -256,27 +359,26 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
return res; return res;
} }
void GraphGpuWrapper::init_sample_status() {
((GpuPsGraphTable *)graph_table)->init_sample_status();
}
void GraphGpuWrapper::free_sample_status() {
((GpuPsGraphTable *)graph_table)->free_sample_status();
}
NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id,
int idx,
int start, int start,
int query_size) { int query_size) {
PADDLE_ENFORCE_EQ(FLAGS_gpugraph_load_node_list_into_hbm,
true,
paddle::platform::errors::PreconditionNotMet(
"when use query_node_list should set "
"gpugraph_load_node_list_into_hbm true"));
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->query_node_list(gpu_id, start, query_size); ->query_node_list(gpu_id, idx, start, query_size);
} }
void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) { void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->load_node_weight(type_id, idx, path); ->cpu_graph_table_->load_node_weight(type_id, idx, path);
} }
void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) { void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
return ((GpuPsGraphTable *)graph_table) return ((GpuPsGraphTable *)graph_table)
->cpu_graph_table->export_partition_files(idx, file_path); ->cpu_graph_table_->export_partition_files(idx, file_path);
} }
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -32,39 +32,76 @@ class GraphGpuWrapper { ...@@ -32,39 +32,76 @@ class GraphGpuWrapper {
} }
static std::shared_ptr<GraphGpuWrapper> s_instance_; static std::shared_ptr<GraphGpuWrapper> s_instance_;
void initialize(); void initialize();
void test(); void finalize();
void set_device(std::vector<int> ids); void set_device(std::vector<int> ids);
void init_service(); void init_service();
void set_up_types(std::vector<std::string>& edge_type, void set_up_types(std::vector<std::string>& edge_type,
std::vector<std::string>& node_type); std::vector<std::string>& node_type);
void upload_batch(int idx, std::vector<std::vector<int64_t>>& ids); void upload_batch(int type,
int idx,
int slice_num,
const std::string& edge_type);
void upload_batch(int type, int slice_num, int slot_num);
void add_table_feat_conf(std::string table_name, void add_table_feat_conf(std::string table_name,
std::string feat_name, std::string feat_name,
std::string feat_dtype, std::string feat_dtype,
int feat_shape); int feat_shape);
void load_edge_file(std::string name, std::string filepath, bool reverse); void load_edge_file(std::string name, std::string filepath, bool reverse);
void load_node_file(std::string name, std::string filepath); void load_node_file(std::string name, std::string filepath);
void load_node_and_edge(std::string etype,
std::string ntype,
std::string epath,
std::string npath,
int part_num,
bool reverse);
int32_t load_next_partition(int idx); int32_t load_next_partition(int idx);
int32_t get_partition_num(int idx); int32_t get_partition_num(int idx);
void load_node_weight(int type_id, int idx, std::string path); void load_node_weight(int type_id, int idx, std::string path);
void export_partition_files(int idx, std::string file_path); void export_partition_files(int idx, std::string file_path);
std::vector<int64_t> get_partition(int idx, int num); std::vector<uint64_t> get_partition(int idx, int num);
void make_partitions(int idx, int64_t byte_size, int device_len); void make_partitions(int idx, int64_t byte_size, int device_len);
void make_complementary_graph(int idx, int64_t byte_size); void make_complementary_graph(int idx, int64_t byte_size);
void set_search_level(int level); void set_search_level(int level);
void init_search_level(int level); void init_search_level(int level);
std::vector<std::vector<int64_t>> get_all_id(int type, int get_all_id(int type,
int slice_num,
std::vector<std::vector<uint64_t>>* output);
int get_all_neighbor_id(int type,
int slice_num,
std::vector<std::vector<uint64_t>>* output);
int get_all_id(int type,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>>* output);
int get_all_neighbor_id(int type,
int idx,
int slice_num,
std::vector<std::vector<uint64_t>>* output);
int get_all_feature_ids(int type,
int idx, int idx,
int slice_num); int slice_num,
NodeQueryResult query_node_list(int gpu_id, int start, int query_size); std::vector<std::vector<uint64_t>>* output);
NodeQueryResult query_node_list(int gpu_id,
int idx,
int start,
int query_size);
NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
bool cpu_switch); bool cpu_switch);
std::vector<int64_t> graph_neighbor_sample(int gpu_id, NeighborSampleResult graph_neighbor_sample(int gpu_id,
std::vector<int64_t>& key, uint64_t* device_keys,
int walk_degree,
int len);
std::vector<uint64_t> graph_neighbor_sample(int gpu_id,
int idx,
std::vector<uint64_t>& key,
int sample_size); int sample_size);
void set_feature_separator(std::string ch);
int get_feature_of_nodes(int gpu_id,
uint64_t* d_walk,
uint64_t* d_offset,
uint32_t size,
int slot_num);
void init_sample_status();
void free_sample_status();
std::unordered_map<std::string, int> edge_to_id, feature_to_id; std::unordered_map<std::string, int> edge_to_id, feature_to_id;
std::vector<std::string> id_to_feature, id_to_edge; std::vector<std::string> id_to_feature, id_to_edge;
std::vector<std::unordered_map<std::string, int>> table_feat_mapping; std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
...@@ -75,6 +112,9 @@ class GraphGpuWrapper { ...@@ -75,6 +112,9 @@ class GraphGpuWrapper {
std::vector<int> device_id_mapping; std::vector<int> device_id_mapping;
int search_level = 1; int search_level = 1;
void* graph_table; void* graph_table;
int upload_num = 8;
std::shared_ptr<::ThreadPool> upload_task_pool;
std::string feature_separator_ = std::string(" ");
}; };
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -83,10 +83,10 @@ class CommonGraphSampler : public GraphSampler { ...@@ -83,10 +83,10 @@ class CommonGraphSampler : public GraphSampler {
virtual void init(GpuPsGraphTable *g, std::vector<std::string> args); virtual void init(GpuPsGraphTable *g, std::vector<std::string> args);
GpuPsGraphTable *gpu_table; GpuPsGraphTable *gpu_table;
paddle::distributed::GraphTable *table; paddle::distributed::GraphTable *table;
std::vector<int64_t> gpu_edges_count; std::vector<uint64_t> gpu_edges_count;
int64_t cpu_edges_count; uint64_t cpu_edges_count;
int64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit; uint64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit;
std::vector<std::unordered_set<int64_t>> gpu_set; std::vector<std::unordered_set<uint64_t>> gpu_set;
int gpu_num; int gpu_num;
}; };
...@@ -102,8 +102,9 @@ class AllInGpuGraphSampler : public GraphSampler { ...@@ -102,8 +102,9 @@ class AllInGpuGraphSampler : public GraphSampler {
protected: protected:
paddle::distributed::GraphTable *graph_table; paddle::distributed::GraphTable *graph_table;
GpuPsGraphTable *gpu_table; GpuPsGraphTable *gpu_table;
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes; std::vector<std::vector<uint64_t>> sample_node_ids;
std::vector<std::vector<int64_t>> sample_neighbors; std::vector<std::vector<paddle::framework::GpuPsNodeInfo>> sample_node_infos;
std::vector<std::vector<uint64_t>> sample_neighbors;
std::vector<GpuPsCommGraph> sample_res; std::vector<GpuPsCommGraph> sample_res;
// std::shared_ptr<std::mt19937_64> random; // std::shared_ptr<std::mt19937_64> random;
int gpu_num; int gpu_num;
......
...@@ -24,7 +24,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) { ...@@ -24,7 +24,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
std::cout << values.size(); std::cout << values.size();
if (values.size() < 2) continue; if (values.size() < 2) continue;
auto neighbors = paddle::string::split_string<std::string>(values[1], ";"); auto neighbors = paddle::string::split_string<std::string>(values[1], ";");
std::vector<int64_t> neighbor_data; std::vector<uint64_t> neighbor_data;
for (auto x : neighbors) { for (auto x : neighbors) {
neighbor_data.push_back(std::stoll(x)); neighbor_data.push_back(std::stoll(x));
} }
...@@ -33,7 +33,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) { ...@@ -33,7 +33,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
(char *)&src_id, (char *)&src_id,
sizeof(uint64_t), sizeof(uint64_t),
(char *)neighbor_data.data(), (char *)neighbor_data.data(),
sizeof(int64_t) * neighbor_data.size()); sizeof(uint64_t) * neighbor_data.size());
int gpu_shard = src_id % gpu_num; int gpu_shard = src_id % gpu_num;
if (gpu_edges_count[gpu_shard] + neighbor_data.size() <= if (gpu_edges_count[gpu_shard] + neighbor_data.size() <=
gpu_edges_each_limit) { gpu_edges_each_limit) {
...@@ -52,7 +52,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) { ...@@ -52,7 +52,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
} }
std::vector<paddle::framework::GpuPsCommGraph> graph_list; std::vector<paddle::framework::GpuPsCommGraph> graph_list;
for (int i = 0; i < gpu_num; i++) { for (int i = 0; i < gpu_num; i++) {
std::vector<int64_t> ids(gpu_set[i].begin(), gpu_set[i].end()); std::vector<uint64_t> ids(gpu_set[i].begin(), gpu_set[i].end());
graph_list.push_back(table->make_gpu_ps_graph(ids)); graph_list.push_back(table->make_gpu_ps_graph(ids));
} }
gpu_table->build_graph_from_cpu(graph_list); gpu_table->build_graph_from_cpu(graph_list);
...@@ -72,26 +72,29 @@ void CommonGraphSampler::init(GpuPsGraphTable *g, ...@@ -72,26 +72,29 @@ void CommonGraphSampler::init(GpuPsGraphTable *g,
gpu_edges_each_limit = gpu_edges_limit / gpu_num; gpu_edges_each_limit = gpu_edges_limit / gpu_num;
if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX; if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX;
table = g->cpu_graph_table.get(); table = g->cpu_graph_table.get();
gpu_edges_count = std::vector<int64_t>(gpu_num, 0); gpu_edges_count = std::vector<uint64_t>(gpu_num, 0);
cpu_edges_count = 0; cpu_edges_count = 0;
gpu_set = std::vector<std::unordered_set<int64_t>>(gpu_num); gpu_set = std::vector<std::unordered_set<uint64_t>>(gpu_num);
} }
int AllInGpuGraphSampler::run_graph_sampling() { return 0; } int AllInGpuGraphSampler::run_graph_sampling() { return 0; }
int AllInGpuGraphSampler::load_from_ssd(std::string path) { int AllInGpuGraphSampler::load_from_ssd(std::string path) {
graph_table->load_edges(path, false); graph_table->load_edges(path, false);
sample_nodes.clear(); sample_node_ids.clear() sample_node_infos.clear() sample_neighbors.clear();
sample_neighbors.clear();
sample_res.clear(); sample_res.clear();
sample_nodes.resize(gpu_num); sample_node_ids.resize(gpu_num);
sample_node_infos.resize(gpu_num);
sample_neighbors.resize(gpu_num); sample_neighbors.resize(gpu_num);
sample_res.resize(gpu_num); sample_res.resize(gpu_num);
std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>> std::vector<std::vector<std::vector<uint64_t>>> sample_node_ids_ex(
sample_nodes_ex(graph_table->task_pool_size_); graph_table->task_pool_size_);
std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex( std::vector<std::vector<std::vector<paddle::framework::GpuPsNodeInfo>>>
sample_node_infos_ex(graph_table->task_pool_size_);
std::vector<std::vector<std::vector<uint64_t>>> sample_neighbors_ex(
graph_table->task_pool_size_); graph_table->task_pool_size_);
for (int i = 0; i < graph_table->task_pool_size_; i++) { for (int i = 0; i < graph_table->task_pool_size_; i++) {
sample_nodes_ex[i].resize(gpu_num); sample_node_ids_ex[i].resize(gpu_num);
sample_node_infos_ex[i].resize(gpu_num);
sample_neighbors_ex[i].resize(gpu_num); sample_neighbors_ex[i].resize(gpu_num);
} }
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
...@@ -100,17 +103,16 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) { ...@@ -100,17 +103,16 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
graph_table->_shards_task_pool[i % graph_table->task_pool_size_] graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
->enqueue([&, i, this]() -> int { ->enqueue([&, i, this]() -> int {
if (this->status == GraphSamplerStatus::terminating) return 0; if (this->status == GraphSamplerStatus::terminating) return 0;
paddle::framework::GpuPsGraphNode node; paddle::framework::GpuPsNodeInfo info;
std::vector<paddle::distributed::Node *> &v = std::vector<paddle::distributed::Node *> &v =
this->graph_table->shards[i]->get_bucket(); this->graph_table->shards[i]->get_bucket();
size_t ind = i % this->graph_table->task_pool_size_; size_t ind = i % this->graph_table->task_pool_size_;
for (size_t j = 0; j < v.size(); j++) { for (size_t j = 0; j < v.size(); j++) {
size_t location = v[j]->get_id() % this->gpu_num; info.neighbor_size = v[j]->get_neighbor_size();
node.node_id = v[j]->get_id(); info.neighbor_offset =
node.neighbor_size = v[j]->get_neighbor_size(); sample_neighbors_ex[ind][location].size();
node.neighbor_offset = sample_node_infos_ex[ind][location].emplace_back(info);
(int)sample_neighbors_ex[ind][location].size(); sample_node_ids_ex[ind][location].emplace_back(v[j]->get_id());
sample_nodes_ex[ind][location].emplace_back(node);
for (int k = 0; k < node.neighbor_size; k++) for (int k = 0; k < node.neighbor_size; k++)
sample_neighbors_ex[ind][location].push_back( sample_neighbors_ex[ind][location].push_back(
v[j]->get_neighbor_id(k)); v[j]->get_neighbor_id(k));
...@@ -128,9 +130,11 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) { ...@@ -128,9 +130,11 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
int total_offset = 0; int total_offset = 0;
size_t ind = i; size_t ind = i;
for (int j = 0; j < this->graph_table->task_pool_size_; j++) { for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { for (size_t k = 0; k < sample_node_ids_ex[j][ind].size(); k++) {
sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]); sample_node_ids[ind].push_back(sample_node_ids_ex[j][ind][k]);
sample_nodes[ind].back().neighbor_offset += total_offset; sample_node_infos[ind].push_back(
sample_node_infos_ex[j][ind][k]);
sample_node_infos[ind].back().neighbor_offset += total_offset;
} }
size_t neighbor_size = sample_neighbors_ex[j][ind].size(); size_t neighbor_size = sample_neighbors_ex[j][ind].size();
total_offset += neighbor_size; total_offset += neighbor_size;
...@@ -144,9 +148,10 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) { ...@@ -144,9 +148,10 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
} }
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
for (size_t i = 0; i < gpu_num; i++) { for (size_t i = 0; i < gpu_num; i++) {
sample_res[i].node_list = sample_nodes[i].data(); sample_res[i].node_list = sample_node_ids[i].data();
sample_res[i].node_info_list = sample_node_infos[i].data();
sample_res[i].neighbor_list = sample_neighbors[i].data(); sample_res[i].neighbor_list = sample_neighbors[i].data();
sample_res[i].node_size = sample_nodes[i].size(); sample_res[i].node_size = sample_node_ids[i].size();
sample_res[i].neighbor_size = sample_neighbors[i].size(); sample_res[i].neighbor_size = sample_neighbors[i].size();
} }
......
...@@ -76,6 +76,7 @@ class XPUCacheArray { ...@@ -76,6 +76,7 @@ class XPUCacheArray {
} }
void print() {} void print() {}
void print_collision(int i) {}
#if defined(__xpu__) #if defined(__xpu__)
__device__ ValType* find(const KeyType& key) { __device__ ValType* find(const KeyType& key) {
...@@ -137,12 +138,12 @@ class HashTable { ...@@ -137,12 +138,12 @@ class HashTable {
size_t len, size_t len,
StreamType stream); StreamType stream);
template <typename StreamType, typename FVAccessor> template <typename StreamType, typename GPUAccessor>
void get(const KeyType* d_keys, void get(const KeyType* d_keys,
char* d_vals, char* d_vals,
size_t len, size_t len,
StreamType stream, StreamType stream,
FVAccessor& fv_accessor); GPUAccessor& fv_accessor);
void show(); void show();
...@@ -193,6 +194,8 @@ class HashTable { ...@@ -193,6 +194,8 @@ class HashTable {
<< " push value size: " << push_grad_value_size_; << " push value size: " << push_grad_value_size_;
} }
void show_collision(int id) { return container_->print_collision(id); }
std::unique_ptr<phi::RWLock> rwlock_{nullptr}; std::unique_ptr<phi::RWLock> rwlock_{nullptr};
private: private:
......
...@@ -83,25 +83,22 @@ __global__ void search_kernel(Table* table, ...@@ -83,25 +83,22 @@ __global__ void search_kernel(Table* table,
} }
} }
template <typename Table, typename FVAccessor> template <typename Table, typename GPUAccessor>
__global__ void dy_mf_search_kernel(Table* table, __global__ void dy_mf_search_kernel(Table* table,
const typename Table::key_type* const keys, const typename Table::key_type* const keys,
char* vals, char* vals,
size_t len, size_t len,
size_t pull_feature_value_size, size_t pull_feature_value_size,
FVAccessor feature_value_accessor) { GPUAccessor gpu_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
// return;
if (i < len) { if (i < len) {
auto it = table->find(keys[i]); auto it = table->find(keys[i]);
if (it != table->end()) { if (it != table->end()) {
uint64_t offset = i * pull_feature_value_size; uint64_t offset = i * pull_feature_value_size;
float* cur = (float*)(vals + offset); float* cur = (float*)(vals + offset);
float* input = it->second; float* input = it->second;
int mf_dim = gpu_accessor.PullValueFill(cur, input);
int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
feature_value_accessor.FeatureValueFill(cur, input, mf_dim);
} }
} }
} }
...@@ -137,9 +134,7 @@ __global__ void dy_mf_update_kernel(Table* table, ...@@ -137,9 +134,7 @@ __global__ void dy_mf_update_kernel(Table* table,
float* cur = (float*)(grads + i * grad_value_size); float* cur = (float*)(grads + i * grad_value_size);
sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur); sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur);
} else { } else {
if (keys[i] != 0) { printf("warning: push miss key: %lu", keys[i]);
printf("warning::push miss key: %llu", keys[i]);
}
} }
} }
} }
...@@ -147,11 +142,12 @@ __global__ void dy_mf_update_kernel(Table* table, ...@@ -147,11 +142,12 @@ __global__ void dy_mf_update_kernel(Table* table,
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
HashTable<KeyType, ValType>::HashTable(size_t capacity) { HashTable<KeyType, ValType>::HashTable(size_t capacity) {
container_ = new TableContainer<KeyType, ValType>(capacity); container_ = new TableContainer<KeyType, ValType>(capacity);
cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig)); CUDA_RT_CALL(
cudaMemcpy((void*)device_optimizer_config_, cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig)));
CUDA_RT_CALL(cudaMemcpy((void*)device_optimizer_config_,
&host_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), sizeof(OptimizerConfig),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice));
rwlock_.reset(new phi::RWLock); rwlock_.reset(new phi::RWLock);
} }
...@@ -201,12 +197,12 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ...@@ -201,12 +197,12 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
template <typename StreamType, typename FVAccessor> template <typename StreamType, typename GPUAccessor>
void HashTable<KeyType, ValType>::get(const KeyType* d_keys, void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
char* d_vals, char* d_vals,
size_t len, size_t len,
StreamType stream, StreamType stream,
FVAccessor& fv_accessor) { GPUAccessor& fv_accessor) {
if (len == 0) { if (len == 0) {
return; return;
} }
...@@ -345,6 +341,7 @@ template class HashTable<unsigned long, float*>; ...@@ -345,6 +341,7 @@ template class HashTable<unsigned long, float*>;
template class HashTable<long, int>; template class HashTable<long, int>;
template class HashTable<unsigned long, int>; template class HashTable<unsigned long, int>;
template class HashTable<unsigned long, unsigned long>; template class HashTable<unsigned long, unsigned long>;
template class HashTable<unsigned long, unsigned long*>;
template class HashTable<unsigned long, long>; template class HashTable<unsigned long, long>;
template class HashTable<unsigned long, long*>; template class HashTable<unsigned long, long*>;
template class HashTable<long, long>; template class HashTable<long, long>;
...@@ -377,7 +374,8 @@ template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>( ...@@ -377,7 +374,8 @@ template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
unsigned long* d_vals, unsigned long* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, long>::get<cudaStream_t>(
const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
template void HashTable<long, unsigned long>::get<cudaStream_t>( template void HashTable<long, unsigned long>::get<cudaStream_t>(
const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys, template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
...@@ -386,8 +384,6 @@ template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys, ...@@ -386,8 +384,6 @@ template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<long, unsigned int>::get<cudaStream_t>( template void HashTable<long, unsigned int>::get<cudaStream_t>(
const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream); const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
template void HashTable<unsigned long, long>::get<cudaStream_t>(
const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
// template void // template void
// HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>( // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
// const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
...@@ -421,6 +417,13 @@ template void HashTable<unsigned long, int>::insert<cudaStream_t>( ...@@ -421,6 +417,13 @@ template void HashTable<unsigned long, int>::insert<cudaStream_t>(
const int* d_vals, const int* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, long>::insert<cudaStream_t>(
const unsigned long* d_keys,
const long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<long, unsigned long>::insert<cudaStream_t>( template void HashTable<long, unsigned long>::insert<cudaStream_t>(
const long* d_keys, const long* d_keys,
const unsigned long* d_vals, const unsigned long* d_vals,
...@@ -433,12 +436,6 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>( ...@@ -433,12 +436,6 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, long>::insert<cudaStream_t>(
const unsigned long* d_keys,
const long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>( template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
const unsigned long* d_keys, const unsigned long* d_keys,
const unsigned long* d_vals, const unsigned long* d_vals,
...@@ -448,26 +445,26 @@ template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>( ...@@ -448,26 +445,26 @@ template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>( template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
int devid, cudaStream_t stream); int devid, cudaStream_t stream);
template void template void HashTable<unsigned long, float*>::update<
HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>( SparseAdagradOptimizer<CommonFeatureValueAccessor>,
const unsigned long* d_keys, cudaStream_t>(const unsigned long* d_keys,
const char* d_grads, const char* d_grads,
size_t len, size_t len,
SparseAdagradOptimizer sgd, SparseAdagradOptimizer<CommonFeatureValueAccessor> sgd,
cudaStream_t stream); cudaStream_t stream);
template void template void HashTable<unsigned long, float*>::update<
HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>( SparseAdamOptimizer<CommonFeatureValueAccessor>,
const unsigned long* d_keys, cudaStream_t>(const unsigned long* d_keys,
const char* d_grads, const char* d_grads,
size_t len, size_t len,
SparseAdamOptimizer sgd, SparseAdamOptimizer<CommonFeatureValueAccessor> sgd,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, float*>::update< template void HashTable<unsigned long, float*>::update<
SparseAdamSharedOptimizer, SparseAdamSharedOptimizer<CommonFeatureValueAccessor>,
cudaStream_t>(const unsigned long* d_keys, cudaStream_t>(const unsigned long* d_keys,
const char* d_grads, const char* d_grads,
size_t len, size_t len,
SparseAdamSharedOptimizer sgd, SparseAdamSharedOptimizer<CommonFeatureValueAccessor> sgd,
cudaStream_t stream); cudaStream_t stream);
// template void HashTable<unsigned long, // template void HashTable<unsigned long,
......
...@@ -25,7 +25,6 @@ limitations under the License. */ ...@@ -25,7 +25,6 @@ limitations under the License. */
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#include "thrust/pair.h" #include "thrust/pair.h"
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
// #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
#include <xpu/runtime.h> #include <xpu/runtime.h>
#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
...@@ -49,14 +48,46 @@ namespace framework { ...@@ -49,14 +48,46 @@ namespace framework {
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
class HeterComm { class HeterComm {
public: public:
HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource); HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
HeterComm(size_t capacity,
std::shared_ptr<HeterPsResource> resource,
GPUAccessor& gpu_accessor);
virtual ~HeterComm(); virtual ~HeterComm();
HeterComm(const HeterComm&) = delete; HeterComm(const HeterComm&) = delete;
HeterComm& operator=(const HeterComm&) = delete; HeterComm& operator=(const HeterComm&) = delete;
void merge_keys(int gpu_num,
const KeyType* d_keys,
size_t len,
KeyType* d_sorted_keys,
KeyType* d_merged_keys,
uint32_t* d_restore_idx,
size_t& uniq_len);
void dynamic_merge_grad(int gpu_num,
KeyType* d_keys,
float* d_grads,
size_t len,
int& uniq_len,
size_t& segment_len,
bool enable_segment_merge_grad);
void segment_merge_grad(int gpu_num,
KeyType* d_keys,
float* d_grads,
const uint32_t* d_index,
size_t len,
const uint32_t* d_fea_num_info,
size_t uniq_len,
size_t& segment_len);
void build_ps(int num,
KeyType* h_keys,
ValType* h_vals,
size_t len,
size_t chunk_size,
int stream_num,
int offset = -1);
void split_input_to_shard(KeyType* d_keys, void split_input_to_shard(KeyType* d_keys,
int* d_idx_ptr, int* d_idx_ptr,
size_t len, size_t len,
...@@ -71,12 +102,6 @@ class HeterComm { ...@@ -71,12 +102,6 @@ class HeterComm {
void dynamic_merge_grad( void dynamic_merge_grad(
int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len); int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len);
void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
void build_ps(int num,
KeyType* h_keys,
ValType* h_vals,
size_t len,
size_t chunk_size,
int stream_num);
void build_ps(int num, void build_ps(int num,
KeyType* h_keys, KeyType* h_keys,
char* pool, char* pool,
...@@ -86,6 +111,7 @@ class HeterComm { ...@@ -86,6 +111,7 @@ class HeterComm {
int stream_num); int stream_num);
void dump(); void dump();
void show_one_table(int gpu_num); void show_one_table(int gpu_num);
void show_table_collisions();
int get_index_by_devid(int devid); int get_index_by_devid(int devid);
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
...@@ -150,12 +176,6 @@ class HeterComm { ...@@ -150,12 +176,6 @@ class HeterComm {
max_mf_dim_ = max_mf_dim; max_mf_dim_ = max_mf_dim;
} }
void set_accessor(FVAccessor& accessor) {
feature_value_accessor_ = accessor;
// for (auto& ptr_table: ptr_tables_) {
// ptr_table->set_accessor(feature_value_accessor_);
// }
}
#endif #endif
bool need_transfer(int send_id, int receive_id) { bool need_transfer(int send_id, int receive_id) {
...@@ -167,6 +187,19 @@ class HeterComm { ...@@ -167,6 +187,19 @@ class HeterComm {
int get_transfer_devid(int send_id) { return (send_id + 4) % 8; } int get_transfer_devid(int send_id) { return (send_id + 4) % 8; }
void end_pass(); void end_pass();
#if defined(PADDLE_WITH_CUDA)
// dedup
int dedup_keys_and_fillidx(const int gpu_id,
const int total_fea_num,
const KeyType* d_keys, // input
KeyType* d_merged_keys, // output
KeyType* d_sorted_keys,
uint32_t* d_restore_idx,
uint32_t* d_sorted_idx,
uint32_t* d_offset,
uint32_t* d_merged_cnts,
bool filter_zero);
#endif
struct Node { struct Node {
ppStream in_stream; ppStream in_stream;
...@@ -262,7 +295,10 @@ class HeterComm { ...@@ -262,7 +295,10 @@ class HeterComm {
#endif #endif
} }
void create_storage(int start_index, int end_index, int keylen, int vallen); void create_storage(int start_index,
int end_index,
size_t keylen,
size_t vallen);
void destroy_storage(int start_index, int end_index); void destroy_storage(int start_index, int end_index);
void walk_to_dest(int start_index, void walk_to_dest(int start_index,
int gpu_num, int gpu_num,
...@@ -289,9 +325,10 @@ class HeterComm { ...@@ -289,9 +325,10 @@ class HeterComm {
char* src_val, char* src_val,
size_t val_size); size_t val_size);
FVAccessor feature_value_accessor_;
protected: protected:
void pull_merge_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
void pull_normal_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
using Table = HashTable<KeyType, ValType>; using Table = HashTable<KeyType, ValType>;
using PtrTable = HashTable<KeyType, float*>; using PtrTable = HashTable<KeyType, float*>;
std::vector<Table*> tables_; std::vector<Table*> tables_;
...@@ -302,6 +339,8 @@ class HeterComm { ...@@ -302,6 +339,8 @@ class HeterComm {
int block_size_{256}; int block_size_{256};
std::unique_ptr<HeterCommKernel> heter_comm_kernel_; std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
GPUAccessor gpu_accessor_;
private: private:
int topo_aware_{0}; int topo_aware_{0};
std::vector<LocalStorage> storage_; std::vector<LocalStorage> storage_;
......
...@@ -16,25 +16,34 @@ limitations under the License. */ ...@@ -16,25 +16,34 @@ limitations under the License. */
#include <queue> #include <queue>
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif #endif
DECLARE_double(gpugraph_hbm_table_load_factor);
DECLARE_bool(gpugraph_enable_gpu_direct_access);
DECLARE_bool(gpugraph_enable_segment_merge_grads);
DECLARE_uint64(gpugraph_merge_grads_segment_size);
DECLARE_int32(gpugraph_dedup_pull_push_mode);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm( HeterComm<KeyType, ValType, GradType, GPUAccessor>::HeterComm(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity, std::shared_ptr<HeterPsResource> resource) {
VLOG(1) << "Construct new HeterComm"; VLOG(1) << "Construct new HeterComm";
resource_ = resource; resource_ = resource;
storage_.resize(resource_->total_device()); storage_.resize(resource_->total_device());
multi_mf_dim_ = resource->multi_mf(); multi_mf_dim_ = resource->multi_mf();
load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
VLOG(0) << "load_factor = " << load_factor_;
for (int i = 0; i < resource_->total_device(); ++i) { for (int i = 0; i < resource_->total_device(); ++i) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
platform::CUDADeviceGuard guard(resource_->dev_id(i)); platform::CUDADeviceGuard guard(resource_->dev_id(i));
...@@ -47,15 +56,19 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm( ...@@ -47,15 +56,19 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
} else { } else {
max_mf_dim_ = resource_->max_mf_dim(); max_mf_dim_ = resource_->max_mf_dim();
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t val_type_size = size_t val_type_size =
accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
size_t grad_type_size = size_t grad_type_size =
accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
size_t pull_type_size =
accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
<< ", feature_value_push_size:" << grad_type_size; << ", feature_value_push_size:" << grad_type_size
<< ", feature_pull_type_size:" << pull_type_size;
auto ptr_table = new PtrTable(capacity / load_factor_); auto ptr_table = new PtrTable(capacity / load_factor_);
ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_table->set_feature_value_size(pull_type_size, grad_type_size);
ptr_tables_.push_back(ptr_table); ptr_tables_.push_back(ptr_table);
} }
if (multi_node_) { if (multi_node_) {
...@@ -69,8 +82,58 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm( ...@@ -69,8 +82,58 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() { HeterComm<KeyType, ValType, GradType, GPUAccessor>::HeterComm(
size_t capacity,
std::shared_ptr<HeterPsResource> resource,
GPUAccessor& gpu_accessor) {
VLOG(1) << "Construct new HeterComm";
resource_ = resource;
storage_.resize(resource_->total_device());
multi_mf_dim_ = resource->multi_mf();
gpu_accessor_ = gpu_accessor;
load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
VLOG(0) << "load_factor = " << load_factor_;
for (int i = 0; i < resource_->total_device(); ++i) {
#if defined(PADDLE_WITH_CUDA)
platform::CUDADeviceGuard guard(resource_->dev_id(i));
allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT
#endif
if (!multi_mf_dim_) {
auto table = new Table(capacity / load_factor_);
tables_.push_back(table);
} else {
max_mf_dim_ = resource_->max_mf_dim();
auto accessor_wrapper_ptr =
GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t val_type_size =
accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
size_t grad_type_size =
accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
size_t pull_type_size =
accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
<< ", feature_value_push_size:" << grad_type_size
<< ", feature_pull_type_size:" << pull_type_size;
auto ptr_table = new PtrTable(capacity / load_factor_);
ptr_table->set_feature_value_size(pull_type_size, grad_type_size);
ptr_tables_.push_back(ptr_table);
}
if (multi_node_) {
storage_[i].init(feanum_, resource_->dev_id(i));
}
}
heter_comm_kernel_ = std::make_unique<HeterCommKernel>(block_size_);
init_path();
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, GPUAccessor>::init_path() {
int total_device = resource_->total_device(); int total_device = resource_->total_device();
path_.resize(total_device); path_.resize(total_device);
if (!topo_aware_) { if (!topo_aware_) {
...@@ -125,9 +188,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() { ...@@ -125,9 +188,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
template <typename DstPlace, typename SrcPlace, typename StreamType> template <typename DstPlace, typename SrcPlace, typename StreamType>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::memory_copy(
DstPlace dst_place, DstPlace dst_place,
void* dst, void* dst,
SrcPlace src_place, SrcPlace src_place,
...@@ -135,9 +198,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy( ...@@ -135,9 +198,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
size_t count, size_t count,
StreamType stream) { StreamType stream) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream); CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
if (stream == 0) { if (stream == 0) {
cudaStreamSynchronize(0); CUDA_CHECK(cudaStreamSynchronize(0));
} }
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
memory::Copy(dst_place, dst, src_place, src, count); memory::Copy(dst_place, dst, src_place, src, count);
...@@ -147,24 +210,24 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy( ...@@ -147,24 +210,24 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::create_storage(
int start_index, int end_index, int keylen, int vallen) { int start_index, int end_index, size_t keylen, size_t vallen) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
auto& allocator = allocators_[start_index]; auto& allocator = allocators_[start_index];
auto& nodes = path_[start_index][end_index].nodes_; auto& nodes = path_[start_index][end_index].nodes_;
for (size_t i = 0; i < nodes.size(); ++i) { for (size_t i = 0; i < nodes.size(); ++i) {
platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
allocator->DeviceAllocate( PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
resource_->dev_id(nodes[i].dev_num), resource_->dev_id(nodes[i].dev_num),
(void**)&(nodes[i].key_storage), // NOLINT (void**)&(nodes[i].key_storage), // NOLINT
keylen, keylen,
resource_->remote_stream(nodes[i].dev_num, start_index)); resource_->remote_stream(nodes[i].dev_num, start_index)));
allocator->DeviceAllocate( PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
resource_->dev_id(nodes[i].dev_num), resource_->dev_id(nodes[i].dev_num),
(void**)&(nodes[i].val_storage), // NOLINT (void**)&(nodes[i].val_storage), // NOLINT
vallen, vallen,
resource_->remote_stream(nodes[i].dev_num, start_index)); resource_->remote_stream(nodes[i].dev_num, start_index)));
nodes[i].key_bytes_len = keylen; nodes[i].key_bytes_len = keylen;
nodes[i].val_bytes_len = vallen; nodes[i].val_bytes_len = vallen;
} }
...@@ -186,8 +249,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage( ...@@ -186,8 +249,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::destroy_storage(
int start_index, int end_index) { int start_index, int end_index) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
auto& allocator = allocators_[start_index]; auto& allocator = allocators_[start_index];
...@@ -195,10 +258,10 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage( ...@@ -195,10 +258,10 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
for (size_t i = 0; i < nodes.size(); ++i) { for (size_t i = 0; i < nodes.size(); ++i) {
platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
nodes[i].key_storage); resource_->dev_id(nodes[i].dev_num), nodes[i].key_storage));
allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
nodes[i].val_storage); resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage));
} }
#endif #endif
} }
...@@ -206,8 +269,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage( ...@@ -206,8 +269,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_dest(
int start_index, int start_index,
int num, int num,
int* h_left, int* h_left,
...@@ -293,8 +356,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest( ...@@ -293,8 +356,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_dest(
int start_index, int start_index,
int gpu_num, int gpu_num,
int* h_left, int* h_left,
...@@ -315,40 +378,44 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest( ...@@ -315,40 +378,44 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
auto& node = path_[start_index][i].nodes_[0]; auto& node = path_[start_index][i].nodes_[0];
CopyTask t(&path_[start_index][i], 0); CopyTask t(&path_[start_index][i], 0);
que.push(t); que.push(t);
cudaMemcpyAsync(node.key_storage, CUDA_CHECK(cudaMemcpyAsync(node.key_storage,
reinterpret_cast<char*>(src_key + h_left[i]), reinterpret_cast<char*>(src_key + h_left[i]),
node.key_bytes_len, node.key_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
node.in_stream); node.in_stream));
if (need_copy_val) { if (need_copy_val) {
CUDA_CHECK(
cudaMemcpyAsync(node.val_storage, cudaMemcpyAsync(node.val_storage,
src_val + uint64_t(h_left[i]) * uint64_t(val_size), src_val + uint64_t(h_left[i]) * uint64_t(val_size),
node.val_bytes_len, node.val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
node.in_stream); node.in_stream));
} }
} }
while (!que.empty()) { while (!que.empty()) {
CopyTask& cur_task = que.front(); CopyTask& cur_task = que.front();
que.pop(); que.pop();
if (cur_task.path->nodes_[cur_task.step].sync) { if (cur_task.path->nodes_[cur_task.step].sync) {
cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream); CUDA_CHECK(cudaStreamSynchronize(
cur_task.path->nodes_[cur_task.step].in_stream));
} }
if (cur_task.step != cur_task.path->nodes_.size() - 1) { if (cur_task.step != cur_task.path->nodes_.size() - 1) {
int cur_step = cur_task.step; int cur_step = cur_task.step;
CopyTask c(cur_task.path, cur_step + 1); CopyTask c(cur_task.path, cur_step + 1);
que.push(c); que.push(c);
CUDA_CHECK(
cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
cur_task.path->nodes_[cur_step].key_storage, cur_task.path->nodes_[cur_step].key_storage,
cur_task.path->nodes_[cur_step + 1].key_bytes_len, cur_task.path->nodes_[cur_step + 1].key_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
cur_task.path->nodes_[cur_step + 1].in_stream); cur_task.path->nodes_[cur_step + 1].in_stream));
if (need_copy_val) { if (need_copy_val) {
CUDA_CHECK(
cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step].val_storage,
cur_task.path->nodes_[cur_step + 1].val_bytes_len, cur_task.path->nodes_[cur_step + 1].val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
cur_task.path->nodes_[cur_step + 1].in_stream); cur_task.path->nodes_[cur_step + 1].in_stream));
} }
} }
} }
...@@ -357,8 +424,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest( ...@@ -357,8 +424,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::walk_to_src(
int start_index, int start_index,
int gpu_num, int gpu_num,
int* h_left, int* h_left,
...@@ -373,19 +440,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src( ...@@ -373,19 +440,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
int cur_step = path_[start_index][i].nodes_.size() - 1; int cur_step = path_[start_index][i].nodes_.size() - 1;
auto& node = path_[start_index][i].nodes_[cur_step]; auto& node = path_[start_index][i].nodes_[cur_step];
if (cur_step == 0) { if (cur_step == 0) {
cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
node.val_storage, node.val_storage,
node.val_bytes_len, node.val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
node.out_stream); node.out_stream));
} else { } else {
CopyTask t(&path_[start_index][i], cur_step - 1); CopyTask t(&path_[start_index][i], cur_step - 1);
que.push(t); que.push(t);
cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage, CUDA_CHECK(cudaMemcpyAsync(
path_[start_index][i].nodes_[cur_step - 1].val_storage,
node.val_storage, node.val_storage,
path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
path_[start_index][i].nodes_[cur_step - 1].out_stream); path_[start_index][i].nodes_[cur_step - 1].out_stream));
} }
} }
while (!que.empty()) { while (!que.empty()) {
...@@ -398,18 +466,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src( ...@@ -398,18 +466,20 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
if (cur_step > 0) { if (cur_step > 0) {
CopyTask c(cur_task.path, cur_step - 1); CopyTask c(cur_task.path, cur_step - 1);
que.push(c); que.push(c);
CUDA_CHECK(
cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step].val_storage,
cur_task.path->nodes_[cur_step - 1].val_bytes_len, cur_task.path->nodes_[cur_step - 1].val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
cur_task.path->nodes_[cur_step - 1].out_stream); cur_task.path->nodes_[cur_step - 1].out_stream));
} else if (cur_step == 0) { } else if (cur_step == 0) {
int end_index = cur_task.path->nodes_.back().dev_num; int end_index = cur_task.path->nodes_.back().dev_num;
CUDA_CHECK(
cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step].val_storage,
cur_task.path->nodes_[cur_step].val_bytes_len, cur_task.path->nodes_[cur_step].val_bytes_len,
cudaMemcpyDefault, cudaMemcpyDefault,
cur_task.path->nodes_[cur_step].out_stream); cur_task.path->nodes_[cur_step].out_stream));
} }
} }
} }
...@@ -417,8 +487,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src( ...@@ -417,8 +487,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() { HeterComm<KeyType, ValType, GradType, GPUAccessor>::~HeterComm() {
if (!multi_mf_dim_) { if (!multi_mf_dim_) {
for (auto& table : tables_) { for (auto& table : tables_) {
delete table; delete table;
...@@ -439,8 +509,8 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() { ...@@ -439,8 +509,8 @@ HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::show_one_table(
int gpu_num) { int gpu_num) {
if (!multi_mf_dim_) { if (!multi_mf_dim_) {
tables_[gpu_num]->show(); tables_[gpu_num]->show();
...@@ -450,8 +520,28 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table( ...@@ -450,8 +520,28 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) { void HeterComm<KeyType, ValType, GradType, GPUAccessor>::
show_table_collisions() {
size_t idx = 0;
for (auto& table : tables_) {
if (table != nullptr) {
table->show_collision(idx++);
}
}
idx = 0;
for (auto& table : ptr_tables_) {
if (table != nullptr) {
table->show_collision(idx++);
}
}
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, GPUAccessor>::log2i(int x) {
unsigned res = 0; unsigned res = 0;
while (x >>= 1) { while (x >>= 1) {
++res; ++res;
...@@ -462,8 +552,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) { ...@@ -462,8 +552,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid( int HeterComm<KeyType, ValType, GradType, GPUAccessor>::get_index_by_devid(
int devid) { int devid) {
return resource_->get_index_by_devid(devid); return resource_->get_index_by_devid(devid);
} }
...@@ -471,8 +561,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid( ...@@ -471,8 +561,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
for (int i = 0; i < resource_->total_device(); ++i) { for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
...@@ -487,8 +577,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd( ...@@ -487,8 +577,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
for (int i = 0; i < resource_->total_device(); ++i) { for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
...@@ -503,14 +593,15 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd( ...@@ -503,14 +593,15 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::build_ps(
int dev_num, int dev_num,
KeyType* h_keys, KeyType* h_keys,
ValType* h_vals, ValType* h_vals,
size_t len, size_t len,
size_t chunk_size, size_t chunk_size,
int stream_num) { int stream_num,
int offset) {
if (len <= 0) { if (len <= 0) {
return; return;
} }
...@@ -557,11 +648,11 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps( ...@@ -557,11 +648,11 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
h_vals + cur_len, h_vals + cur_len,
sizeof(ValType) * tmp_len, sizeof(ValType) * tmp_len,
cur_use_stream); cur_use_stream);
if (offset == -1) offset = dev_num;
tables_[dev_num]->insert( tables_[offset]->insert(
reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()), reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()),
tmp_len, (size_t)tmp_len,
cur_use_stream); cur_use_stream);
cur_stream += 1; cur_stream += 1;
...@@ -576,8 +667,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps( ...@@ -576,8 +667,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::build_ps(
int num, int num,
KeyType* h_keys, KeyType* h_keys,
char* pool, char* pool,
...@@ -642,8 +733,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps( ...@@ -642,8 +733,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::merge_grad(
int dev_num, int dev_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -719,34 +810,36 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad( ...@@ -719,34 +810,36 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::dynamic_merge_grad(
int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) { int gpu_num,
KeyType* d_keys,
float* d_grads,
size_t len,
int& uniq_len,
size_t& segment_len,
bool enable_segment_merge_grad) {
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id);
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
auto stream = resource_->local_stream(gpu_num, 0); auto stream = resource_->local_stream(gpu_num, 0);
size_t temp_storage_bytes; size_t temp_storage_bytes;
size_t grad_dim = max_mf_dim_;
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr()); KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
uint32_t* d_fea_num_info_ptr = uint32_t* d_fea_num_info_ptr =
reinterpret_cast<uint32_t*>(d_fea_num_info->ptr()); reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len]; uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
uint32_t* d_idx = (uint32_t*)&d_index[len]; uint32_t* d_idx = (uint32_t*)&d_index[len];
int* d_merged_size = (int*)&d_idx[len]; int* d_merged_size = (int*)&d_idx[len];
int grid_size = (len - 1) / block_size_ + 1;
heter_comm_kernel_->fill_idx(d_idx, len, stream); heter_comm_kernel_->fill_idx(d_idx, len, stream);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(NULL, cub::DeviceRadixSort::SortPairs(NULL,
temp_storage_bytes, temp_storage_bytes,
...@@ -758,7 +851,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad( ...@@ -758,7 +851,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
0, 0,
8 * sizeof(KeyType), 8 * sizeof(KeyType),
stream)); stream));
void* d_buff = NULL;
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
...@@ -772,6 +864,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad( ...@@ -772,6 +864,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
8 * sizeof(KeyType), 8 * sizeof(KeyType),
stream)); stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
temp_storage_bytes = 0; temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRunLengthEncode::Encode(NULL, cub::DeviceRunLengthEncode::Encode(NULL,
...@@ -824,30 +917,204 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad( ...@@ -824,30 +917,204 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
uniq_len, uniq_len,
stream)); stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
heter_comm_kernel_->merge_gradient(d_offset,
if (enable_segment_merge_grad) {
segment_merge_grad(gpu_num,
d_merge_keys_ptr,
d_grads,
d_index,
len,
d_fea_num_info_ptr,
uniq_len,
segment_len);
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
d_merge_keys_ptr,
sizeof(KeyType) * segment_len,
cudaMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
} else {
auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
heter_comm_kernel_->merge_gradient(d_keys,
d_offset,
d_fea_num_info_ptr, d_fea_num_info_ptr,
d_index, d_index,
(char*)d_grads, (char*)d_grads,
(char*)d_merge_grads_ptr, (char*)d_merge_grads_ptr,
uniq_len, uniq_len,
grad_dim,
grad_value_size, grad_value_size,
merger_, merger_,
stream, stream,
feature_value_accessor_); gpu_accessor_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
d_merge_grads_ptr, d_merge_grads_ptr,
grad_value_size * uniq_len, grad_value_size * uniq_len,
cudaMemcpyDeviceToDevice, cudaMemcpyDeviceToDevice,
stream)); stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
}
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, GPUAccessor>::segment_merge_grad(
int gpu_num, // the device number
KeyType*
d_keys, // the sorted keys list, which will be modified after merged
float* d_grads, // the raw grads list, which will be modified after merged
const uint32_t*
d_index, // the storage position of d_keys, its length is len.
size_t len, // the number of raw input keys
const uint32_t*
d_fea_num_info, // prefix sum array, its length is uniq_len+1
size_t uniq_len, // the number of unique keys
size_t& segments_num) { // the number of segment merged keys
int dev_id = resource_->dev_id(gpu_num);
platform::CUDAPlace place = platform::CUDAPlace(dev_id);
platform::CUDADeviceGuard guard(dev_id);
auto stream = resource_->local_stream(gpu_num, 0);
auto grad_dim = max_mf_dim_;
auto accessor_wrapper_ptr =
GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto d_buffer1 = memory::Alloc(place, sizeof(uint32_t) * len);
auto d_segments = reinterpret_cast<uint32_t*>(d_buffer1->ptr());
auto d_buffer2 = memory::Alloc(place, sizeof(uint32_t) * len);
auto d_segments_offset = reinterpret_cast<uint32_t*>(d_buffer2->ptr());
auto d_buffer3 = memory::Alloc(place, sizeof(uint32_t) * len);
auto d_segments_fea_num_info = reinterpret_cast<uint32_t*>(d_buffer3->ptr());
auto d_buffer4 = memory::Alloc(place, sizeof(uint32_t) * len);
auto d_segments_fea_num_offset =
reinterpret_cast<uint32_t*>(d_buffer4->ptr());
auto d_buffer5 = memory::Alloc(place, sizeof(uint32_t));
auto d_segments_num = reinterpret_cast<uint32_t*>(d_buffer5->ptr());
CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream));
uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size;
heter_comm_kernel_->split_segments(d_fea_num_info,
uniq_len,
d_segments,
d_segments_num,
segment_size,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
size_t temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(
NULL, temp_storage_bytes, d_segments, d_segments_num, uniq_len, stream));
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(d_temp_storage->ptr(),
temp_storage_bytes,
d_segments,
d_segments_num,
uniq_len,
stream));
CUDA_CHECK(cudaMemcpyAsync(&segments_num,
d_segments_num,
sizeof(uint32_t),
cudaMemcpyDeviceToHost,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
temp_storage_bytes,
d_segments,
d_segments_offset,
uniq_len,
stream));
if (d_temp_storage->size() < temp_storage_bytes) {
d_temp_storage = NULL;
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
}
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes,
d_segments,
d_segments_offset,
uniq_len,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
heter_comm_kernel_->expand_segments(d_fea_num_info,
d_segments_offset,
uniq_len,
d_segments_fea_num_info,
segment_size,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::ExclusiveSum(NULL,
temp_storage_bytes,
d_segments_fea_num_info,
d_segments_fea_num_offset,
segments_num,
stream));
if (d_temp_storage->size() < temp_storage_bytes) {
d_temp_storage = NULL;
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
}
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes,
d_segments_fea_num_info,
d_segments_fea_num_offset,
segments_num,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num);
auto d_segments_keys_ptr = reinterpret_cast<KeyType*>(d_segments_keys->ptr());
heter_comm_kernel_->shrink_keys(d_keys,
d_segments_fea_num_offset,
d_segments_keys_ptr,
segments_num,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size);
auto d_segment_grads_ptr = reinterpret_cast<float*>(d_segment_grads->ptr());
heter_comm_kernel_->merge_gradient(d_segments_keys_ptr,
d_segments_fea_num_offset,
d_segments_fea_num_info,
d_index,
(char*)d_grads,
(char*)d_segment_grads_ptr,
segments_num,
grad_dim,
grad_value_size,
merger_,
stream,
gpu_accessor_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
d_segments_keys_ptr,
sizeof(KeyType) * segments_num,
cudaMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
d_segment_grads_ptr,
grad_value_size * segments_num,
cudaMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
} }
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
KeyType* d_keys, KeyType* d_keys,
int* d_idx_ptr, int* d_idx_ptr,
size_t len, size_t len,
...@@ -869,15 +1136,12 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard( ...@@ -869,15 +1136,12 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr()); int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
// int grid_size = (len - 1) / block_size_ + 1;
heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
heter_comm_kernel_->calc_shard_index( heter_comm_kernel_->calc_shard_index(
d_keys, len, d_shard_index_tmp_ptr, total_device, stream); d_keys, len, d_shard_index_tmp_ptr, total_device, stream);
size_t temp_storage_bytes; size_t temp_storage_bytes;
const int num_bits = 1 + log2i(total_device); const int num_bits = 1 + log2i(total_device);
heter_comm_kernel_->sort_pairs(NULL, heter_comm_kernel_->sort_pairs(NULL,
temp_storage_bytes, temp_storage_bytes,
d_shard_index_tmp_ptr, d_shard_index_tmp_ptr,
...@@ -890,7 +1154,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard( ...@@ -890,7 +1154,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
stream); stream);
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
temp_storage_bytes, temp_storage_bytes,
d_shard_index_tmp_ptr, d_shard_index_tmp_ptr,
...@@ -910,13 +1173,309 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard( ...@@ -910,13 +1173,309 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::merge_keys(
int gpu_num,
const KeyType* d_keys,
size_t len, // input
KeyType* d_sorted_keys, // output
KeyType* d_merged_keys, // output
uint32_t* d_restore_idx, // output
size_t& uniq_len) { // output
int dev_id = resource_->dev_id(gpu_num);
platform::CUDAPlace place = platform::CUDAPlace(dev_id);
platform::CUDADeviceGuard guard(dev_id);
auto stream = resource_->local_stream(gpu_num, 0);
size_t grad_dim = max_mf_dim_;
auto accessor_wrapper_ptr =
GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 4 + 1));
uint32_t* d_fea_num_info_ptr =
reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
uint32_t* d_idx = (uint32_t*)&d_fea_num_info_ptr[len];
uint32_t* d_index = (uint32_t*)&d_idx[len];
uint32_t* d_offset = (uint32_t*)&d_index[len];
uint32_t* d_merged_size = (uint32_t*)&d_offset[len];
heter_comm_kernel_->fill_idx(d_idx, len, stream);
size_t temp_storage_bytes;
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(NULL,
temp_storage_bytes,
d_keys,
d_sorted_keys,
d_idx,
d_index,
len,
0,
8 * sizeof(KeyType),
stream));
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
temp_storage_bytes,
d_keys,
d_sorted_keys,
d_idx,
d_index,
len,
0,
8 * sizeof(KeyType),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRunLengthEncode::Encode(NULL,
temp_storage_bytes,
d_sorted_keys,
d_merged_keys,
d_fea_num_info_ptr,
d_merged_size,
len,
stream));
if (d_temp_storage->size() < temp_storage_bytes) {
d_temp_storage = NULL;
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
}
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(),
temp_storage_bytes,
d_sorted_keys,
d_merged_keys,
d_fea_num_info_ptr,
d_merged_size,
len,
stream));
cudaMemcpyAsync((void*)&uniq_len,
d_merged_size,
sizeof(int),
cudaMemcpyDeviceToHost,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
temp_storage_bytes,
d_fea_num_info_ptr,
d_offset,
uniq_len,
stream));
if (d_temp_storage->size() < temp_storage_bytes) {
d_temp_storage = NULL;
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
}
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes,
d_fea_num_info_ptr,
d_offset,
uniq_len,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
heter_comm_kernel_->fill_restore_idx(true,
len,
uniq_len,
d_merged_keys,
d_index,
d_offset,
d_fea_num_info_ptr,
d_restore_idx,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_merge_sparse(
int num, KeyType* d_keys, float* d_vals, size_t len) { int num, KeyType* d_keys, float* d_vals, size_t len) {
if (len == 0) { int total_device = resource_->total_device();
return; int dev_id = resource_->dev_id(num);
DevPlace place = DevPlace(dev_id);
AnyDeviceGuard guard(dev_id);
auto stream = resource_->local_stream(num, 0);
int h_left[total_device]; // NOLINT
int h_right[total_device]; // NOLINT
auto d_left = memory::Alloc(place, total_device * sizeof(int));
auto d_right = memory::Alloc(place, total_device * sizeof(int));
int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
#if defined(PADDLE_WITH_CUDA)
cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream);
cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream);
#elif defined(PADDLE_WITH_XPU_KP)
// get XPUDeviceContext according to xpu place
paddle::platform::XPUDeviceContext xpu_dev_ctx(place);
auto xpu_context = xpu_dev_ctx.x_context();
int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
PADDLE_ENFORCE_EQ(r,
XPU_SUCCESS,
platform::errors::External(
"XPU constant kernel return wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
PADDLE_ENFORCE_EQ(r2,
XPU_SUCCESS,
platform::errors::External(
"XPU constant kernel return wrong value[%d %s]",
r2,
XPUAPIErrorMsg[r2]));
#endif
auto accessor_wrapper_ptr =
GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size;
auto d_sorted_keys = memory::Alloc(place, len * sizeof(KeyType));
auto d_sorted_keys_ptr = reinterpret_cast<KeyType*>(d_sorted_keys->ptr());
auto d_merged_keys = memory::Alloc(place, len * sizeof(KeyType));
auto d_merged_keys_ptr = reinterpret_cast<KeyType*>(d_merged_keys->ptr());
auto d_restore_idx = memory::Alloc(place, len * sizeof(uint32_t));
auto d_restore_idx_ptr = reinterpret_cast<uint32_t*>(d_restore_idx->ptr());
auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
auto d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
auto d_shard_vals = memory::Alloc(place, len * val_type_size);
auto d_shard_vals_ptr = reinterpret_cast<float*>(d_shard_vals->ptr());
size_t uniq_len = 0;
merge_keys(num,
d_keys,
len,
d_sorted_keys_ptr,
d_merged_keys_ptr,
d_restore_idx_ptr,
uniq_len);
sync_stream(stream);
auto d_idx = memory::Alloc(place, uniq_len * sizeof(int));
auto d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
split_input_to_shard(
d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num);
heter_comm_kernel_->fill_shard_key(
d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, stream);
sync_stream(stream);
auto dst_place = platform::CPUPlace();
auto src_place = place;
memory_copy(dst_place,
h_left,
src_place,
d_left_ptr,
total_device * sizeof(int),
stream);
memory_copy(dst_place,
h_right,
src_place,
d_right_ptr,
total_device * sizeof(int),
stream);
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) {
int shard_len = h_right[i] - h_left[i] + 1;
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
create_storage(
num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
}
walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
}
for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1) {
continue;
}
auto& node = path_[num][i].nodes_.back();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
sync_stream(node.in_stream);
}
AnyDeviceGuard guard(resource_->dev_id(i));
ptr_tables_[i]->rwlock_->RDLock();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
node.val_storage,
h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, num),
gpu_accessor_);
} else {
ptr_tables_[i]->get(
d_shard_keys_ptr + h_left[i],
reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, num),
gpu_accessor_);
}
}
for (int i = 0; i < total_device; ++i) {
sync_stream(resource_->remote_stream(i, num));
if (h_left[i] == -1) {
continue;
}
ptr_tables_[i]->rwlock_->UNLock();
} }
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
walk_to_src(num,
total_device,
h_left,
h_right,
reinterpret_cast<char*>(d_shard_vals_ptr),
val_type_size);
for (int i = 0; i < total_device; ++i) {
auto& node = path_[num][i].nodes_.front();
sync_stream(node.out_stream);
}
}
auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size);
auto d_merged_vals_ptr = reinterpret_cast<float*>(d_merged_vals->ptr());
heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
d_merged_vals_ptr,
d_idx_ptr,
uniq_len,
val_type_size,
stream);
sync_stream(stream);
heter_comm_kernel_->unpack_merged_vals(len,
d_keys,
d_merged_vals_ptr,
d_restore_idx_ptr,
d_vals,
val_type_size,
stream);
sync_stream(stream);
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
destroy_storage(num, i);
}
}
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_normal_sparse(
int num, KeyType* d_keys, float* d_vals, size_t len) {
int total_device = resource_->total_device(); int total_device = resource_->total_device();
int dev_id = resource_->dev_id(num); int dev_id = resource_->dev_id(num);
DevPlace place = DevPlace(dev_id); DevPlace place = DevPlace(dev_id);
...@@ -960,8 +1519,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( ...@@ -960,8 +1519,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr()); int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size;
auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr()); KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
...@@ -991,6 +1550,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( ...@@ -991,6 +1550,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
total_device * sizeof(int), total_device * sizeof(int),
stream); stream);
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
int shard_len = h_right[i] - h_left[i] + 1; int shard_len = h_right[i] - h_left[i] + 1;
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
...@@ -1000,20 +1560,31 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( ...@@ -1000,20 +1560,31 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
num, i, shard_len * sizeof(KeyType), shard_len * val_type_size); num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
} }
walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
}
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1) { if (h_left[i] == -1) {
continue; continue;
} }
auto& node = path_[num][i].nodes_.back(); auto& node = path_[num][i].nodes_.back();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
sync_stream(node.in_stream); sync_stream(node.in_stream);
}
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
ptr_tables_[i]->rwlock_->RDLock(); ptr_tables_[i]->rwlock_->RDLock();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage), ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
node.val_storage, node.val_storage,
h_right[i] - h_left[i] + 1, h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, num), resource_->remote_stream(i, num),
feature_value_accessor_); gpu_accessor_);
} else {
ptr_tables_[i]->get(
d_shard_keys_ptr + h_left[i],
reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, num),
gpu_accessor_);
}
} }
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
...@@ -1023,6 +1594,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( ...@@ -1023,6 +1594,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
} }
ptr_tables_[i]->rwlock_->UNLock(); ptr_tables_[i]->rwlock_->UNLock();
} }
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
walk_to_src(num, walk_to_src(num,
total_device, total_device,
h_left, h_left,
...@@ -1033,31 +1605,45 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse( ...@@ -1033,31 +1605,45 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
auto& node = path_[num][i].nodes_.front(); auto& node = path_[num][i].nodes_.front();
sync_stream(node.out_stream); sync_stream(node.out_stream);
} }
heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, }
d_vals, heter_comm_kernel_->dy_mf_fill_dvals(
d_idx_ptr, d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream);
len,
val_type_size,
stream,
feature_value_accessor_);
sync_stream(stream); sync_stream(stream);
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
} }
destroy_storage(num, i); destroy_storage(num, i);
} }
}
}
template <typename KeyType,
typename ValType,
typename GradType,
typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, GPUAccessor>::pull_sparse(
int num, KeyType* d_keys, float* d_vals, size_t len) {
if (len == 0) {
return;
}
if (!FLAGS_gpugraph_dedup_pull_push_mode) {
pull_merge_sparse(num, d_keys, d_vals, len);
} else {
pull_normal_sparse(num, d_keys, d_vals, len);
}
} }
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse(
int dev_num, int dev_num,
KeyType* d_keys, KeyType* d_keys,
float* d_grads, float* d_grads,
...@@ -1071,7 +1657,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1071,7 +1657,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
int dev_id = resource_->dev_id(dev_num); int dev_id = resource_->dev_id(dev_num);
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
DevPlace place = DevPlace(dev_id); DevPlace place = DevPlace(dev_id);
AnyDeviceGuard guard(dev_id); AnyDeviceGuard guard(dev_id);
...@@ -1116,13 +1702,30 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1116,13 +1702,30 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr()); KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
float* d_shard_grads_ptr;
auto d_shard_grads = memory::Alloc(place, len * grad_value_size); auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
float* d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr()); d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr());
int uniq_len = len; int uniq_len = len;
dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); if (!FLAGS_gpugraph_dedup_pull_push_mode) {
size_t segment_len = 0;
int grid_size = (uniq_len - 1) / block_size_ + 1; if (FLAGS_gpugraph_enable_segment_merge_grads) {
// do two gradient merge
// 1st. do segmented gradient merge
// 2nd. do global gradient merge
dynamic_merge_grad(
dev_num, d_keys, d_grads, len, uniq_len, segment_len, true);
len = segment_len;
uniq_len = 0;
segment_len = 0;
dynamic_merge_grad(
dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
} else {
// Perform gradient merge only once
dynamic_merge_grad(
dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
}
}
split_input_to_shard( split_input_to_shard(
d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
...@@ -1135,7 +1738,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1135,7 +1738,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
uniq_len, uniq_len,
grad_value_size, grad_value_size,
stream, stream,
feature_value_accessor_); gpu_accessor_);
sync_stream(stream); sync_stream(stream);
...@@ -1154,6 +1757,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1154,6 +1757,7 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
total_device * sizeof(int), total_device * sizeof(int),
stream); stream);
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
int shard_len = h_right[i] - h_left[i] + 1; int shard_len = h_right[i] - h_left[i] + 1;
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
...@@ -1170,21 +1774,33 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1170,21 +1774,33 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
d_shard_keys_ptr, d_shard_keys_ptr,
reinterpret_cast<char*>(d_shard_grads_ptr), reinterpret_cast<char*>(d_shard_grads_ptr),
grad_value_size); grad_value_size);
}
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
} }
auto& node = path_[dev_num][i].nodes_.back(); auto& node = path_[dev_num][i].nodes_.back();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
sync_stream(node.in_stream); sync_stream(node.in_stream);
}
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
ptr_tables_[i]->rwlock_->WRLock(); ptr_tables_[i]->rwlock_->WRLock();
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage), ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
node.val_storage, node.val_storage,
h_right[i] - h_left[i] + 1, h_right[i] - h_left[i] + 1,
sgd, sgd,
resource_->remote_stream(i, dev_num)); resource_->remote_stream(i, dev_num));
} else {
ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i],
reinterpret_cast<char*>(d_shard_grads_ptr) +
grad_value_size * h_left[i],
h_right[i] - h_left[i] + 1,
sgd,
resource_->remote_stream(i, dev_num));
}
} }
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
...@@ -1198,20 +1814,22 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1198,20 +1814,22 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
} }
} }
if (!FLAGS_gpugraph_enable_gpu_direct_access) {
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
} }
destroy_storage(dev_num, i); destroy_storage(dev_num, i);
} }
}
} }
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse(
int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) {
if (len == 0) { if (len == 0) {
return; return;
...@@ -1269,8 +1887,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1269,8 +1887,6 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
int uniq_len = len; int uniq_len = len;
merge_grad(dev_num, d_keys, d_grads, len, uniq_len); merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
// int grid_size = (uniq_len - 1) / block_size_ + 1;
split_input_to_shard( split_input_to_shard(
d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
...@@ -1351,9 +1967,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse( ...@@ -1351,9 +1967,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::update_one_table(
int gpu_num, int gpu_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -1375,9 +1991,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table( ...@@ -1375,9 +1991,9 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node( void HeterComm<KeyType, ValType, GradType, GPUAccessor>::push_sparse_multi_node(
int gpu_num, int gpu_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -1407,8 +2023,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node( ...@@ -1407,8 +2023,8 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad( int HeterComm<KeyType, ValType, GradType, GPUAccessor>::gather_one_node_grad(
int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
int total_gpu = resource_->total_device(); int total_gpu = resource_->total_device();
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
...@@ -1493,7 +2109,6 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad( ...@@ -1493,7 +2109,6 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
cudaMemcpy( cudaMemcpy(
h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
// int grid_size = (h_node_len[i] - 1) / block_size_ + 1;
heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num, heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num,
storage.all_keys + index, storage.all_keys + index,
storage.local_grads + merge_num, storage.local_grads + merge_num,
...@@ -1512,8 +2127,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad( ...@@ -1512,8 +2127,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad( int HeterComm<KeyType, ValType, GradType, GPUAccessor>::gather_multi_node_grad(
int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
auto& storage = storage_[gpu_num]; auto& storage = storage_[gpu_num];
...@@ -1586,8 +2201,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad( ...@@ -1586,8 +2201,8 @@ int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
template <typename KeyType, template <typename KeyType,
typename ValType, typename ValType,
typename GradType, typename GradType,
typename FVAccessor> typename GPUAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() { void HeterComm<KeyType, ValType, GradType, GPUAccessor>::end_pass() {
int total_device = resource_->total_device(); int total_device = resource_->total_device();
std::vector<std::thread> threads; std::vector<std::thread> threads;
...@@ -1608,10 +2223,127 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() { ...@@ -1608,10 +2223,127 @@ void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
} }
} }
// template <typename KeyType, typename ValType, typename GradType, typename #if defined(PADDLE_WITH_CUDA)
// FVAccessor> template <typename KeyType,
// void HeterComm<KeyType, ValType, GradType, FVAccessor>::dump_to_cpu(int typename ValType,
// index) { typename GradType,
typename GPUAccessor>
int HeterComm<KeyType, ValType, GradType, GPUAccessor>::dedup_keys_and_fillidx(
const int gpu_id,
const int total_fea_num,
const KeyType* d_keys, // input
KeyType* d_merged_keys, // output
KeyType* d_sorted_keys,
uint32_t* d_restore_idx,
uint32_t* d_sorted_idx,
uint32_t* d_offset,
uint32_t* d_merged_cnts,
bool filter_zero) {
int dev_id = resource_->dev_id(gpu_id);
platform::CUDAPlace place = platform::CUDAPlace(dev_id);
platform::CUDADeviceGuard guard(dev_id);
auto stream = resource_->local_stream(gpu_id, 0);
assert(total_fea_num > 0);
int merged_size = 0;
size_t byte_size = sizeof(uint32_t) * (total_fea_num + 1);
auto d_index_ptr = memory::Alloc(place, byte_size);
uint32_t* d_index_in = reinterpret_cast<uint32_t*>(d_index_ptr->ptr());
int* d_merged_size = reinterpret_cast<int*>(&d_index_in[total_fea_num]);
heter_comm_kernel_->fill_idx(d_index_in, total_fea_num, stream);
void* d_buf = NULL;
size_t temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(NULL,
temp_storage_bytes,
d_keys,
d_sorted_keys,
d_index_in,
d_sorted_idx,
total_fea_num,
0,
8 * sizeof(KeyType),
stream,
false));
auto d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRadixSort::SortPairs(d_buf,
temp_storage_bytes,
d_keys,
d_sorted_keys,
d_index_in,
d_sorted_idx,
total_fea_num,
0,
8 * sizeof(KeyType),
stream,
false));
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRunLengthEncode::Encode(NULL,
temp_storage_bytes,
d_sorted_keys,
d_merged_keys,
d_merged_cnts,
d_merged_size,
total_fea_num,
stream));
if (d_cache_ptr->size() < temp_storage_bytes) {
d_cache_ptr = NULL;
d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
}
d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceRunLengthEncode::Encode(d_buf,
temp_storage_bytes,
d_sorted_keys,
d_merged_keys,
d_merged_cnts,
d_merged_size,
total_fea_num,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync((void*)&merged_size,
(void*)d_merged_size,
sizeof(int),
cudaMemcpyDeviceToHost,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
NULL, temp_storage_bytes, d_merged_cnts, d_offset, merged_size, stream));
if (d_cache_ptr->size() < temp_storage_bytes) {
d_cache_ptr = NULL;
d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
}
d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
d_buf, temp_storage_bytes, d_merged_cnts, d_offset, merged_size, stream));
if (filter_zero) {
cudaMemsetAsync(d_restore_idx, 0, total_fea_num * sizeof(uint32_t), stream);
}
// fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1]
heter_comm_kernel_->fill_restore_idx(filter_zero,
total_fea_num,
merged_size,
d_merged_keys,
d_sorted_idx,
d_offset,
d_merged_cnts,
d_restore_idx,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
return merged_size;
}
#endif
// template <typename KeyType, typename ValType, typename GradType>
// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) {
// auto stream = resource_->local_stream(index, 0); // auto stream = resource_->local_stream(index, 0);
// int dev_id = resource_->dev_id(index); // int dev_id = resource_->dev_id(index);
// platform::CUDADeviceGuard guard(dev_id); // platform::CUDADeviceGuard guard(dev_id);
......
...@@ -128,69 +128,177 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ...@@ -128,69 +128,177 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
} }
} }
template <typename KeyType, typename T, typename FVAccessor> template <typename KeyType, typename GPUAccessor>
__global__ void dy_mf_fill_shard_grads_kernel( __global__ void merge_gradients_basic_kernel(const KeyType* d_keys,
KeyType* d_shard_keys, const uint32_t* offset,
KeyType* d_keys, const uint32_t* fea_num,
float* d_shard_grads, const uint32_t* index,
float* d_grads, const char* input,
T* idx, char* output,
size_t len, int n,
size_t grad_value_size, size_t grad_value_size,
FVAccessor feature_value_accessor) { DynamicGradMerger& merger,
GPUAccessor& gpu_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) {
d_shard_keys[i] = d_keys[idx[i]];
float* cur = (float*)((char*)d_shard_grads + i * grad_value_size);
float* shard_val =
(float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
feature_value_accessor.PushValueFill(cur, shard_val); if (i < n) {
uint32_t start = offset[i];
uint32_t num = fea_num[i];
int ori_index = index[start];
float* out = (float*)(output + i * grad_value_size);
float* in = (float*)(input + size_t(ori_index) * grad_value_size);
merger.update_basic(out, in, gpu_accessor);
KeyType key = d_keys[i];
if (key != 0) {
for (int j = 1; j < num; ++j) {
ori_index = index[start + j];
in = (float*)(input + size_t(ori_index) * grad_value_size);
merger.merge_basic(out, in, gpu_accessor);
}
}
} }
} }
template <typename FVAccessor> template <typename KeyType, typename GPUAccessor>
__global__ void merge_gradients_kernel(const uint32_t* offset, __global__ void merge_gradients_embedx_kernel(const KeyType* d_keys,
const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
const char* input, const char* input,
char* output, char* output,
int n, int n,
size_t grad_dim,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger, DynamicGradMerger& merger,
FVAccessor& feature_value_accessor) { GPUAccessor& gpu_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) { if (i < n) {
uint32_t start = offset[i]; size_t value_idx = i / grad_dim;
uint32_t num = fea_num[i]; size_t field_idx = i % grad_dim;
uint32_t start = offset[value_idx];
uint32_t num = fea_num[value_idx];
int ori_index = index[start]; int ori_index = index[start];
float* out = (float*)(output + i * grad_value_size);
float* in = (float*)(input + size_t(ori_index) * grad_value_size); float* in = (float*)(input + size_t(ori_index) * grad_value_size);
merger.update_one(out, in, feature_value_accessor); float* out = (float*)(output + value_idx * grad_value_size);
merger.update_embedx(out, in, field_idx, gpu_accessor);
KeyType key = d_keys[value_idx];
if (key != 0) {
for (int j = 1; j < num; ++j) { for (int j = 1; j < num; ++j) {
ori_index = index[start + j]; int ori_index = index[start + j];
in = (float*)(input + size_t(ori_index) * grad_value_size); float* in = (float*)(input + size_t(ori_index) * grad_value_size);
merger.merge_one(out, in, feature_value_accessor); merger.merge_embedx(out, in, field_idx, gpu_accessor);
}
} }
} }
} }
template <typename T, typename FVAccessor> __global__ void split_segments_kernel(const uint32_t* d_fea_num_info,
__global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, size_t n,
float* d_vals, uint32_t* d_segments,
uint32_t* d_segments_num,
uint32_t segment_size) {
const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx >= n) {
return;
}
auto fea_num = d_fea_num_info[tx];
auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1);
d_segments[tx] = seg_num;
}
__global__ void expand_segments_kernel(const uint32_t* d_fea_num_info,
const uint32_t* d_segments_offset,
size_t n,
uint32_t* d_segments_fea_num_info,
uint32_t segment_size) {
const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx >= n) {
return;
}
auto fea_num = d_fea_num_info[tx];
auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1);
auto start_pos = d_segments_offset[tx];
auto remains = fea_num;
int cur_seg_size = 0;
for (size_t i = 0; i < seg_num; ++i) {
if (remains >= segment_size) {
cur_seg_size = segment_size;
} else {
cur_seg_size = remains;
}
d_segments_fea_num_info[start_pos + i] = cur_seg_size;
remains -= cur_seg_size;
}
}
template <typename KeyType>
__global__ void shrink_keys_kernel(const KeyType* d_keys,
const uint32_t* d_segments_offset,
KeyType* d_segments_keys,
size_t n) {
const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx >= n) {
return;
}
d_segments_keys[tx] = d_keys[d_segments_offset[tx]];
}
template <typename KeyType>
__global__ void unpack_merged_vals_kernel(const KeyType* d_keys,
const float* d_merged_vals,
const uint32_t* d_restored_idx,
float* d_out,
size_t val_size,
const size_t n) {
const size_t tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx >= n) {
return;
}
size_t src_val_idx = 0;
const KeyType& key = d_keys[tx];
if (key != 0) {
src_val_idx = d_restored_idx[tx];
}
uint64_t dst_offset = uint64_t(tx) * val_size;
float* dst = (float*)((char*)d_out + dst_offset);
float* src_val =
(float*)((char*)d_merged_vals + uint64_t(src_val_idx) * val_size);
size_t n_float = val_size / sizeof(float);
for (size_t k = 0; k < n_float; ++k) {
dst[k] = src_val[k];
}
}
template <typename TUnit, typename T>
__global__ void scatter_dvals_by_unit_kernel(TUnit* d_dest_vals,
const TUnit* d_src_vals,
T* idx, T* idx,
size_t len, size_t len,
size_t val_size, size_t val_size_unit) {
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
uint64_t new_offset = uint64_t(idx[i]) * val_size; size_t pos = idx[i / val_size_unit] * val_size_unit + (i % val_size_unit);
float* cur = (float*)((char*)d_vals + new_offset); d_dest_vals[i] = d_src_vals[pos];
float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); }
int mf_dim = int( }
shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]);
feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); template <typename TUnit, typename T>
__global__ void gather_dvals_by_unit_kernel(TUnit* d_dest_vals,
const TUnit* d_src_vals,
T* idx,
size_t len,
const size_t val_size_unit) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) {
size_t pos = idx[i / val_size_unit] * val_size_unit + (i % val_size_unit);
d_dest_vals[pos] = d_src_vals[i];
} }
} }
...@@ -325,9 +433,8 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, ...@@ -325,9 +433,8 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
template <typename KeyType, template <typename KeyType,
typename T, typename T,
typename StreamType, typename StreamType,
typename FVAccessor> typename GPUAccessor>
void HeterCommKernel::dy_mf_fill_shard_grads( void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
KeyType* d_shard_keys,
KeyType* d_keys, KeyType* d_keys,
float* d_shard_grads, float* d_shard_grads,
float* d_grads, float* d_grads,
...@@ -335,33 +442,38 @@ void HeterCommKernel::dy_mf_fill_shard_grads( ...@@ -335,33 +442,38 @@ void HeterCommKernel::dy_mf_fill_shard_grads(
long long len, long long len,
size_t grad_value_size, size_t grad_value_size,
const StreamType& stream, const StreamType& stream,
FVAccessor& feature_value_accessor) { GPUAccessor& gpu_accessor) {
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
size_t c_len = (size_t)len; size_t c_len = (size_t)len;
dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
d_shard_keys, const size_t grad_value_size_float = grad_value_size / sizeof(float);
d_keys, // d_keys to d_shard_keys
d_shard_grads, fill_shard_key_kernel<<<grid_size, block_size_, 0, stream>>>(
d_grads, d_shard_keys, d_keys, idx, c_len);
idx,
c_len, CHECK((grad_value_size % sizeof(float)) == 0);
grad_value_size, size_t N = len * grad_value_size_float;
feature_value_accessor); grid_size = (N - 1) / block_size_ + 1;
scatter_dvals_by_unit_kernel<<<grid_size, block_size_, 0, stream>>>(
d_shard_grads, d_grads, idx, N, grad_value_size_float);
} }
template <typename StreamType, typename FVAccessor> template <typename KeyType, typename StreamType, typename GPUAccessor>
void HeterCommKernel::merge_gradient(const uint32_t* offset, void HeterCommKernel::merge_gradient(const KeyType* d_keys,
const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
const char* input, const char* input,
char* output, char* output,
int n, int n,
size_t grad_dim,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger,
const StreamType& stream, const StreamType& stream,
FVAccessor& feature_value_accessor) { GPUAccessor& gpu_accessor) {
int grid_size = (n - 1) / block_size_ + 1; int grid_size1 = (n - 1) / block_size_ + 1;
merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>( merge_gradients_basic_kernel<<<grid_size1, block_size_, 0, stream>>>(
d_keys,
offset, offset,
fea_num, fea_num,
index, index,
...@@ -369,22 +481,189 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, ...@@ -369,22 +481,189 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
output, output,
n, n,
grad_value_size, grad_value_size,
merger_, merger,
feature_value_accessor); gpu_accessor);
if (grad_dim > 0) {
int grid_size2 = (n * grad_dim - 1) / block_size_ + 1;
merge_gradients_embedx_kernel<<<grid_size2, block_size_, 0, stream>>>(
d_keys,
offset,
fea_num,
index,
input,
output,
n * grad_dim,
grad_dim,
grad_value_size,
merger,
gpu_accessor);
}
} }
template <typename T, typename StreamType, typename FVAccessor> template <typename T, typename StreamType>
void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals, void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals,
float* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream, const StreamType& stream) {
FVAccessor& feature_value_accessor) { const size_t val_size_float = val_size / sizeof(float);
int grid_size = (len - 1) / block_size_ + 1; CHECK((val_size % sizeof(float)) == 0);
size_t c_len = (size_t)len; size_t N = len * val_size_float;
dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>( const int grid_size = (N - 1) / block_size_ + 1;
d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor); // fill by float, d_shard_vals to d_vals
gather_dvals_by_unit_kernel<<<grid_size, block_size_, 0, stream>>>(
d_vals, d_shard_vals, idx, N, val_size_float);
}
template <typename StreamType>
void HeterCommKernel::split_segments(const uint32_t* d_fea_num_info,
size_t n,
uint32_t* d_segments,
uint32_t* d_segments_num,
size_t segment_size,
const StreamType& stream) {
int grid_size = (n - 1) / block_size_ + 1;
split_segments_kernel<<<grid_size, block_size_, 0, stream>>>(
d_fea_num_info, n, d_segments, d_segments_num, segment_size);
}
template <typename StreamType>
void HeterCommKernel::expand_segments(const uint32_t* d_fea_num_info,
const uint32_t* d_segments_offset,
size_t n,
uint32_t* d_segments_fea_num_info,
uint32_t segment_size,
const StreamType& stream) {
int grid_size = (n - 1) / block_size_ + 1;
expand_segments_kernel<<<grid_size, block_size_, 0, stream>>>(
d_fea_num_info,
d_segments_offset,
n,
d_segments_fea_num_info,
segment_size);
}
template <typename KeyType, typename StreamType>
void HeterCommKernel::shrink_keys(const KeyType* d_keys,
const uint32_t* d_segments_offset,
KeyType* d_segments_keys,
size_t n,
const StreamType& stream) {
int grid_size = (n - 1) / block_size_ + 1;
shrink_keys_kernel<<<grid_size, block_size_, 0, stream>>>(
d_keys, d_segments_offset, d_segments_keys, n);
}
template <typename T>
__global__ void kernel_fill_restore_idx(const size_t N,
const T* d_sorted_idx,
const T* d_offset,
const T* d_merged_cnts,
T* d_restore_idx) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
const T& off = d_offset[i];
const T& num = d_merged_cnts[i];
for (size_t k = 0; k < num; ++k) {
d_restore_idx[d_sorted_idx[off + k]] = i;
}
}
}
template <typename KeyType, typename T>
__global__ void kernel_fill_restore_idx_filter_zero(const size_t N,
const KeyType* d_keys,
const T* d_sorted_idx,
const T* d_offset,
const T* d_merged_cnts,
T* d_restore_idx) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
if (d_keys[i] == 0) {
return;
}
const T& off = d_offset[i];
const T& num = d_merged_cnts[i];
for (size_t k = 0; k < num; ++k) {
d_restore_idx[d_sorted_idx[off + k]] = i;
}
}
}
template <typename T>
__global__ void kernel_fill_restore_idx_by_search(const size_t N,
const T* d_sorted_idx,
const size_t merge_num,
const T* d_offset,
T* d_restore_idx) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
if (i < d_offset[1]) {
d_restore_idx[d_sorted_idx[i]] = 0;
return;
}
int high = merge_num - 1;
int low = 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < d_offset[mid + 1]) {
high = mid;
} else {
low = mid + 1;
}
}
d_restore_idx[d_sorted_idx[i]] = low;
}
}
template <typename KeyType, typename StreamType>
void HeterCommKernel::fill_restore_idx(bool filter_zero,
const size_t total_num,
const size_t merge_size,
const KeyType* d_keys,
const uint32_t* d_sorted_idx,
const uint32_t* d_offset,
const uint32_t* d_merged_cnts,
uint32_t* d_restore_idx,
const StreamType& stream) {
// fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1]
if (merge_size * 3 > total_num) {
// repetition rate is not very high
size_t grid_size = (merge_size - 1) / block_size_ + 1;
if (filter_zero) {
kernel_fill_restore_idx_filter_zero<<<grid_size,
block_size_,
0,
stream>>>(merge_size,
d_keys,
d_sorted_idx,
d_offset,
d_merged_cnts,
d_restore_idx);
} else {
kernel_fill_restore_idx<<<grid_size, block_size_, 0, stream>>>(
merge_size, d_sorted_idx, d_offset, d_merged_cnts, d_restore_idx);
}
} else {
size_t grid_size = (total_num - 1) / block_size_ + 1;
// mid search
kernel_fill_restore_idx_by_search<<<grid_size, block_size_, 0, stream>>>(
total_num, d_sorted_idx, merge_size, d_offset, d_restore_idx);
}
}
template <typename KeyType, typename StreamType>
void HeterCommKernel::unpack_merged_vals(size_t n,
const KeyType* d_keys,
const void* d_merged_vals,
const uint32_t* d_restore_idx,
void* d_vals,
size_t val_size,
const StreamType& stream) {
int grid_size = (n - 1) / block_size_ + 1;
unpack_merged_vals_kernel<<<grid_size, block_size_, 0, stream>>>(
d_keys,
(const float*)d_merged_vals,
d_restore_idx,
(float*)d_vals,
val_size,
n);
} }
template void HeterCommKernel::fill_idx<int, cudaStream_t>( template void HeterCommKernel::fill_idx<int, cudaStream_t>(
...@@ -491,12 +770,11 @@ template void HeterCommKernel::reduce_by_key< ...@@ -491,12 +770,11 @@ template void HeterCommKernel::reduce_by_key<
cudaStream_t stream, cudaStream_t stream,
bool debug_synchronous); bool debug_synchronous);
template void template void HeterCommKernel::dy_mf_fill_shard_grads<
HeterCommKernel::dy_mf_fill_shard_grads<unsigned long, unsigned long,
int, int,
cudaStream_t, cudaStream_t,
CommonFeatureValueAccessor>( CommonFeatureValueAccessor>(unsigned long* d_shard_keys,
unsigned long* d_shard_keys,
unsigned long* d_keys, unsigned long* d_keys,
float* d_shard_grads, float* d_shard_grads,
float* d_grads, float* d_grads,
...@@ -504,30 +782,115 @@ HeterCommKernel::dy_mf_fill_shard_grads<unsigned long, ...@@ -504,30 +782,115 @@ HeterCommKernel::dy_mf_fill_shard_grads<unsigned long,
long long len, long long len,
size_t grad_value_size, size_t grad_value_size,
const cudaStream_t& stream, const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor); CommonFeatureValueAccessor& gpu_accessor);
template void template void HeterCommKernel::
HeterCommKernel::merge_gradient<cudaStream_t, CommonFeatureValueAccessor>( merge_gradient<uint32_t, cudaStream_t, CommonFeatureValueAccessor>(
const uint32_t* d_keys,
const uint32_t* offset, const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
const char* input, const char* input,
char* output, char* output,
int n, int n,
size_t grad_dim,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const cudaStream_t& stream, const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor); CommonFeatureValueAccessor& gpu_accessor);
template void HeterCommKernel:: template void HeterCommKernel::
dy_mf_fill_dvals<int, cudaStream_t, CommonFeatureValueAccessor>( merge_gradient<uint64_t, cudaStream_t, CommonFeatureValueAccessor>(
const uint64_t* d_keys,
const uint32_t* offset,
const uint32_t* fea_num,
const uint32_t* index,
const char* input,
char* output,
int n,
size_t grad_dim,
size_t grad_value_size,
DynamicGradMerger& merger_,
const cudaStream_t& stream,
CommonFeatureValueAccessor& gpu_accessor);
template void HeterCommKernel::dy_mf_fill_dvals<int, cudaStream_t>(
float* d_shard_vals, float* d_shard_vals,
float* d_vals, float* d_vals,
int* idx, int* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const cudaStream_t& stream, const cudaStream_t& stream);
CommonFeatureValueAccessor& feature_value_accessor);
template void HeterCommKernel::split_segments<cudaStream_t>(
const uint32_t* d_fea_num_info,
size_t n,
uint32_t* d_segment,
uint32_t* d_segments_num,
size_t segment_size,
const cudaStream_t& stream);
template void HeterCommKernel::expand_segments<cudaStream_t>(
const uint32_t* d_fea_num_info,
const uint32_t* d_segments_offset,
size_t n,
uint32_t* d_segments_fea_num_info,
uint32_t segment_size,
const cudaStream_t& stream);
template void HeterCommKernel::shrink_keys<uint32_t, cudaStream_t>(
const uint32_t* d_keys,
const uint32_t* d_segments_offset,
uint32_t* d_segments_keys,
size_t segment_num,
const cudaStream_t& stream);
template void HeterCommKernel::shrink_keys<uint64_t, cudaStream_t>(
const uint64_t* d_keys,
const uint32_t* d_segments,
uint64_t* d_segments_keys,
size_t total_segment_num,
const cudaStream_t& stream);
template void HeterCommKernel::fill_restore_idx<uint64_t, cudaStream_t>(
bool filter_zero,
const size_t total_num,
const size_t merge_size,
const uint64_t* d_keys,
const uint32_t* d_sorted_idx,
const uint32_t* d_offset,
const uint32_t* d_merged_cnts,
uint32_t* d_restore_idx,
const cudaStream_t& stream);
template void HeterCommKernel::fill_restore_idx<uint32_t, cudaStream_t>(
bool filter_zero,
const size_t total_num,
const size_t merge_size,
const uint32_t* d_keys,
const uint32_t* d_sorted_idx,
const uint32_t* d_offset,
const uint32_t* d_merged_cnts,
uint32_t* d_restore_idx,
const cudaStream_t& stream);
template void HeterCommKernel::unpack_merged_vals<uint64_t, cudaStream_t>(
size_t n,
const uint64_t* d_keys,
const void* d_merged_vals,
const uint32_t* d_restore_idx,
void* d_vals,
size_t val_size,
const cudaStream_t& stream);
template void HeterCommKernel::unpack_merged_vals<uint32_t, cudaStream_t>(
size_t n,
const uint32_t* d_keys,
const void* d_merged_vals,
const uint32_t* d_restore_idx,
void* d_vals,
size_t val_size,
const cudaStream_t& stream);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -41,16 +41,54 @@ struct DynamicGradMerger { ...@@ -41,16 +41,54 @@ struct DynamicGradMerger {
return out; return out;
} }
template <typename FVAccessor> template <typename GPUAccessor>
__device__ __forceinline__ void update_one( __device__ __forceinline__ void update_one(float* output,
float* output, const float* input, FVAccessor& feature_value_accessor) { const float* input,
feature_value_accessor.PushValueFill(output, input); GPUAccessor& gpu_accessor) {
gpu_accessor.PushValueFill(output, input);
} }
template <typename FVAccessor> template <typename GPUAccessor>
__device__ __forceinline__ void merge_one( __device__ __forceinline__ void merge_one(float* output,
float* output, const float* input, FVAccessor& feature_value_accessor) { const float* input,
feature_value_accessor.MergePushValue(output, input); GPUAccessor& gpu_accessor) {
gpu_accessor.MergePushValue(output, input);
}
template <typename GPUAccessor>
__device__ __forceinline__ void update_basic(float* output,
const float* input,
GPUAccessor& fv_accessor) {
fv_accessor.PushValueFillBasic(output, input);
}
template <typename GPUAccessor>
__device__ __forceinline__ void merge_basic(float* output,
const float* input,
GPUAccessor& fv_accessor) {
fv_accessor.MergePushValueBasic(output, input);
}
template <typename GPUAccessor>
__device__ __forceinline__ void update_embedx(float* output,
const float* input,
size_t embedx_idx,
GPUAccessor& fv_accessor) {
if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] =
input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
}
}
template <typename GPUAccessor>
__device__ __forceinline__ void merge_embedx(float* output,
const float* input,
size_t embedx_idx,
GPUAccessor& fv_accessor) {
if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] +=
input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
}
} }
}; };
...@@ -139,7 +177,7 @@ class HeterCommKernel { ...@@ -139,7 +177,7 @@ class HeterCommKernel {
template <typename KeyType, template <typename KeyType,
typename T, typename T,
typename StreamType, typename StreamType,
typename FVAccessor> typename GPUAccessor>
void dy_mf_fill_shard_grads(KeyType* d_shard_keys, void dy_mf_fill_shard_grads(KeyType* d_shard_keys,
KeyType* d_keys, KeyType* d_keys,
float* d_shard_grads, float* d_shard_grads,
...@@ -148,28 +186,72 @@ class HeterCommKernel { ...@@ -148,28 +186,72 @@ class HeterCommKernel {
long long len, long long len,
size_t grad_value_size, size_t grad_value_size,
const StreamType& stream, const StreamType& stream,
FVAccessor& feature_value_accessor); GPUAccessor& gpu_accessor);
template <typename StreamType, typename FVAccessor> template <typename KeyType, typename StreamType, typename GPUAccessor>
void merge_gradient(const uint32_t* offset, void merge_gradient(const KeyType* d_shard_keys,
const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
const char* input, const char* input,
char* output, char* output,
int n, int n,
size_t grad_dim,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger,
const StreamType& stream, const StreamType& stream,
FVAccessor& feature_value_accessor); GPUAccessor& gpu_accessor);
template <typename T, typename StreamType, typename FVAccessor> template <typename T, typename StreamType>
void dy_mf_fill_dvals(float* d_shard_vals, void dy_mf_fill_dvals(float* d_shard_vals,
float* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream, const StreamType& stream);
FVAccessor& feature_value_accessor);
template <typename StreamType>
void split_segments(const uint32_t* d_fea_num_info,
size_t len,
uint32_t* d_segments,
uint32_t* d_segments_num,
size_t segment_size,
const StreamType& stream);
template <typename StreamType>
void expand_segments(const uint32_t* d_fea_num_info,
const uint32_t* d_segments_offset,
size_t segments_num,
uint32_t* d_segments_fea_num_info,
uint32_t segment_size,
const StreamType& stream);
template <typename KeyType, typename StreamType>
void shrink_keys(const KeyType* d_keys,
const uint32_t* d_segments_offset,
KeyType* d_segments_keys,
size_t segments_num,
const StreamType& stream);
template <typename KeyType, typename StreamType>
void fill_restore_idx(bool filter_zero,
const size_t total_num,
const size_t merge_size,
const KeyType* d_keys,
const uint32_t* d_sorted_idx,
const uint32_t* d_offset,
const uint32_t* d_merged_cnts,
uint32_t* d_restore_idx,
const StreamType& stream);
template <typename KeyType, typename StreamType>
void unpack_merged_vals(size_t n,
const KeyType* d_keys,
const void* d_merged_vals,
const uint32_t* d_restore_idx,
void* d_vals,
size_t val_size,
const StreamType& stream);
private: private:
int block_size_{256}; int block_size_{256};
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
#include <vector> #include <vector>
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
...@@ -27,55 +26,80 @@ HeterPsBase* HeterPsBase::get_instance( ...@@ -27,55 +26,80 @@ HeterPsBase* HeterPsBase::get_instance(
std::unordered_map<std::string, float> fleet_config, std::unordered_map<std::string, float> fleet_config,
std::string accessor_type, std::string accessor_type,
int optimizer_type) { int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" && if (accessor_type == "CtrDymfAccessor") {
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { auto* accessor_wrapper_ptr =
return new HeterPs<CommonFeatureValueAccessor>( GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
capacity, resource, accessor_type, fleet_config, optimizer_type); CommonFeatureValueAccessor* gpu_accessor =
((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)
->AccessorPtr();
if (optimizer_type == 1) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
capacity, resource, *gpu_accessor);
} else if (optimizer_type == 3) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdamOptimizer>(
capacity, resource, *gpu_accessor);
} else if (optimizer_type == 4) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdamSharedOptimizer>(
capacity, resource, *gpu_accessor);
}
} else { } else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support " VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get " "CtrDymfAccessor, but get "
<< accessor_type_; << accessor_type;
return new HeterPs<CommonFeatureValueAccessor>( return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
capacity, resource, accessor_type, fleet_config, optimizer_type); capacity, resource, fleet_config, accessor_type, optimizer_type);
} }
} }
HeterPs::HeterPs(size_t capacity, template <typename GPUAccessor, template <typename T> class GPUOptimizer>
HeterPs<GPUAccessor, GPUOptimizer>::HeterPs(
size_t capacity,
std::shared_ptr<HeterPsResource> resource, std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config, GPUAccessor& gpu_accessor) {
std::string accessor_type, comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, GPUAccessor>>(
int optimizer_type) {
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
capacity, resource); capacity, resource);
optimizer_type_ = optimizer_type; opt_ = GPUOptimizer<GPUAccessor>(gpu_accessor);
} }
HeterPs::~HeterPs() {} template <typename GPUAccessor, template <typename T> class GPUOptimizer>
HeterPs<GPUAccessor, GPUOptimizer>::~HeterPs() {}
void HeterPs::pull_sparse(int num, template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<GPUAccessor, GPUOptimizer>::pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_vals, float* d_vals,
size_t len) { size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
int HeterPs::get_index_by_devid(int devid) { template <typename GPUAccessor, template <typename T> class GPUOptimizer>
int HeterPs<GPUAccessor, GPUOptimizer>::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<GPUAccessor, GPUOptimizer>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_sparse_sgd(optimizer_config); comm_->set_sparse_sgd(optimizer_config);
} }
void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { void HeterPs<GPUAccessor, GPUOptimizer>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_embedx_sgd(optimizer_config); comm_->set_embedx_sgd(optimizer_config);
} }
void HeterPs::end_pass() { comm_->end_pass(); } template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<GPUAccessor, GPUOptimizer>::end_pass() {
comm_->end_pass();
}
void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<GPUAccessor, GPUOptimizer>::show_one_table(int gpu_num) {
comm_->show_one_table(gpu_num);
}
void HeterPs::push_sparse(int num, template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_grads, float* d_grads,
size_t len) { size_t len) {
......
...@@ -27,47 +27,52 @@ HeterPsBase* HeterPsBase::get_instance( ...@@ -27,47 +27,52 @@ HeterPsBase* HeterPsBase::get_instance(
std::unordered_map<std::string, float> fleet_config, std::unordered_map<std::string, float> fleet_config,
std::string accessor_type, std::string accessor_type,
int optimizer_type) { int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" && if (accessor_type == "CtrDymfAccessor") {
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { auto* accessor_wrapper_ptr =
return new HeterPs<CommonFeatureValueAccessor>( GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
capacity, resource, fleet_config, accessor_type, optimizer_type); CommonFeatureValueAccessor* gpu_accessor =
((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)
->AccessorPtr();
if (optimizer_type == 1) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(
capacity, resource, *gpu_accessor);
} else if (optimizer_type == 3) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdamOptimizer>(
capacity, resource, *gpu_accessor);
} else if (optimizer_type == 4) {
return new HeterPs<CommonFeatureValueAccessor, SparseAdamSharedOptimizer>(
capacity, resource, *gpu_accessor);
}
} else { } else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support " VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get " "CtrDymfAccessor, but get "
<< accessor_type; << accessor_type;
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, fleet_config, accessor_type, optimizer_type);
} }
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
HeterPs<FVAccessor>::HeterPs( HeterPs<GPUAccessor, GPUOptimizer>::HeterPs(
size_t capacity, size_t capacity,
std::shared_ptr<HeterPsResource> resource, std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config, GPUAccessor& gpu_accessor) {
std::string accessor_type, comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, GPUAccessor>>(
int optimizer_type) { capacity, resource, gpu_accessor);
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>( opt_ = GPUOptimizer<GPUAccessor>(gpu_accessor);
capacity, resource);
feature_value_accessor_.Configure(fleet_config);
set_accessor(feature_value_accessor_);
accessor_type_ = accessor_type;
optimizer_type_ = optimizer_type;
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
HeterPs<FVAccessor>::~HeterPs() {} HeterPs<GPUAccessor, GPUOptimizer>::~HeterPs() {}
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::pull_sparse(int num, void HeterPs<GPUAccessor, GPUOptimizer>::pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_vals, float* d_vals,
size_t len) { size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::build_ps(int num, void HeterPs<GPUAccessor, GPUOptimizer>::build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
size_t len, size_t len,
...@@ -78,81 +83,82 @@ void HeterPs<FVAccessor>::build_ps(int num, ...@@ -78,81 +83,82 @@ void HeterPs<FVAccessor>::build_ps(int num,
num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
int HeterPs<FVAccessor>::get_index_by_devid(int devid) { int HeterPs<GPUAccessor, GPUOptimizer>::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::set_sparse_sgd( void HeterPs<GPUAccessor, GPUOptimizer>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
comm_->set_sparse_sgd(optimizer_config); comm_->set_sparse_sgd(optimizer_config);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::set_embedx_sgd( void HeterPs<GPUAccessor, GPUOptimizer>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
comm_->set_embedx_sgd(optimizer_config); comm_->set_embedx_sgd(optimizer_config);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::end_pass() { void HeterPs<GPUAccessor, GPUOptimizer>::end_pass() {
comm_->end_pass(); comm_->end_pass();
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::show_one_table(int gpu_num) { void HeterPs<GPUAccessor, GPUOptimizer>::show_one_table(int gpu_num) {
comm_->show_one_table(gpu_num); comm_->show_one_table(gpu_num);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::push_sparse(int num, void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_grads, float* d_grads,
size_t len) { size_t len) {
if (accessor_type_ == "CtrDymfAccessor") { comm_->push_sparse(num, d_keys, d_grads, len, opt_);
if (optimizer_type_ == 3) { // adam
auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 4) { // shared_adam
auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 1) { // adagrad {
auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else {
VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
"adam(3) or shared_adam(4), bug get optimizer type:"
<< optimizer_type_;
}
} else {
VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
<< accessor_type_;
}
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::set_nccl_comm_and_size( void HeterPs<GPUAccessor, GPUOptimizer>::set_nccl_comm_and_size(
const std::vector<ncclComm_t>& inner_comms, const std::vector<ncclComm_t>& inner_comms,
const std::vector<ncclComm_t>& inter_comms, const std::vector<ncclComm_t>& inter_comms,
int comm_size) { int comm_size) {
comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { void HeterPs<GPUAccessor, GPUOptimizer>::set_multi_mf_dim(int multi_mf_dim,
int max_mf_dim) {
comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
} }
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) { void HeterPs<GPUAccessor, GPUOptimizer>::show_table_collisions() {
comm_->set_accessor(accessor); comm_->show_table_collisions();
}
template <typename GPUAccessor, template <typename T> class GPUOptimizer>
int HeterPs<GPUAccessor, GPUOptimizer>::dedup_keys_and_fillidx(
const int gpu_id,
const int total_fea_num,
const FeatureKey* d_keys, // input
FeatureKey* d_merged_keys, // output
FeatureKey* d_sorted_keys,
uint32_t* d_restore_idx,
uint32_t* d_sorted_idx,
uint32_t* d_offset,
uint32_t* d_merged_cnts,
bool filter_zero) {
return comm_->dedup_keys_and_fillidx(gpu_id,
total_fea_num,
d_keys, // input
d_merged_keys, // output
d_sorted_keys,
d_restore_idx,
d_sorted_idx,
d_offset,
d_merged_cnts,
filter_zero);
} }
} // end namespace framework } // end namespace framework
......
...@@ -26,15 +26,13 @@ limitations under the License. */ ...@@ -26,15 +26,13 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename FVAccessor> template <typename GPUAccessor, template <typename T> class GPUOptimizer>
class HeterPs : public HeterPsBase { class HeterPs : public HeterPsBase {
public: public:
HeterPs() {} HeterPs() {}
HeterPs(size_t capacity, HeterPs(size_t capacity,
std::shared_ptr<HeterPsResource> resource, std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config, GPUAccessor& gpu_accessor);
std::string accessor_type,
int optimizer_type);
virtual ~HeterPs(); virtual ~HeterPs();
HeterPs(const HeterPs&) = delete; HeterPs(const HeterPs&) = delete;
HeterPs& operator=(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete;
...@@ -43,6 +41,8 @@ class HeterPs : public HeterPsBase { ...@@ -43,6 +41,8 @@ class HeterPs : public HeterPsBase {
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_vals, float* d_vals,
size_t len) override; size_t len) override;
// void build_ps(int num, FeatureKey* h_keys, float* h_vals, size_t len,
// size_t chunk_size, int stream_num) override;
void build_ps(int num, void build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
...@@ -56,7 +56,6 @@ class HeterPs : public HeterPsBase { ...@@ -56,7 +56,6 @@ class HeterPs : public HeterPsBase {
int comm_size) override; int comm_size) override;
void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
void set_accessor(FVAccessor& accessor);
#endif #endif
void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
...@@ -65,17 +64,25 @@ class HeterPs : public HeterPsBase { ...@@ -65,17 +64,25 @@ class HeterPs : public HeterPsBase {
void end_pass() override; void end_pass() override;
int get_index_by_devid(int devid) override; int get_index_by_devid(int devid) override;
void show_one_table(int gpu_num) override; void show_one_table(int gpu_num) override;
void push_sparse(int num, void push_sparse(int num, FeatureKey* d_keys, float* d_grads, size_t len);
FeatureKey* d_keys, void show_table_collisions() override;
float* d_grads, #if defined(PADDLE_WITH_CUDA)
size_t len) override; // dedup
int dedup_keys_and_fillidx(const int gpu_id,
const int total_fea_num,
const FeatureKey* d_keys, // input
FeatureKey* d_merged_keys, // output
FeatureKey* d_sorted_keys,
uint32_t* d_restore_idx,
uint32_t* d_sorted_idx,
uint32_t* d_offset,
uint32_t* d_merged_cnts,
bool filter_zero);
#endif
private: private:
std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_; std::shared_ptr<HeterComm<FeatureKey, float*, float*, GPUAccessor>> comm_;
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
FVAccessor feature_value_accessor_; GPUOptimizer<GPUAccessor> opt_;
std::string accessor_type_;
int optimizer_type_;
#endif #endif
}; };
......
...@@ -54,6 +54,7 @@ class HeterPsBase { ...@@ -54,6 +54,7 @@ class HeterPsBase {
#endif #endif
virtual void end_pass() = 0; virtual void end_pass() = 0;
virtual void show_one_table(int gpu_num) = 0; virtual void show_one_table(int gpu_num) = 0;
virtual void show_table_collisions() = 0;
virtual void push_sparse(int num, virtual void push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
float* d_grads, float* d_grads,
...@@ -65,10 +66,22 @@ class HeterPsBase { ...@@ -65,10 +66,22 @@ class HeterPsBase {
static HeterPsBase* get_instance( static HeterPsBase* get_instance(
size_t capacity, size_t capacity,
std::shared_ptr<HeterPsResource> resource, std::shared_ptr<HeterPsResource> resource,
// CommonFeatureValueAccessor feature_value_accessor,
std::unordered_map<std::string, float> fleet_config, std::unordered_map<std::string, float> fleet_config,
std::string accessor_type, std::string accessor_type,
int optimizer_type); int optimizer_type);
#if defined(PADDLE_WITH_CUDA)
// dedup
virtual int dedup_keys_and_fillidx(const int gpu_id,
const int total_fea_num,
const FeatureKey* d_keys, // input
FeatureKey* d_merged_keys, // output
FeatureKey* d_sorted_keys,
uint32_t* d_restore_idx,
uint32_t* d_sorted_idx,
uint32_t* d_offset,
uint32_t* d_merged_cnts,
bool filter_zero) = 0;
#endif
}; };
} // end namespace framework } // end namespace framework
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <iostream> #include <iostream>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh" #include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -60,9 +61,9 @@ class HBMMemoryPool : public managed { ...@@ -60,9 +61,9 @@ class HBMMemoryPool : public managed {
block_size_ = mem_pool->block_size(); block_size_ = mem_pool->block_size();
VLOG(3) << "hbm memory pool with capacity" << capacity_ VLOG(3) << "hbm memory pool with capacity" << capacity_
<< " bs: " << block_size_; << " bs: " << block_size_;
cudaMalloc(&mem_, block_size_ * capacity_); CUDA_CHECK(cudaMalloc(&mem_, block_size_ * capacity_));
cudaMemcpy( CUDA_CHECK(cudaMemcpy(
mem_, mem_pool->mem(), mem_pool->byte_size(), cudaMemcpyHostToDevice); mem_, mem_pool->mem(), mem_pool->byte_size(), cudaMemcpyHostToDevice));
} }
~HBMMemoryPool() { ~HBMMemoryPool() {
...@@ -78,8 +79,8 @@ class HBMMemoryPool : public managed { ...@@ -78,8 +79,8 @@ class HBMMemoryPool : public managed {
cudaFree(mem_); cudaFree(mem_);
mem_ = NULL; mem_ = NULL;
capacity_ = capacity; capacity_ = capacity;
cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8); CUDA_CHECK(cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8));
cudaMemset(mem_, 0, block_size_ * capacity); CUDA_CHECK(cudaMemset(mem_, 0, block_size_ * capacity));
} }
char* mem() { return mem_; } char* mem() { return mem_; }
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include <curand_kernel.h> #include <curand_kernel.h>
#endif #endif
#include <vector> #include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
...@@ -28,49 +27,34 @@ namespace framework { ...@@ -28,49 +27,34 @@ namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
class Optimizer { template <typename GPUAccessor>
public: class SparseAdagradOptimizer {
__host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) {
feature_value_accessor_ = feature_value_accessor;
}
__host__ ~Optimizer() {}
__device__ void update_value(const OptimizerConfig& optimizer_config,
float& val, // NOLINT
const float& grad) {
printf(
"Warning: update_value will not used. Please use dy_mf_update_value\n");
}
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr,
const float* grad) {}
CommonFeatureValueAccessor feature_value_accessor_;
size_t _embedding_dim;
size_t _lr_embedding_dim;
};
class SparseAdagradOptimizer : public Optimizer {
public: public:
__host__ SparseAdagradOptimizer( SparseAdagradOptimizer() {}
CommonFeatureValueAccessor feature_value_accessor) SparseAdagradOptimizer(GPUAccessor gpu_accessor) {
: Optimizer(feature_value_accessor) { gpu_accessor_ = gpu_accessor;
_lr_embedding_dim = 1; _lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
} }
~SparseAdagradOptimizer() {}
__device__ void update_value_work(const OptimizerConfig& optimizer_config, __device__ void update_value_work(const OptimizerConfig& optimizer_config,
int n, int n,
float* w, float* w,
float* sgd, // NOLINT float* sgd, // NOLINT
const float* g, const float* g,
float scale) { float scale,
float slot) {
float& g2sum = sgd[G2SumIndex()]; float& g2sum = sgd[G2SumIndex()];
double add_g2sum = 0; double add_g2sum = 0;
double ratio = optimizer_config.mf_learning_rate *
sqrt(optimizer_config.mf_initial_g2sum / float learning_rate = optimizer_config.mf_learning_rate;
if (slot != optimizer_config.nodeid_slot) {
learning_rate = optimizer_config.feature_learning_rate;
}
double ratio =
learning_rate * sqrt(optimizer_config.mf_initial_g2sum /
(optimizer_config.mf_initial_g2sum + g2sum)); (optimizer_config.mf_initial_g2sum + g2sum));
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale; double scaled_grad = g[i] / scale;
...@@ -96,47 +80,43 @@ class SparseAdagradOptimizer : public Optimizer { ...@@ -96,47 +80,43 @@ class SparseAdagradOptimizer : public Optimizer {
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr, float* ptr,
const float* grad) { const float* grad) {
float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
float g_click = float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
grad[feature_value_accessor_.common_push_value.ClickIndex()];
ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = grad[gpu_accessor_.common_push_value.SlotIndex()];
grad[feature_value_accessor_.common_push_value.SlotIndex()]; ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) + optimizer_config.nonclk_coeff * (g_show - g_click) +
optimizer_config.clk_coeff * g_click; optimizer_config.clk_coeff * g_click;
float slot = ptr[gpu_accessor_.common_feature_value.SlotIndex()];
update_value_work( update_value_work(
optimizer_config, optimizer_config,
1, 1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(), grad + gpu_accessor_.common_push_value.EmbedGIndex(),
g_show); g_show,
slot);
int mf_dim =
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <= if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * optimizer_config.nonclk_coeff *
(ptr[feature_value_accessor_.common_feature_value (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
.ShowIndex()] - ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff * optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
.ClickIndex()]) { ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state; curandState state;
curand_init(clock64(), tid_x, 0, &state); curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < mf_dim; ++i) { for (int i = 0; i < mf_dim; ++i) {
ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range; (curand_uniform(&state)) * optimizer_config.mf_initial_range;
} }
} }
...@@ -144,10 +124,11 @@ class SparseAdagradOptimizer : public Optimizer { ...@@ -144,10 +124,11 @@ class SparseAdagradOptimizer : public Optimizer {
update_value_work( update_value_work(
optimizer_config, optimizer_config,
mf_dim, mf_dim,
ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
g_show); g_show,
slot);
} }
} }
...@@ -156,17 +137,25 @@ class SparseAdagradOptimizer : public Optimizer { ...@@ -156,17 +137,25 @@ class SparseAdagradOptimizer : public Optimizer {
__host__ __device__ size_t EmbedxDim() { return _embedding_dim; } __host__ __device__ size_t EmbedxDim() { return _embedding_dim; }
__host__ __device__ size_t G2SumIndex() { return 0; } __host__ __device__ size_t G2SumIndex() { return 0; }
__host__ __device__ size_t EmbedxG2SumIndex() { return 0; } __host__ __device__ size_t EmbedxG2SumIndex() { return 0; }
private:
GPUAccessor gpu_accessor_;
size_t _embedding_dim;
size_t _lr_embedding_dim;
}; };
class SparseAdamOptimizer : public Optimizer { template <typename GPUAccessor>
class SparseAdamOptimizer {
public: public:
__host__ SparseAdamOptimizer( SparseAdamOptimizer() {}
CommonFeatureValueAccessor feature_value_accessor) SparseAdamOptimizer(GPUAccessor gpu_accessor) {
: Optimizer(feature_value_accessor) { gpu_accessor_ = gpu_accessor;
_lr_embedding_dim = 1; _lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
} }
~SparseAdamOptimizer() {}
__device__ void update_lr(const OptimizerConfig& optimizer_config, __device__ void update_lr(const OptimizerConfig& optimizer_config,
int n, int n,
float* w, float* w,
...@@ -256,65 +245,57 @@ class SparseAdamOptimizer : public Optimizer { ...@@ -256,65 +245,57 @@ class SparseAdamOptimizer : public Optimizer {
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr, float* ptr,
const float* grad) { const float* grad) {
float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
float g_click = float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
grad[feature_value_accessor_.common_push_value.ClickIndex()];
ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = grad[gpu_accessor_.common_push_value.SlotIndex()];
grad[feature_value_accessor_.common_push_value.SlotIndex()]; ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) + optimizer_config.nonclk_coeff * (g_show - g_click) +
optimizer_config.clk_coeff * g_click; optimizer_config.clk_coeff * g_click;
update_lr( update_lr(optimizer_config,
optimizer_config,
1, 1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(), grad + gpu_accessor_.common_push_value.EmbedGIndex(),
g_show); g_show);
int mf_dim = int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <= if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * optimizer_config.nonclk_coeff *
(ptr[feature_value_accessor_.common_feature_value (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
.ShowIndex()] - ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff * optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
.ClickIndex()]) { ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state; curandState state;
curand_init(clock64(), tid_x, 0, &state); curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < mf_dim; ++i) { for (int i = 0; i < mf_dim; ++i) {
ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range; (curand_uniform(&state)) * optimizer_config.mf_initial_range;
} }
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
} }
} else { } else {
update_mf( update_mf(optimizer_config,
optimizer_config,
mf_dim, mf_dim,
ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
g_show); g_show);
} }
// printf("EmbedxGIndex: %f, mf_gsum: %f, ", // printf("EmbedxGIndex: %f, mf_gsum: %f, ",
// feature_value_accessor_.common_push_value.EmbedxGIndex(), // gpu_accessor_.common_push_value.EmbedxGIndex(),
// ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]); // ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex()]);
} }
__host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
...@@ -338,17 +319,25 @@ class SparseAdamOptimizer : public Optimizer { ...@@ -338,17 +319,25 @@ class SparseAdamOptimizer : public Optimizer {
__host__ __device__ size_t EmbedxBeta2PowIndex() { __host__ __device__ size_t EmbedxBeta2PowIndex() {
return EmbedxBeta1PowIndex() + 1; return EmbedxBeta1PowIndex() + 1;
} }
private:
GPUAccessor gpu_accessor_;
size_t _embedding_dim;
size_t _lr_embedding_dim;
}; };
class SparseAdamSharedOptimizer : public Optimizer { template <typename GPUAccessor>
class SparseAdamSharedOptimizer {
public: public:
__host__ SparseAdamSharedOptimizer( SparseAdamSharedOptimizer() {}
CommonFeatureValueAccessor feature_value_accessor) SparseAdamSharedOptimizer(GPUAccessor gpu_accessor) {
: Optimizer(feature_value_accessor) { gpu_accessor_ = gpu_accessor;
_lr_embedding_dim = 1; _lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); _embedding_dim = gpu_accessor_.common_feature_value.EmbedWDim();
} }
~SparseAdamSharedOptimizer() {}
__device__ void update_value_work(const OptimizerConfig& optimizer_config, __device__ void update_value_work(const OptimizerConfig& optimizer_config,
int n, int n,
float* w, float* w,
...@@ -406,60 +395,54 @@ class SparseAdamSharedOptimizer : public Optimizer { ...@@ -406,60 +395,54 @@ class SparseAdamSharedOptimizer : public Optimizer {
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr, float* ptr,
const float* grad) { const float* grad) {
float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; float g_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
float g_click = float g_click = grad[gpu_accessor_.common_push_value.ClickIndex()];
grad[feature_value_accessor_.common_push_value.ClickIndex()];
ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = grad[gpu_accessor_.common_push_value.SlotIndex()];
grad[feature_value_accessor_.common_push_value.SlotIndex()]; ptr[gpu_accessor_.common_feature_value.ShowIndex()] += g_show;
ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; ptr[gpu_accessor_.common_feature_value.ClickIndex()] += g_click;
ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) + optimizer_config.nonclk_coeff * (g_show - g_click) +
optimizer_config.clk_coeff * g_click; optimizer_config.clk_coeff * g_click;
update_value_work( update_value_work(
optimizer_config, optimizer_config,
1, 1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(), grad + gpu_accessor_.common_push_value.EmbedGIndex(),
g_show); g_show);
int mf_dim = int mf_dim = int(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); if (ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <= if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * optimizer_config.nonclk_coeff *
(ptr[feature_value_accessor_.common_feature_value (ptr[gpu_accessor_.common_feature_value.ShowIndex()] -
.ShowIndex()] - ptr[gpu_accessor_.common_feature_value.ClickIndex()]) +
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff * optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value ptr[gpu_accessor_.common_feature_value.ClickIndex()]) {
.ClickIndex()]) { ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = gpu_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float);
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state; curandState state;
curand_init(clock64(), tid_x, 0, &state); curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < mf_dim; ++i) { for (int i = 0; i < mf_dim; ++i) {
ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = ptr[gpu_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range; (curand_uniform(&state)) * optimizer_config.mf_initial_range;
} }
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
} }
} else { } else {
update_value_work( update_value_work(
optimizer_config, optimizer_config,
mf_dim, mf_dim,
ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), ptr + gpu_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), grad + gpu_accessor_.common_push_value.EmbedxGIndex(),
g_show); g_show);
} }
} }
...@@ -481,6 +464,11 @@ class SparseAdamSharedOptimizer : public Optimizer { ...@@ -481,6 +464,11 @@ class SparseAdamSharedOptimizer : public Optimizer {
__host__ __device__ size_t EmbedxBeta2PowIndex() { __host__ __device__ size_t EmbedxBeta2PowIndex() {
return EmbedxBeta1PowIndex() + 1; return EmbedxBeta1PowIndex() + 1;
} }
private:
GPUAccessor gpu_accessor_;
size_t _embedding_dim;
size_t _lr_embedding_dim;
}; };
#endif #endif
......
...@@ -41,6 +41,9 @@ class OptimizerConfig { ...@@ -41,6 +41,9 @@ class OptimizerConfig {
float mf_max_bound = 10; float mf_max_bound = 10;
float mf_ada_epsilon = 1e-8; float mf_ada_epsilon = 1e-8;
float nodeid_slot = 9008;
float feature_learning_rate = 0.05;
void set_sparse_sgd(float nonclk_coeff, void set_sparse_sgd(float nonclk_coeff,
float clk_coeff, float clk_coeff,
float min_bound, float min_bound,
...@@ -84,7 +87,9 @@ class OptimizerConfig { ...@@ -84,7 +87,9 @@ class OptimizerConfig {
float mf_max_bound, float mf_max_bound,
float mf_beta1_decay_rate, float mf_beta1_decay_rate,
float mf_beta2_decay_rate, float mf_beta2_decay_rate,
float mf_ada_epsilon) { float mf_ada_epsilon,
float nodeid_slot,
float feature_learning_rate) {
this->mf_create_thresholds = mf_create_thresholds; this->mf_create_thresholds = mf_create_thresholds;
this->mf_learning_rate = mf_learning_rate; this->mf_learning_rate = mf_learning_rate;
this->mf_initial_g2sum = mf_initial_g2sum; this->mf_initial_g2sum = mf_initial_g2sum;
...@@ -94,6 +99,9 @@ class OptimizerConfig { ...@@ -94,6 +99,9 @@ class OptimizerConfig {
this->mf_beta1_decay_rate = mf_beta1_decay_rate; this->mf_beta1_decay_rate = mf_beta1_decay_rate;
this->mf_beta2_decay_rate = mf_beta2_decay_rate; this->mf_beta2_decay_rate = mf_beta2_decay_rate;
this->mf_ada_epsilon = mf_ada_epsilon; this->mf_ada_epsilon = mf_ada_epsilon;
this->nodeid_slot = nodeid_slot;
this->feature_learning_rate = feature_learning_rate;
} }
void set_embedx_sgd(const OptimizerConfig& optimizer_config) { void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
...@@ -106,6 +114,9 @@ class OptimizerConfig { ...@@ -106,6 +114,9 @@ class OptimizerConfig {
this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate; this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate;
this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate; this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate;
this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon; this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon;
this->nodeid_slot = nodeid_slot;
this->feature_learning_rate = feature_learning_rate;
} }
}; };
......
...@@ -27,9 +27,6 @@ ...@@ -27,9 +27,6 @@
using namespace paddle::framework; using namespace paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph
// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
// std::vector<int64_t> ids)
std::string edges[] = { std::string edges[] = {
std::string("0\t1"), std::string("0\t1"),
...@@ -121,13 +118,13 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -121,13 +118,13 @@ TEST(TEST_FLEET, test_cpu_cache) {
std::make_shared<HeterPsResource>(device_id_mapping); std::make_shared<HeterPsResource>(device_id_mapping);
resource->enable_p2p(); resource->enable_p2p();
int use_nv = 1; int use_nv = 1;
GpuPsGraphTable g(resource, use_nv); GpuPsGraphTable g(resource, 1, 2);
g.init_cpu_table(table_proto); g.init_cpu_table(table_proto);
g.cpu_graph_table->Load(node_file_name, "nuser"); g.cpu_graph_table_->Load(node_file_name, "nuser");
g.cpu_graph_table->Load(node_file_name, "nitem"); g.cpu_graph_table_->Load(node_file_name, "nitem");
std::remove(node_file_name); std::remove(node_file_name);
std::vector<paddle::framework::GpuPsCommGraph> vec; std::vector<paddle::framework::GpuPsCommGraph> vec;
std::vector<int64_t> node_ids; std::vector<uint64_t> node_ids;
node_ids.push_back(37); node_ids.push_back(37);
node_ids.push_back(96); node_ids.push_back(96);
std::vector<std::vector<std::string>> node_feat(2, std::vector<std::vector<std::string>> node_feat(2,
...@@ -135,38 +132,29 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -135,38 +132,29 @@ TEST(TEST_FLEET, test_cpu_cache) {
std::vector<std::string> feature_names; std::vector<std::string> feature_names;
feature_names.push_back(std::string("c")); feature_names.push_back(std::string("c"));
feature_names.push_back(std::string("d")); feature_names.push_back(std::string("d"));
g.cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat); g.cpu_graph_table_->get_node_feat(0, node_ids, feature_names, node_feat);
VLOG(0) << "get_node_feat: " << node_feat[0][0]; VLOG(0) << "get_node_feat: " << node_feat[0][0];
VLOG(0) << "get_node_feat: " << node_feat[0][1]; VLOG(0) << "get_node_feat: " << node_feat[0][1];
VLOG(0) << "get_node_feat: " << node_feat[1][0]; VLOG(0) << "get_node_feat: " << node_feat[1][0];
VLOG(0) << "get_node_feat: " << node_feat[1][1]; VLOG(0) << "get_node_feat: " << node_feat[1][1];
int n = 10; int n = 10;
std::vector<int64_t> ids0, ids1; std::vector<uint64_t> ids0, ids1;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
g.cpu_graph_table->add_comm_edge(0, i, (i + 1) % n); g.cpu_graph_table_->add_comm_edge(0, i, (i + 1) % n);
g.cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n); g.cpu_graph_table_->add_comm_edge(0, i, (i - 1 + n) % n);
if (i % 2 == 0) ids0.push_back(i); if (i % 2 == 0) ids0.push_back(i);
} }
g.cpu_graph_table->build_sampler(0); g.cpu_graph_table_->build_sampler(0);
ids1.push_back(5); ids1.push_back(5);
ids1.push_back(7); ids1.push_back(7);
vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0)); vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids0));
vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1)); vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids1));
vec[0].display_on_cpu(); vec[0].display_on_cpu();
vec[1].display_on_cpu(); vec[1].display_on_cpu();
// g.build_graph_from_cpu(vec); // g.build_graph_from_cpu(vec);
g.build_graph_on_single_gpu(vec[0], 0); g.build_graph_on_single_gpu(vec[0], 0, 0);
g.build_graph_on_single_gpu(vec[1], 1); g.build_graph_on_single_gpu(vec[1], 1, 0);
int64_t cpu_key[3] = {0, 1, 2}; uint64_t cpu_key[3] = {0, 1, 2};
/*
std::vector<std::shared_ptr<char>> buffers(3);
std::vector<int> actual_sizes(3,0);
g.cpu_graph_table->random_sample_neighbors(cpu_key,2,buffers,actual_sizes,false);
for(int i = 0;i < 3;i++){
VLOG(0)<<"sample from cpu key->"<<cpu_key[i]<<" actual sample size =
"<<actual_sizes[i]/sizeof(int64_t);
}
*/
void *key; void *key;
int device_len = 2; int device_len = 2;
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
...@@ -178,7 +166,7 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -178,7 +166,7 @@ TEST(TEST_FLEET, test_cpu_cache) {
int step = 2; int step = 2;
int cur = 0; int cur = 0;
while (true) { while (true) {
auto node_query_res = g.query_node_list(i, cur, step); auto node_query_res = g.query_node_list(i, 0, cur, step);
node_query_res.display(); node_query_res.display();
if (node_query_res.get_len() == 0) { if (node_query_res.get_len() == 0) {
VLOG(0) << "no more ids,break"; VLOG(0) << "no more ids,break";
...@@ -187,19 +175,20 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -187,19 +175,20 @@ TEST(TEST_FLEET, test_cpu_cache) {
cur += node_query_res.get_len(); cur += node_query_res.get_len();
NeighborSampleQuery query; NeighborSampleQuery query;
query.initialize( query.initialize(
i, node_query_res.get_val(), 1, node_query_res.get_len()); i, 0, node_query_res.get_val(), 1, node_query_res.get_len());
query.display(); query.display();
auto c = g.graph_neighbor_sample_v3(query, false); auto c = g.graph_neighbor_sample_v3(query, false);
c.display(); c.display();
} }
} }
g.cpu_graph_table->set_search_level(2); g.cpu_graph_table_->clear_graph(0);
// g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u"); g.cpu_graph_table_->set_search_level(2);
g.cpu_graph_table->Load(edge_file_name, "e>u2u"); g.cpu_graph_table_->Load(edge_file_name, "e>u2u");
g.cpu_graph_table->make_partitions(0, 64, 2); g.cpu_graph_table_->make_partitions(0, 64, 2);
int index = 0; int index = 0;
while (g.cpu_graph_table->load_next_partition(0) != -1) { /*
auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len); while (g.cpu_graph_table_->load_next_partition(0) != -1) {
auto all_ids = g.cpu_graph_table_->get_all_id(0, 0, device_len);
for (auto x : all_ids) { for (auto x : all_ids) {
for (auto y : x) { for (auto y : x) {
VLOG(0) << "part " << index << " " << y; VLOG(0) << "part " << index << " " << y;
...@@ -207,19 +196,19 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -207,19 +196,19 @@ TEST(TEST_FLEET, test_cpu_cache) {
} }
for (int i = 0; i < all_ids.size(); i++) { for (int i = 0; i < all_ids.size(); i++) {
GpuPsCommGraph sub_graph = GpuPsCommGraph sub_graph =
g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]); g.cpu_graph_table_->make_gpu_ps_graph(0, all_ids[i]);
g.build_graph_on_single_gpu(sub_graph, i); g.build_graph_on_single_gpu(sub_graph, i, 0);
VLOG(2) << "sub graph on gpu " << i << " is built"; VLOG(2) << "sub graph on gpu " << i << " is built";
} }
VLOG(0) << "start to iterate gpu graph node"; VLOG(0) << "start to iterate gpu graph node";
g.cpu_graph_table->make_complementary_graph(0, 64); g.cpu_graph_table_->make_complementary_graph(0, 64);
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
// platform::CUDADeviceGuard guard(i); // platform::CUDADeviceGuard guard(i);
LOG(0) << "query on card " << i; LOG(0) << "query on card " << i;
int step = 2; int step = 2;
int cur = 0; int cur = 0;
while (true) { while (true) {
auto node_query_res = g.query_node_list(i, cur, step); auto node_query_res = g.query_node_list(i, 0, cur, step);
node_query_res.display(); node_query_res.display();
if (node_query_res.get_len() == 0) { if (node_query_res.get_len() == 0) {
VLOG(0) << "no more ids,break"; VLOG(0) << "no more ids,break";
...@@ -227,23 +216,23 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -227,23 +216,23 @@ TEST(TEST_FLEET, test_cpu_cache) {
} }
cur += node_query_res.get_len(); cur += node_query_res.get_len();
NeighborSampleQuery query, q1; NeighborSampleQuery query, q1;
query.initialize( query.initialize(i, 0, node_query_res.get_val(), 4,
i, node_query_res.get_val(), 4, node_query_res.get_len()); node_query_res.get_len());
query.display(); query.display();
auto c = g.graph_neighbor_sample_v3(query, true); auto c = g.graph_neighbor_sample_v3(query, true);
c.display(); c.display();
platform::CUDADeviceGuard guard(i); platform::CUDADeviceGuard guard(i);
int64_t *key; uint64_t *key;
VLOG(0) << "sample key 1 globally"; VLOG(0) << "sample key 1 globally";
g.cpu_graph_table->set_search_level(2); g.cpu_graph_table_->set_search_level(2);
cudaMalloc((void **)&key, sizeof(int64_t)); cudaMalloc((void **)&key, sizeof(uint64_t));
int64_t t_key = 1; uint64_t t_key = 1;
cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice); cudaMemcpy(key, &t_key, sizeof(uint64_t), cudaMemcpyHostToDevice);
q1.initialize(i, (int64_t)key, 2, 1); q1.initialize(i, 0, (uint64_t)key, 2, 1);
auto d = g.graph_neighbor_sample_v3(q1, true); auto d = g.graph_neighbor_sample_v3(q1, true);
d.display(); d.display();
cudaFree(key); cudaFree(key);
g.cpu_graph_table->set_search_level(1); g.cpu_graph_table_->set_search_level(1);
} }
} }
index++; index++;
...@@ -253,4 +242,5 @@ TEST(TEST_FLEET, test_cpu_cache) { ...@@ -253,4 +242,5 @@ TEST(TEST_FLEET, test_cpu_cache) {
device.push_back(0); device.push_back(0);
device.push_back(1); device.push_back(1);
iter->set_device(device); iter->set_device(device);
*/
} }
...@@ -50,15 +50,16 @@ TEST(TEST_FLEET, graph_comm) { ...@@ -50,15 +50,16 @@ TEST(TEST_FLEET, graph_comm) {
} }
std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0); std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0);
for (int i = 0; i < graph_list.size(); i++) { for (int i = 0; i < graph_list.size(); i++) {
graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size]; graph_list[i].node_list = new uint64_t[graph_list[i].node_size];
graph_list[i].node_info_list = new GpuPsNodeInfo[graph_list[i].node_size];
graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size]; graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size];
} }
for (int i = 0; i < node_count; i++) { for (int i = 0; i < node_count; i++) {
ind = i % gpu_count; ind = i % gpu_count;
graph_list[ind].node_list[node_index[ind]].node_id = i; graph_list[ind].node_list[node_index[ind]] = i;
graph_list[ind].node_list[node_index[ind]].neighbor_offset = graph_list[ind].node_info_list[node_index[ind]].neighbor_offset =
neighbor_offset[ind]; neighbor_offset[ind];
graph_list[ind].node_list[node_index[ind]].neighbor_size = graph_list[ind].node_info_list[node_index[ind]].neighbor_size =
neighbors[i].size(); neighbors[i].size();
for (auto x : neighbors[i]) { for (auto x : neighbors[i]) {
graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x; graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x;
......
...@@ -25,7 +25,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -25,7 +25,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
...@@ -34,11 +33,14 @@ limitations under the License. */ ...@@ -34,11 +33,14 @@ limitations under the License. */
#include <deque> #include <deque>
#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#if defined(PADDLE_WITH_PSCORE) #if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif #endif
DECLARE_int32(gpugraph_dedup_pull_push_mode);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -117,7 +119,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -117,7 +119,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
std::vector<std::thread> threads; std::vector<std::thread> threads;
// data should be in input channel // data should be in input channel
thread_dim_keys_.resize(thread_keys_thread_num_); thread_dim_keys_.resize(thread_keys_thread_num_);
...@@ -135,16 +136,21 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -135,16 +136,21 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
std::string data_set_name = std::string(typeid(*dataset_).name()); std::string data_set_name = std::string(typeid(*dataset_).name());
VLOG(0) << "gpu_graph_mode_:" << gpu_graph_mode_;
if (!gpu_graph_mode_) {
if (data_set_name.find("SlotRecordDataset") != std::string::npos) { if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
auto input_channel = dataset->GetInputChannel(); auto input_channel = dataset->GetInputChannel();
VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size(); VLOG(0) << "psgpu wrapperinputslotchannle size: "
<< input_channel->Size();
const std::deque<SlotRecord>& vec_data = input_channel->GetData(); const std::deque<SlotRecord>& vec_data = input_channel->GetData();
total_len = vec_data.size(); total_len = vec_data.size();
len_per_thread = total_len / thread_keys_thread_num_; len_per_thread = total_len / thread_keys_thread_num_;
remain = total_len % thread_keys_thread_num_; remain = total_len % thread_keys_thread_num_;
VLOG(0) << "total len: " << total_len; VLOG(0) << "total len: " << total_len;
auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data, auto gen_dynamic_mf_func = [this](
const std::deque<SlotRecord>& total_data,
int begin_index, int begin_index,
int end_index, int end_index,
int i) { int i) {
...@@ -162,7 +168,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -162,7 +168,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
int shard_id = feasign_v[j] % thread_keys_shard_num_; int shard_id = feasign_v[j] % thread_keys_shard_num_;
int dim_id = slot_index_vec_[slot_idx]; int dim_id = slot_index_vec_[slot_idx];
if (feasign_v[j] != 0) { if (feasign_v[j] != 0) {
this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]); this->thread_dim_keys_[i][shard_id][dim_id].insert(
feasign_v[j]);
} }
} }
} }
...@@ -182,7 +189,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -182,7 +189,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
t.join(); t.join();
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
<< " seconds.";
} else { } else {
CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
...@@ -222,7 +230,67 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -222,7 +230,67 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
t.join(); t.join();
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
<< " seconds.";
}
} else {
VLOG(0) << "PreBuild in GpuGraph mode";
SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
total_len = vec_data.size();
len_per_thread = total_len / thread_keys_thread_num_;
VLOG(0) << "GpuGraphTotalKeys: " << total_len;
remain = total_len % thread_keys_thread_num_;
auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
int begin_index,
int end_index,
int i) {
for (auto iter = total_data.begin() + begin_index;
iter != total_data.begin() + end_index;
iter++) {
uint64_t cur_key = *iter;
int shard_id = cur_key % thread_keys_shard_num_;
this->thread_keys_[i][shard_id].insert(cur_key);
}
};
auto gen_graph_dynamic_mf_func =
[this](const std::vector<uint64_t>& total_data,
int begin_index,
int end_index,
int i) {
for (auto iter = total_data.begin() + begin_index;
iter != total_data.begin() + end_index;
iter++) {
uint64_t cur_key = *iter;
int shard_id = cur_key % thread_keys_shard_num_;
// TODO: feasign <-> slot <-> multi_dim
this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
}
};
for (int i = 0; i < thread_keys_thread_num_; i++) {
if (!multi_mf_dim_) {
VLOG(1) << "psgpu graph wrapper genfunc";
threads.push_back(
std::thread(gen_graph_data_func,
std::ref(vec_data),
begin,
begin + len_per_thread + (i < remain ? 1 : 0),
i));
} else {
VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf";
threads.push_back(
std::thread(gen_graph_dynamic_mf_func,
std::ref(vec_data),
begin,
begin + len_per_thread + (i < remain ? 1 : 0),
i));
}
begin += len_per_thread + (i < remain ? 1 : 0);
}
for (std::thread& t : threads) {
t.join();
}
} }
timeline.Start(); timeline.Start();
...@@ -255,6 +323,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -255,6 +323,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
for (int i = 0; i < thread_keys_shard_num_; i++) { for (int i = 0; i < thread_keys_shard_num_; i++) {
for (int j = 0; j < multi_mf_dim_; j++) { for (int j = 0; j < multi_mf_dim_; j++) {
if (i == 0 && j == multi_mf_dim_ - 1) {
gpu_task->feature_dim_keys_[i][j].push_back(0);
}
VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
<< " key len: " << gpu_task->feature_dim_keys_[i][j].size(); << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
gpu_task->value_dim_ptr_[i][j].resize( gpu_task->value_dim_ptr_[i][j].resize(
...@@ -640,7 +711,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -640,7 +711,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
} }
std::vector<std::thread> threads(device_num); std::vector<std::thread> threads(device_num);
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
HeterPs_ = HeterPsBase::get_instance( HeterPs_ = HeterPsBase::get_instance(
size_max, resource_, fleet_config_, accessor_class_, optimizer_type_); size_max, resource_, fleet_config_, accessor_class_, optimizer_type_);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -824,6 +895,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { ...@@ -824,6 +895,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
dataset_->LocalShuffle(); dataset_->LocalShuffle();
} }
InitSlotInfo(); InitSlotInfo();
gpu_graph_mode_ = dataset_->GetGpuGraphMode();
std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get(); std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
gpu_task->Reset(); gpu_task->Reset();
...@@ -890,15 +962,22 @@ void PSGPUWrapper::BeginPass() { ...@@ -890,15 +962,22 @@ void PSGPUWrapper::BeginPass() {
platform::errors::Fatal("[BeginPass] current task is not ended.")); platform::errors::Fatal("[BeginPass] current task is not ended."));
} }
debug_gpu_memory_info("befor build task");
build_task(); build_task();
debug_gpu_memory_info("after build task");
timer.Pause(); timer.Pause();
if (current_task_ == nullptr) { if (current_task_ == nullptr) {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"[BeginPass] after build_task, current task is not null.")); "[BeginPass] after build_task, current task is not null."));
} }
if (FLAGS_gpugraph_dedup_pull_push_mode) {
VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec()
<< "s, enable pull push dedup mode="
<< FLAGS_gpugraph_dedup_pull_push_mode;
} else {
VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
}
} }
void PSGPUWrapper::EndPass() { void PSGPUWrapper::EndPass() {
...@@ -919,7 +998,7 @@ void PSGPUWrapper::EndPass() { ...@@ -919,7 +998,7 @@ void PSGPUWrapper::EndPass() {
} }
int thread_num = 8; int thread_num = 8;
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr]( auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
int i, int j, int z) { int i, int j, int z) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
...@@ -961,30 +1040,7 @@ void PSGPUWrapper::EndPass() { ...@@ -961,30 +1040,7 @@ void PSGPUWrapper::EndPass() {
size_t local_offset = (i - left) * feature_value_size; size_t local_offset = (i - left) * feature_value_size;
float* gpu_val = (float*)(test_build_values + local_offset); float* gpu_val = (float*)(test_build_values + local_offset);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto* downpour_value = // TODO: PSLIB DumpFill
(paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
int downpour_value_size = downpour_value->size();
if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
}
float* cpu_val = downpour_value->data();
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
delta_score_index()] = gpu_val->delta_score;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
show_index()] = gpu_val->show;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
click_index()] = gpu_val->clk;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
embed_w_index()] = gpu_val->lr;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
embed_g2sum_index()] = gpu_val->lr_g2sum;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
slot_index()] = gpu_val->slot;
if (gpu_val->mf_size > 0) {
for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
cpu_val[x + 8] = gpu_val->mf[x];
}
}
#endif #endif
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
...@@ -1043,39 +1099,150 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1043,39 +1099,150 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
platform::Timer all_timer; platform::Timer all_timer;
platform::Timer pull_gpups_timer; platform::Timer pull_gpups_timer;
all_timer.Start(); all_timer.Start();
size_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
size_t feature_value_size = 0;
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); size_t feature_value_size =
accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ VLOG(3) << "PullSparse max_dim:" << max_mf_dim_
<< " feature_value_size:" << feature_value_size; << " pull_feature_value_size:" << pull_type_size_;
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "Begine Gpu Ps PullSparse";
auto buf = memory::Alloc(place, total_length * feature_value_size);
float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
#endif
#ifdef PADDLE_WITH_XPU_KP
VLOG(3) << "Begine Xpu Ps PullSparse";
FeatureValue* total_values_gpu = nullptr;
xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
total_length * feature_value_size);
#endif
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GpuPs now.")); "Warning:: CPUPlace is not supported in GpuPs now."));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; #ifdef PADDLE_WITH_CUDA
int device_id = place.GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
if (FLAGS_gpugraph_dedup_pull_push_mode > 0) {
auto& dev = device_caches_[devid_2_index];
int slot_num = static_cast<int>(slot_lengths.size());
std::vector<int64_t> slot_lengths_lod;
slot_lengths_lod.reserve(slot_num + 1);
slot_lengths_lod.push_back(0);
int64_t total_length = 0;
for (int i = 0; i < slot_num; ++i) {
total_length += slot_lengths[i];
slot_lengths_lod.push_back(total_length);
}
dev.total_key_length = total_length;
VLOG(3) << "[" << device_id << "]Begin copy keys, key_num["
<< total_length << "] dedup mode";
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
uint64_t* total_keys = dev.keys_tensor.mutable_data<uint64_t>(
(total_length * 3) * sizeof(uint64_t), place);
int* gpu_slot_dims = dev.dims_tensor.mutable_data<int>(
slot_dim.size() * sizeof(int), place);
uint64_t** gpu_keys = dev.keys_ptr_tensor.mutable_data<uint64_t*>(
keys.size() * sizeof(uint64_t*), place);
int64_t* slot_lens = dev.slot_lens.mutable_data<int64_t>(
(slot_num + 1) * sizeof(int64_t), place);
cudaMemcpyAsync(gpu_keys,
keys.data(),
keys.size() * sizeof(uint64_t*),
cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(slot_lens,
slot_lengths_lod.data(),
slot_lengths_lod.size() * sizeof(int64_t),
cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(gpu_slot_dims,
slot_dim.data(),
slot_dim.size() * sizeof(int),
cudaMemcpyHostToDevice,
stream);
float** gpu_values = dev.values_ptr_tensor.mutable_data<float*>(
values.size() * sizeof(float*), place);
cudaMemcpyAsync(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice,
stream);
int* key2slot = dev.keys2slot.mutable_data<int>(
(total_length * 5) * sizeof(int), place);
this->CopyKeys(place,
gpu_keys,
total_keys,
slot_lens,
slot_num,
static_cast<int>(total_length),
key2slot);
uint32_t* d_restore_idx =
reinterpret_cast<uint32_t*>(&key2slot[total_length]);
uint32_t* d_sorted_idx =
reinterpret_cast<uint32_t*>(&d_restore_idx[total_length]);
uint32_t* d_offset =
reinterpret_cast<uint32_t*>(&d_sorted_idx[total_length]);
uint32_t* d_merged_cnts =
reinterpret_cast<uint32_t*>(&d_offset[total_length]);
uint64_t* d_merged_keys =
reinterpret_cast<uint64_t*>(&total_keys[total_length]);
uint64_t* d_sorted_keys =
reinterpret_cast<uint64_t*>(&d_merged_keys[total_length]);
int dedup_size = HeterPs_->dedup_keys_and_fillidx(
devid_2_index,
static_cast<int>(total_length),
total_keys, // input
d_merged_keys, // output
d_sorted_keys, // sort keys
d_restore_idx, // pull fill idx
d_sorted_idx, // sort old idx
d_offset, // offset
d_merged_cnts,
FLAGS_gpugraph_dedup_pull_push_mode & 0x02);
// printf("device %d, end dedup_keys_and_fillidx total %d, "
// "dedup_size %d, slot num: %d, value size: %d\n",
// device_id, int(total_length), dedup_size, slot_num,
// int(feature_value_size));
PADDLE_ENFORCE_GT(dedup_size,
0,
platform::errors::PreconditionNotMet(
"dedup keys need more than zero failed in BoxPS."));
dev.dedup_key_length = dedup_size;
int64_t total_bytes = dedup_size * feature_value_size;
float* total_values_gpu =
dev.pull_push_tensor.mutable_data<float>(total_bytes, place);
pull_gpups_timer.Start();
HeterPs_->pull_sparse(
devid_2_index, d_merged_keys, total_values_gpu, dedup_size);
// values.size() not sure equal slot_num
accessor_wrapper_ptr->CopyForPull(place,
total_keys,
gpu_values,
total_values_gpu,
slot_lens,
key2slot,
max_mf_dim_ + 3,
total_length,
gpu_slot_dims,
d_restore_idx,
feature_value_size);
} else {
size_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
auto buf = memory::Alloc(place, total_length * feature_value_size);
float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>( reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
{int64_t(total_length), 1}, place)); {int64_t(total_length), 1}, place));
// construct slot_level lod info // construct slot_level lod info
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
for (size_t i = 1; i < slot_lengths_lod.size(); i++) { for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
...@@ -1127,18 +1294,25 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1127,18 +1294,25 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
hidden_size, hidden_size,
total_length, total_length,
gpu_dim, gpu_dim,
val_type_size_); feature_value_size);
}
pull_gpups_timer.Pause(); pull_gpups_timer.Pause();
#endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
VLOG(3) << "Begine Xpu Ps PullSparse";
size_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
FeatureValue* total_values_gpu = nullptr;
xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
total_length * feature_value_size);
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = place.GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = reinterpret_cast<uint64_t*>( uint64_t* total_keys =
total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place)); reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
{int64_t(total_length), 1}, place));
// construct slot_level lod info // construct slot_level lod info
auto slot_lengths_lod = slot_lengths; auto slot_lengths_lod = slot_lengths;
...@@ -1185,7 +1359,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1185,7 +1359,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
static_cast<int>(slot_lengths.size()), static_cast<int>(slot_lengths.size()),
hidden_size, hidden_size,
total_length, total_length,
val_type_size_); feature_value_size);
#endif #endif
} else { } else {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
...@@ -1208,17 +1382,10 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1208,17 +1382,10 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
platform::Timer all_timer; platform::Timer all_timer;
platform::Timer push_gpups_timer; platform::Timer push_gpups_timer;
all_timer.Start(); all_timer.Start();
int64_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
// #ifdef PADDLE_WITH_CUDA
VLOG(3) << "Begin GPUPS PushSparseGrad";
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto buf = memory::Alloc(place, total_length * grad_value_size);
VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
<< "grad_value_size:" << grad_value_size;
float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GPUPS now.")); "Warning:: CPUPlace is not supported in GPUPS now."));
...@@ -1226,10 +1393,107 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1226,10 +1393,107 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int device_id = place.GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; if (FLAGS_gpugraph_dedup_pull_push_mode > 0) {
auto& dev = device_caches_[devid_2_index];
int64_t total_length = dev.total_key_length;
VLOG(3) << "Begin push sparse, key_num[" << total_length
<< "] dedup mode, device:" << device_id << ", index"
<< devid_2_index;
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
uint64_t* total_keys = dev.keys_tensor.data<uint64_t>();
int* slot_dims = dev.dims_tensor.data<int>();
int slot_num = static_cast<int>(slot_lengths.size());
if (!dev.d_slot_vector.IsInitialized()) {
int* buf_slot_vector =
dev.d_slot_vector.mutable_data<int>(slot_num * sizeof(int), place);
cudaMemcpyAsync(buf_slot_vector,
slot_vector_.data(),
slot_num * sizeof(int),
cudaMemcpyHostToDevice,
stream);
}
const int64_t* slot_lens = dev.slot_lens.data<int64_t>();
const int* d_slot_vector = dev.d_slot_vector.data<int>();
const int* key2slot = dev.keys2slot.data<int>();
float** gpu_values = dev.values_ptr_tensor.data<float*>();
cudaMemcpyAsync(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice,
stream);
uint64_t* d_merged_keys = &total_keys[total_length];
int64_t dedup_size = dev.dedup_key_length;
int64_t total_bytes = dedup_size * grad_value_size;
float* total_grad_values_gpu =
dev.pull_push_tensor.mutable_data<float>(total_bytes, place);
// dedup rate more than 3
if (total_length > dedup_size * 3) {
const uint32_t* d_restore_idx =
reinterpret_cast<const uint32_t*>(&key2slot[total_length]);
accessor_wrapper_ptr->CopyForPush(place,
total_keys,
gpu_values,
total_grad_values_gpu,
d_slot_vector,
slot_lens,
max_mf_dim_ + 3,
total_length,
dedup_size,
batch_size,
slot_dims,
key2slot,
d_restore_idx,
grad_value_size);
} else {
const uint32_t* d_sorted_idx =
reinterpret_cast<const uint32_t*>(&key2slot[total_length * 2]);
const uint32_t* d_offset =
reinterpret_cast<const uint32_t*>(&d_sorted_idx[total_length]);
const uint32_t* d_merged_cnts =
reinterpret_cast<const uint32_t*>(&d_offset[total_length]);
accessor_wrapper_ptr->CopyForPush(place,
d_merged_keys,
gpu_values,
total_grad_values_gpu,
d_slot_vector,
slot_lens,
max_mf_dim_ + 3,
total_length,
dedup_size,
batch_size,
slot_dims,
key2slot,
d_sorted_idx,
d_offset,
d_merged_cnts,
grad_value_size);
}
push_gpups_timer.Start();
HeterPs_->push_sparse(devid_2_index,
d_merged_keys,
total_grad_values_gpu,
static_cast<int>(dedup_size));
} else {
int64_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
VLOG(3) << "Begin GPUPS PushSparseGrad";
auto buf = memory::Alloc(place, total_length * grad_value_size);
VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
<< "grad_value_size:" << grad_value_size;
float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>()); reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
VLOG(3) << "Begin copy grad tensor to gpups struct"; VLOG(3) << "Begin copy grad tensor to gpups struct";
accessor_wrapper_ptr->CopyForPush(place, accessor_wrapper_ptr->CopyForPush(place,
grad_values, grad_values,
total_grad_values_gpu, total_grad_values_gpu,
...@@ -1247,15 +1511,24 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1247,15 +1511,24 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
total_keys, total_keys,
total_grad_values_gpu, total_grad_values_gpu,
static_cast<int>(total_length)); static_cast<int>(total_length));
}
push_gpups_timer.Pause(); push_gpups_timer.Pause();
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
int device_id = place.GetDeviceId(); int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id); int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; int64_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
VLOG(3) << "Begin GPUPS PushSparseGrad";
auto buf = memory::Alloc(place, total_length * grad_value_size);
VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
<< "grad_value_size:" << grad_value_size;
float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>()); reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
VLOG(3) << "Begin copy grad tensor to xpups struct"; VLOG(3) << "Begin copy grad tensor to xpups struct";
accessor_wrapper_ptr->CopyForPush(place, accessor_wrapper_ptr->CopyForPush(place,
grad_values, grad_values,
...@@ -1288,6 +1561,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1288,6 +1561,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
VLOG(3) << "End PushSparseGrad"; VLOG(3) << "End PushSparseGrad";
} }
} // end namespace framework } // namespace framework
} // end namespace paddle } // end namespace paddle
#endif #endif
...@@ -22,10 +22,15 @@ limitations under the License. */ ...@@ -22,10 +22,15 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
__global__ void CopyKeysKernel(uint64_t** src_keys, __global__ void CopyKeysKernel(uint64_t** src_keys,
uint64_t* dest_total_keys, uint64_t* dest_total_keys,
const int64_t* len, const int64_t* len,
...@@ -93,6 +98,44 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -93,6 +98,44 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
__global__ void CopyKeysKernel2(const int total_len,
uint64_t** src_keys,
uint64_t* dest_total_keys,
const int slot_num,
const int64_t* slot_lens,
int* key2slots) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < slot_lens[mid + 1]) {
high = mid;
} else {
low = mid + 1;
}
}
key2slots[i] = low;
int y = i - slot_lens[low];
dest_total_keys[i] = src_keys[low][y];
}
}
void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys,
uint64_t* total_keys,
const int64_t* slot_lens,
int slot_num,
int total_len,
int* key2slot) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
CopyKeysKernel2<<<CUDA_BLOCK(total_len), stream>>>(
total_len, origin_keys, total_keys, slot_num, slot_lens, key2slot);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
float clk_coeff, float clk_coeff,
float min_bound, float min_bound,
...@@ -123,7 +166,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, ...@@ -123,7 +166,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_max_bound, float mf_max_bound,
float mf_beta1_decay_rate, float mf_beta1_decay_rate,
float mf_beta2_decay_rate, float mf_beta2_decay_rate,
float mf_ada_epsilon) { float mf_ada_epsilon,
float nodeid_slot,
float feature_learning_rate) {
optimizer_config_.set_embedx_sgd(mf_create_thresholds, optimizer_config_.set_embedx_sgd(mf_create_thresholds,
mf_learning_rate, mf_learning_rate,
mf_initial_g2sum, mf_initial_g2sum,
...@@ -132,7 +177,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, ...@@ -132,7 +177,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
mf_max_bound, mf_max_bound,
mf_beta1_decay_rate, mf_beta1_decay_rate,
mf_beta2_decay_rate, mf_beta2_decay_rate,
mf_ada_epsilon); mf_ada_epsilon,
nodeid_slot,
feature_learning_rate);
} }
} // end namespace framework } // end namespace framework
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include <atomic> #include <atomic>
...@@ -98,20 +97,61 @@ class AfsWrapper { ...@@ -98,20 +97,61 @@ class AfsWrapper {
#endif #endif
class PSGPUWrapper { class PSGPUWrapper {
class DCacheBuffer {
public:
DCacheBuffer() : buf_(nullptr) {}
~DCacheBuffer() {}
/**
* @Brief get data
*/
template <typename T>
T* mutable_data(const size_t total_bytes,
const paddle::platform::Place& place) {
if (buf_ == nullptr) {
buf_ = memory::AllocShared(place, total_bytes);
} else if (buf_->size() < total_bytes) {
buf_.reset();
buf_ = memory::AllocShared(place, total_bytes);
}
return reinterpret_cast<T*>(buf_->ptr());
}
template <typename T>
T* data() {
return reinterpret_cast<T*>(buf_->ptr());
}
size_t memory_size() {
if (buf_ == nullptr) {
return 0;
}
return buf_->size();
}
bool IsInitialized(void) { return (buf_ != nullptr); }
private:
std::shared_ptr<memory::Allocation> buf_ = nullptr;
};
struct PSDeviceData {
DCacheBuffer keys_tensor;
DCacheBuffer dims_tensor;
DCacheBuffer keys_ptr_tensor;
DCacheBuffer values_ptr_tensor;
DCacheBuffer pull_push_tensor;
DCacheBuffer slot_lens;
DCacheBuffer d_slot_vector;
DCacheBuffer keys2slot;
int64_t total_key_length = 0;
int64_t dedup_key_length = 0;
};
PSDeviceData* device_caches_ = nullptr;
public: public:
~PSGPUWrapper(); ~PSGPUWrapper();
PSGPUWrapper() { PSGPUWrapper() {
HeterPs_ = NULL; HeterPs_ = NULL;
sleep_seconds_before_fail_exit_ = 300; sleep_seconds_before_fail_exit_ = 300;
pull_thread_pool_.resize(thread_keys_shard_num_);
for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
pull_thread_pool_[i].reset(new ::ThreadPool(1));
}
hbm_thread_pool_.resize(thread_keys_shard_num_);
for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
hbm_thread_pool_[i].reset(new ::ThreadPool(1));
}
} }
void PullSparse(const paddle::platform::Place& place, void PullSparse(const paddle::platform::Place& place,
...@@ -140,6 +180,13 @@ class PSGPUWrapper { ...@@ -140,6 +180,13 @@ class PSGPUWrapper {
const int64_t* gpu_len, const int64_t* gpu_len,
int slot_num, int slot_num,
int total_len); int total_len);
void CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys,
uint64_t* total_keys,
const int64_t* gpu_len,
int slot_num,
int total_len,
int* key2slot);
void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task); void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
void PreBuildTask(std::shared_ptr<HeterContext> gpu_task); void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
...@@ -164,6 +211,11 @@ class PSGPUWrapper { ...@@ -164,6 +211,11 @@ class PSGPUWrapper {
pre_build_threads_.join(); pre_build_threads_.join();
s_instance_ = nullptr; s_instance_ = nullptr;
VLOG(3) << "PSGPUWrapper Finalize Finished."; VLOG(3) << "PSGPUWrapper Finalize Finished.";
HeterPs_->show_table_collisions();
if (device_caches_ != nullptr) {
delete[] device_caches_;
device_caches_ = nullptr;
}
} }
void InitializeGPU(const std::vector<int>& dev_ids) { void InitializeGPU(const std::vector<int>& dev_ids) {
...@@ -173,6 +225,7 @@ class PSGPUWrapper { ...@@ -173,6 +225,7 @@ class PSGPUWrapper {
resource_ = std::make_shared<HeterPsResource>(dev_ids); resource_ = std::make_shared<HeterPsResource>(dev_ids);
resource_->enable_p2p(); resource_->enable_p2p();
keys_tensor.resize(resource_->total_device()); keys_tensor.resize(resource_->total_device());
device_caches_ = new PSDeviceData[resource_->total_device()];
#ifdef PADDLE_WITH_GLOO #ifdef PADDLE_WITH_GLOO
auto gloo = paddle::framework::GlooWrapper::GetInstance(); auto gloo = paddle::framework::GlooWrapper::GetInstance();
if (gloo->Size() > 1) { if (gloo->Size() > 1) {
...@@ -256,7 +309,9 @@ class PSGPUWrapper { ...@@ -256,7 +309,9 @@ class PSGPUWrapper {
float mf_max_bound, float mf_max_bound,
float mf_beta1_decay_rate, float mf_beta1_decay_rate,
float mf_beta2_decay_rate, float mf_beta2_decay_rate,
float mf_ada_epsilon); float mf_ada_epsilon,
float nodeid_slot,
float feature_learning_rate);
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
void add_sparse_optimizer( void add_sparse_optimizer(
...@@ -308,6 +363,21 @@ class PSGPUWrapper { ...@@ -308,6 +363,21 @@ class PSGPUWrapper {
void InitializeGPUServer(paddle::distributed::PSParameter ps_param) { void InitializeGPUServer(paddle::distributed::PSParameter ps_param) {
auto sparse_table = auto sparse_table =
ps_param.server_param().downpour_server_param().downpour_table_param(0); ps_param.server_param().downpour_server_param().downpour_table_param(0);
// set build thread_num and shard_num
thread_keys_thread_num_ = sparse_table.shard_num();
thread_keys_shard_num_ = sparse_table.shard_num();
VLOG(1) << "ps_gpu build phase thread_num:" << thread_keys_thread_num_
<< " shard_num:" << thread_keys_shard_num_;
pull_thread_pool_.resize(thread_keys_shard_num_);
for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
pull_thread_pool_[i].reset(new ::ThreadPool(1));
}
hbm_thread_pool_.resize(thread_keys_shard_num_);
for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
hbm_thread_pool_[i].reset(new ::ThreadPool(1));
}
auto sparse_table_accessor = sparse_table.accessor(); auto sparse_table_accessor = sparse_table.accessor();
auto sparse_table_accessor_parameter = auto sparse_table_accessor_parameter =
sparse_table_accessor.ctr_accessor_param(); sparse_table_accessor.ctr_accessor_param();
...@@ -319,6 +389,11 @@ class PSGPUWrapper { ...@@ -319,6 +389,11 @@ class PSGPUWrapper {
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
config["nodeid_slot"] =
sparse_table_accessor.graph_sgd_param().nodeid_slot();
config["feature_learning_rate"] =
sparse_table_accessor.graph_sgd_param().feature_learning_rate();
if (accessor_class_ == "CtrDymfAccessor") { if (accessor_class_ == "CtrDymfAccessor") {
// optimizer config for embed_w and embedx // optimizer config for embed_w and embedx
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
...@@ -327,8 +402,8 @@ class PSGPUWrapper { ...@@ -327,8 +402,8 @@ class PSGPUWrapper {
} }
fleet_config_ = config; fleet_config_ = config;
GlobalAccessorTransfor::GetInstance().Init(accessor_class_); GlobalAccessorFactory::GetInstance().Init(accessor_class_);
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure( GlobalAccessorFactory::GetInstance().GetAccessorWrapper()->Configure(
config); config);
InitializeGPUServer(config); InitializeGPUServer(config);
} }
...@@ -394,6 +469,16 @@ class PSGPUWrapper { ...@@ -394,6 +469,16 @@ class PSGPUWrapper {
float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end()) float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
? 1e-8 ? 1e-8
: config["mf_ada_epsilon"]; : config["mf_ada_epsilon"];
float feature_learning_rate =
(config.find("feature_learning_rate") == config.end())
? 0.05
: config["feature_learning_rate"];
float nodeid_slot = (config.find("nodeid_slot") == config.end())
? 9008
: config["nodeid_slot"];
this->SetSparseSGD(nonclk_coeff, this->SetSparseSGD(nonclk_coeff,
clk_coeff, clk_coeff,
min_bound, min_bound,
...@@ -412,12 +497,18 @@ class PSGPUWrapper { ...@@ -412,12 +497,18 @@ class PSGPUWrapper {
mf_max_bound, mf_max_bound,
mf_beta1_decay_rate, mf_beta1_decay_rate,
mf_beta2_decay_rate, mf_beta2_decay_rate,
mf_ada_epsilon); mf_ada_epsilon,
nodeid_slot,
feature_learning_rate);
// set optimizer type(naive,adagrad,std_adagrad,adam,share_adam) // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
optimizer_type_ = (config.find("optimizer_type") == config.end()) optimizer_type_ = (config.find("optimizer_type") == config.end())
? 1 ? 1
: static_cast<int>(config["optimizer_type"]); : int(config["optimizer_type"]);
VLOG(0) << "InitializeGPUServer optimizer_type_:" << optimizer_type_
<< " nodeid_slot:" << nodeid_slot
<< " feature_learning_rate:" << feature_learning_rate;
} }
void SetDate(int year, int month, int day) { void SetDate(int year, int month, int day) {
...@@ -508,11 +599,13 @@ class PSGPUWrapper { ...@@ -508,11 +599,13 @@ class PSGPUWrapper {
} }
auto accessor_wrapper_ptr = auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
pull_type_size_ = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
<< " grad_type_size_:" << grad_type_size_; << " grad_type_size_:" << grad_type_size_
<< " pull_type_size_:" << pull_type_size_;
slot_info_initialized_ = true; slot_info_initialized_ = true;
} }
#endif #endif
...@@ -564,6 +657,7 @@ class PSGPUWrapper { ...@@ -564,6 +657,7 @@ class PSGPUWrapper {
int max_mf_dim_{0}; int max_mf_dim_{0};
size_t val_type_size_{0}; size_t val_type_size_{0};
size_t grad_type_size_{0}; size_t grad_type_size_{0};
size_t pull_type_size_{0};
double time_1 = 0.0; double time_1 = 0.0;
double time_2 = 0.0; double time_2 = 0.0;
...@@ -573,6 +667,7 @@ class PSGPUWrapper { ...@@ -573,6 +667,7 @@ class PSGPUWrapper {
int multi_node_{0}; int multi_node_{0};
int node_size_; int node_size_;
uint64_t table_id_; uint64_t table_id_;
int gpu_graph_mode_ = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<ncclComm_t> inner_comms_; std::vector<ncclComm_t> inner_comms_;
std::vector<ncclComm_t> inter_comms_; std::vector<ncclComm_t> inter_comms_;
......
...@@ -220,52 +220,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -220,52 +220,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
xpu_wait(stream); xpu_wait(stream);
} }
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
float clk_coeff,
float min_bound,
float max_bound,
float learning_rate,
float initial_g2sum,
float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff,
clk_coeff,
min_bound,
max_bound,
learning_rate,
initial_g2sum,
initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
HeterPs_->set_sparse_sgd(optimizer_config);
}
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate,
float mf_initial_g2sum,
float mf_initial_range,
float mf_min_bound,
float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds,
mf_learning_rate,
mf_initial_g2sum,
mf_initial_range,
mf_min_bound,
mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
HeterPs_->set_embedx_sgd(optimizer_config);
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
#endif #endif
...@@ -119,6 +119,12 @@ void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) { ...@@ -119,6 +119,12 @@ void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) {
void HogwildWorker::TrainFilesWithProfiler() { void HogwildWorker::TrainFilesWithProfiler() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
#if defined(PADDLE_WITH_HETERPS) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
platform::SetDeviceId(thread_id_);
#elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
platform::SetXPUDeviceId(thread_id_);
#endif
device_reader_->Start(); device_reader_->Start();
std::vector<double> op_total_time; std::vector<double> op_total_time;
std::vector<std::string> op_name; std::vector<std::string> op_name;
...@@ -175,8 +181,6 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -175,8 +181,6 @@ void HogwildWorker::TrainFilesWithProfiler() {
PrintFetchVars(); PrintFetchVars();
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
dev_ctx_->Wait(); dev_ctx_->Wait();
VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
<< " seconds, ins_num: " << total_inst;
for (size_t i = 0; i < op_name.size(); ++i) { for (size_t i = 0; i < op_name.size(); ++i) {
VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i] VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
<< ", mean time: " << op_total_time[i] / total_inst << ", mean time: " << op_total_time[i] / total_inst
...@@ -201,6 +205,9 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -201,6 +205,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
thread_scope_->DropKids(); thread_scope_->DropKids();
timeline.Start(); timeline.Start();
} }
VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time
<< " seconds, ins_num: " << total_inst << " read time: " << read_time
<< "seconds ";
if (need_dump_field_ || need_dump_param_) { if (need_dump_field_ || need_dump_param_) {
writer_.Flush(); writer_.Flush();
...@@ -217,16 +224,19 @@ void HogwildWorker::TrainFiles() { ...@@ -217,16 +224,19 @@ void HogwildWorker::TrainFiles() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
platform::Timer timeline; platform::Timer timeline;
timeline.Start(); timeline.Start();
#if defined(PADDLE_WITH_HETERPS) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
platform::SetDeviceId(thread_id_);
#elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
platform::SetXPUDeviceId(thread_id_);
#endif
int total_ins_num = 0; int total_batch_num = 0;
// how to accumulate fetched values here // how to accumulate fetched values here
device_reader_->Start(); device_reader_->Start();
int cur_batch; int cur_batch;
int batch_cnt = 0; int batch_cnt = 0;
#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
platform::SetDeviceId(thread_id_);
#endif
while ((cur_batch = device_reader_->Next()) > 0) { while ((cur_batch = device_reader_->Next()) > 0) {
for (auto &op : ops_) { for (auto &op : ops_) {
bool need_skip = false; bool need_skip = false;
...@@ -248,7 +258,7 @@ void HogwildWorker::TrainFiles() { ...@@ -248,7 +258,7 @@ void HogwildWorker::TrainFiles() {
DumpParam(*thread_scope_, batch_cnt); DumpParam(*thread_scope_, batch_cnt);
} }
total_ins_num += cur_batch; total_batch_num += cur_batch;
++batch_cnt; ++batch_cnt;
PrintFetchVars(); PrintFetchVars();
thread_scope_->DropKids(); thread_scope_->DropKids();
...@@ -257,8 +267,8 @@ void HogwildWorker::TrainFiles() { ...@@ -257,8 +267,8 @@ void HogwildWorker::TrainFiles() {
#endif #endif
} }
timeline.Pause(); timeline.Pause();
VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
<< " seconds, ins_num: " << total_ins_num; << " seconds, batch_num: " << total_batch_num;
if (need_dump_field_ || need_dump_param_) { if (need_dump_field_ || need_dump_param_) {
writer_.Flush(); writer_.Flush();
......
...@@ -157,7 +157,7 @@ std::vector<std::string> localfs_list(const std::string& path) { ...@@ -157,7 +157,7 @@ std::vector<std::string> localfs_list(const std::string& path) {
std::shared_ptr<FILE> pipe; std::shared_ptr<FILE> pipe;
int err_no = 0; int err_no = 0;
pipe = shell_popen( pipe = shell_popen(
string::format_string("find %s -type f -maxdepth 1", path.c_str()), string::format_string("find %s -type f -maxdepth 1 | sort", path.c_str()),
"r", "r",
&err_no); &err_no);
string::LineFileReader reader; string::LineFileReader reader;
......
...@@ -128,16 +128,16 @@ void PSGPUWorker::TrainFiles() { ...@@ -128,16 +128,16 @@ void PSGPUWorker::TrainFiles() {
timeline.Start(); timeline.Start();
int total_ins_num = 0; int total_ins_num = 0;
// how to accumulate fetched values here
device_reader_->Start();
int cur_batch;
int batch_cnt = 0;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform::SetDeviceId(thread_id_); platform::SetDeviceId(thread_id_);
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
platform::SetXPUDeviceId(thread_id_); platform::SetXPUDeviceId(thread_id_);
#endif #endif
// how to accumulate fetched values here
device_reader_->Start();
int cur_batch;
int batch_cnt = 0;
while ((cur_batch = device_reader_->Next()) > 0) { while ((cur_batch = device_reader_->Next()) > 0) {
total_ins_num += cur_batch; total_ins_num += cur_batch;
for (auto& op : ops_) { for (auto& op : ops_) {
......
...@@ -58,7 +58,6 @@ void TrainerBase::DumpWork(int tid) { ...@@ -58,7 +58,6 @@ void TrainerBase::DumpWork(int tid) {
int err_no = 0; int err_no = 0;
// GetDumpPath is implemented in each Trainer // GetDumpPath is implemented in each Trainer
std::string path = GetDumpPath(tid); std::string path = GetDumpPath(tid);
std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_); std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
while (1) { while (1) {
std::string out_str; std::string out_str;
......
...@@ -68,7 +68,7 @@ message TrainerDesc { ...@@ -68,7 +68,7 @@ message TrainerDesc {
// add for gpu // add for gpu
optional string fleet_desc = 37; optional string fleet_desc = 37;
optional bool is_dump_in_simple_mode = 38 [ default = false ];
// device worker parameters // device worker parameters
optional HogwildWorkerParameter hogwild_param = 101; optional HogwildWorkerParameter hogwild_param = 101;
optional DownpourWorkerParameter downpour_param = 103; optional DownpourWorkerParameter downpour_param = 103;
......
...@@ -32,7 +32,7 @@ cc_library( ...@@ -32,7 +32,7 @@ cc_library(
if(WITH_TESTING AND NOT WIN32) if(WITH_TESTING AND NOT WIN32)
add_custom_target( add_custom_target(
jit_download_program jit_download_program
COMMAND wget -nc -q COMMAND wget -nc -q --no-check-certificate
https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load.tar.gz https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load.tar.gz
COMMAND tar zxf multi_program_load.tar.gz) COMMAND tar zxf multi_program_load.tar.gz)
set(JIT_DEPS set(JIT_DEPS
......
...@@ -170,7 +170,7 @@ if(WITH_TESTING) ...@@ -170,7 +170,7 @@ if(WITH_TESTING)
if(NOT WIN32) if(NOT WIN32)
add_custom_target( add_custom_target(
download_data download_data
COMMAND wget -nc COMMAND wget -nc --no-check-certificate
https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar
COMMAND tar -xf buddy_allocator_test_data.tar) COMMAND tar -xf buddy_allocator_test_data.tar)
add_dependencies(buddy_allocator_test download_data) add_dependencies(buddy_allocator_test download_data)
......
...@@ -68,6 +68,20 @@ PADDLE_DEFINE_EXPORTED_bool( ...@@ -68,6 +68,20 @@ PADDLE_DEFINE_EXPORTED_bool(
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
/**
* Operator related FLAG
* Name: FLAGS_check_nan_inf
* Since Version: 0.13.0
* Value Range: bool, default=false
* Example:
* Note: Used to debug. Checking whether operator produce NAN/INF or not.
*/
PADDLE_DEFINE_EXPORTED_bool(
enable_opt_get_features,
false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags. // flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
...@@ -785,6 +799,34 @@ PADDLE_DEFINE_EXPORTED_bool( ...@@ -785,6 +799,34 @@ PADDLE_DEFINE_EXPORTED_bool(
false, false,
"It controls whether to apply IR pass to program when using Fleet APIs"); "It controls whether to apply IR pass to program when using Fleet APIs");
/**
* Distributed related FLAG
* Name: FLAGS_graph_load_in_parallel
* Since Version: 2.2.0
* Value Range: bool, default=false
* Example:
* Note: Control whether load graph node and edge with multi threads parallely
* If it is not set, load graph data with one thread
*/
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
false,
"It controls whether load graph node and edge with "
"mutli threads parallely.");
/**
* Distributed related FLAG
* Name: FLAGS_graph_get_neighbor_id
* Since Version: 2.2.0
* Value Range: bool, default=false
* Example:
* Note: Control get all neighbor id when running sub part graph
* If it is not set, do not need get neighbor id when run all part graph
*/
PADDLE_DEFINE_EXPORTED_bool(
graph_get_neighbor_id,
false,
"It controls get all neighbor id when running sub part graph.");
/** /**
* KP kernel related FLAG * KP kernel related FLAG
* Name: FLAGS_run_kp_kernel * Name: FLAGS_run_kp_kernel
...@@ -893,7 +935,33 @@ DEFINE_bool(enable_slotrecord_reset_shrink, ...@@ -893,7 +935,33 @@ DEFINE_bool(enable_slotrecord_reset_shrink,
"enable slotrecord obejct reset shrink memory, default false"); "enable slotrecord obejct reset shrink memory, default false");
DEFINE_bool(enable_ins_parser_file, DEFINE_bool(enable_ins_parser_file,
false, false,
"enable parser ins file , default false"); "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
gpugraph_enable_hbm_table_collision_stat,
false,
"enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
0.75,
"the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
gpugraph_enable_gpu_direct_access,
false,
"enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
gpugraph_enable_segment_merge_grads,
false,
"enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
gpugraph_merge_grads_segment_size,
128,
"segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
gpugraph_dedup_pull_push_mode,
0,
"enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
true,
"enable load_node_list_into_hbm, default true");
/** /**
* ProcessGroupNCCL related FLAG * ProcessGroupNCCL related FLAG
......
...@@ -365,6 +365,9 @@ void BindDataset(py::module *m) { ...@@ -365,6 +365,9 @@ void BindDataset(py::module *m) {
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("enable_pv_merge", .def("enable_pv_merge",
&framework::Dataset::EnablePvMerge, &framework::Dataset::EnablePvMerge,
py::call_guard<py::gil_scoped_release>())
.def("set_gpu_graph_mode",
&framework::Dataset::SetGpuGraphMode,
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper") py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")
......
...@@ -199,13 +199,13 @@ void BindHeterClient(py::module* m) { ...@@ -199,13 +199,13 @@ void BindHeterClient(py::module* m) {
void BindGraphNode(py::module* m) { void BindGraphNode(py::module* m) {
py::class_<GraphNode>(*m, "GraphNode") py::class_<GraphNode>(*m, "GraphNode")
.def(py::init<>()) .def(py::init<>())
.def("get_id", &GraphNode::get_id) .def("get_id", &GraphNode::get_py_id)
.def("get_feature", &GraphNode::get_feature); .def("get_feature", &GraphNode::get_feature);
} }
void BindGraphPyFeatureNode(py::module* m) { void BindGraphPyFeatureNode(py::module* m) {
py::class_<FeatureNode>(*m, "FeatureNode") py::class_<FeatureNode>(*m, "FeatureNode")
.def(py::init<>()) .def(py::init<>())
.def("get_id", &GraphNode::get_id) .def("get_id", &GraphNode::get_py_id)
.def("get_feature", &GraphNode::get_feature); .def("get_feature", &GraphNode::get_feature);
} }
...@@ -359,17 +359,32 @@ void BindGraphGpuWrapper(py::module* m) { ...@@ -359,17 +359,32 @@ void BindGraphGpuWrapper(py::module* m) {
*m, "GraphGpuWrapper") *m, "GraphGpuWrapper")
.def(py::init([]() { return GraphGpuWrapper::GetInstance(); })) .def(py::init([]() { return GraphGpuWrapper::GetInstance(); }))
.def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3) .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
.def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample) .def("graph_neighbor_sample",
py::overload_cast<int, uint64_t*, int, int>(
&GraphGpuWrapper::graph_neighbor_sample))
.def("graph_neighbor_sample",
py::overload_cast<int, int, std::vector<uint64_t>&, int>(
&GraphGpuWrapper::graph_neighbor_sample))
.def("set_device", &GraphGpuWrapper::set_device) .def("set_device", &GraphGpuWrapper::set_device)
.def("set_feature_separator", &GraphGpuWrapper::set_feature_separator)
.def("init_service", &GraphGpuWrapper::init_service) .def("init_service", &GraphGpuWrapper::init_service)
.def("set_up_types", &GraphGpuWrapper::set_up_types) .def("set_up_types", &GraphGpuWrapper::set_up_types)
.def("query_node_list", &GraphGpuWrapper::query_node_list) .def("query_node_list", &GraphGpuWrapper::query_node_list)
.def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf) .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
.def("load_edge_file", &GraphGpuWrapper::load_edge_file) .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
.def("upload_batch", &GraphGpuWrapper::upload_batch) .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge)
.def("get_all_id", &GraphGpuWrapper::get_all_id) .def("upload_batch",
.def("init_sample_status", &GraphGpuWrapper::init_sample_status) py::overload_cast<int, int, int, const std::string&>(
.def("free_sample_status", &GraphGpuWrapper::free_sample_status) &GraphGpuWrapper::upload_batch))
.def("upload_batch",
py::overload_cast<int, int, int>(&GraphGpuWrapper::upload_batch))
.def(
"get_all_id",
py::overload_cast<int, int, int, std::vector<std::vector<uint64_t>>*>(
&GraphGpuWrapper::get_all_id))
.def("get_all_id",
py::overload_cast<int, int, std::vector<std::vector<uint64_t>>*>(
&GraphGpuWrapper::get_all_id))
.def("load_next_partition", &GraphGpuWrapper::load_next_partition) .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
.def("make_partitions", &GraphGpuWrapper::make_partitions) .def("make_partitions", &GraphGpuWrapper::make_partitions)
.def("make_complementary_graph", .def("make_complementary_graph",
...@@ -380,7 +395,8 @@ void BindGraphGpuWrapper(py::module* m) { ...@@ -380,7 +395,8 @@ void BindGraphGpuWrapper(py::module* m) {
.def("get_partition", &GraphGpuWrapper::get_partition) .def("get_partition", &GraphGpuWrapper::get_partition)
.def("load_node_weight", &GraphGpuWrapper::load_node_weight) .def("load_node_weight", &GraphGpuWrapper::load_node_weight)
.def("export_partition_files", &GraphGpuWrapper::export_partition_files) .def("export_partition_files", &GraphGpuWrapper::export_partition_files)
.def("load_node_file", &GraphGpuWrapper::load_node_file); .def("load_node_file", &GraphGpuWrapper::load_node_file)
.def("finalize", &GraphGpuWrapper::finalize);
} }
#endif #endif
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <algorithm>
#include <cstring> #include <cstring>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -221,6 +222,117 @@ std::string join_strings(const Container& strs, ...@@ -221,6 +222,117 @@ std::string join_strings(const Container& strs,
return ss.str(); return ss.str();
} }
struct str_ptr {
const char* ptr;
size_t len;
str_ptr(const char* p, size_t n) : ptr(p), len(n) {}
str_ptr(str_ptr& other) {
ptr = other.ptr;
len = other.len;
}
str_ptr(str_ptr&& other) {
ptr = other.ptr;
len = other.len;
}
size_t find_ptr(const char c) {
for (size_t i = 0; i < len; ++i) {
if (ptr[i] == c) {
return i;
}
}
return -1;
}
std::string to_string(void) { return std::string(ptr, len); }
};
struct str_ptr_stream {
char* ptr = NULL;
char* end = NULL;
str_ptr_stream() {}
str_ptr_stream(const str_ptr& p) { reset(p.ptr, p.len); }
void reset(const str_ptr& p) { reset(p.ptr, p.len); }
void reset(const char* p, size_t len) {
ptr = const_cast<char*>(p);
end = ptr + len;
}
char* cursor(void) { return ptr; }
char* finish(void) { return end; }
void set_cursor(char* p) { ptr = p; }
bool is_finish(void) { return (ptr == end); }
template <typename T>
str_ptr_stream& operator>>(T& x) {
*this >> x;
return *this;
}
};
inline str_ptr_stream& operator>>(str_ptr_stream& ar, float& c) {
char* next = NULL;
c = strtof(ar.cursor(), &next);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline str_ptr_stream& operator>>(str_ptr_stream& ar, double& c) {
char* next = NULL;
c = strtod(ar.cursor(), &next);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline str_ptr_stream& operator>>(str_ptr_stream& ar, int32_t& c) {
char* next = NULL;
c = strtol(ar.cursor(), &next, 10);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint32_t& c) {
char* next = NULL;
c = strtoul(ar.cursor(), &next, 10);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint64_t& c) {
char* next = NULL;
c = strtoul(ar.cursor(), &next, 10);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline str_ptr_stream& operator>>(str_ptr_stream& ar, int64_t& c) {
char* next = NULL;
c = strtoll(ar.cursor(), &next, 10);
ar.set_cursor(std::min(++next, ar.finish()));
return ar;
}
inline int split_string_ptr(const char* str,
size_t len,
char delim,
std::vector<str_ptr>* values) {
if (len <= 0) {
return 0;
}
int num = 0;
const char* p = str;
const char* end = str + len;
const char* last = str;
while (p < end) {
if (*p != delim) {
++p;
continue;
}
values->emplace_back(last, (size_t)(p - last));
++num;
++p;
// skip continue delim
while (*p == delim) {
++p;
}
last = p;
}
if (p > last) {
values->emplace_back(last, (size_t)(p - last));
++num;
}
return num;
}
// A helper class for reading lines from file. A line buffer is maintained. It // A helper class for reading lines from file. A line buffer is maintained. It
// doesn't need to know the maximum possible length of a line. // doesn't need to know the maximum possible length of a line.
......
...@@ -530,7 +530,7 @@ class DistributedStrategy(object): ...@@ -530,7 +530,7 @@ class DistributedStrategy(object):
'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \ 'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \ 'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \ 'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate'] 'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
support_sparse_table_class = ['DownpourSparseTable'] support_sparse_table_class = ['DownpourSparseTable']
support_sparse_accessor_class = [ support_sparse_accessor_class = [
'DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
...@@ -540,6 +540,11 @@ class DistributedStrategy(object): ...@@ -540,6 +540,11 @@ class DistributedStrategy(object):
from google.protobuf.descriptor import FieldDescriptor from google.protobuf.descriptor import FieldDescriptor
table_param = self.strategy.downpour_table_param table_param = self.strategy.downpour_table_param
def add_graph_config(graph, strategy):
graph.feature_learning_rate = strategy.get('feature_learning_rate',
0.05)
graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
def sparse_optimizer_config(sgd, strategy, prefix): def sparse_optimizer_config(sgd, strategy, prefix):
optimizer_name = strategy.get(prefix + "sparse_optimizer", optimizer_name = strategy.get(prefix + "sparse_optimizer",
"adagrad") "adagrad")
...@@ -691,6 +696,7 @@ class DistributedStrategy(object): ...@@ -691,6 +696,7 @@ class DistributedStrategy(object):
config, 'embed_') config, 'embed_')
sparse_optimizer_config(table_data.accessor.embedx_sgd_param, sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
config, 'embedx_') config, 'embedx_')
add_graph_config(table_data.accessor.graph_sgd_param, config)
if not configs: if not configs:
print("fleet desc config is empty") print("fleet desc config is empty")
......
...@@ -155,6 +155,12 @@ class Accessor: ...@@ -155,6 +155,12 @@ class Accessor:
if not accessor_proto.HasField("embedx_threshold"): if not accessor_proto.HasField("embedx_threshold"):
accessor_proto.embedx_threshold = 0 accessor_proto.embedx_threshold = 0
graph_sgd_param = accessor_proto.graph_sgd_param
if not graph_sgd_param.HasField("nodeid_slot"):
graph_sgd_param.nodeid_slot = 9008
if not graph_sgd_param.HasField("feature_learning_rate"):
graph_sgd_param.feature_learning_rate = 0.05
ctr_accessor_param = accessor_proto.ctr_accessor_param ctr_accessor_param = accessor_proto.ctr_accessor_param
if not ctr_accessor_param.HasField("nonclk_coeff"): if not ctr_accessor_param.HasField("nonclk_coeff"):
ctr_accessor_param.nonclk_coeff = 0.1 ctr_accessor_param.nonclk_coeff = 0.1
......
...@@ -933,7 +933,7 @@ def shuffle_batch(x, seed=None): ...@@ -933,7 +933,7 @@ def shuffle_batch(x, seed=None):
seed = helper.create_variable( seed = helper.create_variable(
name=unique_name.generate("shuffle_batch_seed"), name=unique_name.generate("shuffle_batch_seed"),
dtype="int64", dtype="int64",
persistable=True) persistable=False)
helper.append_op(type='shuffle_batch', helper.append_op(type='shuffle_batch',
inputs={ inputs={
'X': x, 'X': x,
......
...@@ -1037,6 +1037,51 @@ class InMemoryDataset(DatasetBase): ...@@ -1037,6 +1037,51 @@ class InMemoryDataset(DatasetBase):
""" """
self.dataset.set_heter_ps(enable_heter_ps) self.dataset.set_heter_ps(enable_heter_ps)
def set_graph_config(self, config):
"""
Set graph config, user can set graph config in gpu graph mode.
Args:
config(dict): config dict.
Returns:
The size of shuffle data.
Examples:
.. code-block:: python
# required: skiptest
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
graph_config = {"walk_len": 24,
"walk_degree": 10,
"once_sample_startid_len": 80000,
"sample_times_one_chunk": 5,
"window": 3,
"debug_mode": 0,
"batch_size": 800,
"meta_path": "cuid2clk-clk2cuid;cuid2conv-conv2cuid;clk2cuid-cuid2clk;clk2cuid-cuid2conv",
"gpu_graph_training": 1}
dataset.set_graph_config(graph_config)
"""
self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1)
self.proto_desc.graph_config.walk_len = config.get("walk_len", 20)
self.proto_desc.graph_config.window = config.get("window", 5)
self.proto_desc.graph_config.once_sample_startid_len = config.get(
"once_sample_startid_len", 8000)
self.proto_desc.graph_config.sample_times_one_chunk = config.get(
"sample_times_one_chunk", 10)
self.proto_desc.graph_config.batch_size = config.get("batch_size", 1)
self.proto_desc.graph_config.debug_mode = config.get("debug_mode", 0)
self.proto_desc.graph_config.first_node_type = config.get(
"first_node_type", "")
self.proto_desc.graph_config.meta_path = config.get("meta_path", "")
self.proto_desc.graph_config.gpu_graph_training = config.get(
"gpu_graph_training", True)
self.dataset.set_gpu_graph_mode(True)
class QueueDataset(DatasetBase): class QueueDataset(DatasetBase):
""" """
......
...@@ -744,6 +744,65 @@ class TestDataset(unittest.TestCase): ...@@ -744,6 +744,65 @@ class TestDataset(unittest.TestCase):
temp_dir.cleanup() temp_dir.cleanup()
def test_run_with_inmemory_dataset_train_debug_mode(self):
"""
Testcase for InMemoryDataset from create to run.
"""
temp_dir = tempfile.TemporaryDirectory()
dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
dump_b_path = os.path.join(temp_dir.name, 'test_run_with_dump_b.txt')
with open(dump_a_path, "w") as f:
data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
f.write(data)
with open(dump_b_path, "w") as f:
data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n"
f.write(data)
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = fluid.layers.data(name=slot,
shape=[1],
dtype="int64",
lod_level=1)
slots_vars.append(var)
dataset = paddle.distributed.InMemoryDataset()
dataset.init(batch_size=32,
thread_num=1,
pipe_command="cat",
data_feed_type="SlotRecordInMemoryDataFeed",
use_var=slots_vars)
dataset._init_distributed_settings(parse_ins_id=True,
parse_content=True,
fea_eval=True,
candidate_size=10000)
dataset.set_filelist([dump_a_path, dump_b_path])
dataset.load_into_memory()
paddle.enable_static()
exe = paddle.static.Executor(paddle.CPUPlace())
startup_program = paddle.static.Program()
main_program = paddle.static.Program()
exe.run(startup_program)
for i in range(2):
try:
exe.train_from_dataset(main_program, dataset, debug=True)
except ImportError as e:
pass
except Exception as e:
self.assertTrue(False)
temp_dir.cleanup()
class TestDatasetWithDataLoader(TestDataset): class TestDatasetWithDataLoader(TestDataset):
""" """
......
...@@ -45,6 +45,17 @@ class TestTrainerDesc(unittest.TestCase): ...@@ -45,6 +45,17 @@ class TestTrainerDesc(unittest.TestCase):
self.assertEqual(mpi_rank, 1) self.assertEqual(mpi_rank, 1)
self.assertEqual(dump_fields_path, "path") self.assertEqual(dump_fields_path, "path")
def test_config_dump_simple(self):
"""
Testcase for dump_in_simple_mode
"""
trainer_desc = fluid.trainer_desc.TrainerDesc()
trainer_desc._set_dump_fields(["a", "b"])
trainer_desc._set_is_dump_in_simple_mode(True)
is_dump_in_simple_mode = trainer_desc.proto_desc.is_dump_in_simple_mode
self.assertEqual(is_dump_in_simple_mode, 1)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -156,6 +156,9 @@ class TrainerDesc(object): ...@@ -156,6 +156,9 @@ class TrainerDesc(object):
for field in dump_fields: for field in dump_fields:
self.proto_desc.dump_fields.append(field) self.proto_desc.dump_fields.append(field)
def _set_is_dump_in_simple_mode(self, is_dump_in_simple_mode):
self.proto_desc.is_dump_in_simple_mode = is_dump_in_simple_mode
def _set_dump_fields_path(self, path): def _set_dump_fields_path(self, path):
self.proto_desc.dump_fields_path = path self.proto_desc.dump_fields_path = path
......
...@@ -84,6 +84,9 @@ class TrainerFactory(object): ...@@ -84,6 +84,9 @@ class TrainerFactory(object):
trainer._set_worker_places(opt_info["worker_places"]) trainer._set_worker_places(opt_info["worker_places"])
if opt_info.get("use_ps_gpu") is not None: if opt_info.get("use_ps_gpu") is not None:
trainer._set_use_ps_gpu(opt_info["use_ps_gpu"]) trainer._set_use_ps_gpu(opt_info["use_ps_gpu"])
if opt_info.get("is_dump_in_simple_mode") is not None:
trainer._set_is_dump_in_simple_mode(
opt_info["is_dump_in_simple_mode"])
if opt_info.get("enable_random_dump") is not None: if opt_info.get("enable_random_dump") is not None:
trainer._set_enable_random_dump( trainer._set_enable_random_dump(
opt_info["enable_random_dump"]) opt_info["enable_random_dump"])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册