未验证 提交 cb525d4e 编写于 作者: Z zmxdream 提交者: GitHub

[Pglbox2.0] merge gpugraph to develop (#49946)

* add set slot_num for psgpuwraper (#177)

* add set slot_num_for_pull_feature for psgpuwarper

* Add get_epoch_finish python interface (#182)

* add get_epoch_finish interface

* add return

* delete return

* add unzip op (#183)

* fix miss key for error dataset (#186)

* fix miss key for error dataset

* fix miss key for error dataset
Co-authored-by: Nyangjunchao <yangjunchao@baidu.com>

* add excluded_train_pair and infer_node_type (#187)

* support return of degree (#188)

* fix task stuck in barrier (#189)
Co-authored-by: Nyangjunchao <yangjunchao@baidu.com>

* check node/feature format when loading (#190)

* check node&feature format when loading

* check node&feature format when loading (2£ (2)

* degrade log (#191)

* [PGLBOX]fix conflict

* [PGLBOX]fix conflict

* [PGLBOX]replace LodTensor with phi::DenseTensor

* [PGLBOX]fix gpu_primitives.h include path

* [PGLBOX]from platform::PADDLE_CUDA_NUM_THREADS to phi::PADDLE_CUDA_NUM_THREADS

* [PGLBOX]fix unzip example code

* [PGLBOX]fix unzip example code

* [PGLBOX]fix unzip example code

* [PGLBOX]fix unzip example code

* [PGLBOX]fix unzip ut

* [PGLBOX]fix unzip ut

* [PGLBOX]fix code style

* [PGLBOX]fix code style

* [PGLBOX]fix code style

* fix code style

* fix code style

* fix unzip ut

* fix unzip ut

* fix unzip ut

* fix unzip

* fix code stype

* add ut

* add c++ ut & fix train_mode_ set

* fix load into memory

* fix c++ ut

* fix c++ ut

* fix c++ ut

* fix c++ ut

* fix code style

* fix collective

* fix unzip_op.cc

* fix barrier

* fix code style

* fix barrier

* fix barrier

* fix code styple

* fix unzip

* add unzip.py

* add unzip.py

* fix unzip.py

---------
Co-authored-by: Nchao9527 <33347532+chao9527@users.noreply.github.com>
Co-authored-by: NSiming Dai <908660116@qq.com>
Co-authored-by: Nhuwei02 <53012141+huwei02@users.noreply.github.com>
Co-authored-by: Nyangjunchao <yangjunchao@baidu.com>
上级 382e9a06
...@@ -306,10 +306,10 @@ int32_t PsLocalClient::Initialize() { ...@@ -306,10 +306,10 @@ int32_t PsLocalClient::Initialize() {
size_t threshold) { size_t threshold) {
auto* table_ptr = GetTable(table_id); auto* table_ptr = GetTable(table_id);
std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat(); std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first VLOG(1) << "table id: " << table_id << ", feasign size: " << ret.first
<< ", mf size: " << ret.second; << ", mf size: " << ret.second;
if (ret.first > (int64_t)threshold) { if (ret.first > (int64_t)threshold) {
VLOG(0) << "run cache table"; VLOG(1) << "run cache table";
table_ptr->CacheTable(pass_id); table_ptr->CacheTable(pass_id);
} }
return done(); return done();
......
...@@ -124,11 +124,13 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( ...@@ -124,11 +124,13 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
} }
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
std::stringstream ss; if (FLAGS_v > 0) {
for (int k = 0; k < slot_num; ++k) { std::stringstream ss;
ss << slot_feature_num_map_[k] << " "; for (int k = 0; k < slot_num; ++k) {
ss << slot_feature_num_map_[k] << " ";
}
VLOG(1) << "slot_feature_num_map: " << ss.str();
} }
VLOG(0) << "slot_feature_num_map: " << ss.str();
tasks.clear(); tasks.clear();
...@@ -137,7 +139,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( ...@@ -137,7 +139,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
for (size_t i = 0; i < shard_num; i++) { for (size_t i = 0; i < shard_num; i++) {
tot_len += feature_array[i].size(); tot_len += feature_array[i].size();
} }
VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len VLOG(1) << "Loaded feature table on cpu, feature_list_size[" << tot_len
<< "] node_ids_size[" << node_ids.size() << "]"; << "] node_ids_size[" << node_ids.size() << "]";
res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num); res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
unsigned int offset = 0, ind = 0; unsigned int offset = 0, ind = 0;
...@@ -494,6 +496,8 @@ void GraphTable::export_partition_files(int idx, std::string file_path) { ...@@ -494,6 +496,8 @@ void GraphTable::export_partition_files(int idx, std::string file_path) {
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
} }
#endif
void GraphTable::clear_graph(int idx) { void GraphTable::clear_graph(int idx) {
for (auto p : edge_shards[idx]) { for (auto p : edge_shards[idx]) {
p->clear(); p->clear();
...@@ -506,6 +510,7 @@ void GraphTable::clear_graph(int idx) { ...@@ -506,6 +510,7 @@ void GraphTable::clear_graph(int idx) {
} }
} }
#ifdef PADDLE_WITH_HETERPS
void GraphTable::release_graph() { void GraphTable::release_graph() {
// Before releasing graph, prepare for sampling ids and embedding keys. // Before releasing graph, prepare for sampling ids and embedding keys.
build_graph_type_keys(); build_graph_type_keys();
...@@ -545,6 +550,7 @@ void GraphTable::release_graph_node() { ...@@ -545,6 +550,7 @@ void GraphTable::release_graph_node() {
feature_shrink_to_fit(); feature_shrink_to_fit();
} }
} }
#endif
void GraphTable::clear_edge_shard() { void GraphTable::clear_edge_shard() {
VLOG(0) << "begin clear edge shard"; VLOG(0) << "begin clear edge shard";
...@@ -590,6 +596,7 @@ void GraphTable::clear_feature_shard() { ...@@ -590,6 +596,7 @@ void GraphTable::clear_feature_shard() {
VLOG(0) << "finish clear feature shard"; VLOG(0) << "finish clear feature shard";
} }
#ifdef PADDLE_WITH_HETERPS
void GraphTable::feature_shrink_to_fit() { void GraphTable::feature_shrink_to_fit() {
std::vector<std::future<int>> tasks; std::vector<std::future<int>> tasks;
for (auto &type_shards : feature_shards) { for (auto &type_shards : feature_shards) {
...@@ -619,6 +626,8 @@ void GraphTable::merge_feature_shard() { ...@@ -619,6 +626,8 @@ void GraphTable::merge_feature_shard() {
feature_shards.resize(1); feature_shards.resize(1);
} }
#endif
void GraphTable::clear_graph() { void GraphTable::clear_graph() {
VLOG(0) << "begin clear_graph"; VLOG(0) << "begin clear_graph";
clear_edge_shard(); clear_edge_shard();
...@@ -626,6 +635,7 @@ void GraphTable::clear_graph() { ...@@ -626,6 +635,7 @@ void GraphTable::clear_graph() {
VLOG(0) << "finish clear_graph"; VLOG(0) << "finish clear_graph";
} }
#ifdef PADDLE_WITH_HETERPS
int32_t GraphTable::load_next_partition(int idx) { int32_t GraphTable::load_next_partition(int idx) {
if (next_partition >= static_cast<int>(partitions[idx].size())) { if (next_partition >= static_cast<int>(partitions[idx].size())) {
VLOG(0) << "partition iteration is done"; VLOG(0) << "partition iteration is done";
...@@ -1203,11 +1213,21 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) { ...@@ -1203,11 +1213,21 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
if (load_edge) { if (load_edge) {
bool reverse_edge = (param[1] == '<'); bool reverse_edge = (param[1] == '<');
std::string edge_type = param.substr(2); std::string edge_type = param.substr(2);
return this->load_edges(path, reverse_edge, edge_type); int ret = this->load_edges(path, reverse_edge, edge_type);
if (ret != 0) {
VLOG(0) << "Fail to load edges, path[" << path << "] edge_type["
<< edge_type << "]";
return -1;
}
} }
if (load_node) { if (load_node) {
std::string node_type = param.substr(1); std::string node_type = param.substr(1);
return this->load_nodes(path, node_type); int ret = this->load_nodes(path, node_type);
if (ret != 0) {
VLOG(0) << "Fail to load nodes, path[" << path << "] node_type["
<< node_type << "]";
return -1;
}
} }
return 0; return 0;
} }
...@@ -1319,10 +1339,19 @@ int32_t GraphTable::parse_node_and_load(std::string ntype2files, ...@@ -1319,10 +1339,19 @@ int32_t GraphTable::parse_node_and_load(std::string ntype2files,
return 0; return 0;
} }
if (FLAGS_graph_load_in_parallel) { if (FLAGS_graph_load_in_parallel) {
this->load_nodes(npath_str, ""); int ret = this->load_nodes(npath_str, "");
if (ret != 0) {
VLOG(0) << "Fail to load nodes, path[" << npath << "]";
return -1;
}
} else { } else {
for (size_t j = 0; j < ntypes.size(); j++) { for (size_t j = 0; j < ntypes.size(); j++) {
this->load_nodes(npath_str, ntypes[j]); int ret = this->load_nodes(npath_str, ntypes[j]);
if (ret != 0) {
VLOG(0) << "Fail to load nodes, path[" << npath << "], ntypes["
<< ntypes[j] << "]";
return -1;
}
} }
} }
return 0; return 0;
...@@ -1397,10 +1426,19 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files, ...@@ -1397,10 +1426,19 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
return 0; return 0;
} }
if (FLAGS_graph_load_in_parallel) { if (FLAGS_graph_load_in_parallel) {
this->load_nodes(npath_str, ""); int ret = this->load_nodes(npath_str, "");
if (ret != 0) {
VLOG(0) << "Fail to load nodes, path[" << npath_str << "]";
return -1;
}
} else { } else {
for (size_t j = 0; j < ntypes.size(); j++) { for (size_t j = 0; j < ntypes.size(); j++) {
this->load_nodes(npath_str, ntypes[j]); int ret = this->load_nodes(npath_str, ntypes[j]);
if (ret != 0) {
VLOG(0) << "Fail to load nodes, path[" << npath_str
<< "], ntypes[" << ntypes[j] << "]";
return -1;
}
} }
} }
} }
...@@ -1408,6 +1446,10 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files, ...@@ -1408,6 +1446,10 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
})); }));
} }
for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
if (is_parse_node_fail_) {
VLOG(0) << "Fail to load node_and_edge_file";
return -1;
}
return 0; return 0;
} }
...@@ -1499,7 +1541,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file( ...@@ -1499,7 +1541,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
node->set_feature_size(feat_name[idx].size()); node->set_feature_size(feat_name[idx].size());
for (int i = 1; i < num; ++i) { for (int i = 1; i < num; ++i) {
auto &v = vals[i]; auto &v = vals[i];
parse_feature(idx, v.ptr, v.len, node); int ret = parse_feature(idx, v.ptr, v.len, node);
if (ret != 0) {
VLOG(0) << "Fail to parse feature, node_id[" << id << "]";
is_parse_node_fail_ = true;
return {0, 0};
}
} }
} }
local_valid_count++; local_valid_count++;
...@@ -1551,7 +1598,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file( ...@@ -1551,7 +1598,12 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
if (node != NULL) { if (node != NULL) {
for (int i = 2; i < num; ++i) { for (int i = 2; i < num; ++i) {
auto &v = vals[i]; auto &v = vals[i];
parse_feature(idx, v.ptr, v.len, node); int ret = parse_feature(idx, v.ptr, v.len, node);
if (ret != 0) {
VLOG(0) << "Fail to parse feature, node_id[" << id << "]";
is_parse_node_fail_ = true;
return {0, 0};
}
} }
} }
local_valid_count++; local_valid_count++;
...@@ -1603,6 +1655,11 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { ...@@ -1603,6 +1655,11 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
valid_count += res.second; valid_count += res.second;
} }
} }
if (is_parse_node_fail_) {
VLOG(0) << "Fail to load nodes, path[" << paths[0] << ".."
<< paths[paths.size() - 1] << "] node_type[" << node_type << "]";
return -1;
}
VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type
<< "] are loaded successfully!"; << "] are loaded successfully!";
...@@ -2103,28 +2160,48 @@ int GraphTable::parse_feature(int idx, ...@@ -2103,28 +2160,48 @@ int GraphTable::parse_feature(int idx,
if (dtype == "feasign") { if (dtype == "feasign") {
// string_vector_2_string(fields.begin() + 1, fields.end(), ' ', // string_vector_2_string(fields.begin() + 1, fields.end(), ' ',
// fea_ptr); // fea_ptr);
FeatureNode::parse_value_to_bytes<uint64_t>( int ret = FeatureNode::parse_value_to_bytes<uint64_t>(
fea_fields.begin(), fea_fields.end(), fea_ptr); fea_fields.begin(), fea_fields.end(), fea_ptr);
if (ret != 0) {
VLOG(0) << "Fail to parse value";
return -1;
}
return 0; return 0;
} else if (dtype == "string") { } else if (dtype == "string") {
string_vector_2_string( string_vector_2_string(
fea_fields.begin(), fea_fields.end(), ' ', fea_ptr); fea_fields.begin(), fea_fields.end(), ' ', fea_ptr);
return 0; return 0;
} else if (dtype == "float32") { } else if (dtype == "float32") {
FeatureNode::parse_value_to_bytes<float>( int ret = FeatureNode::parse_value_to_bytes<float>(
fea_fields.begin(), fea_fields.end(), fea_ptr); fea_fields.begin(), fea_fields.end(), fea_ptr);
if (ret != 0) {
VLOG(0) << "Fail to parse value";
return -1;
}
return 0; return 0;
} else if (dtype == "float64") { } else if (dtype == "float64") {
FeatureNode::parse_value_to_bytes<double>( int ret = FeatureNode::parse_value_to_bytes<double>(
fea_fields.begin(), fea_fields.end(), fea_ptr); fea_fields.begin(), fea_fields.end(), fea_ptr);
if (ret != 0) {
VLOG(0) << "Fail to parse value";
return -1;
}
return 0; return 0;
} else if (dtype == "int32") { } else if (dtype == "int32") {
FeatureNode::parse_value_to_bytes<int32_t>( int ret = FeatureNode::parse_value_to_bytes<int32_t>(
fea_fields.begin(), fea_fields.end(), fea_ptr); fea_fields.begin(), fea_fields.end(), fea_ptr);
if (ret != 0) {
VLOG(0) << "Fail to parse value";
return -1;
}
return 0; return 0;
} else if (dtype == "int64") { } else if (dtype == "int64") {
FeatureNode::parse_value_to_bytes<uint64_t>( int ret = FeatureNode::parse_value_to_bytes<uint64_t>(
fea_fields.begin(), fea_fields.end(), fea_ptr); fea_fields.begin(), fea_fields.end(), fea_ptr);
if (ret != 0) {
VLOG(0) << "Fail to parse value";
return -1;
}
return 0; return 0;
} }
} else { } else {
...@@ -2132,7 +2209,7 @@ int GraphTable::parse_feature(int idx, ...@@ -2132,7 +2209,7 @@ int GraphTable::parse_feature(int idx,
<< idx << "] feat_id_map_size[" << feat_id_map.size() << "]"; << idx << "] feat_id_map_size[" << feat_id_map.size() << "]";
} }
return -1; return 0;
} }
// thread safe shard vector merge // thread safe shard vector merge
class MergeShardVector { class MergeShardVector {
......
...@@ -789,6 +789,7 @@ class GraphTable : public Table { ...@@ -789,6 +789,7 @@ class GraphTable : public Table {
std::string slot_feature_separator_ = std::string(" "); std::string slot_feature_separator_ = std::string(" ");
std::string feature_separator_ = std::string(" "); std::string feature_separator_ = std::string(" ");
std::vector<int> slot_feature_num_map_; std::vector<int> slot_feature_num_map_;
bool is_parse_node_fail_ = false;
}; };
/* /*
......
...@@ -255,7 +255,7 @@ class FeatureNode : public Node { ...@@ -255,7 +255,7 @@ class FeatureNode : public Node {
} }
template <typename T> template <typename T>
static void parse_value_to_bytes( static int parse_value_to_bytes(
std::vector<paddle::string::str_ptr>::iterator feat_str_begin, std::vector<paddle::string::str_ptr>::iterator feat_str_begin,
std::vector<paddle::string::str_ptr>::iterator feat_str_end, std::vector<paddle::string::str_ptr>::iterator feat_str_end,
std::string *output) { std::string *output) {
...@@ -269,8 +269,14 @@ class FeatureNode : public Node { ...@@ -269,8 +269,14 @@ class FeatureNode : public Node {
thread_local paddle::string::str_ptr_stream ss; thread_local paddle::string::str_ptr_stream ss;
for (size_t i = 0; i < feat_str_size; i++) { for (size_t i = 0; i < feat_str_size; i++) {
ss.reset(*(feat_str_begin + i)); ss.reset(*(feat_str_begin + i));
int len = ss.end - ss.ptr;
char *old_ptr = ss.ptr;
ss >> fea_ptrs[i]; ss >> fea_ptrs[i];
if (ss.ptr - old_ptr != len) {
return -1;
}
} }
return 0;
} }
protected: protected:
......
...@@ -866,7 +866,8 @@ if(WITH_DISTRIBUTE) ...@@ -866,7 +866,8 @@ if(WITH_DISTRIBUTE)
fleet fleet
heter_server heter_server
brpc brpc
fleet_executor) fleet_executor
flags)
set(DISTRIBUTE_COMPILE_FLAGS set(DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses" "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses"
) )
......
...@@ -14,6 +14,11 @@ ...@@ -14,6 +14,11 @@
#pragma once #pragma once
#if defined _WIN32 || defined __APPLE__
#else
#define __LINUX__
#endif
#ifdef __LINUX__ #ifdef __LINUX__
#include <pthread.h> #include <pthread.h>
#include <semaphore.h> #include <semaphore.h>
...@@ -48,7 +53,7 @@ class Barrier { ...@@ -48,7 +53,7 @@ class Barrier {
void wait() { void wait() {
#ifdef __LINUX__ #ifdef __LINUX__
int err = pthread_barrier_wait(&_barrier); int err = pthread_barrier_wait(&_barrier);
if (err != 0 && err != PTHREAD_BARRIER_SERIAL_THREAD)) { if (err != 0 && err != PTHREAD_BARRIER_SERIAL_THREAD) {
CHECK_EQ(1, 0); CHECK_EQ(1, 0);
} }
#endif #endif
......
...@@ -2112,15 +2112,24 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { ...@@ -2112,15 +2112,24 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.SetConfig(data_feed_desc); gpu_graph_data_generator_.SetConfig(data_feed_desc);
#endif #endif
if (gpu_graph_mode_) {
train_mode_ = true;
} else {
train_mode_ = data_feed_desc.graph_config().gpu_graph_training();
}
} }
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
void SlotRecordInMemoryDataFeed::InitGraphResource() { void SlotRecordInMemoryDataFeed::InitGraphResource() {
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_); gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_);
#endif
} }
void SlotRecordInMemoryDataFeed::InitGraphTrainResource() { void SlotRecordInMemoryDataFeed::InitGraphTrainResource() {
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
gpu_graph_data_generator_.AllocTrainResource(thread_id_); gpu_graph_data_generator_.AllocTrainResource(thread_id_);
#endif
} }
#endif #endif
......
此差异已折叠。
...@@ -914,7 +914,9 @@ class GraphDataGenerator { ...@@ -914,7 +914,9 @@ class GraphDataGenerator {
int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk, int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
std::shared_ptr<phi::Allocation> d_feature); std::shared_ptr<phi::Allocation> d_feature);
void FillOneStep(uint64_t* start_ids, void FillOneStep(uint64_t* start_ids,
int etype_id,
uint64_t* walk, uint64_t* walk,
uint8_t* walk_ntype,
int len, int len,
NeighborSampleResult& sample_res, // NOLINT NeighborSampleResult& sample_res, // NOLINT
int cur_degree, int cur_degree,
...@@ -966,6 +968,7 @@ class GraphDataGenerator { ...@@ -966,6 +968,7 @@ class GraphDataGenerator {
int len, int len,
int* uniq_len, int* uniq_len,
std::shared_ptr<phi::Allocation>& inverse); // NOLINT std::shared_ptr<phi::Allocation>& inverse); // NOLINT
std::shared_ptr<phi::Allocation> GetNodeDegree(uint64_t* node_ids, int len);
int InsertTable(const uint64_t* d_keys, int InsertTable(const uint64_t* d_keys,
uint64_t len, uint64_t len,
std::shared_ptr<phi::Allocation> d_uniq_node_num); std::shared_ptr<phi::Allocation> d_uniq_node_num);
...@@ -988,6 +991,7 @@ class GraphDataGenerator { ...@@ -988,6 +991,7 @@ class GraphDataGenerator {
int* index_tensor_ptr_; int* index_tensor_ptr_;
int64_t* show_tensor_ptr_; int64_t* show_tensor_ptr_;
int64_t* clk_tensor_ptr_; int64_t* clk_tensor_ptr_;
int* degree_tensor_ptr_;
cudaStream_t train_stream_; cudaStream_t train_stream_;
cudaStream_t sample_stream_; cudaStream_t sample_stream_;
...@@ -999,6 +1003,8 @@ class GraphDataGenerator { ...@@ -999,6 +1003,8 @@ class GraphDataGenerator {
std::shared_ptr<phi::Allocation> d_train_metapath_keys_; std::shared_ptr<phi::Allocation> d_train_metapath_keys_;
std::shared_ptr<phi::Allocation> d_walk_; std::shared_ptr<phi::Allocation> d_walk_;
std::shared_ptr<phi::Allocation> d_walk_ntype_;
std::shared_ptr<phi::Allocation> d_excluded_train_pair_;
std::shared_ptr<phi::Allocation> d_feature_list_; std::shared_ptr<phi::Allocation> d_feature_list_;
std::shared_ptr<phi::Allocation> d_feature_; std::shared_ptr<phi::Allocation> d_feature_;
std::shared_ptr<phi::Allocation> d_len_per_row_; std::shared_ptr<phi::Allocation> d_len_per_row_;
...@@ -1033,11 +1039,13 @@ class GraphDataGenerator { ...@@ -1033,11 +1039,13 @@ class GraphDataGenerator {
// sage mode batch data // sage mode batch data
std::vector<std::shared_ptr<phi::Allocation>> inverse_vec_; std::vector<std::shared_ptr<phi::Allocation>> inverse_vec_;
std::vector<std::shared_ptr<phi::Allocation>> final_sage_nodes_vec_; std::vector<std::shared_ptr<phi::Allocation>> final_sage_nodes_vec_;
std::vector<std::shared_ptr<phi::Allocation>> node_degree_vec_;
std::vector<int> uniq_instance_vec_; std::vector<int> uniq_instance_vec_;
std::vector<int> total_instance_vec_; std::vector<int> total_instance_vec_;
std::vector<std::vector<std::shared_ptr<phi::Allocation>>> graph_edges_vec_; std::vector<std::vector<std::shared_ptr<phi::Allocation>>> graph_edges_vec_;
std::vector<std::vector<std::vector<int>>> edges_split_num_vec_; std::vector<std::vector<std::vector<int>>> edges_split_num_vec_;
int excluded_train_pair_len_;
int64_t reindex_table_size_; int64_t reindex_table_size_;
int sage_batch_count_; int sage_batch_count_;
int sage_batch_num_; int sage_batch_num_;
...@@ -1067,6 +1075,9 @@ class GraphDataGenerator { ...@@ -1067,6 +1075,9 @@ class GraphDataGenerator {
int total_row_; int total_row_;
size_t infer_node_start_; size_t infer_node_start_;
size_t infer_node_end_; size_t infer_node_end_;
std::set<int> infer_node_type_index_set_;
std::string infer_node_type_;
bool get_degree_;
}; };
class DataFeed { class DataFeed {
......
...@@ -42,6 +42,9 @@ message GraphConfig { ...@@ -42,6 +42,9 @@ message GraphConfig {
optional string samples = 12; optional string samples = 12;
optional int64 train_table_cap = 13 [ default = 80000 ]; optional int64 train_table_cap = 13 [ default = 80000 ];
optional int64 infer_table_cap = 14 [ default = 80000 ]; optional int64 infer_table_cap = 14 [ default = 80000 ];
optional string excluded_train_pair = 15;
optional string infer_node_type = 16;
optional bool get_degree = 17 [ default = false ];
} }
message DataFeedDesc { message DataFeedDesc {
......
...@@ -457,7 +457,7 @@ void DatasetImpl<T>::LoadIntoMemory() { ...@@ -457,7 +457,7 @@ void DatasetImpl<T>::LoadIntoMemory() {
timeline.Start(); timeline.Start();
std::vector<std::thread> load_threads; std::vector<std::thread> load_threads;
if (gpu_graph_mode_) { if (gpu_graph_mode_) {
VLOG(0) << "in gpu_graph_mode"; VLOG(1) << "in gpu_graph_mode";
#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
for (size_t i = 0; i < readers_.size(); i++) { for (size_t i = 0; i < readers_.size(); i++) {
readers_[i]->SetGpuGraphMode(gpu_graph_mode_); readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
...@@ -470,7 +470,6 @@ void DatasetImpl<T>::LoadIntoMemory() { ...@@ -470,7 +470,6 @@ void DatasetImpl<T>::LoadIntoMemory() {
readers_[i]->ResetPathNum(); readers_[i]->ResetPathNum();
readers_[i]->ResetEpochFinish(); readers_[i]->ResetEpochFinish();
} }
return;
} }
for (int64_t i = 0; i < thread_num_; ++i) { for (int64_t i = 0; i < thread_num_; ++i) {
......
...@@ -30,7 +30,7 @@ limitations under the License. */ ...@@ -30,7 +30,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_PSCORE) #if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif #endif
#include "paddle/fluid/framework/barrier.h"
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/heter_util.h"
...@@ -212,6 +212,9 @@ class DeviceWorker { ...@@ -212,6 +212,9 @@ class DeviceWorker {
virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) { virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) {
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
} }
virtual void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
virtual Scope* GetThreadScope() { return thread_scope_; } virtual Scope* GetThreadScope() { return thread_scope_; }
DataFeed* device_reader_ = nullptr; DataFeed* device_reader_ = nullptr;
...@@ -290,7 +293,8 @@ class HogwildWorker : public CPUWorkerBase { ...@@ -290,7 +293,8 @@ class HogwildWorker : public CPUWorkerBase {
HogwildWorkerParameter param_; HogwildWorkerParameter param_;
std::vector<std::string> skip_ops_; std::vector<std::string> skip_ops_;
std::map<std::string, int> stat_var_name_map_; std::map<std::string, int> stat_var_name_map_;
static std::atomic<uint64_t> worker_num_stat_; static std::atomic<bool> quit_flag_;
// static bool quit_flag_2;
}; };
class DownpourWorker : public HogwildWorker { class DownpourWorker : public HogwildWorker {
...@@ -724,7 +728,7 @@ class HeterSectionWorker : public DeviceWorker { ...@@ -724,7 +728,7 @@ class HeterSectionWorker : public DeviceWorker {
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
void SetDeviceIndex(int tid) override { thread_id_ = tid; } void SetDeviceIndex(int tid) override { thread_id_ = tid; }
void SetThreadNum(int thread_num) { thread_num_ = thread_num; } // void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
void SetMicrobatchNum(int num) { num_microbatches_ = num; } void SetMicrobatchNum(int num) { num_microbatches_ = num; }
void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; } void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
void SetPipelineStage(int stage) { pipeline_stage_ = stage; } void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
...@@ -767,7 +771,7 @@ class HeterSectionWorker : public DeviceWorker { ...@@ -767,7 +771,7 @@ class HeterSectionWorker : public DeviceWorker {
protected: protected:
int trainer_id_; int trainer_id_;
int trainers_; int trainers_;
int thread_num_; // int thread_num_;
int thread_id_; int thread_id_;
int num_microbatches_; int num_microbatches_;
int num_pipeline_stages_; int num_pipeline_stages_;
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#include "paddle/phi/core/flags.h"
#ifdef PADDLE_WITH_GLOO #ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
...@@ -21,6 +22,7 @@ ...@@ -21,6 +22,7 @@
#else #else
#define _LINUX #define _LINUX
#endif #endif
DECLARE_bool(enable_exit_when_partial_worker);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -82,5 +84,92 @@ TEST(DisMultiTrainerTest, testforgpugraph) { ...@@ -82,5 +84,92 @@ TEST(DisMultiTrainerTest, testforgpugraph) {
#endif #endif
} }
TEST(DisMultiTrainerTest, test2) {
#ifdef _LINUX
FLAGS_enable_exit_when_partial_worker = true;
std::shared_ptr<MultiTrainer> tmp1 = std::make_shared<MultiTrainer>();
TrainerDesc t;
t.set_class_name("MultiTrainer");
t.set_device_worker_name("HogwildWorker");
t.set_thread_num(1);
auto* m = t.mutable_downpour_param()->add_program_config();
m->set_program_id("123");
std::string str;
// str += "name: \"MultiSlotDataFeed\"\nbatch_size: 2\nmulti_slot_desc {\n";
str +=
"name: \"SlotRecordInMemoryDataFeed\"\nbatch_size: 2\nmulti_slot_desc "
"{\n";
str += "slots {\nname: \"words\"\ntype: \"uint64\"\nis_dense: false\n";
str += "is_used: true\n}\nslots {\nname: \"label\"\ntype: \"uint64\"\n";
str += "is_dense: false\nis_used: true\n}\n}\n";
str += "graph_config {\n";
str += "gpu_graph_training: true\n}";
// std::shared_ptr<MultiSlotDataset> dataset =
// std::make_shared<MultiSlotDataset>();
std::shared_ptr<SlotRecordDataset> dataset =
std::make_shared<SlotRecordDataset>();
dataset->SetFileList(std::vector<std::string>());
dataset->SetThreadNum(1);
dataset->SetTrainerNum(1);
dataset->SetDataFeedDesc(str);
dataset->CreateChannel();
dataset->CreateReaders();
Scope root_scope;
tmp1->SetScope(&root_scope);
tmp1->Initialize(t, dataset.get());
tmp1->SetDebug(false);
ProgramDesc p;
tmp1->InitOtherEnv(p);
tmp1->Run();
tmp1->Finalize();
#endif
}
TEST(DisMultiTrainerTest, test3) {
#ifdef _LINUX
FLAGS_enable_exit_when_partial_worker = true;
std::shared_ptr<MultiTrainer> tmp1 = std::make_shared<MultiTrainer>();
TrainerDesc t;
t.set_class_name("MultiTrainer");
t.set_device_worker_name("HogwildWorker");
t.set_thread_num(1);
auto* m = t.mutable_downpour_param()->add_program_config();
m->set_program_id("123");
std::string str;
// str += "name: \"MultiSlotDataFeed\"\nbatch_size: 2\nmulti_slot_desc {\n";
str +=
"name: \"SlotRecordInMemoryDataFeed\"\nbatch_size: 2\nmulti_slot_desc "
"{\n";
str += "slots {\nname: \"words\"\ntype: \"uint64\"\nis_dense: false\n";
str += "is_used: true\n}\nslots {\nname: \"label\"\ntype: \"uint64\"\n";
str += "is_dense: false\nis_used: true\n}\n}\n";
str += "graph_config {\n";
str += "gpu_graph_training: true\n}";
// std::shared_ptr<MultiSlotDataset> dataset =
// std::make_shared<MultiSlotDataset>();
std::shared_ptr<SlotRecordDataset> dataset =
std::make_shared<SlotRecordDataset>();
dataset->SetFileList(std::vector<std::string>());
dataset->SetThreadNum(1);
dataset->SetTrainerNum(1);
dataset->SetDataFeedDesc(str);
dataset->CreateChannel();
dataset->SetGpuGraphMode(true);
dataset->CreateReaders();
auto readers = dataset->GetReaders();
readers[0]->SetGpuGraphMode(true);
Scope root_scope;
tmp1->SetScope(&root_scope);
tmp1->Initialize(t, dataset.get());
tmp1->SetDebug(true);
ProgramDesc p;
tmp1->InitOtherEnv(p);
tmp1->Run();
tmp1->Finalize();
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -109,6 +109,11 @@ class GpuPsGraphTable ...@@ -109,6 +109,11 @@ class GpuPsGraphTable
std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs); std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs);
std::vector<std::shared_ptr<phi::Allocation>> get_edge_type_graph( std::vector<std::shared_ptr<phi::Allocation>> get_edge_type_graph(
int gpu_id, int edge_type_len); int gpu_id, int edge_type_len);
void get_node_degree(int gpu_id,
int edge_idx,
uint64_t *key,
int len,
std::shared_ptr<phi::Allocation> node_degree);
int get_feature_of_nodes(int gpu_id, int get_feature_of_nodes(int gpu_id,
uint64_t *d_walk, uint64_t *d_walk,
uint64_t *d_offset, uint64_t *d_offset,
...@@ -146,6 +151,8 @@ class GpuPsGraphTable ...@@ -146,6 +151,8 @@ class GpuPsGraphTable
uint32_t *actual_feature_size, uint32_t *actual_feature_size,
uint64_t *feature_list, uint64_t *feature_list,
uint8_t *slot_list); uint8_t *slot_list);
void move_degree_to_source_gpu(
int gpu_id, int gpu_num, int *h_left, int *h_right, int *node_degree);
void move_result_to_source_gpu_all_edge_type(int gpu_id, void move_result_to_source_gpu_all_edge_type(int gpu_id,
int gpu_num, int gpu_num,
int sample_size, int sample_size,
......
...@@ -155,6 +155,15 @@ __global__ void get_features_kernel(GpuPsCommGraphFea graph, ...@@ -155,6 +155,15 @@ __global__ void get_features_kernel(GpuPsCommGraphFea graph,
} }
} }
__global__ void get_node_degree_kernel(GpuPsNodeInfo* node_info_list,
int* node_degree,
int n) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
node_degree[i] = node_info_list[i].neighbor_size;
}
}
template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE> template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
__global__ void neighbor_sample_kernel_walking(GpuPsCommGraph graph, __global__ void neighbor_sample_kernel_walking(GpuPsCommGraph graph,
GpuPsNodeInfo* node_info_list, GpuPsNodeInfo* node_info_list,
...@@ -455,6 +464,41 @@ void GpuPsGraphTable::move_result_to_source_gpu(int start_index, ...@@ -455,6 +464,41 @@ void GpuPsGraphTable::move_result_to_source_gpu(int start_index,
} }
} }
void GpuPsGraphTable::move_degree_to_source_gpu(
int start_index, int gpu_num, int* h_left, int* h_right, int* node_degree) {
int shard_len[gpu_num];
for (int i = 0; i < gpu_num; i++) {
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
shard_len[i] = h_right[i] - h_left[i] + 1;
int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
for (int j = cur_step; j > 0; j--) {
CUDA_CHECK(
cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
path_[start_index][i].nodes_[j].val_storage,
path_[start_index][i].nodes_[j - 1].val_bytes_len,
cudaMemcpyDefault,
path_[start_index][i].nodes_[j - 1].out_stream));
}
auto& node = path_[start_index][i].nodes_.front();
CUDA_CHECK(
cudaMemcpyAsync(reinterpret_cast<char*>(node_degree + h_left[i]),
node.val_storage + sizeof(int64_t) * shard_len[i],
sizeof(int) * shard_len[i],
cudaMemcpyDefault,
node.out_stream));
}
for (int i = 0; i < gpu_num; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) {
continue;
}
auto& node = path_[start_index][i].nodes_.front();
CUDA_CHECK(cudaStreamSynchronize(node.out_stream));
}
}
void GpuPsGraphTable::move_result_to_source_gpu_all_edge_type( void GpuPsGraphTable::move_result_to_source_gpu_all_edge_type(
int start_index, int start_index,
int gpu_num, int gpu_num,
...@@ -570,6 +614,16 @@ __global__ void fill_dvalues(uint64_t* d_shard_vals, ...@@ -570,6 +614,16 @@ __global__ void fill_dvalues(uint64_t* d_shard_vals,
} }
} }
__global__ void fill_dvalues(int* d_shard_degree,
int* d_degree,
int* idx,
int len) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) {
d_degree[idx[i]] = d_shard_degree[i];
}
}
__global__ void fill_dvalues_with_edge_type(uint64_t* d_shard_vals, __global__ void fill_dvalues_with_edge_type(uint64_t* d_shard_vals,
uint64_t* d_vals, uint64_t* d_vals,
int* d_shard_actual_sample_size, int* d_shard_actual_sample_size,
...@@ -756,7 +810,7 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g, ...@@ -756,7 +810,7 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
} else { } else {
gpu_graph_fea_list_[offset].feature_size = 0; gpu_graph_fea_list_[offset].feature_size = 0;
} }
VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is " VLOG(1) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
<< gpu_graph_fea_list_[offset].node_size << ", feature_size is " << gpu_graph_fea_list_[offset].node_size << ", feature_size is "
<< gpu_graph_fea_list_[offset].feature_size; << gpu_graph_fea_list_[offset].feature_size;
} }
...@@ -1340,6 +1394,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( ...@@ -1340,6 +1394,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
return result; return result;
} }
// only for graphsage
NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type( NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
int gpu_id, int gpu_id,
int edge_type_len, int edge_type_len,
...@@ -1537,6 +1592,125 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type( ...@@ -1537,6 +1592,125 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
return result; return result;
} }
void GpuPsGraphTable::get_node_degree(
int gpu_id,
int edge_idx,
uint64_t* key,
int len,
std::shared_ptr<phi::Allocation> node_degree) {
int* node_degree_ptr =
reinterpret_cast<int*>(node_degree->ptr()) + edge_idx * len;
int total_gpu = resource_->total_device();
platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
auto stream = resource_->local_stream(gpu_id, 0);
int grid_size = (len - 1) / block_size_ + 1;
int h_left[total_gpu]; // NOLINT
int h_right[total_gpu]; // NOLINT
auto d_left =
memory::Alloc(place,
total_gpu * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
auto d_right =
memory::Alloc(place,
total_gpu * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
auto d_idx =
memory::Alloc(place,
len * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
auto d_shard_keys =
memory::Alloc(place,
len * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
auto d_shard_degree =
memory::Alloc(place,
len * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
int* d_shard_degree_ptr = reinterpret_cast<int*>(d_shard_degree->ptr());
split_input_to_shard(
(uint64_t*)(key), d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
heter_comm_kernel_->fill_shard_key(
d_shard_keys_ptr, key, d_idx_ptr, len, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(cudaMemcpyAsync(h_left,
d_left_ptr,
total_gpu * sizeof(int),
cudaMemcpyDeviceToHost,
stream));
CUDA_CHECK(cudaMemcpyAsync(h_right,
d_right_ptr,
total_gpu * sizeof(int),
cudaMemcpyDeviceToHost,
stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
device_mutex_[gpu_id]->lock();
for (int i = 0; i < total_gpu; ++i) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue;
}
create_storage(
gpu_id,
i,
shard_len * sizeof(uint64_t),
shard_len * sizeof(uint64_t) + sizeof(int) * shard_len + shard_len % 2);
}
walk_to_dest(
gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) {
continue;
}
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
auto& node = path_[gpu_id][i].nodes_.back();
CUDA_CHECK(cudaMemsetAsync(
node.val_storage, 0, shard_len * sizeof(uint64_t), node.in_stream));
CUDA_CHECK(cudaStreamSynchronize(node.in_stream));
platform::CUDADeviceGuard guard(resource_->dev_id(i));
int table_offset =
get_table_offset(i, GraphTableType::EDGE_TABLE, edge_idx);
tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
reinterpret_cast<uint64_t*>(node.val_storage),
(size_t)(h_right[i] - h_left[i] + 1),
resource_->remote_stream(i, gpu_id));
GpuPsNodeInfo* node_info_list =
reinterpret_cast<GpuPsNodeInfo*>(node.val_storage);
int* node_degree_array = (int*)(node_info_list + shard_len);
int grid_size_ = (shard_len - 1) / block_size_ + 1;
get_node_degree_kernel<<<grid_size_,
block_size_,
0,
resource_->remote_stream(i, gpu_id)>>>(
node_info_list, node_degree_array, shard_len);
}
for (int i = 0; i < total_gpu; ++i) {
if (h_left[i] == -1) {
continue;
}
CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)));
}
move_degree_to_source_gpu(
gpu_id, total_gpu, h_left, h_right, d_shard_degree_ptr);
fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
d_shard_degree_ptr, node_degree_ptr, d_idx_ptr, len);
CUDA_CHECK(cudaStreamSynchronize(stream));
for (int i = 0; i < total_gpu; i++) {
int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
if (shard_len == 0) {
continue;
}
destroy_storage(gpu_id, i);
}
device_mutex_[gpu_id]->unlock();
}
NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id, NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
int sample_size) { int sample_size) {
return NodeQueryResult(); return NodeQueryResult();
......
...@@ -32,7 +32,8 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) { ...@@ -32,7 +32,8 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
} }
void GraphGpuWrapper::init_conf(const std::string &first_node_type, void GraphGpuWrapper::init_conf(const std::string &first_node_type,
const std::string &meta_path) { const std::string &meta_path,
const std::string &excluded_train_pair) {
static std::mutex mutex; static std::mutex mutex;
{ {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
...@@ -45,12 +46,12 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type, ...@@ -45,12 +46,12 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
paddle::string::split_string<std::string>(first_node_type, ";"); paddle::string::split_string<std::string>(first_node_type, ";");
VLOG(2) << "node_types: " << first_node_type; VLOG(2) << "node_types: " << first_node_type;
for (auto &type : node_types) { for (auto &type : node_types) {
auto iter = feature_to_id.find(type); auto iter = node_to_id.find(type);
PADDLE_ENFORCE_NE(iter, PADDLE_ENFORCE_NE(
feature_to_id.end(), iter,
platform::errors::NotFound( node_to_id.end(),
"(%s) is not found in feature_to_id.", type)); platform::errors::NotFound("(%s) is not found in node_to_id.", type));
VLOG(2) << "feature_to_id[" << type << "] = " << iter->second; VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
first_node_type_.push_back(iter->second); first_node_type_.push_back(iter->second);
} }
meta_path_.resize(first_node_type_.size()); meta_path_.resize(first_node_type_.size());
...@@ -58,17 +59,40 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type, ...@@ -58,17 +59,40 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
for (size_t i = 0; i < meta_paths.size(); i++) { for (size_t i = 0; i < meta_paths.size(); i++) {
auto path = meta_paths[i]; auto path = meta_paths[i];
auto nodes = paddle::string::split_string<std::string>(path, "-"); auto edges = paddle::string::split_string<std::string>(path, "-");
for (auto &edge : edges) {
auto iter = edge_to_id.find(edge);
PADDLE_ENFORCE_NE(iter,
edge_to_id.end(),
platform::errors::NotFound(
"(%s) is not found in edge_to_id.", edge));
VLOG(2) << "edge_to_id[" << edge << "] = " << iter->second;
meta_path_[i].push_back(iter->second);
if (edge_to_node_map_.find(iter->second) == edge_to_node_map_.end()) {
auto nodes = paddle::string::split_string<std::string>(edge, "2");
uint64_t src_node_id = node_to_id.find(nodes[0])->second;
uint64_t dst_node_id = node_to_id.find(nodes[1])->second;
edge_to_node_map_[iter->second] = src_node_id << 32 | dst_node_id;
}
}
}
auto paths =
paddle::string::split_string<std::string>(excluded_train_pair, ";");
VLOG(2) << "excluded_train_pair[" << excluded_train_pair << "]";
for (auto &path : paths) {
auto nodes = paddle::string::split_string<std::string>(path, "2");
for (auto &node : nodes) { for (auto &node : nodes) {
auto iter = edge_to_id.find(node); auto iter = node_to_id.find(node);
PADDLE_ENFORCE_NE(iter, PADDLE_ENFORCE_NE(iter,
edge_to_id.end(), edge_to_id.end(),
platform::errors::NotFound( platform::errors::NotFound(
"(%s) is not found in edge_to_id.", node)); "(%s) is not found in edge_to_id.", node));
VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
meta_path_[i].push_back(iter->second); excluded_train_pair_.push_back(iter->second);
} }
} }
int max_dev_id = 0; int max_dev_id = 0;
for (size_t i = 0; i < device_id_mapping.size(); i++) { for (size_t i = 0; i < device_id_mapping.size(); i++) {
if (device_id_mapping[i] > max_dev_id) { if (device_id_mapping[i] > max_dev_id) {
...@@ -85,11 +109,11 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type, ...@@ -85,11 +109,11 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
auto &finish_node_type = finish_node_type_[i]; auto &finish_node_type = finish_node_type_[i];
finish_node_type.clear(); finish_node_type.clear();
for (size_t idx = 0; idx < feature_to_id.size(); idx++) { for (size_t idx = 0; idx < node_to_id.size(); idx++) {
infer_node_type_start[idx] = 0; infer_node_type_start[idx] = 0;
} }
for (auto &type : node_types) { for (auto &type : node_types) {
auto iter = feature_to_id.find(type); auto iter = node_to_id.find(type);
node_type_start[iter->second] = 0; node_type_start[iter->second] = 0;
infer_node_type_start[iter->second] = 0; infer_node_type_start[iter->second] = 0;
} }
...@@ -188,7 +212,7 @@ void GraphGpuWrapper::init_metapath(std::string cur_metapath, ...@@ -188,7 +212,7 @@ void GraphGpuWrapper::init_metapath(std::string cur_metapath,
int first_node_idx; int first_node_idx;
std::string first_node = std::string first_node =
paddle::string::split_string<std::string>(cur_metapath_, "2")[0]; paddle::string::split_string<std::string>(cur_metapath_, "2")[0];
auto it = feature_to_id.find(first_node); auto it = node_to_id.find(first_node);
first_node_idx = it->second; first_node_idx = it->second;
d_graph_train_total_keys_.resize(thread_num); d_graph_train_total_keys_.resize(thread_num);
h_graph_train_keys_len_.resize(thread_num); h_graph_train_keys_len_.resize(thread_num);
...@@ -309,8 +333,8 @@ void GraphGpuWrapper::set_up_types(const std::vector<std::string> &edge_types, ...@@ -309,8 +333,8 @@ void GraphGpuWrapper::set_up_types(const std::vector<std::string> &edge_types,
} }
id_to_feature = node_types; id_to_feature = node_types;
for (size_t table_id = 0; table_id < node_types.size(); table_id++) { for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
int res = feature_to_id.size(); int res = node_to_id.size();
feature_to_id[node_types[table_id]] = res; node_to_id[node_types[table_id]] = res;
} }
table_feat_mapping.resize(node_types.size()); table_feat_mapping.resize(node_types.size());
this->table_feat_conf_feat_name.resize(node_types.size()); this->table_feat_conf_feat_name.resize(node_types.size());
...@@ -389,21 +413,22 @@ void GraphGpuWrapper::load_edge_file(std::string etype2files, ...@@ -389,21 +413,22 @@ void GraphGpuWrapper::load_edge_file(std::string etype2files,
etype2files, graph_data_local_path, part_num, reverse); etype2files, graph_data_local_path, part_num, reverse);
} }
void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { int GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
// 'n' means load nodes and 'node_type' follows // 'n' means load nodes and 'node_type' follows
std::string params = "n" + name; std::string params = "n" + name;
if (feature_to_id.find(name) != feature_to_id.end()) { if (node_to_id.find(name) != node_to_id.end()) {
reinterpret_cast<GpuPsGraphTable *>(graph_table) return reinterpret_cast<GpuPsGraphTable *>(graph_table)
->cpu_graph_table_->Load(std::string(filepath), params); ->cpu_graph_table_->Load(std::string(filepath), params);
} }
return 0;
} }
void GraphGpuWrapper::load_node_file(std::string ntype2files, int GraphGpuWrapper::load_node_file(std::string ntype2files,
std::string graph_data_local_path, std::string graph_data_local_path,
int part_num) { int part_num) {
reinterpret_cast<GpuPsGraphTable *>(graph_table) return reinterpret_cast<GpuPsGraphTable *>(graph_table)
->cpu_graph_table_->parse_node_and_load( ->cpu_graph_table_->parse_node_and_load(
ntype2files, graph_data_local_path, part_num); ntype2files, graph_data_local_path, part_num);
} }
...@@ -422,8 +447,8 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name, ...@@ -422,8 +447,8 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
std::string feat_name, std::string feat_name,
std::string feat_dtype, std::string feat_dtype,
int feat_shape) { int feat_shape) {
if (feature_to_id.find(table_name) != feature_to_id.end()) { if (node_to_id.find(table_name) != node_to_id.end()) {
int idx = feature_to_id[table_name]; int idx = node_to_id[table_name];
if (table_feat_mapping[idx].find(feat_name) == if (table_feat_mapping[idx].find(feat_name) ==
table_feat_mapping[idx].end()) { table_feat_mapping[idx].end()) {
int res = table_feat_mapping[idx].size(); int res = table_feat_mapping[idx].size();
...@@ -512,7 +537,7 @@ void GraphGpuWrapper::upload_batch(int type, ...@@ -512,7 +537,7 @@ void GraphGpuWrapper::upload_batch(int type,
g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]); g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]);
g->build_graph_on_single_gpu(sub_graph, i, idx); g->build_graph_on_single_gpu(sub_graph, i, idx);
sub_graph.release_on_cpu(); sub_graph.release_on_cpu();
VLOG(0) << "sub graph on gpu " << i << " is built"; VLOG(1) << "sub graph on gpu " << i << " is built";
return 0; return 0;
})); }));
} }
...@@ -579,7 +604,7 @@ void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, ...@@ -579,7 +604,7 @@ void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea,
GpuPsGraphTable *g = reinterpret_cast<GpuPsGraphTable *>(graph_table); GpuPsGraphTable *g = reinterpret_cast<GpuPsGraphTable *>(graph_table);
g->build_graph_fea_on_single_gpu(sub_graph_fea, i); g->build_graph_fea_on_single_gpu(sub_graph_fea, i);
sub_graph_fea.release_on_cpu(); sub_graph_fea.release_on_cpu();
VLOG(0) << "sub graph fea on gpu " << i << " is built"; VLOG(1) << "sub graph fea on gpu " << i << " is built";
return; return;
} }
...@@ -607,6 +632,16 @@ GraphGpuWrapper::get_edge_type_graph(int gpu_id, int edge_type_len) { ...@@ -607,6 +632,16 @@ GraphGpuWrapper::get_edge_type_graph(int gpu_id, int edge_type_len) {
->get_edge_type_graph(gpu_id, edge_type_len); ->get_edge_type_graph(gpu_id, edge_type_len);
} }
void GraphGpuWrapper::get_node_degree(
int gpu_id,
int edge_idx,
uint64_t *key,
int len,
std::shared_ptr<phi::Allocation> node_degree) {
return ((GpuPsGraphTable *)graph_table)
->get_node_degree(gpu_id, edge_idx, key, len, node_degree);
}
int GraphGpuWrapper::get_feature_info_of_nodes( int GraphGpuWrapper::get_feature_info_of_nodes(
int gpu_id, int gpu_id,
uint64_t *d_nodes, uint64_t *d_nodes,
...@@ -776,7 +811,7 @@ std::string &GraphGpuWrapper::get_node_type_size(std::string first_node_type) { ...@@ -776,7 +811,7 @@ std::string &GraphGpuWrapper::get_node_type_size(std::string first_node_type) {
auto &type_to_index = get_graph_type_to_index(); auto &type_to_index = get_graph_type_to_index();
std::vector<std::string> node_type_size; std::vector<std::string> node_type_size;
for (auto node : uniq_first_node_) { for (auto node : uniq_first_node_) {
auto it = feature_to_id.find(node); auto it = node_to_id.find(node);
auto first_node_idx = it->second; auto first_node_idx = it->second;
size_t f_idx = type_to_index[first_node_idx]; size_t f_idx = type_to_index[first_node_idx];
int type_total_key_size = graph_all_type_total_keys[f_idx].size(); int type_total_key_size = graph_all_type_total_keys[f_idx].size();
......
...@@ -41,7 +41,8 @@ class GraphGpuWrapper { ...@@ -41,7 +41,8 @@ class GraphGpuWrapper {
} }
static std::shared_ptr<GraphGpuWrapper> s_instance_; static std::shared_ptr<GraphGpuWrapper> s_instance_;
void init_conf(const std::string& first_node_type, void init_conf(const std::string& first_node_type,
const std::string& meta_path); const std::string& meta_path,
const std::string& excluded_train_pair);
void initialize(); void initialize();
void finalize(); void finalize();
void set_device(std::vector<int> ids); void set_device(std::vector<int> ids);
...@@ -66,10 +67,10 @@ class GraphGpuWrapper { ...@@ -66,10 +67,10 @@ class GraphGpuWrapper {
int part_num, int part_num,
bool reverse); bool reverse);
void load_node_file(std::string name, std::string filepath); int load_node_file(std::string name, std::string filepath);
void load_node_file(std::string ntype2files, int load_node_file(std::string ntype2files,
std::string graph_data_local_path, std::string graph_data_local_path,
int part_num); int part_num);
void load_node_and_edge(std::string etype2files, void load_node_and_edge(std::string etype2files,
std::string ntype2files, std::string ntype2files,
std::string graph_data_local_path, std::string graph_data_local_path,
...@@ -120,6 +121,11 @@ class GraphGpuWrapper { ...@@ -120,6 +121,11 @@ class GraphGpuWrapper {
int sample_size, int sample_size,
int len, int len,
std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs); std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs);
void get_node_degree(int gpu_id,
int edge_idx,
uint64_t* key,
int len,
std::shared_ptr<phi::Allocation> node_degree);
gpuStream_t get_local_stream(int gpuid); gpuStream_t get_local_stream(int gpuid);
std::vector<uint64_t> graph_neighbor_sample( std::vector<uint64_t> graph_neighbor_sample(
int gpu_id, int gpu_id,
...@@ -160,7 +166,7 @@ class GraphGpuWrapper { ...@@ -160,7 +166,7 @@ class GraphGpuWrapper {
std::string& get_node_type_size(std::string first_node_type); std::string& get_node_type_size(std::string first_node_type);
std::string& get_edge_type_size(); std::string& get_edge_type_size();
std::unordered_map<std::string, int> edge_to_id, feature_to_id; std::unordered_map<std::string, int> edge_to_id, node_to_id;
std::vector<std::string> id_to_feature, id_to_edge; std::vector<std::string> id_to_feature, id_to_edge;
std::vector<std::unordered_map<std::string, int>> table_feat_mapping; std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
std::vector<std::vector<std::string>> table_feat_conf_feat_name; std::vector<std::vector<std::string>> table_feat_conf_feat_name;
...@@ -175,6 +181,7 @@ class GraphGpuWrapper { ...@@ -175,6 +181,7 @@ class GraphGpuWrapper {
std::string feature_separator_ = std::string(" "); std::string feature_separator_ = std::string(" ");
bool conf_initialized_ = false; bool conf_initialized_ = false;
std::vector<int> first_node_type_; std::vector<int> first_node_type_;
std::vector<uint8_t> excluded_train_pair_;
std::vector<std::vector<int>> meta_path_; std::vector<std::vector<int>> meta_path_;
std::vector<std::set<int>> finish_node_type_; std::vector<std::set<int>> finish_node_type_;
...@@ -187,6 +194,11 @@ class GraphGpuWrapper { ...@@ -187,6 +194,11 @@ class GraphGpuWrapper {
std::vector<size_t> h_graph_train_keys_len_; std::vector<size_t> h_graph_train_keys_len_;
std::vector<std::vector<std::shared_ptr<phi::Allocation>>> std::vector<std::vector<std::shared_ptr<phi::Allocation>>>
d_graph_all_type_total_keys_; d_graph_all_type_total_keys_;
std::map<uint64_t, // edge_id
uint64_t // src_node_id << 32 | dst_node_id
>
edge_to_node_map_;
std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_; std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_;
std::string slot_feature_separator_ = std::string(" "); std::string slot_feature_separator_ = std::string(" ");
......
...@@ -193,17 +193,17 @@ void PSGPUWrapper::add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -193,17 +193,17 @@ void PSGPUWrapper::add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task) {
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() VLOG(1) << "GpuPs task add keys cost " << timeline.ElapsedSec()
<< " seconds."; << " seconds.";
timeline.Start(); timeline.Start();
size_t slot_num = slot_vector_.size() - 1; size_t slot_num = (size_t)slot_num_for_pull_feature_;
// no slot_fea mode and whole_hbm mode, only keep one unique_sort action // no slot_fea mode and whole_hbm mode, only keep one unique_sort action
if (slot_num > 0 && FLAGS_gpugraph_storage_mode != if (slot_num > 0 && FLAGS_gpugraph_storage_mode !=
paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
gpu_task->UniqueKeys(); gpu_task->UniqueKeys();
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; VLOG(1) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
} }
void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) { void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) {
...@@ -218,7 +218,8 @@ void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -218,7 +218,8 @@ void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) {
} }
} }
void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
Dataset* dataset_for_pull) {
VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
platform::Timer timeline; platform::Timer timeline;
timeline.Start(); timeline.Start();
...@@ -341,12 +342,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -341,12 +342,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
<< " seconds."; << " seconds.";
} }
} else { } else {
SlotRecordDataset* dataset = reinterpret_cast<SlotRecordDataset*>(dataset_); SlotRecordDataset* dataset =
reinterpret_cast<SlotRecordDataset*>(dataset_for_pull);
const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys(); const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
timeline.Start(); timeline.Start();
add_key_to_local(vec_data); add_key_to_local(vec_data);
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size() VLOG(1) << "GpuGraphTotalKeys: " << vec_data.size()
<< ", add_key_to_local cost " << timeline.ElapsedSec() << ", add_key_to_local cost " << timeline.ElapsedSec()
<< " seconds."; << " seconds.";
} }
...@@ -361,7 +363,8 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) { ...@@ -361,7 +363,8 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
// 8卡数据分片 // 8卡数据分片
size_t device_num = heter_devices_.size(); size_t device_num = heter_devices_.size();
std::vector<std::thread> threads; std::vector<std::thread> threads;
size_t slot_num = slot_vector_.size() - 1; // node slot 9008 in slot_vector size_t slot_num =
(size_t)slot_num_for_pull_feature_; // node slot 9008 in slot_vector
auto& local_dim_keys = gpu_task->feature_dim_keys_; // [shard_num, 0, keys]] auto& local_dim_keys = gpu_task->feature_dim_keys_; // [shard_num, 0, keys]]
double divide_nodeid_cost = 0; double divide_nodeid_cost = 0;
double get_feature_id_cost = 0; double get_feature_id_cost = 0;
...@@ -532,8 +535,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) { ...@@ -532,8 +535,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
for (size_t i = 0; i < device_num; i++) { for (size_t i = 0; i < device_num; i++) {
feature_num += feature_list_size[i]; feature_num += feature_list_size[i];
} }
VLOG(0) << "feature_num is " << feature_num << " node_num num is " VLOG(1) << "feature_num is " << feature_num << " node_num is " << node_num;
<< node_num;
size_t set_num = thread_keys_shard_num_; size_t set_num = thread_keys_shard_num_;
std::vector<std::unordered_set<uint64_t>> feature_id_set(set_num); std::vector<std::unordered_set<uint64_t>> feature_id_set(set_num);
...@@ -635,7 +637,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) { ...@@ -635,7 +637,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
add_feature_to_key_cost = time_stage.ElapsedSec(); add_feature_to_key_cost = time_stage.ElapsedSec();
threads.clear(); threads.clear();
timeline.Pause(); timeline.Pause();
VLOG(0) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s." VLOG(1) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s."
<< " divide_nodeid_cost " << divide_nodeid_cost << " divide_nodeid_cost " << divide_nodeid_cost
<< " get_feature_id_cost " << get_feature_id_cost << " get_feature_id_cost " << get_feature_id_cost
<< " add_feature_to_set_cost " << add_feature_to_set_cost << " add_feature_to_set_cost " << add_feature_to_set_cost
...@@ -644,9 +646,9 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) { ...@@ -644,9 +646,9 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) { void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
platform::Timer timeline; platform::Timer timeline;
size_t slot_num = slot_vector_.size() - 1; // node slot 9008 in slot_vector if (slot_num_for_pull_feature_ > 0 &&
if (slot_num > 0 && FLAGS_gpugraph_storage_mode != FLAGS_gpugraph_storage_mode !=
paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
add_slot_feature(gpu_task); add_slot_feature(gpu_task);
} }
...@@ -656,7 +658,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) { ...@@ -656,7 +658,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
time_stage.Start(); time_stage.Start();
gpu_task->UniqueKeys(); gpu_task->UniqueKeys();
time_stage.Pause(); time_stage.Pause();
VLOG(0) << "BuildPull slot feature uniq and sort cost time: " VLOG(1) << "BuildPull slot feature uniq and sort cost time: "
<< time_stage.ElapsedSec(); << time_stage.ElapsedSec();
auto& local_dim_keys = gpu_task->feature_dim_keys_; auto& local_dim_keys = gpu_task->feature_dim_keys_;
...@@ -795,7 +797,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) { ...@@ -795,7 +797,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
} }
task_futures.clear(); task_futures.clear();
timeline.Pause(); timeline.Pause();
VLOG(0) << "pull sparse from CpuPS into GpuPS total keys " << total_key VLOG(1) << "pull sparse from CpuPS into GpuPS total keys " << total_key
<< ", cost " << timeline.ElapsedSec() << " seconds."; << ", cost " << timeline.ElapsedSec() << " seconds.";
if (multi_node_) { if (multi_node_) {
auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
...@@ -879,7 +881,7 @@ void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) { ...@@ -879,7 +881,7 @@ void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) {
} }
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
<< " seconds."; << " seconds.";
} }
...@@ -1052,7 +1054,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1052,7 +1054,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
<< " dim index: " << j << " contains feasign nums: " << " dim index: " << j << " contains feasign nums: "
<< gpu_task->device_dim_ptr_[i][j].size(); << gpu_task->device_dim_ptr_[i][j].size();
} }
VLOG(0) << i << " card with dynamic mf contains feasign nums total: " VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
<< feature_keys_count[i]; << feature_keys_count[i];
size_max = std::max(size_max, feature_keys_count[i]); size_max = std::max(size_max, feature_keys_count[i]);
} }
...@@ -1074,7 +1076,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1074,7 +1076,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
#endif #endif
} }
stagetime.Pause(); stagetime.Pause();
VLOG(0) << "card: " VLOG(1) << "card: "
<< " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec() << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec()
<< " s."; << " s.";
stagetime.Start(); stagetime.Start();
...@@ -1191,8 +1193,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1191,8 +1193,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size); threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size);
} }
// build feature table // build feature table
size_t slot_num = slot_vector_.size() - 1; // node slot 9008 in slot_vector if (slot_num_for_pull_feature_ > 0 &&
if (slot_num > 0 &&
(FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode:: (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::
MEM_EMB_FEATURE_AND_GPU_GRAPH || MEM_EMB_FEATURE_AND_GPU_GRAPH ||
FLAGS_gpugraph_storage_mode == FLAGS_gpugraph_storage_mode ==
...@@ -1228,7 +1229,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1228,7 +1229,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
t.join(); t.join();
} }
stagetime.Pause(); stagetime.Pause();
VLOG(0) << "card: " << i VLOG(1) << "card: " << i
<< " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec() << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec()
<< " s."; << " s.";
}; };
...@@ -1262,7 +1263,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1262,7 +1263,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
} }
cpu_task_futures.clear(); cpu_task_futures.clear();
stagetime.Pause(); stagetime.Pause();
VLOG(0) << " BuildGPUTask build_dynamic_mf_func " VLOG(1) << " BuildGPUTask build_dynamic_mf_func "
<< " cost " << stagetime.ElapsedSec() << " s."; << " cost " << stagetime.ElapsedSec() << " s.";
for (int i = 0; i < device_num; i++) { for (int i = 0; i < device_num; i++) {
cpu_reday_channels_[i]->Close(); cpu_reday_channels_[i]->Close();
...@@ -1282,7 +1283,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -1282,7 +1283,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
gpu_task->sub_graph_feas = NULL; gpu_task->sub_graph_feas = NULL;
} }
stagetime.Pause(); stagetime.Pause();
VLOG(0) << " build_dymf_hbm_pool " VLOG(1) << " build_dymf_hbm_pool "
<< " cost " << stagetime.ElapsedSec() << " s."; << " cost " << stagetime.ElapsedSec() << " s.";
} }
...@@ -1292,7 +1293,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { ...@@ -1292,7 +1293,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
timer.Start(); timer.Start();
dataset_->LoadIntoMemory(); dataset_->LoadIntoMemory();
timer.Pause(); timer.Pause();
VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s"; VLOG(1) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
gpu_graph_mode_ = dataset_->GetGpuGraphMode(); gpu_graph_mode_ = dataset_->GetGpuGraphMode();
if (dataset_->GetMemoryDataSize() == 0) { if (dataset_->GetMemoryDataSize() == 0) {
VLOG(0) << "GetMemoryDataSize == 0"; VLOG(0) << "GetMemoryDataSize == 0";
...@@ -1309,7 +1310,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { ...@@ -1309,7 +1310,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get(); std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
gpu_task->Reset(); gpu_task->Reset();
gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
data_ready_channel_->Put(gpu_task); data_ready_channel_->Put(std::make_pair(gpu_task, dataset_));
} else if (hbm_sparse_table_initialized_ == false) { } else if (hbm_sparse_table_initialized_ == false) {
SparseTableToHbm(); SparseTableToHbm();
} }
...@@ -1317,7 +1318,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { ...@@ -1317,7 +1318,7 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get(); std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
gpu_task->Reset(); gpu_task->Reset();
gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
data_ready_channel_->Put(gpu_task); data_ready_channel_->Put(std::make_pair(gpu_task, dataset_));
#endif #endif
VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
} }
...@@ -1332,17 +1333,20 @@ void PSGPUWrapper::start_build_thread() { ...@@ -1332,17 +1333,20 @@ void PSGPUWrapper::start_build_thread() {
void PSGPUWrapper::pre_build_thread() { void PSGPUWrapper::pre_build_thread() {
// prebuild: process load_data // prebuild: process load_data
while (running_) { while (running_) {
std::pair<std::shared_ptr<HeterContext>, Dataset*> task =
std::make_pair(nullptr, nullptr);
std::shared_ptr<HeterContext> gpu_task = nullptr; std::shared_ptr<HeterContext> gpu_task = nullptr;
if (!data_ready_channel_->Get(gpu_task)) { if (!data_ready_channel_->Get(task)) {
continue; continue;
} }
gpu_task = task.first;
VLOG(3) << "thread PreBuildTask start."; VLOG(3) << "thread PreBuildTask start.";
platform::Timer timer; platform::Timer timer;
timer.Start(); timer.Start();
// build cpu ps data process // build cpu ps data process
PreBuildTask(gpu_task); PreBuildTask(gpu_task, task.second);
timer.Pause(); timer.Pause();
VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec()
<< " s"; << " s";
buildcpu_ready_channel_->Put(gpu_task); buildcpu_ready_channel_->Put(gpu_task);
} }
...@@ -1382,7 +1386,7 @@ void PSGPUWrapper::build_task() { ...@@ -1382,7 +1386,7 @@ void PSGPUWrapper::build_task() {
return; return;
} }
VLOG(0) << "PrepareGPUTask start."; VLOG(1) << "PrepareGPUTask start.";
platform::Timer timer; platform::Timer timer;
timer.Start(); timer.Start();
if (!multi_mf_dim_) { if (!multi_mf_dim_) {
...@@ -1390,7 +1394,7 @@ void PSGPUWrapper::build_task() { ...@@ -1390,7 +1394,7 @@ void PSGPUWrapper::build_task() {
} }
BuildGPUTask(gpu_task); BuildGPUTask(gpu_task);
timer.Pause(); timer.Pause();
VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " VLOG(1) << "PrepareGPUTask + BuildGPUTask end, cost time: "
<< timer.ElapsedSec() << "s"; << timer.ElapsedSec() << "s";
current_task_ = gpu_task; current_task_ = gpu_task;
...@@ -1419,11 +1423,11 @@ void PSGPUWrapper::BeginPass() { ...@@ -1419,11 +1423,11 @@ void PSGPUWrapper::BeginPass() {
"[BeginPass] after build_task, current task is not null.")); "[BeginPass] after build_task, current task is not null."));
} }
if (FLAGS_gpugraph_dedup_pull_push_mode) { if (FLAGS_gpugraph_dedup_pull_push_mode) {
VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec()
<< "s, enable pull push dedup mode=" << "s, enable pull push dedup mode="
<< FLAGS_gpugraph_dedup_pull_push_mode; << FLAGS_gpugraph_dedup_pull_push_mode;
} else { } else {
VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
} }
} }
...@@ -1433,11 +1437,14 @@ void PSGPUWrapper::EndPass() { ...@@ -1433,11 +1437,14 @@ void PSGPUWrapper::EndPass() {
return; return;
} }
#endif #endif
if (current_task_ == nullptr) {
return;
}
platform::Timer stagetime; platform::Timer stagetime;
stagetime.Start(); stagetime.Start();
HbmToSparseTable(); HbmToSparseTable();
stagetime.Pause(); stagetime.Pause();
VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec() VLOG(1) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec()
<< "s"; << "s";
gpu_task_pool_.Push(current_task_); gpu_task_pool_.Push(current_task_);
...@@ -1453,7 +1460,7 @@ void PSGPUWrapper::SparseTableToHbm() { ...@@ -1453,7 +1460,7 @@ void PSGPUWrapper::SparseTableToHbm() {
gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
auto node_to_id = gpu_graph_ptr->feature_to_id; auto node_to_id = gpu_graph_ptr->node_to_id;
auto edge_to_id = gpu_graph_ptr->edge_to_id; auto edge_to_id = gpu_graph_ptr->edge_to_id;
std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys(); std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys();
...@@ -1569,7 +1576,7 @@ void PSGPUWrapper::HbmToSparseTable() { ...@@ -1569,7 +1576,7 @@ void PSGPUWrapper::HbmToSparseTable() {
size_t thread_num = 16; size_t thread_num = 16;
size_t device_num = heter_devices_.size(); size_t device_num = heter_devices_.size();
if (multi_mf_dim_) { if (multi_mf_dim_) {
VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; VLOG(1) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
for (size_t i = 0; i < device_num; i++) { for (size_t i = 0; i < device_num; i++) {
cpu_reday_channels_[i]->Open(); cpu_reday_channels_[i]->Open();
for (int j = 0; j < multi_mf_dim_; j++) { for (int j = 0; j < multi_mf_dim_; j++) {
...@@ -1593,7 +1600,7 @@ void PSGPUWrapper::HbmToSparseTable() { ...@@ -1593,7 +1600,7 @@ void PSGPUWrapper::HbmToSparseTable() {
f.wait(); f.wait();
} }
timer.Pause(); timer.Pause();
VLOG(0) << " EndPass dump_pool_to_cpu_func " VLOG(1) << " EndPass dump_pool_to_cpu_func "
<< " cost " << timer.ElapsedSec() << " s."; << " cost " << timer.ElapsedSec() << " s.";
for (size_t i = 0; i < device_num; i++) { for (size_t i = 0; i < device_num; i++) {
cpu_reday_channels_[i]->Close(); cpu_reday_channels_[i]->Close();
...@@ -1605,7 +1612,7 @@ void PSGPUWrapper::HbmToSparseTable() { ...@@ -1605,7 +1612,7 @@ void PSGPUWrapper::HbmToSparseTable() {
} }
cpu_task_futures.clear(); cpu_task_futures.clear();
timer.Pause(); timer.Pause();
VLOG(0) << " EndPass cpu_func " VLOG(1) << " EndPass cpu_func "
<< " cost " << timer.ElapsedSec() << " s."; << " cost " << timer.ElapsedSec() << " s.";
if (keysize_max != 0) { if (keysize_max != 0) {
HeterPs_->end_pass(); HeterPs_->end_pass();
......
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_GLOO #ifdef PADDLE_WITH_GLOO
#include <gloo/broadcast.h> #include <gloo/broadcast.h>
...@@ -202,7 +203,8 @@ class PSGPUWrapper { ...@@ -202,7 +203,8 @@ class PSGPUWrapper {
void divide_to_device(std::shared_ptr<HeterContext> gpu_task); void divide_to_device(std::shared_ptr<HeterContext> gpu_task);
void add_slot_feature(std::shared_ptr<HeterContext> gpu_task); void add_slot_feature(std::shared_ptr<HeterContext> gpu_task);
void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task); void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
void PreBuildTask(std::shared_ptr<HeterContext> gpu_task); void PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
Dataset* dataset_for_pull);
void BuildPull(std::shared_ptr<HeterContext> gpu_task); void BuildPull(std::shared_ptr<HeterContext> gpu_task);
void PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task); void PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task);
void LoadIntoMemory(bool is_shuffle); void LoadIntoMemory(bool is_shuffle);
...@@ -251,8 +253,8 @@ class PSGPUWrapper { ...@@ -251,8 +253,8 @@ class PSGPUWrapper {
buildpull_threads_.join(); buildpull_threads_.join();
s_instance_ = nullptr; s_instance_ = nullptr;
VLOG(3) << "PSGPUWrapper Finalize Finished."; VLOG(3) << "PSGPUWrapper Finalize Finished.";
HeterPs_->show_table_collisions();
if (HeterPs_ != NULL) { if (HeterPs_ != NULL) {
HeterPs_->show_table_collisions();
delete HeterPs_; delete HeterPs_;
HeterPs_ = NULL; HeterPs_ = NULL;
} }
...@@ -605,7 +607,10 @@ class PSGPUWrapper { ...@@ -605,7 +607,10 @@ class PSGPUWrapper {
slot_vector_ = slot_vector; slot_vector_ = slot_vector;
VLOG(0) << "slot_vector size is " << slot_vector_.size(); VLOG(0) << "slot_vector size is " << slot_vector_.size();
} }
void SetPullFeatureSlotNum(int slot_num) {
slot_num_for_pull_feature_ = slot_num;
VLOG(0) << "slot_num_for_pull_feature_ is " << slot_num_for_pull_feature_;
}
void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) { void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
slot_offset_vector_ = slot_offset_vector; slot_offset_vector_ = slot_offset_vector;
std::cout << "yxf set: "; std::cout << "yxf set: ";
...@@ -734,6 +739,7 @@ class PSGPUWrapper { ...@@ -734,6 +739,7 @@ class PSGPUWrapper {
std::vector<int> index_dim_vec_; std::vector<int> index_dim_vec_;
int multi_mf_dim_{0}; int multi_mf_dim_{0};
int max_mf_dim_{0}; int max_mf_dim_{0};
int slot_num_for_pull_feature_{0};
size_t val_type_size_{0}; size_t val_type_size_{0};
size_t grad_type_size_{0}; size_t grad_type_size_{0};
size_t pull_type_size_{0}; size_t pull_type_size_{0};
...@@ -781,10 +787,10 @@ class PSGPUWrapper { ...@@ -781,10 +787,10 @@ class PSGPUWrapper {
// hbm pools of totol dims number // hbm pools of totol dims number
#endif #endif
std::shared_ptr< std::shared_ptr<paddle::framework::ChannelObject<
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>> std::pair<std::shared_ptr<HeterContext>, Dataset*>>>
data_ready_channel_ = data_ready_channel_ = paddle::framework::MakeChannel<
paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>(); std::pair<std::shared_ptr<HeterContext>, Dataset*>>();
std::shared_ptr< std::shared_ptr<
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>> paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
buildcpu_ready_channel_ = buildcpu_ready_channel_ =
......
...@@ -31,7 +31,7 @@ DECLARE_bool(enable_exit_when_partial_worker); ...@@ -31,7 +31,7 @@ DECLARE_bool(enable_exit_when_partial_worker);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::atomic<uint64_t> HogwildWorker::worker_num_stat_(0); std::atomic<bool> HogwildWorker::quit_flag_(false);
Barrier g_barrier; Barrier g_barrier;
void HogwildWorker::Initialize(const TrainerDesc &desc) { void HogwildWorker::Initialize(const TrainerDesc &desc) {
...@@ -148,7 +148,7 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -148,7 +148,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
int cur_batch; int cur_batch;
int batch_cnt = 0; int batch_cnt = 0;
if (thread_id_ == 0) { if (thread_id_ == 0) {
worker_num_stat_.store(0); quit_flag_.store(false);
} }
g_barrier.wait(); g_barrier.wait();
bool train_mode = device_reader_->IsTrainMode(); bool train_mode = device_reader_->IsTrainMode();
...@@ -160,11 +160,11 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -160,11 +160,11 @@ void HogwildWorker::TrainFilesWithProfiler() {
while (1) { while (1) {
cur_batch = device_reader_->Next(); cur_batch = device_reader_->Next();
if (FLAGS_enable_exit_when_partial_worker && train_mode) { if (FLAGS_enable_exit_when_partial_worker && train_mode) {
if (cur_batch > 0) { if (cur_batch <= 0) {
worker_num_stat_.fetch_add(1, std::memory_order_relaxed); quit_flag_.store(true, std::memory_order_relaxed);
} }
g_barrier.wait(); g_barrier.wait();
if (worker_num_stat_.load(std::memory_order_relaxed) % thread_num_ != 0) { if (quit_flag_.load(std::memory_order_relaxed) == true) {
break; break;
} }
} }
...@@ -265,7 +265,8 @@ void HogwildWorker::TrainFiles() { ...@@ -265,7 +265,8 @@ void HogwildWorker::TrainFiles() {
int cur_batch; int cur_batch;
int batch_cnt = 0; int batch_cnt = 0;
if (thread_id_ == 0) { if (thread_id_ == 0) {
worker_num_stat_.store(0); quit_flag_.store(false);
// quit_flag_2 = false;
} }
g_barrier.wait(); g_barrier.wait();
...@@ -280,11 +281,11 @@ void HogwildWorker::TrainFiles() { ...@@ -280,11 +281,11 @@ void HogwildWorker::TrainFiles() {
while (1) { while (1) {
cur_batch = device_reader_->Next(); cur_batch = device_reader_->Next();
if (FLAGS_enable_exit_when_partial_worker && train_mode) { if (FLAGS_enable_exit_when_partial_worker && train_mode) {
if (cur_batch > 0) { if (cur_batch <= 0) {
worker_num_stat_.fetch_add(1, std::memory_order_relaxed); quit_flag_.store(true, std::memory_order_relaxed);
} }
g_barrier.wait(); g_barrier.wait();
if (worker_num_stat_.load(std::memory_order_relaxed) % thread_num_ != 0) { if (quit_flag_.load(std::memory_order_relaxed) == true) {
break; break;
} }
} }
...@@ -320,7 +321,7 @@ void HogwildWorker::TrainFiles() { ...@@ -320,7 +321,7 @@ void HogwildWorker::TrainFiles() {
#endif #endif
} }
timeline.Pause(); timeline.Pause();
VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
<< " seconds, batch_num: " << total_batch_num; << " seconds, batch_num: " << total_batch_num;
if (need_dump_field_ || need_dump_param_) { if (need_dump_field_ || need_dump_param_) {
......
...@@ -25,6 +25,8 @@ limitations under the License. */ ...@@ -25,6 +25,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
extern Barrier g_barrier;
void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
Dataset* dataset) { Dataset* dataset) {
thread_num_ = trainer_desc.thread_num(); thread_num_ = trainer_desc.thread_num();
...@@ -62,7 +64,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -62,7 +64,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
thread_num_); thread_num_);
} }
#endif #endif
g_barrier.reset(thread_num_);
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name()); trainer_desc.device_worker_name());
...@@ -74,6 +76,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -74,6 +76,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetDeviceIndex(i); workers_[i]->SetDeviceIndex(i);
workers_[i]->SetDataFeed(readers[i]); workers_[i]->SetDataFeed(readers[i]);
workers_[i]->SetThreadNum(thread_num_);
} }
// set debug here // set debug here
...@@ -177,7 +180,7 @@ void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -177,7 +180,7 @@ void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
// for unittest which call train_from_dataset but does not call // for unittest which call train_from_dataset but does not call
// fleet.init_worker() first // fleet.init_worker() first
if (communicator == nullptr) { if (communicator == nullptr) {
VLOG(0) << "MultiTrainer::InitOtherEnv Communicator is null!"; VLOG(1) << "MultiTrainer::InitOtherEnv Communicator is null!";
} else { } else {
auto& recv_ctx = communicator->GetRecvCtxMap(); auto& recv_ctx = communicator->GetRecvCtxMap();
communicator->PullDense(recv_ctx); communicator->PullDense(recv_ctx);
...@@ -299,13 +302,13 @@ void MultiTrainer::Finalize() { ...@@ -299,13 +302,13 @@ void MultiTrainer::Finalize() {
auto communicator = paddle::distributed::Communicator::GetInstance(); auto communicator = paddle::distributed::Communicator::GetInstance();
// for unittest which does not call fleet.init_worker() first // for unittest which does not call fleet.init_worker() first
if (communicator == nullptr) { if (communicator == nullptr) {
VLOG(0) << "MultiTrainer::Finalize communicator is null!"; VLOG(1) << "MultiTrainer::Finalize communicator is null!";
} else { } else {
if (communicator->_worker_ptr != nullptr) { if (communicator->_worker_ptr != nullptr) {
communicator->_worker_ptr->Flush(); communicator->_worker_ptr->Flush();
VLOG(1) << "MultiTrainer::Finalize ps client flush done"; VLOG(1) << "MultiTrainer::Finalize ps client flush done";
} else { } else {
VLOG(0) << "communicator->_worker_ptr is null"; VLOG(1) << "communicator->_worker_ptr is null";
} }
} }
#endif #endif
......
...@@ -64,6 +64,7 @@ register_unity_group( ...@@ -64,6 +64,7 @@ register_unity_group(
cudnn_lstm_op.cc cudnn_lstm_op.cc
cumsum_op.cc cumsum_op.cc
cvm_op.cc cvm_op.cc
unzip_op.cc
data_norm_op.cc data_norm_op.cc
deformable_conv_op.cc deformable_conv_op.cc
deformable_conv_v1_op.cc deformable_conv_v1_op.cc
...@@ -402,6 +403,7 @@ register_unity_group( ...@@ -402,6 +403,7 @@ register_unity_group(
ctc_align_op.cu ctc_align_op.cu
cumsum_op.cu cumsum_op.cu
cvm_op.cu cvm_op.cu
unzip_op.cu
data_norm_op.cu data_norm_op.cu
deformable_conv_op.cu deformable_conv_op.cu
deformable_conv_v1_op.cu deformable_conv_v1_op.cu
...@@ -579,3 +581,5 @@ register_unity_group(cu expand_op.cu) ...@@ -579,3 +581,5 @@ register_unity_group(cu expand_op.cu)
register_unity_group(cu matmul_v2_op.cu) register_unity_group(cu matmul_v2_op.cu)
register_unity_group(cu top_k_v2_op.cu) register_unity_group(cu top_k_v2_op.cu)
register_unity_group(cu set_value_op.cu) register_unity_group(cu set_value_op.cu)
register_unity_group(cu unzip.cu)
register_unity_group(cc unzip.cc)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/unzip_op.h"
#include <memory>
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
class unzipOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lod");
OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "lod");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
x_dims.size(),
2UL,
platform::errors::InvalidArgument(
"Input(X)'s rank should be 2, but got %d", x_dims.size()));
auto lod_dims = ctx->GetInputDim("lod");
PADDLE_ENFORCE_EQ(
lod_dims.size(),
1UL,
platform::errors::InvalidArgument(
"Input(X)'s rank should be 1, but got %d", lod_dims.size()));
ctx->SetOutputDim("Y", {lod_dims[0] - 1, x_dims[1]});
}
protected:
// Explicitly set that the data type of computation kernel of
// unzip
// is determined by its input "X".
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context().GetPlace());
}
};
class unzipGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unzipGradient");
OP_INOUT_CHECK(ctx->HasInput("lod"), "Input", "unzip", "unzipGradient");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
"Input",
framework::GradVarName("Y"),
"unzipGradient");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
"Output",
framework::GradVarName("X"),
"unzipGradient");
auto x_dims = ctx->GetInputDim("X");
auto lod_dims = ctx->GetInputDim("lod");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(
x_dims.size(),
2,
platform::errors::InvalidArgument(
"Expect Input(X)'s rank == 2, but got %d", x_dims.size()));
PADDLE_ENFORCE_EQ(
dy_dims.size(),
2,
platform::errors::InvalidArgument(
"Expect Input(X)'s rank == 2, but got %d", dy_dims.size()));
PADDLE_ENFORCE_EQ(
lod_dims.size(),
1,
platform::errors::InvalidArgument(
"Expect Input(X)'s rank == 1, but got %d", lod_dims.size()));
PADDLE_ENFORCE_EQ(
x_dims[1],
dy_dims[1],
platform::errors::InvalidArgument(
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal, X is %d, Y@Grad is %d",
x_dims[1],
dy_dims[1]));
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
// Explicitly set that the data type of computation kernel of
// unzip
// is determined by its input "X".
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Y")),
ctx.device_context().GetPlace());
}
};
class unzipOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
"[M x N],"
" where N is the batch size and D is the emebdding dim. ");
AddInput("lod", "(Tensor), a 1-D Tensor with shape [K]");
AddOutput("Y",
"(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
"[K-1 x N].");
AddComment(R"DOC(
unzip Operator.
)DOC");
}
};
template <typename T>
class unzipGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("unzip_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("lod", this->Input("lod"));
op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(unzip,
ops::unzipOp,
ops::unzipOpMaker,
ops::unzipGradOpMaker<paddle::framework::OpDesc>,
ops::unzipGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(unzip_grad, ops::unzipGradientOp);
REGISTER_OP_CPU_KERNEL(unzip,
ops::unzipOpKernel<int64_t>,
ops::unzipOpKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(unzip_grad,
ops::unzipGradOpKernel<int64_t>,
ops::unzipGradOpKernel<int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/unzip_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, typename LodType>
__global__ void unzipKernel(
const T* X, const LodType* lod, T* Y, size_t col_size, size_t n) {
CUDA_KERNEL_LOOP(i, n) {
int lod_idx = i / col_size;
if ((lod[lod_idx + 1] - lod[lod_idx]) > 0) {
assert((lod[lod_idx + 1] - lod[lod_idx]) == col_size);
int x_idx = 0;
for (int j = 0; j < lod_idx; ++j) {
if ((lod[j + 1] - lod[j]) > 0) {
x_idx++;
}
}
Y[i] = X[x_idx * col_size + (i % col_size)];
} else {
Y[i] = 0;
}
}
}
template <typename T, typename LodType>
class unzipCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<phi::DenseTensor>("X");
const T* x_data = x->data<T>();
const auto* lod = context.Input<phi::DenseTensor>("lod");
const LodType* lod_data = lod->data<LodType>();
auto col_size = x->dims()[1];
auto row_size = lod->dims()[0] - 1;
auto y_numel = col_size * row_size;
auto* y = context.Output<phi::DenseTensor>("Y");
T* y_data = y->mutable_data<T>(context.GetPlace());
// for Input X do not have lod Information.
auto stream = context.template device_context<phi::GPUContext>().stream();
unzipKernel<<<(y_numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS,
0,
stream>>>(x_data, lod_data, y_data, col_size, y_numel);
}
};
template <typename T>
class unzipGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
unzip,
ops::unzipCUDAKernel<float, int>,
ops::unzipCUDAKernel<double, int>,
ops::unzipCUDAKernel<paddle::platform::float16, int>,
ops::unzipCUDAKernel<int, int>,
ops::unzipCUDAKernel<bool, int>,
ops::unzipCUDAKernel<int64_t, int>,
ops::unzipCUDAKernel<float, int64_t>,
ops::unzipCUDAKernel<double, int64_t>,
ops::unzipCUDAKernel<paddle::platform::float16, int64_t>,
ops::unzipCUDAKernel<int, int64_t>,
ops::unzipCUDAKernel<bool, int64_t>,
ops::unzipCUDAKernel<int64_t, int64_t>);
REGISTER_OP_CUDA_KERNEL(unzip_grad,
ops::unzipGradCUDAKernel<float>,
ops::unzipGradCUDAKernel<double>,
ops::unzipGradCUDAKernel<paddle::platform::float16>,
ops::unzipGradCUDAKernel<int>,
ops::unzipGradCUDAKernel<bool>,
ops::unzipGradCUDAKernel<int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename T>
class unzipOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_THROW(phi::errors::Unimplemented("unzip is unimplemented"));
}
};
template <typename T>
class unzipGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
}
};
} // namespace operators
} // namespace paddle
...@@ -290,6 +290,9 @@ void BindDataset(py::module *m) { ...@@ -290,6 +290,9 @@ void BindDataset(py::module *m) {
.def("get_memory_data_size", .def("get_memory_data_size",
&framework::Dataset::GetMemoryDataSize, &framework::Dataset::GetMemoryDataSize,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("get_epoch_finish",
&framework::Dataset::GetEpochFinish,
py::call_guard<py::gil_scoped_release>())
.def("get_pv_data_size", .def("get_pv_data_size",
&framework::Dataset::GetPvDataSize, &framework::Dataset::GetPvDataSize,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
......
...@@ -41,6 +41,9 @@ void BindPSGPUWrapper(py::module* m) { ...@@ -41,6 +41,9 @@ void BindPSGPUWrapper(py::module* m) {
.def("set_slot_vector", .def("set_slot_vector",
&framework::PSGPUWrapper::SetSlotVector, &framework::PSGPUWrapper::SetSlotVector,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("set_slot_num_for_pull_feature",
&framework::PSGPUWrapper::SetPullFeatureSlotNum,
py::call_guard<py::gil_scoped_release>())
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
.def("set_slot_dim_vector", .def("set_slot_dim_vector",
&framework::PSGPUWrapper::SetSlotDimVector, &framework::PSGPUWrapper::SetSlotDimVector,
......
...@@ -19,10 +19,10 @@ from _collections import defaultdict ...@@ -19,10 +19,10 @@ from _collections import defaultdict
import paddle import paddle
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
from paddle.distributed.passes.pass_base import PassBase, register_pass from paddle.distributed.passes.pass_base import PassBase, register_pass
from paddle.fluid.transpiler.collective import SingleProcessMultiThread
from paddle.framework import core from paddle.framework import core
from paddle.static import Parameter, Program from paddle.static import Parameter, Program
from ..ps.utils.collective_transpiler import SingleProcessMultiThread
from ..ps.utils.public import * # noqa: F403 from ..ps.utils.public import * # noqa: F403
......
...@@ -967,6 +967,9 @@ class InMemoryDataset(DatasetBase): ...@@ -967,6 +967,9 @@ class InMemoryDataset(DatasetBase):
""" """
return self.dataset.get_pv_data_size() return self.dataset.get_pv_data_size()
def get_epoch_finish(self):
return self.dataset.get_epoch_finish()
@deprecated( @deprecated(
since="2.0.0", since="2.0.0",
update_to="paddle.distributed.InMemoryDataset.get_memory_data_size", update_to="paddle.distributed.InMemoryDataset.get_memory_data_size",
...@@ -1121,6 +1124,15 @@ class InMemoryDataset(DatasetBase): ...@@ -1121,6 +1124,15 @@ class InMemoryDataset(DatasetBase):
self.proto_desc.graph_config.infer_table_cap = config.get( self.proto_desc.graph_config.infer_table_cap = config.get(
"infer_table_cap", 800000 "infer_table_cap", 800000
) )
self.proto_desc.graph_config.excluded_train_pair = config.get(
"excluded_train_pair", ""
)
self.proto_desc.graph_config.infer_node_type = config.get(
"infer_node_type", ""
)
self.proto_desc.graph_config.get_degree = config.get(
"get_degree", False
)
self.dataset.set_gpu_graph_mode(True) self.dataset.set_gpu_graph_mode(True)
def set_pass_id(self, pass_id): def set_pass_id(self, pass_id):
......
...@@ -313,6 +313,7 @@ class TestDataset(unittest.TestCase): ...@@ -313,6 +313,7 @@ class TestDataset(unittest.TestCase):
dataset.set_graph_config(graph_config) dataset.set_graph_config(graph_config)
dataset.set_pass_id(0) dataset.set_pass_id(0)
dataset.get_pass_id() dataset.get_pass_id()
dataset.get_epoch_finish()
def test_in_memory_dataset_masterpatch(self): def test_in_memory_dataset_masterpatch(self):
""" """
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
class TestUnzipOp(unittest.TestCase):
def test_result(self):
"""
For unzip op
"""
paddle.enable_static()
if core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
x = fluid.data(name='X', shape=[3, 4], dtype='float64')
lod = fluid.data(name='lod', shape=[11], dtype='int64')
output = paddle.incubate.unzip(x, lod)
input = [
[1.0, 2.0, 3.0, 4.0],
[10.0, 20.0, 30.0, 40.0],
[100.0, 200.0, 300.0, 400.0],
]
lod = [0, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]
feed = {
'X': np.array(input).astype("float64"),
'lod': np.array(lod).astype("int64"),
}
exe = fluid.Executor(place=place)
exe.run(fluid.default_startup_program())
res = exe.run(feed=feed, fetch_list=[output])
out = [
[1.0, 2.0, 3.0, 4.0],
[0.0, 0.0, 0.0, 0.0],
[10.0, 20.0, 30.0, 40.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[100.0, 200.0, 300.0, 400.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
]
out_np = np.array(out, dtype="float64")
assert (res == out_np).all(), "output is not right"
if __name__ == '__main__':
unittest.main()
...@@ -23,6 +23,7 @@ from .operators import graph_send_recv ...@@ -23,6 +23,7 @@ from .operators import graph_send_recv
from .operators import graph_khop_sampler from .operators import graph_khop_sampler
from .operators import graph_sample_neighbors from .operators import graph_sample_neighbors
from .operators import graph_reindex from .operators import graph_reindex
from .operators import unzip
from .tensor import segment_sum from .tensor import segment_sum
from .tensor import segment_mean from .tensor import segment_mean
from .tensor import segment_max from .tensor import segment_max
...@@ -55,4 +56,5 @@ __all__ = [ ...@@ -55,4 +56,5 @@ __all__ = [
'segment_max', 'segment_max',
'segment_min', 'segment_min',
'identity_loss', 'identity_loss',
'unzip',
] ]
...@@ -21,3 +21,4 @@ from .graph_send_recv import graph_send_recv # noqa: F401 ...@@ -21,3 +21,4 @@ from .graph_send_recv import graph_send_recv # noqa: F401
from .graph_khop_sampler import graph_khop_sampler # noqa: F401 from .graph_khop_sampler import graph_khop_sampler # noqa: F401
from .graph_sample_neighbors import graph_sample_neighbors # noqa: F401 from .graph_sample_neighbors import graph_sample_neighbors # noqa: F401
from .graph_reindex import graph_reindex # noqa: F401 from .graph_reindex import graph_reindex # noqa: F401
from .unzip import unzip # noqa: F401
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.layer_helper import LayerHelper
def unzip(input, lod):
r"""
**unzip layers**
unzip 'input' accroding to 'lod'
Args:
input (Variable): The zipped input, 2-D LodTensor with shape [N, M].
lod (Variable): The original lod of unzipped input, 1-D LodTensor with shape[K].
Returns:
Variable: The original unzipped tensor, 2-D LodTensor with shape[K-1, M].
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
paddle.enable_static()
input_np = np.array([
[1.0, 2.0, 3.0, 4.0],
[10.0, 20.0, 30.0, 40.0],
[100.0, 200.0, 300.0, 400.0]
])
lod_np = np.array([0, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12])
input = paddle.to_tensor(input_np, "int64")
lod = paddle.to_tensor(lod_np, "int64")
unzipped_input = paddle.incubate.unzip(input, lod)
'''
unzipped_input is [
[1.0, 2.0, 3.0, 4.0],
[0.0, 0.0, 0.0, 0.0],
[10.0, 20.0, 30.0, 40.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[100.0, 200.0, 300.0, 400.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0]
]
'''
"""
helper = LayerHelper('unzip', **locals())
out = helper.create_variable(dtype=input.dtype)
check_variable_and_dtype(
input,
'input',
['float16', 'float32', 'float64', 'int', 'bool', 'int64'],
'unzip',
)
check_variable_and_dtype(lod, 'lod', ['int', 'int64'], 'unzip')
helper.append_op(
type='unzip', inputs={'X': [input], 'lod': [lod]}, outputs={'Y': [out]}
)
return out
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册