memory sparse table (#36909)

703487c6 · zhaocaibei123 · GitHub · 41a09113 · 703487c6 · 703487c6
9 changed file
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -37,7 +37,9 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
 cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
-cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor)
+cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -221,15 +221,6 @@ class DAdamD2Sum : public DenseOptimizer {
  void update(const float* update_values, size_t num, int begin,
              int end) override {
    auto update_numel = end - begin;
-    /*
-    // for debug
-    std::cout << "before update:\n";
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "param: " << i << " " << *(param+begin+i) <<
-                   "grad: " << *(update_values+begin+i) << "\n";
-    }*/
    std::vector<float> grad, grad2, scale;
    grad.resize(update_numel);
    grad2.resize(update_numel);
@@ -240,57 +231,21 @@ class DAdamD2Sum : public DenseOptimizer {
    blas.VCOPY(update_numel, update_values + begin, grad.data());
    blas.VCOPY(update_numel, update_values + begin, grad2.data());
-    /*
-    for (int i = 0; i < end-begin; ++ i) {
-      std::cout << "copy grad: " << i << " " << *(grad.data()+begin+i) <<
-                   "copy grad2: " << *(grad2.data()+begin+i) << "\n";
-    }
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "d2sum before: " << i << " " << *(ada_d2sum+begin+i) << "\n";
-    }*/
    // d2sum
    blas.SCAL(update_numel, ada_decay_rate[0], ada_d2sum + begin);
    ADD<float>(update_numel, ada_d2sum + begin, 1, ada_d2sum + begin);
-    /*
-    for (int i = 0; i < end-begin; ++ i) {
-      std::cout << "d2sum update: " << i << " " << *(ada_d2sum+begin+i) << "\n";
-    }
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "g2sum before: " << i << " " << *(ada_g2sum+begin+i) << "\n";
-    }*/
    // g2sum
    blas.SCAL(update_numel, ada_decay_rate[0], ada_g2sum + begin);
    blas.VSQUARE(update_numel, grad2.data(), grad2.data());
    blas.VADD(update_numel, ada_g2sum + begin, grad2.data(), ada_g2sum + begin);
-    /*
-    for (int i = 0; i < end-begin; ++ i) {
-      std::cout << "g2sum update: " << i << " " << *(ada_g2sum+begin+i) << "\n";
-    }
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "mom before: " << i << " " << *(mom_velocity+begin+i) <<
-    "\n";
-    }*/
    // mom
    blas.SCAL(update_numel, mom_decay_rate[0], mom_velocity + begin);
    blas.SCAL(update_numel, 1 - mom_decay_rate[0], grad.data());
    blas.VADD(update_numel, mom_velocity + begin, grad.data(),
              mom_velocity + begin);
-    /*
-    for (int i = 0; i < end-begin; ++ i) {
-      std::cout << "mom update: " << i << " " << *(mom_velocity+begin+i) <<
-    "\n";
-    }
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "scale before: " << i << " " << *(scale.data()+begin+i) <<
-    "\n";
-    }*/
    // scale
    float* scale_ = scale.data();
    blas.VDIV(update_numel, ada_g2sum + begin, ada_d2sum + begin, scale_);
@@ -298,30 +253,13 @@ class DAdamD2Sum : public DenseOptimizer {
    DIV<float>(update_numel, 1 + ada_epsilon[0], scale_, scale_);
    SQRT<float>(update_numel, scale_, scale_);
-    /*
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "scale update: " << i << " " << *(scale.data()+begin+i) <<
-    "\n";
-    }*/
    blas.SCAL(update_numel, learning_rate[0], scale_);
    // TODO(zhaocaibei123): check if there exists elementwise_multiply in blas
    // TODO(zhaocaibei123): blas.VMUL
    ELE_MUL<float>(update_numel, scale_, mom_velocity + begin, scale_);
-    /*
-    for (int i = 0; i < 3; ++ i) {
-      std::cout << "scale update2: " << i << " " << *(scale.data()+begin+i) <<
-    "\n";
-    }*/
    blas.VSUB(update_numel, param + begin, scale_, param + begin);
-    /*
-    for (int i = 0; i < end-begin; ++ i) {
-      std::cout << "param update " << i << " " << *(param+begin+i) << "\n";
-    }*/
  }
  float* learning_rate;

--- a/paddle/fluid/distributed/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/table/memory_sparse_table.cc
--- a/paddle/fluid/distributed/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/table/memory_sparse_table.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/depends/feature_value.h"
+#include "paddle/fluid/string/string_helper.h"
+#define PSERVER_SAVE_SUFFIX ".shard"
+namespace paddle {
+namespace distributed {
+class MemorySparseTable : public SparseTable {
+ public:
+  MemorySparseTable() {}
+  virtual ~MemorySparseTable() {}
+  // unused method begin
+  virtual int32_t pull_dense(float* pull_values, size_t num) { return 0; }
+  virtual int32_t push_dense_param(const float* values, size_t num) {
+    return 0;
+  }
+  virtual int32_t push_dense(const float* values, size_t num) { return 0; }
+  // unused method end
+  virtual int32_t initialize();
+  virtual int32_t initialize_shard() { return 0; }
+  virtual int32_t initialize_value();
+  virtual int32_t load(const std::string& path, const std::string& param);
+  virtual int32_t save(const std::string& path, const std::string& param);
+  int32_t load_local_fs(const std::string& path, const std::string& param);
+  int32_t save_local_fs(const std::string& path, const std::string& param,
+                        const std::string& prefix);
+  virtual std::pair<int64_t, int64_t> print_table_stat();
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
+  virtual int32_t push_sparse(const uint64_t* keys, const float* values,
+                              size_t num);
+  virtual int32_t push_sparse(const uint64_t* keys, const float** values,
+                              size_t num);
+  virtual int32_t flush();
+  virtual int32_t shrink(const std::string& param);
+  virtual void clear();
+ protected:
+  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
+                               size_t num);
+ protected:
+  const int task_pool_size_ = 24;
+  size_t avg_local_shard_num_;
+  size_t real_local_shard_num_;
+  size_t sparse_table_shard_num_;
+  std::vector<std::shared_ptr<::ThreadPool>> shards_task_pool_;
+  std::vector<std::shared_ptr<SparseTableShard>> shard_values_;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -24,6 +24,8 @@
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/distributed/table/ssd_sparse_table.h"
 #endif
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
 #include "paddle/fluid/distributed/table/tensor_table.h"
@@ -40,7 +42,13 @@ REGISTER_PSCORE_CLASS(Table, BarrierTable);
 REGISTER_PSCORE_CLASS(Table, TensorTable);
 REGISTER_PSCORE_CLASS(Table, DenseTensorTable);
 REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
+REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
+REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
 int32_t TableManager::initialize() {
  static bool initialized = false;
@@ -58,6 +66,11 @@ int32_t Table::initialize(const TableParameter &config,
    LOG(WARNING) << "Table accessor initialize failed";
    return -1;
  }
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
  return initialize();
 }
@@ -67,6 +80,9 @@ int32_t Table::initialize_accessor() {
               << _config.table_id();
    return -1;
  }
+  LOG(INFO) << "accessor initializing: table_id: " << _config.table_id()
+            << ", accessor_name: " << _config.accessor().accessor_class();
  auto *accessor = CREATE_PSCORE_CLASS(
      ValueAccessor,
      _config.accessor().accessor_class()) if (accessor == NULL) {

--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/table/accessor.h"
 #include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/graph/graph_node.h"
@@ -103,10 +104,10 @@ class Table {
  virtual int32_t flush() = 0;
  virtual int32_t shrink(const std::string &param) = 0;
-  //指定加载路径
+  // 指定加载路径
  virtual int32_t load(const std::string &path,
                       const std::string &converter) = 0;
-  //指定保存路径
+  // 指定保存路径
  virtual int32_t save(const std::string &path,
                       const std::string &converter) = 0;
@@ -137,6 +138,7 @@ class Table {
  TableParameter _config;
  float *_global_lr = nullptr;
  std::shared_ptr<ValueAccessor> _value_accesor;
+  AfsClient _afs_client;
 };
 REGISTER_PSCORE_REGISTERER(Table);

--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -29,3 +29,6 @@ cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} bo
 set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <ThreadPool.h>
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/memory_sparse_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+TEST(MemorySparseTable, SGD) {
+  int emb_dim = 8;
+  int trainers = 2;
+  TableParameter table_config;
+  table_config.set_table_class("MemorySparseTable");
+  table_config.set_shard_num(10);
+  FsClientParameter fs_config;
+  Table *table = new MemorySparseTable();
+  table->set_shard(0, 1);
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CtrCommonAccessor");
+  accessor_config->set_fea_dim(11);
+  accessor_config->set_embedx_dim(8);
+  accessor_config->set_embedx_threshold(5);
+  accessor_config->mutable_ctr_accessor_param()->set_nonclk_coeff(0.2);
+  accessor_config->mutable_ctr_accessor_param()->set_click_coeff(1);
+  accessor_config->mutable_ctr_accessor_param()->set_base_threshold(0.5);
+  accessor_config->mutable_ctr_accessor_param()->set_delta_threshold(0.2);
+  accessor_config->mutable_ctr_accessor_param()->set_delta_keep_days(16);
+  accessor_config->mutable_ctr_accessor_param()->set_show_click_decay_rate(
+      0.99);
+  accessor_config->mutable_embed_sgd_param()->set_name("SparseNaiveSGDRule");
+  auto *naive_param =
+      accessor_config->mutable_embed_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+  accessor_config->mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule");
+  naive_param = accessor_config->mutable_embedx_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+  // pull parameters for create and check
+  std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+  std::vector<float> init_values;
+  init_values.resize(init_keys.size() * (emb_dim + 1));
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
+  // for check
+  std::vector<float> total_gradients;
+  total_gradients.resize(init_keys.size() * (4 + emb_dim));
+  memset(total_gradients.data(), 0, sizeof(float) * total_gradients.size());
+  // push gradient
+  std::vector<std::vector<uint64_t>> trainer_keys;
+  std::vector<std::vector<float>> trainer_gradient_values;
+  trainer_keys.resize(trainers);
+  trainer_gradient_values.resize(trainers);
+  float start = 0.0;
+  for (int i = 0; i < trainers; i++) {
+    start = 0.0;
+    trainer_keys[i] = init_keys;
+    for (size_t j = 0; j < trainer_keys[i].size(); j++) {
+      auto id = trainer_keys[i][j];
+      for (int k = 0; k < emb_dim + 4; k++) {
+        trainer_gradient_values[i].push_back(start);
+        total_gradients[id * (emb_dim + 4) + k] += start;
+        start += 0.1;
+      }
+    }
+  }
+  std::shared_ptr<::ThreadPool> pool_ =
+      std::make_shared<::ThreadPool>(trainers);
+  std::vector<std::future<void>> task_status;
+  for (int i = 0; i < trainers; i++) {
+    auto &push_keys = trainer_keys[i];
+    auto &push_values = trainer_gradient_values[i];
+    auto task = [table, &push_keys, &push_values] {
+      table->push_sparse(push_keys.data(), push_values.data(),
+                         push_keys.size());
+    };
+    task_status.push_back(pool_->enqueue(std::move(task)));
+  }
+  for (auto &status : task_status) {
+    status.wait();
+  }
+  std::vector<float> pull_values;
+  pull_values.resize(init_keys.size() * (emb_dim + 1));
+  table->pull_sparse(pull_values.data(), value);
+  for (size_t i = 0; i < init_keys.size(); ++i) {
+    for (size_t j = 0; j < emb_dim + 1; ++j) {
+      auto update_val = init_values[i * (emb_dim + 1) + j] -
+                        0.1 * total_gradients[3 + i * (emb_dim + 4) + j];
+      VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":"
+              << init_values[i * (emb_dim + 1) + j];
+      VLOG(3) << update_val << ": " << pull_values[i * (emb_dim + 1) + j];
+    }
+  }
+  MemorySparseTable *ctr_table = dynamic_cast<MemorySparseTable *>(table);
+  ctr_table->save_local_fs("./work/table.save", "0", "test");
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
-cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce)
 cc_library(shell SRCS shell.cc DEPS string_helper glog timer enforce)
+cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce shell)
 cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
 if (WITH_CRYPTO)