box_wrapper.cc 13.2 KB
Newer Older
H
hutuxian 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

H
hutuxian 已提交
15
#ifdef PADDLE_WITH_BOX_PS
H
hutuxian 已提交
16
#include "paddle/fluid/framework/fleet/box_wrapper.h"
H
hutuxian 已提交
17
#include <algorithm>
H
hutuxian 已提交
18 19 20 21 22 23 24 25 26 27
#include <ctime>
#include <memory>
#include <numeric>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/gpu_info.h"

namespace paddle {
namespace framework {

std::shared_ptr<BoxWrapper> BoxWrapper::s_instance_ = nullptr;
H
hutuxian 已提交
28 29
cudaStream_t BoxWrapper::stream_list_[8];
std::shared_ptr<boxps::BoxPSBase> BoxWrapper::boxps_ptr_ = nullptr;
H
hutuxian 已提交
30

H
hutuxian 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
void BasicAucCalculator::compute() {
  double* table[2] = {&_table[0][0], &_table[1][0]};

  double area = 0;
  double fp = 0;
  double tp = 0;

  for (int i = _table_size - 1; i >= 0; i--) {
    double newfp = fp + table[0][i];
    double newtp = tp + table[1][i];
    area += (newfp - fp) * (tp + newtp) / 2;
    fp = newfp;
    tp = newtp;
  }

  if (fp < 1e-3 || tp < 1e-3) {
    _auc = -0.5;  // which means all nonclick or click
  } else {
    _auc = area / (fp * tp);
  }

  _mae = _local_abserr / (fp + tp);
  _rmse = sqrt(_local_sqrerr / (fp + tp));
  _actual_ctr = tp / (fp + tp);
  _predicted_ctr = _local_pred / (fp + tp);
  _size = fp + tp;
H
hutuxian 已提交
57 58
}

H
hutuxian 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
void BasicAucCalculator::calculate_bucket_error() {
  double last_ctr = -1;
  double impression_sum = 0;
  double ctr_sum = 0.0;
  double click_sum = 0.0;
  double error_sum = 0.0;
  double error_count = 0;
  double* table[2] = {&_table[0][0], &_table[1][0]};
  for (int i = 0; i < _table_size; i++) {
    double click = table[1][i];
    double show = table[0][i] + table[1][i];
    double ctr = static_cast<double>(i) / _table_size;
    if (fabs(ctr - last_ctr) > kMaxSpan) {
      last_ctr = ctr;
      impression_sum = 0.0;
      ctr_sum = 0.0;
      click_sum = 0.0;
    }
    impression_sum += show;
    ctr_sum += ctr * show;
    click_sum += click;
    double adjust_ctr = ctr_sum / impression_sum;
    double relative_error =
        sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum));
    if (relative_error < kRelativeErrorBound) {
      double actual_ctr = click_sum / impression_sum;
      double relative_ctr_error = fabs(actual_ctr / adjust_ctr - 1);
      error_sum += relative_ctr_error * impression_sum;
      error_count += impression_sum;
      last_ctr = -1;
    }
  }
  _bucket_error = error_count > 0 ? error_sum / error_count : 0.0;
}

void BoxWrapper::FeedPass(int date,
                          const std::vector<uint64_t>& feasgin_to_box) const {
  int ret = boxps_ptr_->FeedPass(date, feasgin_to_box);
  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                "FeedPass failed in BoxPS."));
}

void BoxWrapper::BeginFeedPass(int date, boxps::PSAgentBase** agent) const {
  int ret = boxps_ptr_->BeginFeedPass(date, *agent);
  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                "BeginFeedPass failed in BoxPS."));
}

void BoxWrapper::EndFeedPass(boxps::PSAgentBase* agent) const {
  int ret = boxps_ptr_->EndFeedPass(agent);
  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                "EndFeedPass failed in BoxPS."));
H
hutuxian 已提交
111 112 113 114
}

void BoxWrapper::BeginPass() const {
  int ret = boxps_ptr_->BeginPass();
H
hutuxian 已提交
115 116
  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                "BeginPass failed in BoxPS."));
H
hutuxian 已提交
117 118 119 120
}

void BoxWrapper::EndPass() const {
  int ret = boxps_ptr_->EndPass();
H
hutuxian 已提交
121 122
  PADDLE_ENFORCE_EQ(
      ret, 0, platform::errors::PreconditionNotMet("EndPass failed in BoxPS."));
H
hutuxian 已提交
123 124 125 126 127 128 129
}

void BoxWrapper::PullSparse(const paddle::platform::Place& place,
                            const std::vector<const uint64_t*>& keys,
                            const std::vector<float*>& values,
                            const std::vector<int64_t>& slot_lengths,
                            const int hidden_size) {
H
hutuxian 已提交
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
  VLOG(3) << "Begin PullSparse";
  platform::Timer all_timer;
  platform::Timer pull_boxps_timer;
  all_timer.Start();

  int64_t total_length =
      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
  auto buf =
      memory::AllocShared(place, total_length * sizeof(boxps::FeatureValueGpu));
  boxps::FeatureValueGpu* total_values_gpu =
      reinterpret_cast<boxps::FeatureValueGpu*>(buf->ptr());

  if (platform::is_cpu_place(place)) {
    // Note: Only GPU is supported in paddlebox now, and following code have not
    // be tested fully yet
H
hutuxian 已提交
145
    LoDTensor total_keys_tensor;
H
hutuxian 已提交
146 147
    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
H
hutuxian 已提交
148
    int64_t offset = 0;
H
hutuxian 已提交
149
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
H
hutuxian 已提交
150
    for (size_t i = 0; i < keys.size(); ++i) {
H
hutuxian 已提交
151 152 153
      memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
                   boost::get<platform::CPUPlace>(place), keys[i],
                   slot_lengths[i] * sizeof(uint64_t));
H
hutuxian 已提交
154 155 156
      offset += slot_lengths[i];
    }

H
hutuxian 已提交
157 158 159 160 161 162 163 164 165 166 167
    VLOG(3) << "Begin call PullSparseCPU in BoxPS";
    pull_boxps_timer.Start();
    // TODO(hutuxian): should use boxps::FeatureValue in the future
    int ret = boxps_ptr_->PullSparseCPU(total_keys, total_values_gpu,
                                        static_cast<int>(total_length));
    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                  "PullSparseCPU failed in BoxPS."));
    pull_boxps_timer.Pause();

    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
            << "]";
H
hutuxian 已提交
168 169 170
    offset = 0;
    for (size_t i = 0; i < values.size(); ++i) {
      int64_t fea_num = slot_lengths[i];
H
hutuxian 已提交
171
      VLOG(3) << "Begin Copy slot[" << i << "] fea_num[" << fea_num << "]";
H
hutuxian 已提交
172
      for (auto j = 0; j < fea_num; ++j) {
H
hutuxian 已提交
173 174
        // Copy the emb from BoxPS to paddle tensor. Since
        // 'show','click','emb'
H
hutuxian 已提交
175
        // are continuous in memory, so we copy here using the 'show' address
H
hutuxian 已提交
176 177 178 179 180
        memory::Copy(
            boost::get<platform::CPUPlace>(place), values[i] + j * hidden_size,
            boost::get<platform::CPUPlace>(place),
            reinterpret_cast<float*>(&((total_values_gpu + offset)->show)),
            sizeof(float) * hidden_size);
H
hutuxian 已提交
181 182 183
        ++offset;
      }
    }
H
hutuxian 已提交
184 185 186 187 188 189 190
  } else if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
    int device_id = boost::get<platform::CUDAPlace>(place).GetDeviceId();
    LoDTensor& total_keys_tensor = keys_tensor[device_id];
    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
H
hutuxian 已提交
191

H
hutuxian 已提交
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
    // construct slot_level lod info
    auto slot_lengths_lod = slot_lengths;
    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
    }
    auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*));
    auto buf_length =
        memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t));
    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
               cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);

    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
                   static_cast<int>(slot_lengths.size()),
                   static_cast<int>(total_length));
    VLOG(3) << "Begin call PullSparseGPU in BoxPS";
    pull_boxps_timer.Start();
    int ret =
        boxps_ptr_->PullSparseGPU(total_keys, total_values_gpu,
                                  static_cast<int>(total_length), device_id);
    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                  "PullSparseGPU failed in BoxPS."));
    pull_boxps_timer.Pause();

    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
            << "]";
    this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
                      static_cast<int>(slot_lengths.size()), hidden_size,
                      total_length);
#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "Please compile WITH_GPU option, because NCCL doesn't support "
        "windows."));
#endif
H
hutuxian 已提交
229
  } else {
H
hutuxian 已提交
230 231
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddleBox: PullSparse Only Support CPUPlace or CUDAPlace Now."));
H
hutuxian 已提交
232
  }
H
hutuxian 已提交
233 234 235 236 237
  all_timer.Pause();
  VLOG(1) << "PullSparse total costs: " << all_timer.ElapsedSec()
          << " s, of which BoxPS costs: " << pull_boxps_timer.ElapsedSec()
          << " s";
  VLOG(3) << "End PullSparse";
H
hutuxian 已提交
238 239 240 241 242 243
}

void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place,
                                const std::vector<const uint64_t*>& keys,
                                const std::vector<const float*>& grad_values,
                                const std::vector<int64_t>& slot_lengths,
H
hutuxian 已提交
244 245 246 247 248 249 250 251 252 253 254 255 256 257
                                const int hidden_size, const int batch_size) {
  VLOG(3) << "Begin PushSparseGrad";
  platform::Timer all_timer;
  platform::Timer push_boxps_timer;
  all_timer.Start();
  int64_t total_length =
      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
  auto buf = memory::AllocShared(
      place, total_length * sizeof(boxps::FeaturePushValueGpu));
  boxps::FeaturePushValueGpu* total_grad_values_gpu =
      reinterpret_cast<boxps::FeaturePushValueGpu*>(buf->ptr());
  if (platform::is_cpu_place(place)) {
    // Note: only GPU is supported in paddlebox now, and following code have not
    // be tested fully yet
H
hutuxian 已提交
258
    LoDTensor total_keys_tensor;
H
hutuxian 已提交
259 260
    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
H
hutuxian 已提交
261
    int64_t offset = 0;
H
hutuxian 已提交
262
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
H
hutuxian 已提交
263
    for (size_t i = 0; i < keys.size(); ++i) {
H
hutuxian 已提交
264 265 266
      memory::Copy(boost::get<platform::CPUPlace>(place), total_keys + offset,
                   boost::get<platform::CPUPlace>(place), keys[i],
                   slot_lengths[i] * sizeof(uint64_t));
H
hutuxian 已提交
267 268 269
      offset += slot_lengths[i];
    }
    offset = 0;
H
hutuxian 已提交
270
    VLOG(3) << "Begin copy grad tensor to BoxPS struct";
H
hutuxian 已提交
271 272 273 274
    for (size_t i = 0; i < grad_values.size(); ++i) {
      int64_t fea_num = slot_lengths[i];
      for (auto j = 0; j < fea_num; ++j) {
        // Copy the emb grad from paddle tensor to BoxPS. Since
H
hutuxian 已提交
275 276 277 278 279 280 281
        // 'show','click','emb' are continuous in memory, here we copy
        // using 'show' address
        memory::Copy(
            boost::get<platform::CPUPlace>(place),
            reinterpret_cast<float*>(&((total_grad_values_gpu + offset)->show)),
            boost::get<platform::CPUPlace>(place),
            grad_values[i] + j * hidden_size, sizeof(float) * hidden_size);
H
hutuxian 已提交
282 283 284
        ++offset;
      }
    }
H
hutuxian 已提交
285 286 287 288 289 290 291 292 293

    VLOG(3) << "Begin call PushSparseCPU in BoxPS";
    push_boxps_timer.Start();
    int ret = boxps_ptr_->PushSparseCPU(total_keys, total_grad_values_gpu,
                                        static_cast<int>(total_length));
    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                  "PushSparseCPU failed in BoxPS."));
    push_boxps_timer.Pause();
  } else if (platform::is_gpu_place(place)) {
H
hutuxian 已提交
294
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
H
hutuxian 已提交
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
    int device_id = boost::get<platform::CUDAPlace>(place).GetDeviceId();
    LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
    uint64_t* total_keys =
        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
    VLOG(3) << "Begin copy grad tensor to boxps struct";
    this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
                      hidden_size, total_length, batch_size);

    VLOG(3) << "Begin call PushSparseGPU in BoxPS";
    push_boxps_timer.Start();
    int ret = boxps_ptr_->PushSparseGPU(
        total_keys, total_grad_values_gpu, static_cast<int>(total_length),
        boost::get<platform::CUDAPlace>(place).GetDeviceId());
    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                  "PushSparseGPU failed in BoxPS."));
    push_boxps_timer.Pause();
#else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "Please compile WITH_GPU option, because NCCL doesn't support "
        "windows."));
H
hutuxian 已提交
315 316
#endif
  } else {
H
hutuxian 已提交
317 318
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddleBox: PushSparseGrad Only Support CPUPlace or CUDAPlace Now."));
H
hutuxian 已提交
319
  }
H
hutuxian 已提交
320 321 322 323 324
  all_timer.Pause();
  VLOG(1) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
          << " s, of which BoxPS cost: " << push_boxps_timer.ElapsedSec()
          << " s";
  VLOG(3) << "End PushSparseGrad";
H
hutuxian 已提交
325 326 327
}
}  // end namespace framework
}  // end namespace paddle
H
hutuxian 已提交
328
#endif