Vector.h 20.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <cmath>
Y
Yu Yang 已提交
18
#include <memory>
19 20 21 22

#include <hl_gpu.h>

#include "BaseMatrix.h"
Y
Yu Yang 已提交
23
#include "MemoryHandle.h"
24
#include "paddle/utils/Common.h"
25 26 27 28 29 30 31 32 33 34 35 36 37 38
#include "paddle/utils/Thread.h"

namespace paddle {

template <class T>
class GpuVectorT;
template <class T>
class CpuVectorT;

template <class T>
class BaseVector;

class SyncThreadPool;

39 40
class Matrix;

41
template <class T>
42 43 44
class BaseVector : public BaseMatrixT<T> {
public:
  BaseVector(size_t size, T* data, bool useGpu)
45
      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113

  ~BaseVector() {}

protected:
  size_t& size_;
};

/**
 * Copy or assignemnt constructor will share the data as opposed to making a
 * copy of the original data. To make a copy of the orinal data, use copyFrom()
 * instead.
 */
template <class T>
class VectorT : public BaseVector<T> {
protected:
  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
      : BaseVector<T>(size,
                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
                      useGpu) {
    memoryHandle_ = memoryHandle;
  }

  // data is still owned by the caller.
  // data should be valid during the life of this vector.
  // Caller is responsible for release the memory.
  VectorT(size_t size, T* data, bool useGpu)
      : BaseVector<T>(size, data, useGpu) {}

public:
  virtual ~VectorT() {}

  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);

  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);

  static std::shared_ptr<VectorT<T>> create(size_t size,
                                            MemoryHandlePtr memoryHandle,
                                            size_t offset = 0);

  // owner can set SyncThreadPool,
  // if not set, will use globalSyncThreadPool,
  // which can be used in main thread only.
  static std::shared_ptr<VectorT<T>> createParallelVector(
      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);

  size_t getSize() const { return this->size_; }
  const T* getData() const { return this->data_; }
  T* getData() { return this->data_; }

  virtual void zeroMem() = 0;
  // set all elements to value
  virtual void reset(const T& value) = 0;
  // fill data by 0, 1, 2, ...
  virtual void fillSequence() = 0;

  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }

  /**
   * resizing to a big vector will not preserve old values.
   */
  void resize(size_t newSize) {
    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
      memoryHandle_ = newMemory(newSize * sizeof(T));
      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
    }
    this->size_ = newSize;
  }

114 115
  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
                             size_t size,
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
                             bool useGpu) {
    if (vec) {
      vec->resize(size);
    } else {
      vec = create(size, useGpu);
    }
  }

  virtual MemoryHandlePtr newMemory(size_t size) = 0;

  /**
   * form sub vector from *src*, shallow copy
   */
  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
    CHECK_LT(start, src.size_);
    CHECK_LE(start + size, src.size_);

    BaseVector<T>::size_ = size;
    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
  }

  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
    CHECK_LE(start + size, static_cast<size_t>(getSize()));
    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
  }

  /**
   * form sub vector from *src*, shallow copy
   */
  void subVecFrom(const T* src, size_t start, size_t size) {
    BaseVector<T>::size_ = size;
    BaseVector<T>::data_ = const_cast<T*>(src) + start;
  }

  /**
   * form sub vector from *src*, shallow copy
   * in *interval* [interval.first, interval.second)
   */
  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
    subVecFrom(src, interval.first, interval.second - interval.first);
  }

159 160 161 162 163 164
  /**
   * convert the vector to a sparse one_hot matrix of width idRange
   * only applies to IVector
   */
  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
  /**
   * This function will crash if the size of src and dest is different.
   */
  virtual void copyFrom(const VectorT<T>& src) = 0;

  /**
   * If use_gpu, this function will push the copy-task to the specifed-stream
   * and return immediately.
   *
   * If not use GPU, this function is same as
   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
   */
  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;

  /**
   * copy size elements from src
   *
   * If this is GpuVector, src can be cpu or gpu memory
   *
   * If this is CpuVector, src is assumed to be cpu memory
   */
  virtual void copyFrom(const T* src, size_t size) = 0;

  /**
   * copy size elements from src
   *
   * If this is GpuVector, src can be cpu or gpu memory
   *
   * If this is CpuVector, src is assumed to be cpu memory,
   */
  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;

  /**
   * exec a func in single/multi thread
   */
  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }

  /// Get the buffer point with beginPos
  virtual T* getPoint(const uint64_t beginPos) = 0;

  /// Get the value for the i'th element
  virtual T getElement(size_t i) const = 0;
  virtual void setElement(size_t i, const T& value) = 0;

  //----------  math operations ----------------

  // sum of the absolute value of each elements
  virtual T getAbsSum() = 0;

  virtual T getSum() = 0;
  virtual T getMax() = 0;
  virtual T getAbsMax() = 0;
  virtual T getMin() = 0;

  /// element-wise calc:  this = (b == value)
  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;

  /// select elements indexed by *ids* from vector *src*
  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;

  enum HistogramType {
    HISTOGRAM_EXPONENT = 0,
  };

  /**
   * @brief  print histogram of vector values
   *
   * @note   only exponent histogram supported currently
   */
  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;

  /// generate uniform random value for each element
  virtual void rand() = 0;
  /**
   * generate uniform random value for each element,
   * data range is from 0 to (classes - 1).
   */
  virtual void rand(size_t classes) = 0;

  /**
   * Debug use only. Very inefficient for GPU vector.
   * get the value at pos.
   */
  virtual T get(size_t pos) = 0;

  /**
   * generate univariate Gaussian distributed random numbers
   * with given mean and standardDeviation.
   */
  virtual void randnorm(real mean, real standardDeviation) = 0;

  /**
   * generate uniform distributed random numbers
   * with given range.
   */
  virtual void uniform(real left, real right) = 0;

  /// print the first "num" elements of the Vector
  virtual void print(std::ostream& os, size_t num) const = 0;

  /// print the "idx" element of the Vector
  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;

268
  template <typename ExpressionType>
H
hedaoyuan 已提交
269 270 271 272 273 274 275 276
  void operator=(const ExpressionType& expr) {
    if (BaseVector<T>::useGpu_) {
      TensorGpuApply<T>(*this, expr);
    } else {
      TensorCpuApply<T>(*this, expr);
    }
  }

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
protected:
  friend class GpuVectorT<T>;
  friend class CpuVectorT<T>;
  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
  MemoryHandlePtr memoryHandle_;
};

template <class T>
std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
  vec.print(os, vec.getSize());
  return os;
}

template <class T>
class GpuVectorT : public VectorT<T> {
public:
  explicit GpuVectorT(size_t size);
  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
      : VectorT<T>(size, memHandle, offset, true) {}

  // data is still owned by the caller.
  // data should be valid during the life of this vector.
  // Caller is responsible for release the memory.
  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}

  virtual MemoryHandlePtr newMemory(size_t size) {
    return std::make_shared<GpuMemoryHandle>(size);
  }
  virtual void zeroMem();
  virtual void reset(const T& value);
  virtual void fillSequence();

  virtual void copyFrom(const T* src, size_t size);
  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
  virtual void copyFrom(const VectorT<T>& src);
  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
  virtual T getElement(size_t i) const;
  virtual void setElement(size_t i, const T& value);
  virtual T* getPoint(const uint64_t beginPos);

  virtual T getAbsSum();
  virtual T getSum();
  virtual T getMax();
  virtual T getAbsMax();
  virtual T getMin();
  virtual void isEqualTo(const VectorT<T>& b, const T& value);
  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
  virtual void histogram(std::ostream& os, int type);
  virtual void rand();
  virtual void rand(size_t classes);
  virtual void randnorm(real mean, real standardDeviation);
  virtual void uniform(real left, real right);
  virtual T get(size_t pos);
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;

334
  template <typename ExpressionType>
H
hedaoyuan 已提交
335 336 337 338
  void operator=(const ExpressionType& expr) {
    TensorGpuApply<T>(*this, expr);
  }

339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
protected:
  virtual void copyTo(CpuVectorT<T>* dest) const;
  virtual void copyTo(GpuVectorT<T>* dest) const;
};

template <class T>
class CpuVectorT : public VectorT<T> {
public:
  explicit CpuVectorT(size_t size);
  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
      : VectorT<T>(size, memoryHandle, offset, false) {}

  // data is still owned by the caller.
  // data should be valid during the life of this vector.
  // Caller is responsible for release the memory.
  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}

  /**
   * If src is a CpuVector, the new CpuVector will share the data with src
   *
   * If src is a GpuVector, the new CpuVector will copy data from src
   */
  explicit CpuVectorT(const VectorT<T>& src);

  virtual MemoryHandlePtr newMemory(size_t size) {
    return std::make_shared<CpuMemoryHandle>(size);
  }

  virtual void zeroMem();
  virtual void reset(const T& value);
  virtual void fillSequence();
  virtual void copyFrom(const T* src, size_t size);
  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
  virtual void copyFrom(const VectorT<T>& src);
  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
  virtual void copyTo(CpuVectorT<T>* dest) const;
  virtual void copyTo(GpuVectorT<T>* dest) const;

  /// Get the buffer point with beginPos
  virtual T* getPoint(const uint64_t beginPos) {
    return this->getData() + beginPos;
  }

  virtual T getElement(size_t i) const { return this->getData()[i]; }
  virtual void setElement(size_t i, const T& value) {
    this->getData()[i] = value;
  }

  virtual T getAbsSum();
  virtual T getSum();
  virtual T getMax();
  virtual T getAbsMax();
  virtual T getMin();
  virtual void isEqualTo(const VectorT<T>& b, const T& value);
  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
  virtual void histogram(std::ostream& os, int type);
  virtual void rand();
  virtual void rand(size_t classes);
  virtual void randnorm(real mean, real standardDeviation);
  virtual void uniform(real left, real right);
  virtual T get(size_t pos);
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;
H
hedaoyuan 已提交
402

403
  template <typename ExpressionType>
H
hedaoyuan 已提交
404 405 406
  void operator=(const ExpressionType& expr) {
    TensorCpuApply<T>(*this, expr);
  }
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
};

template <class T>
class ParallelCpuVectorT : public CpuVectorT<T> {
public:
  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
      : CpuVectorT<T>(size), pool_(pool) {}

  virtual void zeroMem() {
    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
  }
  virtual void randnorm(real mean, real standardDeviation) {
    parallelExec([=](CpuVectorT<T>& vec) {
      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
    });
  }
  virtual void uniform(real left, real right) {
    parallelExec(
        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
  }

  virtual void exec(SyncThreadPool::JobFunc jobFunc);

private:
  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
  void parallelExec(ExecFunc func);
  SyncThreadPool* pool_;
};

/**
 * A class to do conversion between CpuVector and GpuVector automatically.
 */
template <class T>
class CpuGpuVectorT {
public:
  /**
   * @brief An enum type of SyncedFlag using to
   *        mark data memory is in CPU or GPU.
   *
   * DATA_AT_CPU: data is located in CPU.
   *
   * DATA_AT_GPU: data is located in GPU.
   *
   * SYNCED: data is located in CPU and GPU simultaneously.
   */
452
  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485

  /**
   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
   *
   * @param[in] size    data size.
   * @param[in] useGpu  use gpu or not.
   */
  explicit CpuGpuVectorT(size_t size, bool useGpu);

  /**
   * @brief A constructor, create CpuGpuVectorT by VectorT.
   *
   * If src is CpuVector, cpuVectorT_ is shared data with src.
   *
   * If src is GpuVector, gpuVectorT_ is shared data with src.
   */
  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);

  /**
   * @brief A constructor.
   *
   * If useGpu is true, data should be located in device and
   * create gpuVectorT_ with data.
   *
   * If useGpu is false, data should be located in host and
   * create cpuVectorT_ with data.
   *
   * @note Data is owned by the caller and should be valid during
   *       the life of this vector.
   *       Caller is responsible for release the memory.
   */
  CpuGpuVectorT(size_t size, T* data, bool useGpu);

486
  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504

  virtual ~CpuGpuVectorT() {}

  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);

  /**
   * @brief resize vector.
   *
   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
   *
   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
   */
  void resize(size_t size, bool useGpu);

  /**
   * @brief resize or create CpuGpuVectorT.
   */
  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
505 506
                             size_t size,
                             bool useGpu);
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537

  /**
   * @brief return a const cpuVectorT_ or gpuVectorT_.
   *
   * If useGpu is true, return gpuVectorT_.
   *
   * If useGpu is false, return cpuVectorT_.
   *
   * @note Caller should not change the data.
   *       If caller changes const attribute,
   *       should set syncFlag_.
   */
  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;

  /**
   * @brief return a const cpuVectorT_ or gpuVectorT_.
   *
   * @note: This interface will change syncFlag_, so if you will
   *        not change the data, you should call getVector.
   */
  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);

  /**
   * @brief return const T* data.
   *
   * If useGpu is true, return device data.
   *
   * If useGpu is false, return host data.
   */
  const T* getData(bool useGpu) const;

538 539 540 541
  // TODO(yuyang18): Make getData more c++ style.
  //  inline T* getData(bool useGpu) {
  //    return getMutableData(useGpu);
  //  }
542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630

  T* getMutableData(bool useGpu);

  /**
   * If useGpu is true, gpuVectorT_->Op().
   *
   * If useGpu is false, cpuVectorT_->Op().
   *
   * Op is zeroMem, fillSequence, ...
   */
  void zeroMem(bool useGpu);
  void fillSequence(bool useGpu);
  void setElement(size_t i, const T& value, bool useGpu);

  /**
   * @brief return i-th element.
   */
  T getElement(size_t i) const;

  /**
   * @brief return vector size.
   */
  size_t getSize() const {
    size_t size = 0;
    switch (*sync_) {
      case SYNCED:
      case DATA_AT_CPU:
        size = cpuVectorT_->getSize();
        break;
      case DATA_AT_GPU:
        size = gpuVectorT_->getSize();
        break;
      default:
        LOG(FATAL) << "Not support";
        break;
    }
    return size;
  }

  /// copy data to cpuVectorT_.
  inline void copyToCpu(const T* data, size_t size) {
    this->resizeOrCreate(size, false);
    cpuVectorT_->copyFrom(data, size);
    setSync(DATA_AT_CPU);
  }
  /// copy data to cpuVectorT_ using specifed-stream.
  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
    this->resizeOrCreate(size, false);
    cpuVectorT_->copyFrom(data, size, stream);
    setSync(DATA_AT_CPU);
  }

  /// copy data to gpuVectorT_.
  inline void copyToGpu(const T* data, size_t size) {
    this->resizeOrCreate(size, true);
    gpuVectorT_->copyFrom(data, size);
    setSync(DATA_AT_GPU);
  }
  /// copy data to gpuVectorT_ using specifed-stream.
  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
    this->resizeOrCreate(size, true);
    gpuVectorT_->copyFrom(data, size, stream);
    setSync(DATA_AT_GPU);
  }

  /**
   * @brief copy from src using specifed-stream.
   *
   * If src is CpuVectorT, copy to cpuVectorT_.
   *
   * If src is GpuVectorT, copy to gpuVectorT_.
   */
  void copyFrom(const VectorT<T>& src, hl_stream_t stream);

  /**
   * @brief copy data.
   *
   * If useGpu is false, copy host data to cpuVectorT_.
   *
   * If useGpu is true, copy device data to gpuVectorT_.
   *
   * @note  data address should consistent with useGpu.
   */
  void copyFrom(const T* data, size_t size, bool useGpu);
  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);

  /**
   * @brief copy from (src + offset) using specifed-stream.
   */
631 632 633 634 635
  void copyFrom(CpuGpuVectorT<T>& src,
                size_t offset,
                size_t size,
                bool useGpu,
                hl_stream_t stream);
636 637 638 639 640 641 642 643 644

  /**
   * @brief copy from src using specifed-stream.
   */
  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);

  /**
   * @brief return sync_.
   */
645
  inline SyncedFlag* getSync() const { return sync_; }
646 647 648 649

  /**
   * @brief set sync_.
   */
650
  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719

  inline void setSync(SyncedFlag syncFlag) {
    if (sync_) {
      *sync_ = syncFlag;
    } else {
      syncFlag_ = syncFlag;
      sync_ = &syncFlag_;
    }
  }

  inline void setSync(bool useGpu) {
    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
    setSync(flag);
  }

protected:
  void resizeOrCreate(size_t size, bool useGpu);

  /**
   * @brief copy between cpuVectorT_ and gpuVectorT_.
   *
   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
   *
   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
   *   and set syncFlag_ to SYNCED.
   */
  void copyToCpu();

  /**
   * @brief copy between cpuVectorT_ and gpuVectorT_.
   *
   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
   *
   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
   *   and set syncFlag_ to SYNCED.
   */
  void copyToGpu();

  /// host pointer.
  std::shared_ptr<VectorT<T>> cpuVectorT_;
  /// device pointer.
  std::shared_ptr<VectorT<T>> gpuVectorT_;
  /// specify current data address.
  SyncedFlag syncFlag_;
  SyncedFlag* sync_;
};

typedef VectorT<real> Vector;
typedef CpuVectorT<real> CpuVector;
typedef GpuVectorT<real> GpuVector;

typedef VectorT<int> IVector;
typedef CpuVectorT<int> CpuIVector;
typedef GpuVectorT<int> GpuIVector;

typedef std::shared_ptr<Vector> VectorPtr;
typedef std::shared_ptr<CpuVector> CpuVectorPtr;
typedef std::shared_ptr<GpuVector> GpuVectorPtr;

typedef std::shared_ptr<IVector> IVectorPtr;
typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;

typedef CpuGpuVectorT<real> CpuGpuVector;
typedef CpuGpuVectorT<int> ICpuGpuVector;
typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;

}  // namespace paddle