/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/utils/Util.h" #include "Vector.h" #include #include "paddle/utils/Logging.h" #include "paddle/utils/ThreadLocal.h" #include "paddle/utils/Thread.h" #include "paddle/utils/Flags.h" #include "Matrix.h" #include "hl_gpu.h" #include "hl_table_apply.h" namespace paddle { template std::shared_ptr> VectorT::create(size_t size, bool useGpu) { if (useGpu) { return std::make_shared>(size); } else { return std::make_shared>(size); } } template std::shared_ptr> VectorT::createParallelVector( size_t size, bool useGpu, SyncThreadPool* pool) { if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector && size >= (size_t)FLAGS_enable_parallel_vector) { return std::make_shared>( size, pool ? pool : getGlobalSyncThreadPool()); } else { return create(size, useGpu); } } template std::shared_ptr> VectorT::create(T* data, size_t size, bool useGpu) { if (useGpu) { return std::make_shared>(size, data); } else { return std::make_shared>(size, data); } } template std::shared_ptr> VectorT::create(size_t size, MemoryHandlePtr memoryHandle, size_t offset) { if (auto cpuMemHandle = std::dynamic_pointer_cast(memoryHandle)) { return std::make_shared>(size, cpuMemHandle, offset); } else if (auto gpuMemHandle = std::dynamic_pointer_cast(memoryHandle)) { return std::make_shared>(size, gpuMemHandle, offset); } else { LOG(FATAL) << "Wrong"; return NULL; } } template <> MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) { LOG(FATAL) << "Wrong for real vector"; return nullptr; } template <> MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) { size_t height = getSize(); size_t width = idRange; MatrixPtr mat = Matrix::createSparseMatrix( height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu); CpuIVector cpuIds(height); cpuIds.copyFrom(*this); int* idData = cpuIds.getData(); for (decltype(height) i = 0; i < height; i++) { const unsigned int id = idData[i]; CHECK_LT(id, width); mat->setRow(i, 1, &id, nullptr); } return mat; } template GpuVectorT::GpuVectorT(size_t size) : VectorT(size, std::make_shared(sizeof(T) * size), 0, /* offset = 0 */ true /* useGpu = true */) {} template T GpuVectorT::getElement(size_t i) const { T elem = 0; hl_memcpy_device2host(&elem, const_cast(&this->getData()[i]), sizeof(T)); return elem; } template void GpuVectorT::setElement(size_t i, const T& value) { hl_memcpy_host2device(&this->getData()[i], const_cast(&value), sizeof(T)); } template T* GpuVectorT::getPoint(const uint64_t beginPos) { LOG(FATAL) << "Not implemented" << beginPos; return NULL; } template <> int GpuVectorT::getAbsSum() { LOG(FATAL) << "Not implemented"; return 0; } template <> int GpuVectorT::getSum() { LOG(FATAL) << "Not implemented"; return 0; } template <> real GpuVectorT::getAbsSum() { real* A = this->getData(); real sum = 0; hl_vector_abs_sum(A, &sum, this->getSize()); return sum; } template <> real GpuVectorT::getSum() { real* A = this->getData(); real sum = 0; hl_vector_sum(A, &sum, this->getSize()); return sum; } template <> int GpuVectorT::getMax() { CpuIVector cpuIVec = CpuIVector(this->getSize()); copyTo(&cpuIVec); return cpuIVec.getMax(); } template <> int GpuVectorT::getAbsMax() { CpuIVector cpuIVec = CpuIVector(this->getSize()); copyTo(&cpuIVec); return cpuIVec.getAbsMax(); } template void GpuVectorT::isEqualTo(const VectorT& b, const T& value) { BaseMatrixT::isEqualTo((BaseMatrixT&)b, value); } template void GpuVectorT::selectFrom(const VectorT& src, const VectorT& ids) { #ifndef PADDLE_ONLY_CPU hl_vector_select_from(this->getData(), this->getSize(), src.getData(), src.getSize(), ids.getData(), ids.getSize()); #endif } template real gpuRowFunc(Func f, GpuVector& v) { static ThreadLocal>> local; if (!*local) { (*local).reset(new CpuVector(1)); } real* A = v.getData(); f(A, (*local)->getData(), 1, v.getSize()); return (*local)->getData()[0]; } template <> real GpuVectorT::getMax() { return gpuRowFunc(hl_matrix_row_max, *this); } template <> real GpuVectorT::getAbsMax() { return std::max(gpuRowFunc(hl_matrix_row_max, *this), -gpuRowFunc(hl_matrix_row_min, *this)); } template <> int GpuVectorT::getMin() { LOG(FATAL) << "Not implemented"; return 0; } template <> real GpuVectorT::getMin() { return gpuRowFunc(hl_matrix_row_min, *this); } template T GpuVectorT::get(size_t pos) { T val = (T)0; hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T)); return val; } template void GpuVectorT::histogram(std::ostream& os, int type) { LOG(FATAL) << "Not implemented"; } template void GpuVectorT::zeroMem() { BaseMatrixT::zero(); } template void GpuVectorT::reset(const T& value) { BaseMatrixT::assign(value); } template void GpuVectorT::fillSequence() { LOG(FATAL) << "not implemented"; } template void GpuVectorT::copyFrom(const VectorT& src) { src.copyTo(this); } template void GpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { CHECK_EQ(src.getSize(), this->getSize()); hl_memcpy_async((void*)this->getData(), (void*)src.getData(), sizeof(T) * this->getSize(), stream); } template void GpuVectorT::copyFrom(const T* gpuSrc, size_t size) { CHECK(gpuSrc != NULL); CHECK_LE(size, this->size_); hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size); } template void GpuVectorT::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) { CHECK(gpuSrc != NULL); CHECK_LE(size, this->size_); hl_memcpy_async( (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream); } template void GpuVectorT::copyTo(CpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), sizeof(T) * this->getSize()); } template void GpuVectorT::copyTo(GpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(), sizeof(T) * this->getSize()); } template <> void GpuVectorT::rand() { LOG(FATAL) << "Not implemented"; } template <> void GpuVectorT::print(std::ostream& os, size_t num) const { IVectorPtr dest = IVector::create(this->size_, false); hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), sizeof(int) * this->getSize()); dest->print(os, num); } template <> void GpuVectorT::print(std::ostream& os, size_t num) const { VectorPtr dest = Vector::create(this->size_, false); hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), sizeof(int) * this->getSize()); dest->print(os, num); } template <> void GpuVectorT::printOneElement(std::ostream& os, size_t idx) const { LOG(FATAL) << "Not implemented"; } template <> void GpuVectorT::printOneElement(std::ostream& os, size_t idx) const { LOG(FATAL) << "Not implemented"; } template <> void CpuVectorT::rand() { LOG(FATAL) << "Not implemented"; } template <> void GpuVectorT::rand(size_t classNum) { LOG(FATAL) << "Not implemented"; } template <> void CpuVectorT::rand(size_t classNum) { LOG(FATAL) << "Not implemented"; } template <> void GpuVectorT::rand() { VectorPtr cPtr = Vector::create(this->size_, false); cPtr->rand(); hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real)); } template <> void GpuVectorT::rand(size_t classNum) { IVectorPtr cPtr = IVector::create(this->size_, false); cPtr->rand(classNum); hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int)); } template <> void CpuVectorT::rand(size_t classNum) { size_t size = this->getSize(); int* data = this->getData(); for (size_t i = 0; i < size; i++) { data[i] = std::min(classNum - 1, size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum)); } } template <> void CpuVectorT::rand() { size_t size = this->getSize(); real* data = this->getData(); for (size_t i = 0; i < size; i++) { data[i] = ::rand() * (1. / (double)RAND_MAX); // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) * // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 ); } } template void CpuVectorT::randnorm(real, real) { LOG(FATAL) << "Not implemented"; } template void CpuVectorT::uniform(real, real) { LOG(FATAL) << "Not implemented"; } template void GpuVectorT::randnorm(real, real) { LOG(FATAL) << "Not implemented"; } template void GpuVectorT::uniform(real, real) { LOG(FATAL) << "Not implemented"; } template <> void CpuVectorT::randnorm(real mean, real std) { size_t size = this->getSize(); real* data = this->getData(); unsigned int* seed = ThreadLocalRand::getSeed(); auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); }; for (size_t i = 0; i < size - 1; i += 2) { real r1 = rand1(); r1 = std::sqrt(-2 * std::log(r1)); real r2 = rand1(); data[i] = mean + std * r1 * cos(2 * M_PI * r2); data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2); } real r1 = rand1(); r1 = std::sqrt(-2 * std::log(r1)); real r2 = rand1(); data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2); } template <> void CpuVectorT::uniform(real left, real right) { size_t size = this->getSize(); real* data = this->getData(); real range = right - left; unsigned int* seed = ThreadLocalRand::getSeed(); auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); }; for (size_t i = 0; i < size; ++i) { data[i] = rand1() * range + left; } } template <> void GpuVectorT::randnorm(real mean, real std) { CpuVector cpuVec = CpuVector(this->getSize()); cpuVec.randnorm(mean, std); hl_memcpy_host2device( data_, cpuVec.getData(), this->getSize() * sizeof(real)); } template <> void GpuVectorT::uniform(real left, real right) { CpuVector cpuVec = CpuVector(this->getSize()); cpuVec.uniform(left, right); hl_memcpy_host2device( data_, cpuVec.getData(), this->getSize() * sizeof(real)); } template CpuVectorT::CpuVectorT(size_t size) : VectorT(size, std::make_shared(sizeof(T) * size), 0, /* offset = 0 */ false /* useGpu = false */) {} template CpuVectorT::CpuVectorT(const VectorT& src) : VectorT(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */ false /* useGpu = false */) { if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) { this->memoryHandle_ = std::make_shared(sizeof(T) * this->getSize()); this->data_ = reinterpret_cast(this->memoryHandle_->getBuf()); } src.copyTo(this); } template T CpuVectorT::getAbsSum() { const T* A = this->getData(); size_t size = this->getSize(); T sum = 0; for (size_t i = 0; i < size; i++) { sum += (A[i] > 0) ? A[i] : -A[i]; } return sum; } // cannot use above version, due to precision issue of float template <> real CpuVectorT::getAbsSum() { const real* A = this->getData(); size_t size = this->getSize(); double sum = 0; for (size_t i = 0; i < size; i++) { sum += (A[i] > 0) ? A[i] : -A[i]; } return sum; } template T CpuVectorT::getSum() { const T* A = this->getData(); size_t size = this->getSize(); T sum = 0; for (size_t i = 0; i < size; i++) { sum += A[i]; } return sum; } template <> real CpuVectorT::getSum() { const real* A = this->getData(); size_t size = this->getSize(); double sum = 0; for (size_t i = 0; i < size; i++) { sum += A[i]; } return sum; } template T CpuVectorT::get(size_t pos) { return this->getData()[pos]; } template T CpuVectorT::getMax() { const T* A = this->getData(); size_t size = this->getSize(); T res = A[0]; for (size_t i = 1; i < size; i++) { if (res < A[i]) res = A[i]; } return res; } template T CpuVectorT::getAbsMax() { const T* A = this->getData(); size_t size = this->getSize(); T res = std::abs(A[0]); for (size_t i = 1; i < size; i++) { if (res < std::abs(A[i])) res = std::abs(A[i]); } return res; } template T CpuVectorT::getMin() { const T* A = this->getData(); size_t size = this->getSize(); T res = A[0]; for (size_t i = 1; i < size; i++) { if (res > A[i]) res = A[i]; } return res; } template void CpuVectorT::isEqualTo(const VectorT& b, const T& value) { size_t size = this->getSize(); CHECK_EQ(b.getSize(), size); const T* B = b.getData(); T* A = this->getData(); for (size_t i = 0; i < size; i++) { A[i] = (B[i] == value); } } template void CpuVectorT::selectFrom(const VectorT& src, const VectorT& ids) { size_t size = this->getSize(); CHECK_EQ(ids.getSize(), size); const int* indices = ids.getData(); const T* B = src.getData(); T* A = this->getData(); for (size_t i = 0; i < size; i++) { int index = indices[i]; CHECK_LT(index, (int)src.getSize()); A[i] = B[index]; } } static int getSignAndExponentOfFloat(float a) { uint32_t* pa = reinterpret_cast(&a); return *pa >> 23; } template void CpuVectorT::histogram(std::ostream& os, int type) { LOG(FATAL) << "Not implemented"; } template <> void CpuVectorT::histogram(std::ostream& os, int type) { int counters[512]; memset(counters, 0, sizeof(counters)); int counterZero = 0; const real* A = this->getData(); size_t size = this->getSize(); for (size_t i = 0; i < size; i++) { if (A[i] == 0.0f) { ++counterZero; } else { ++counters[getSignAndExponentOfFloat(A[i])]; } } int64_t sum = 0; float sizeNonZero = size - counterZero; os << "zero:" << counterZero; for (int i = 0; i < 256; i++) { int counter = counters[i]; if (counter) { os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%"; sum += counter * (i - 127); } } for (int i = 0; i < 256; i++) { int counter = counters[i + 256]; if (counter) { os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%"; sum += counter * (i - 127); } } os << ", nonzero_exponent_avg=" << sum / sizeNonZero; } template void CpuVectorT::zeroMem() { memset(this->getData(), 0, sizeof(T) * this->getSize()); } template void CpuVectorT::reset(const T& value) { T* A = this->getData(); size_t size = this->getSize(); for (size_t i = 0; i < size; i++) { A[i] = value; } } template void CpuVectorT::fillSequence() { T* A = this->getData(); size_t size = this->getSize(); for (size_t i = 0; i < size; i++) { A[i] = i; } } template void CpuVectorT::copyFrom(const VectorT& src) { src.copyTo(this); } template void CpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { if (typeid(src) == typeid(GpuVectorT)) { hl_memcpy_async((void*)this->getData(), (void*)src.getData(), sizeof(T) * this->getSize(), stream); } else { src.copyTo(this); } } template void CpuVectorT::copyFrom(const T* hostSrc, size_t size) { CHECK(hostSrc != NULL); CHECK_LE(size, this->size_); memcpy(this->data_, hostSrc, sizeof(T) * size); } template void CpuVectorT::copyFrom(const T* hostSrc, size_t size, hl_stream_t stream) { (void)stream; CHECK(hostSrc != NULL); CHECK_LE(size, this->size_); memcpy(this->data_, hostSrc, sizeof(T) * size); } template void CpuVectorT::copyTo(CpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize()); } template void CpuVectorT::copyTo(GpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(), sizeof(T) * this->getSize()); } template <> void CpuVectorT::print(std::ostream& os, size_t num) const { size_t w = size_ < num ? size_ : num; os << "["; for (size_t i = 0; i < w; ++i) { os << data_[i] << " "; } os << "]" << std::endl; } template <> void CpuVectorT::print(std::ostream& os, size_t num) const { size_t w = size_ < num ? size_ : num; os << "["; for (size_t i = 0; i < w; ++i) { os << (int)data_[i] << " "; } os << "]" << std::endl; } template <> void CpuVectorT::printOneElement(std::ostream& os, size_t idx) const { CHECK_LT(idx, size_); os << data_[idx] << ";"; } template <> void CpuVectorT::printOneElement(std::ostream& os, size_t idx) const { CHECK_LT(idx, size_); os << (int)data_[idx] << ";"; } template void ParallelCpuVectorT::parallelExec(ExecFunc func) { LOG(FATAL) << "Not implemented"; } template <> void ParallelCpuVectorT::parallelExec(ExecFunc func) { pool_->exec([this, func](int tid, size_t numThreads) { auto interval = calcSplitArrayInterval( this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/); // setup sub bufs CpuVector subVec(0, nullptr); subVec.subVecFrom(*this, interval); func(subVec); }); } template void ParallelCpuVectorT::exec(SyncThreadPool::JobFunc func) { LOG(FATAL) << "Not implemented"; } template <> void ParallelCpuVectorT::exec(SyncThreadPool::JobFunc func) { pool_->exec(func); } template CpuGpuVectorT::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) { if (!useGpu) { cpuVectorT_ = std::make_shared>(size); } else { gpuVectorT_ = std::make_shared>(size); } setSync(useGpu); } template CpuGpuVectorT::CpuGpuVectorT(const std::shared_ptr>& src) : sync_(nullptr) { bool useGpu = src->useGpu(); if (useGpu) { gpuVectorT_ = src; } else { cpuVectorT_ = src; } setSync(useGpu); } template CpuGpuVectorT::CpuGpuVectorT(size_t size, T* data, bool useGpu) : sync_(nullptr) { if (!useGpu) { cpuVectorT_ = std::make_shared>(size, data); setSync(DATA_AT_CPU); } else { gpuVectorT_ = std::make_shared>(size, data); setSync(DATA_AT_GPU); } } template std::shared_ptr> CpuGpuVectorT::create(size_t size, bool useGpu) { return std::make_shared>(size, useGpu); } template void CpuGpuVectorT::resize(size_t size, bool useGpu) { if (useGpu) { CHECK(gpuVectorT_) << "gpuVectorT_ is null"; // If memoryHandle_ is nullptr, // the data may be owned by the caller when it was constructed. // It should not resize for this case. if (gpuVectorT_->getMemoryHandle()) { gpuVectorT_->resize(size); } else { CHECK_EQ(gpuVectorT_->getSize(), size); } } else { CHECK(cpuVectorT_) << "cpuVectorT_ is null"; // If memoryHandle_ is nullptr, // the data may be owned by the caller when it was constructed. // It should not resize for this case. if (cpuVectorT_->getMemoryHandle()) { cpuVectorT_->resize(size); } else { CHECK_EQ(cpuVectorT_->getSize(), size); } } setSync(useGpu); } template void CpuGpuVectorT::resizeOrCreate(std::shared_ptr>& vec, size_t size, bool useGpu) { if (vec) { vec->resize(size, useGpu); } else { vec = create(size, useGpu); } } template void CpuGpuVectorT::resizeOrCreate(size_t size, bool useGpu) { if (useGpu && (!gpuVectorT_)) { gpuVectorT_ = VectorT::create(size, true); } else if ((!useGpu) && (!cpuVectorT_)) { cpuVectorT_ = VectorT::create(size, false); } else { CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_)); this->resize(size, useGpu); } } template CpuGpuVectorT::CpuGpuVectorT(CpuGpuVectorT& src, size_t offset, size_t size) : sync_(nullptr) { CHECK_LE(offset + size, static_cast(src.getSize())); #ifndef PADDLE_ONLY_CPU SyncedFlag* flag = src.getSync(); if (*flag == DATA_AT_CPU) { src.copyToGpu(); // will set synchronous data between CPU and GPU } else if (*flag == DATA_AT_GPU) { src.copyToCpu(); // will set synchronous data between CPU and GPU } #endif auto cMemHandle = (src.getVector(false))->getMemoryHandle(); cpuVectorT_ = std::make_shared>( size, std::dynamic_pointer_cast(cMemHandle), offset); #ifndef PADDLE_ONLY_CPU auto gMemHandle = (src.getVector(true))->getMemoryHandle(); gpuVectorT_ = std::make_shared>( size, std::dynamic_pointer_cast(gMemHandle), offset); src.setSync(SYNCED); #endif setSync(src.getSync()); } template std::shared_ptr> CpuGpuVectorT::getVector( bool useGpu) const { auto* self = const_cast*>(this); if (useGpu) { self->copyToGpu(); return std::const_pointer_cast>(gpuVectorT_); } else { self->copyToCpu(); return std::const_pointer_cast>(cpuVectorT_); } } template std::shared_ptr>& CpuGpuVectorT::getMutableVector(bool useGpu) { setSync(useGpu); if (useGpu) { copyToGpu(); return gpuVectorT_; } else { copyToCpu(); return cpuVectorT_; } } template const T* CpuGpuVectorT::getData(bool useGpu) const { auto self = const_cast*>(this); if (useGpu) { self->copyToGpu(); return gpuVectorT_->getData(); } else { self->copyToCpu(); return cpuVectorT_->getData(); } } // Operation will change data and need to reset sync_ & syncFlag_. #define MUTABLE_VECTOR_OP(OP, useGpu, args...) \ do { \ setSync(useGpu); \ if (useGpu) { \ copyToGpu(); \ return gpuVectorT_->OP(args); \ } else { \ copyToCpu(); \ return cpuVectorT_->OP(args); \ } \ } while (0) template T* CpuGpuVectorT::getMutableData(bool useGpu) { MUTABLE_VECTOR_OP(getData, useGpu); } template void CpuGpuVectorT::zeroMem(bool useGpu) { MUTABLE_VECTOR_OP(zeroMem, useGpu); } template void CpuGpuVectorT::fillSequence(bool useGpu) { MUTABLE_VECTOR_OP(fillSequence, useGpu); } template void CpuGpuVectorT::setElement(size_t i, const T& value, bool useGpu) { MUTABLE_VECTOR_OP(setElement, useGpu, i, value); } template T CpuGpuVectorT::getElement(size_t i) const { switch (*this->getSync()) { case SYNCED: case DATA_AT_CPU: return cpuVectorT_->getElement(i); break; case DATA_AT_GPU: return gpuVectorT_->getElement(i); break; default: LOG(FATAL) << "Not support"; break; } } template void CpuGpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { auto cVec = dynamic_cast*>(&src); auto gVec = dynamic_cast*>(&src); if (cVec) { copyToCpu(cVec->getData(), cVec->getSize(), stream); } else if (gVec) { copyToGpu(gVec->getData(), gVec->getSize(), stream); } else { LOG(FATAL) << "Invalid type of src"; } } template void CpuGpuVectorT::copyFrom(const T* data, size_t size, bool useGpu) { if (useGpu) { copyToGpu(data, size); } else { copyToCpu(data, size); } } template void CpuGpuVectorT::copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu) { if (useGpu) { copyToGpu(data, size, stream); } else { copyToCpu(data, size, stream); } } template void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, size_t offset, size_t size, bool useGpu, hl_stream_t stream) { if (useGpu) { VectorT::resizeOrCreate(gpuVectorT_, size, true); gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream); } else { VectorT::resizeOrCreate(cpuVectorT_, size, false); cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream); } setSync(useGpu); } template void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, hl_stream_t stream) { switch (*src.getSync()) { case DATA_AT_CPU: copyFrom(*(src.getVector(false)), stream); break; case DATA_AT_GPU: copyFrom(*(src.getVector(true)), stream); break; case SYNCED: copyFrom(*(src.getVector(false)), stream); copyFrom(*(src.getVector(true)), stream); setSync(SYNCED); break; default: LOG(FATAL) << "Not support"; break; } } template void CpuGpuVectorT::copyToCpu() { switch (*this->getSync()) { case DATA_AT_GPU: CHECK(gpuVectorT_); this->resizeOrCreate(gpuVectorT_->getSize(), false); cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT); setSync(SYNCED); break; case DATA_AT_CPU: case SYNCED: CHECK(cpuVectorT_); break; default: LOG(FATAL) << "Not support"; break; } } template void CpuGpuVectorT::copyToGpu() { switch (*this->getSync()) { case DATA_AT_CPU: CHECK(cpuVectorT_); this->resizeOrCreate(cpuVectorT_->getSize(), true); gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT); setSync(SYNCED); break; case DATA_AT_GPU: case SYNCED: CHECK(gpuVectorT_); break; default: LOG(FATAL) << "Not support"; break; } } template class VectorT; template class VectorT; template class CpuVectorT; template class CpuVectorT; template class GpuVectorT; template class GpuVectorT; template class CpuGpuVectorT; template class CpuGpuVectorT; } // namespace paddle