From c532b967411b9e4aa89ebb5878a0a44e7f117431 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 19 Oct 2017 16:03:53 -0700 Subject: [PATCH] Remove template parameter for Tensor methods (#4937) * Remove template parameter for Tensor methods * Also check the type is correct when data() * Simplize holder_ * Fix accuracy_op * Register Code --- paddle/framework/data_type.h | 2 + paddle/framework/feed_fetch_method.h | 2 +- paddle/framework/tensor.h | 46 ++++---- paddle/framework/tensor_array.cc | 34 +++--- paddle/framework/tensor_array_test.cc | 2 +- paddle/framework/tensor_impl.h | 100 ++++++++++++++---- paddle/framework/tensor_test.cc | 32 +++--- paddle/operators/accuracy_op.cc | 7 +- paddle/operators/accuracy_op.cu | 18 ++-- paddle/operators/conv2d_op.h | 32 +++--- paddle/operators/dynamic_recurrent_op.cc | 15 ++- paddle/operators/feed_op.cc | 2 +- paddle/operators/fetch_op.cc | 2 +- paddle/operators/math/im2col_test.cc | 8 +- paddle/operators/math/math_function_test.cu | 28 ++--- .../math/selected_rows_functor_test.cu | 4 +- paddle/operators/math/vol2col_test.cc | 8 +- paddle/operators/matmul_op.h | 6 +- paddle/operators/mul_op.h | 28 ++--- paddle/operators/multiplex_op.cu | 6 +- paddle/operators/recurrent_op.cc | 4 +- paddle/operators/reshape_op.h | 4 +- paddle/operators/rnn/recurrent_op_utils.cc | 8 +- paddle/operators/scatter_op.cu | 4 +- paddle/operators/scatter_op.h | 4 +- paddle/operators/sequence_concat_op.h | 16 +-- paddle/operators/sequence_pool_op.h | 12 +-- paddle/operators/sequence_softmax_op.h | 10 +- .../softmax_with_cross_entropy_op.cu | 2 +- .../operators/softmax_with_cross_entropy_op.h | 2 +- paddle/pybind/pybind.cc | 2 + 31 files changed, 248 insertions(+), 202 deletions(-) diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h index 649899d4257..c25a62c2b11 100644 --- a/paddle/framework/data_type.h +++ b/paddle/framework/data_type.h @@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) { return DataType::FP64; } else if (typeid(int).hash_code() == type.hash_code()) { return DataType::INT32; + } else if (typeid(int64_t).hash_code() == type.hash_code()) { + return DataType::INT64; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h index 9b23ad271cb..d58736dcb1c 100644 --- a/paddle/framework/feed_fetch_method.h +++ b/paddle/framework/feed_fetch_method.h @@ -34,7 +34,7 @@ void SetFeedVariable(const LoDTensor& input, const std::string& var_name, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + feed_inputs[index].ShareDataWith(input); // set lod feed_inputs[index].set_lod(input.lod()); } diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index bc430852de6..3a2bdaf0863 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -60,6 +60,10 @@ class Tensor { template inline T* mutable_data(platform::Place place); + inline void* mutable_data(platform::Place place, std::type_index type); + + inline void* mutable_data(platform::Place place); + /** * @brief Return a pointer to mutable memory block. * @@ -81,7 +85,6 @@ class Tensor { inline Tensor& Resize(const DDim& dims); /*! The internal of two tensors share the same memory block. */ - template inline Tensor& ShareDataWith(const Tensor& src); /** @@ -96,26 +99,9 @@ class Tensor { // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647 // Remove `CopyFrom` and `CopyFromVector` from Tensor interface // and make them global functions - template inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx); - // FIXME(yuyang18): CopyFrom should without template T, use the replace - // `CopyFrom` with `CopyFromTensor` - inline void CopyFromTensor(const Tensor& src, - const platform::Place& dst_place, - const platform::DeviceContext& ctx) { - // NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions. - // That is a bug of cpplint.py. Just ignore lint these lines. - if (src.type() == std::type_index(typeid(double))) { - CopyFrom(src, dst_place, ctx); - } else if (src.type() == std::type_index(typeid(float))) { - CopyFrom(src, dst_place, ctx); - } else if (src.type() == std::type_index(typeid(int))) { - CopyFrom(src, dst_place, ctx); - } - } - /** * @brief Copy the content of an external vector to a tensor. * @@ -135,7 +121,6 @@ class Tensor { * @param[in] begin_idx The begin index of the slice. * @param[in] end_idx The end index of the slice. */ - template inline Tensor Slice(const int& begin_idx, const int& end_idx) const; platform::Place place() const { @@ -146,7 +131,6 @@ class Tensor { std::type_index type() const { return holder_->type(); } private: - template inline void check_memory_size() const; private: @@ -155,20 +139,22 @@ class Tensor { * parameter of Variable. */ struct Placeholder { - virtual ~Placeholder() {} + virtual ~Placeholder() = default; virtual void* ptr() const = 0; virtual size_t size() const = 0; virtual std::type_index type() const = 0; virtual platform::Place place() const = 0; + virtual void set_type(std::type_index type) = 0; }; - template + template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size) - : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + PlaceholderImpl(Place place, size_t size, std::type_index type) + : ptr_(static_cast(memory::Alloc(place, size)), + memory::PODDeleter(place)), place_(place), - size_(size) { + size_(size), + type_(type) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -176,16 +162,20 @@ class Tensor { virtual size_t size() const { return size_; } virtual platform::Place place() const { return place_; } virtual void* ptr() const { return static_cast(ptr_.get()); } - virtual std::type_index type() const { return std::type_index(typeid(T)); } + virtual std::type_index type() const { return type_; } + virtual void set_type(std::type_index type) { type_ = type; } /*! the pointer of memory block. */ - std::unique_ptr> ptr_; + std::unique_ptr> ptr_; /*! the place of memory block. */ platform::Place place_; /*! the size of memory block. */ size_t size_; + + /* the current type of memory */ + std::type_index type_; }; /*! holds the memory block if allocated. */ diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc index 06459cbfd7b..4c82c363835 100644 --- a/paddle/framework/tensor_array.cc +++ b/paddle/framework/tensor_array.cc @@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) { values_[index].Resize(value.dims()); values_[index].mutable_data(platform::CPUPlace()); - values_[index].CopyFrom(value, platform::CPUPlace(), - platform::CPUDeviceContext()); + values_[index].CopyFrom(value, platform::CPUPlace(), + platform::CPUDeviceContext()); } void TensorArray::WriteShared(size_t index, const LoDTensor& value) { @@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) { values_.resize(index + 1); } - values_[index].ShareDataWith(value); + values_[index].ShareDataWith(value); } LoDTensor TensorArray::Pack(size_t level, const std::vector& meta, @@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const { result.mutable_data(platform::CPUPlace()); for (size_t idx = 0; idx < size(); idx++) { - result.Slice(idx, idx + 1) - .CopyFrom(Read(idx), platform::CPUPlace(), - platform::CPUDeviceContext()); + result.Slice(idx, idx + 1) + .CopyFrom(Read(idx), platform::CPUPlace(), + platform::CPUDeviceContext()); } return result; } @@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const { auto& value = values_[elem]; if (data_shared) { // share memory - value.ShareDataWith(source.Slice(elem, elem + 1)); + value.ShareDataWith(source.Slice(elem, elem + 1)); } else { // copy value.Resize(value_dims); - value.CopyFrom(source.Slice(elem, elem + 1), - platform::CPUPlace(), - platform::CPUDeviceContext()); + value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(), + platform::CPUDeviceContext()); } } } @@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { for (size_t i = 0; i < indice.size(); i++) { auto index = indice[i]; - auto target = result.Slice(i, i + 1); - auto slice = source->Slice(index, index + 1); + auto target = result.Slice(i, i + 1); + auto slice = source->Slice(index, index + 1); - target.CopyFrom(slice, platform::CPUPlace(), - platform::CPUDeviceContext()); + target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext()); } return result; @@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector& source, // target is result[index] auto index = seq_meta.begin + batch_id; if (index >= seq_meta.end) break; - auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); - auto target = result.Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace(), - platform::CPUDeviceContext()); + auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); + auto target = result.Slice(index, index + 1); + target.CopyFrom(source_, platform::CPUPlace(), + platform::CPUDeviceContext()); } } diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc index d9f52509cdd..9470ac5e6ed 100644 --- a/paddle/framework/tensor_array_test.cc +++ b/paddle/framework/tensor_array_test.cc @@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test { size_t begin = level[i]; size_t end = level[i + 1]; for (size_t j = begin; j < end; j++) { - auto record = source.Slice(j, j + 1); + auto record = source.Slice(j, j + 1); for (int dim = 0; dim < 128; dim++) { record.mutable_data(platform::CPUPlace())[dim] = j - begin; } diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index ce73e0a9edb..f6e801bbb4a 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -19,12 +19,50 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct SizeOfTypeFunctor; + template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor functor; + size_t size = functor(type); + PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + inline void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); PADDLE_ENFORCE_GE( - holder_->size(), numel() * sizeof(T) + offset_, + holder_->size(), numel() * SizeOfType(type()) + offset_, "Tensor's dims_ is out of bound. Call Tensor::mutable_data " "first to re-allocate memory.\n" "or maybe the required data-type mismatches the data already stored."); @@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const { template inline const T* Tensor::data() const { - check_memory_size(); + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); } template inline T* Tensor::data() { - check_memory_size(); + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } @@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) { template inline T* Tensor::mutable_data(platform::Place place) { static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(place, typeid(T))); +} + +inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } PADDLE_ENFORCE_GT(numel(), 0, "Tensor's numel must be larger than zero to call " "Tensor::mutable_data. Call Tensor::set_dim first."); + int64_t size = numel() * SizeOfType(type); /* some versions of boost::variant don't have operator!= */ - int64_t size = numel() * sizeof(T); if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } #else - holder_.reset(new PlaceholderImpl( - boost::get(place), size)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } #endif offset_ = 0; } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +inline void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing"); + return mutable_data(place, holder_->type()); } -template inline Tensor& Tensor::ShareDataWith(const Tensor& src) { - src.check_memory_size(); + src.check_memory_size(); *this = src; return *this; } -template inline void Tensor::CopyFrom(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx) { - src.check_memory_size(); + src.check_memory_size(); Resize(src.dims()); auto src_place = src.holder_->place(); - auto src_ptr = static_cast(src.data()); + auto src_ptr = src.data(); - auto dst_ptr = static_cast(mutable_data(dst_place)); + auto dst_ptr = mutable_data(dst_place, src.type()); - auto size = src.numel() * sizeof(T); + auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, @@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector& src, #endif } -template inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { - check_memory_size(); + check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero."); PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound."); PADDLE_ENFORCE_LT(begin_idx, end_idx, @@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { DDim dst_dims = dims_; dst_dims[0] = end_idx - begin_idx; dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * sizeof(T); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); return dst; } } @@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } -template inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; - res.ShareDataWith(src); + res.ShareDataWith(src); res.Resize(flatten_to_2d(src.dims(), num_col_dims)); return res; } diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 0b62fe08ce9..1bb0fb71b07 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) { // Try to share data form uninitialized tensor bool caught = false; try { - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = @@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_TRUE(caught); src_tensor.mutable_data(make_ddim({2, 3, 4}), CPUPlace()); - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) { Tensor src_tensor; Tensor dst_tensor; src_tensor.mutable_data(make_ddim({2, 3, 4}), GPUPlace()); - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } #endif @@ -143,7 +143,7 @@ TEST(Tensor, Slice) { { Tensor src_tensor; src_tensor.mutable_data(make_ddim({5, 3, 4}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 3); + Tensor slice_tensor = src_tensor.Slice(1, 3); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 3); EXPECT_EQ(slice_dims[0], 2); @@ -167,7 +167,7 @@ TEST(Tensor, Slice) { { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); - Tensor slice_tensor = src_tensor.Slice(2, 6); + Tensor slice_tensor = src_tensor.Slice(2, 6); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); EXPECT_EQ(slice_dims[0], 4); @@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) { memcpy(src_ptr, arr, 9 * sizeof(int)); auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); + dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); @@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); @@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) { // CPU Tensor to GPU Tensor auto gpu_place = new paddle::platform::GPUPlace(0); CUDADeviceContext gpu_ctx(*gpu_place); - gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); + gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } - Tensor slice_tensor = src_tensor.Slice(1, 2); + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor - gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); + gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Slice Tensors gpu_ctx.Wait(); @@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) { CUDADeviceContext gpu_ctx(*gpu_place); gpu_tensor.CopyFromVector(src_vec, gpu_ctx); // Copy from GPU to CPU tensor for comparison - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) { cpu_tensor.CopyFromVector(src_vec, cpu_ctx); gpu_tensor.Resize(make_ddim({2, 2})); gpu_tensor.CopyFromVector(src_vec, gpu_ctx); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) { for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { src_ptr[i] = i; } - Tensor res = ReshapeToMatrix(src, 2); + Tensor res = ReshapeToMatrix(src, 2); ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index 037bb49abc6..e0a00ecaf04 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker); -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel); +REGISTER_OP_CPU_KERNEL( + accuracy, ops::AccuracyKernel, + ops::AccuracyKernel, + ops::AccuracyKernel, + ops::AccuracyKernel); diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index 0ca9ef941d4..54e6ab99dc8 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -21,9 +21,9 @@ namespace paddle { namespace operators { using platform::PADDLE_CUDA_NUM_THREADS; -template -__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata, - const int* labeldata, float* accuracy) { +template +__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata, + const T* labeldata, float* accuracy) { int count = 0; __shared__ int total[BlockSize]; @@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { auto* accuracy = ctx.Output("Accuracy"); // FIXME(typhoonzero): only support indices currently // if add support for output values, how to detect the data type? - const int* inference_data = inference->data(); - const int* label_data = label->data(); + const T* inference_data = inference->data(); + const T* label_data = label->data(); float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); size_t num_samples = inference->dims()[0]; @@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { return; } - AccuracyCudaKernel<<< + AccuracyCudaKernel<<< 1, PADDLE_CUDA_NUM_THREADS, 0, reinterpret_cast( ctx.device_context()) @@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(accuracy, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h index bd1734879ef..f629728f68d 100644 --- a/paddle/operators/conv2d_op.h +++ b/paddle/operators/conv2d_op.h @@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel { int in_step = input_channels / groups; int out_step = output_channels / groups; for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); for (int g = 0; g < groups; g++) { // im2col - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], strides[1], paddings[0], paddings[1]); // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); } @@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel { for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - input_grad->Slice(i, i + 1).Resize(input_shape); + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); for (int g = 0; g < groups; g++) { // gemm Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = - filter.Slice(g * out_step, (g + 1) * out_step); + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), filter_slice, true, out_grad_slice, false, T(1.0), &col_matrix, T(0.0)); // col2im Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); col2im(context.device_context(), in_grad_slice, col, strides[0], strides[1], paddings[0], paddings[1]); } @@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel { for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); for (int g = 0; g < groups; g++) { // im2col Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], strides[1], paddings[0], paddings[1]); // gemm Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); + filter_grad_.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), out_grad_slice, false, col_matrix, true, T(1.0), &filter_grad_slice, T(1.0)); diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc index 03f33e28d49..62962be205c 100644 --- a/paddle/operators/dynamic_recurrent_op.cc +++ b/paddle/operators/dynamic_recurrent_op.cc @@ -48,12 +48,11 @@ inline void ReorderBootState(const DySeqMetaBatch& metas, const LoDTensor& boot_state, LoDTensor* tensor, const platform::Place& dst_place) { for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { - auto slice = tensor->Slice(seq_id, seq_id + 1); + auto slice = tensor->Slice(seq_id, seq_id + 1); auto boot_slice = - boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); + boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); // TODO(superjom) pass in device context as an argument - slice.template CopyFrom(boot_slice, dst_place, - platform::CPUDeviceContext()); + slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext()); } } @@ -138,7 +137,7 @@ void DynamicRecurrentOp::WriteStepInputs() const { if (var == nullptr) { var = step_scope.Var(item.first); } - var->GetMutable()->ShareDataWith(tensor); + var->GetMutable()->ShareDataWith(tensor); } } } @@ -206,7 +205,7 @@ void DynamicRecurrentOp::ConcatOutputs() const { for (auto& item : step_outputs_) { auto tensor = item.second.Pack(level, some_meta, some_lod); auto* output = cache_.outlinks[item.first]->GetMutable(); - const_cast(output)->ShareDataWith(tensor); + const_cast(output)->ShareDataWith(tensor); } } @@ -260,8 +259,8 @@ void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory, } // shink and share from previous state - auto shrinked_pre_state = pre_state->Slice(0, num_instances); - state_pre.ShareDataWith(shrinked_pre_state); + auto shrinked_pre_state = pre_state->Slice(0, num_instances); + state_pre.ShareDataWith(shrinked_pre_state); } void DynamicRecurrentOp::ArgCache::Init( diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index bf453c85966..0f1722a5383 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase { auto &feed_list = feed_var->Get(); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); - out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx); + out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx); out_item->set_lod(feed_item.lod()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 524e77d6ad3..c1b3d66bac4 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx); + dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx); VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; } diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 9c506ae89bd..443c94b83f0 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -64,7 +64,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -85,8 +85,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), - *context); + output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } EXPECT_EQ(out_cfo_ptr[0], 0); @@ -102,8 +101,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), - *context); + output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context); out_ocf_ptr = output_tmp.data(); } EXPECT_EQ(out_ocf_ptr[0], 0); diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index 14359d835bb..8b22c71552a 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); context.Wait(); @@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); context.Wait(); @@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + input3.CopyFrom(input3_gpu, *cpu_place, context); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + input3.CopyFrom(input3_gpu, *cpu_place, context); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 8a9f25b9826..69607c5afc4 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_rows[6], 9); Tensor out_cpu; - out_cpu.CopyFrom(*out_value, cpu_place, ctx); + out_cpu.CopyFrom(*out_value, cpu_place, ctx); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) { add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; - tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx); + tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx); ctx.Wait(); auto* tensor2_cpu_data = tensor2_cpu.data(); diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 2d69218843a..74590d17cd0 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -78,7 +78,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -93,7 +93,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); + output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } @@ -107,7 +107,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } paddle::operators::math::Col2VolFunctor col2vol; @@ -118,7 +118,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); in_ptr = input_tmp.data(); } diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 8ae54e1eec3..5ce30740c90 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -46,7 +46,7 @@ class MatMulKernel : public framework::OpKernel { template inline Tensor Reshape(const Tensor& input, const DDim& dims) { Tensor output; - output.ShareDataWith(input); + output.ShareDataWith(input); output.Resize(dims); return output; } @@ -56,7 +56,7 @@ inline Tensor Reshape(const Tensor& input, const DDim& dims) { template Tensor CombineBatchAndM(const Tensor& input) { Tensor output; - output.ShareDataWith(input); + output.ShareDataWith(input); auto in_dims = input.dims(); if (in_dims.size() == 3) { std::vector out_dims = {in_dims[0] * in_dims[1], in_dims[2]}; @@ -80,7 +80,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize(make_ddim(out_dims)); } else { - output.ShareDataWith(input); + output.ShareDataWith(input); } return output; } diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 684b1ea0c0c..3f3e77595b7 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -36,12 +36,12 @@ class MulKernel : public framework::OpKernel { Tensor* z = context.Output("Out"); const Tensor x_matrix = x->dims().size() > 2 - ? framework::ReshapeToMatrix( + ? framework::ReshapeToMatrix( *x, context.template Attr("x_num_col_dims")) : *x; const Tensor y_matrix = y->dims().size() > 2 - ? framework::ReshapeToMatrix( + ? framework::ReshapeToMatrix( *y, context.template Attr("y_num_col_dims")) : *y; @@ -59,30 +59,30 @@ class MulGradKernel : public framework::OpKernel { int y_num_col_dims = ctx.template Attr("y_num_col_dims"); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); - const Tensor x_matrix = - x->dims().size() > 2 ? framework::ReshapeToMatrix(*x, x_num_col_dims) - : *x; - const Tensor y_matrix = - y->dims().size() > 2 ? framework::ReshapeToMatrix(*y, y_num_col_dims) - : *y; + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; const Tensor* dout = ctx.Input(framework::GradVarName("Out")); Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { dx->mutable_data(ctx.GetPlace()); - Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( - *dx, x_num_col_dims) - : *dx; + Tensor dx_matrix = dx->dims().size() > 2 + ? framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N math::matmul(ctx.device_context(), *dout, false, y_matrix, true, 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); - Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix( - *dy, y_num_col_dims) - : *dy; + Tensor dy_matrix = dy->dims().size() > 2 + ? framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K math::matmul(ctx.device_context(), x_matrix, true, *dout, false, 1, &dy_matrix, 0); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 10cb0e005f4..143a14fef57 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -33,8 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), - ctx.device_context()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( ctx.device_context()) @@ -71,8 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), - ctx.device_context()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index e3d08378c2f..dcc90e5d87c 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -95,7 +95,7 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { step_scope->FindVar(attr.boot_var)->GetMutable(); pre_mem->Resize(boot_mem->dims()); PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2); - pre_mem->ShareDataWith(*boot_mem); + pre_mem->ShareDataWith(*boot_mem); } } @@ -171,7 +171,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients( auto* boot_mem_grad = step_scope->Var(attr.boot_var)->GetMutable(); boot_mem_grad->Resize(mem_grad->dims()); - boot_mem_grad->ShareDataWith(*mem_grad); + boot_mem_grad->ShareDataWith(*mem_grad); } } diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 3ba4611458f..c89cdf8cab9 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { std::transform(shape.begin(), shape.end(), shape_int64.begin(), [](int a) { return static_cast(a); }); auto out_dims = framework::make_ddim(shape_int64); - out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); + out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); out->Resize(out_dims); } }; @@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel { d_x->mutable_data(ctx.GetPlace()); auto in_dims = d_x->dims(); - d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); + d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); d_x->Resize(in_dims); } }; diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc index 30b8ddeb5bc..d0725f50230 100644 --- a/paddle/operators/rnn/recurrent_op_utils.cc +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -43,7 +43,7 @@ void SegmentInputs(const std::vector& step_scopes, step_scopes[j]->Var(inlinks[i])->GetMutable(); // The input of operators of each step is Tensor here. // Maybe need to modify Slice function. - *step_input = input->Slice(j, j + 1); + *step_input = input->Slice(j, j + 1); step_input->Resize(step_dims); } } @@ -71,8 +71,8 @@ void ConcatOutputs(const std::vector& step_scopes, step_scopes[j]->FindVar(outlinks[i])->GetMutable(); // TODO(luotao02) data type and platform::DeviceContext() should set // correctly - (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace(), ctx); + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace(), ctx); } } } @@ -95,7 +95,7 @@ void LinkMemories(const std::vector& scopes, auto* mem = scope->FindVar(attr.pre_var)->GetMutable(); auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); mem->Resize(linked_mem->dims()); - mem->ShareDataWith(*linked_mem); + mem->ShareDataWith(*linked_mem); } } diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu index 06f4d759447..3b32ae2fb77 100644 --- a/paddle/operators/scatter_op.cu +++ b/paddle/operators/scatter_op.cu @@ -30,7 +30,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel { auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*Ref); GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); } @@ -48,7 +48,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + dRef->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Index] GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h index 61012190064..1a4f6f99bfe 100644 --- a/paddle/operators/scatter_op.h +++ b/paddle/operators/scatter_op.h @@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel { auto *Out = ctx.Output("Out"); // In place output: Out = Ref, Out[Index] += Updates - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*Ref); // Apply ScatterUpdate: Out[index] += Updates[:] ScatterAssign(ctx.device_context(), *Updates, *Index, Out); } @@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + dRef->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates += dO[Index] CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index a197a05bbb8..6adf96120c9 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -87,16 +87,16 @@ class SequenceConcatOpKernel : public framework::OpKernel { auto out_lod_level = out_lod[level]; for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { - Tensor out_t = out->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); + Tensor out_t = out->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); auto out_stride = framework::stride(out_t.dims()); size_t offset = 0; for (size_t j = 0; j < n; ++j) { auto in_lod_level = ins[j]->lod()[level]; auto in_stride = framework::stride(ins[j]->dims()); - Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), - static_cast(in_lod_level[i + 1])); + Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), + static_cast(in_lod_level[i + 1])); size_t axis_dim = in_t.dims()[axis]; StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, in_t.dims(), out_stride, out_t.data() + offset); @@ -130,8 +130,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { Tensor out_grad_t = - out_grad->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); + out_grad->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); auto out_grad_stride = framework::stride(out_grad_t.dims()); size_t offset = 0; @@ -139,8 +139,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { auto x_grad_lod_level = x_grads[j]->lod()[level]; auto x_grad_stride = framework::stride(x_grads[j]->dims()); Tensor x_grad_t = - x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), - static_cast(x_grad_lod_level[i + 1])); + x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), + static_cast(x_grad_lod_level[i + 1])); size_t axis_dim = x_grad_t.dims()[axis]; StridedMemcpy(ctx.device_context(), out_grad_t.data() + offset, out_grad_stride, out_grad_t.dims(), x_grad_stride, diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h index a5569d1aace..0de6cafe9ca 100644 --- a/paddle/operators/sequence_pool_op.h +++ b/paddle/operators/sequence_pool_op.h @@ -64,9 +64,9 @@ class SequencePoolKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); auto place = context.GetEigenDevice(); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - Tensor in_t = in->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - Tensor out_t = out->Slice(i, i + 1); + Tensor in_t = in->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + Tensor out_t = out->Slice(i, i + 1); int64_t h = static_cast(lod_level_0[i + 1] - lod_level_0[i]); auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); auto out_e = EigenVector::Flatten(out_t); @@ -116,9 +116,9 @@ class SequencePoolGradKernel : public framework::OpKernel { } auto place = context.GetEigenDevice(); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - auto in_g_t = in_g->Slice(static_cast(lod[i]), - static_cast(lod[i + 1])); - auto out_g_t = out_g->Slice(i, i + 1); + auto in_g_t = + in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); + auto out_g_t = out_g->Slice(i, i + 1); int64_t h = static_cast(lod[i + 1] - lod[i]); auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h index 96d87c404d2..3eb1e2844df 100644 --- a/paddle/operators/sequence_softmax_op.h +++ b/paddle/operators/sequence_softmax_op.h @@ -46,8 +46,8 @@ class SequenceSoftmaxKernel : public framework::OpKernel { for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); - Tensor x_i = x->Slice(start_pos, end_pos); - Tensor out_i = out->Slice(start_pos, end_pos); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); @@ -75,9 +75,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); - Tensor out_i = out->Slice(start_pos, end_pos); - Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); - Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu index d03a1a76585..68ac2b0ea36 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/operators/softmax_with_cross_entropy_op.cu @@ -85,7 +85,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss"))->data(); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + logit_grad->ShareDataWith(*context.Input("Softmax")); T* logit_grad_data = logit_grad->data(); const int batch_size = logit_grad->dims()[0]; diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h index 66d7bc1569e..01027cf63fc 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.h +++ b/paddle/operators/softmax_with_cross_entropy_op.h @@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const Tensor* labels = context.Input("Label"); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + logit_grad->ShareDataWith(*context.Input("Softmax")); const int class_num = logit_grad->dims()[1]; if (context.Attr("soft_label")) { diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 84ebe3c2b84..3455c82e675 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -84,10 +84,12 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("set_float_element", TensorSetElement) -- GitLab