未验证 提交 155ebbb9 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #13449 from chengduoZH/speed_up_lod_tensor_to_array

Speed up lod_tensor to array and array to lod_tensor
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/operators/math/concat.h>
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
...@@ -24,6 +25,50 @@ namespace operators { ...@@ -24,6 +25,50 @@ namespace operators {
using LoD = framework::LoD; using LoD = framework::LoD;
class ArrayToLoDFunctor;
template <typename DeviceContext>
struct ArrayToLoDFunctorImpl {
const ArrayToLoDFunctor *prev_functor_;
DeviceContext *dev_ctx_;
template <typename T>
void apply();
};
struct ArrayToLoDFunctor : public boost::static_visitor<void> {
std::vector<framework::Tensor> in;
mutable framework::Tensor *out;
template <typename Place>
void operator()(Place place) const {
auto &pool = platform::DeviceContextPool::Instance();
if (std::is_same<Place, platform::CPUPlace>::value) {
Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
} else {
#ifdef PADDLE_WITH_CUDA
Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
#else
PADDLE_THROW("Fluid is not compiled with CUDA");
#endif
}
}
template <typename DeviceContext>
void Apply(DeviceContext *dev_ctx) const {
ArrayToLoDFunctorImpl<DeviceContext> functor;
functor.dev_ctx_ = dev_ctx;
functor.prev_functor_ = this;
framework::VisitDataType(framework::ToDataType(out->type()), functor);
}
};
template <typename DeviceContext>
template <typename T>
void ArrayToLoDFunctorImpl<DeviceContext>::apply() {
math::ConcatFunctor<DeviceContext, T> func;
func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out);
}
class ArrayToLoDTensorOp : public framework::OperatorBase { class ArrayToLoDTensorOp : public framework::OperatorBase {
public: public:
ArrayToLoDTensorOp(const std::string &type, ArrayToLoDTensorOp(const std::string &type,
...@@ -47,14 +92,18 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -47,14 +92,18 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
int rank = x[0].dims().size(); int rank = x[0].dims().size();
platform::Place place = x[0].place(); platform::Place place = x[0].place();
std::type_index data_type = x[0].type(); std::type_index data_type = x[0].type();
framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
int64_t batch_size = x[0].dims()[0]; int64_t batch_size = x[0].dims()[0];
framework::DDim ins_dims = rank > 1
? framework::slice_ddim(x[0].dims(), 1, rank)
: framework::make_ddim({0});
for (size_t i = 1; i < x.size(); ++i) { for (size_t i = 1; i < x.size(); ++i) {
PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims, auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank)
: framework::make_ddim({0});
PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims,
"The dimension of the %zu'th element in LoDTensorArray " "The dimension of the %zu'th element in LoDTensorArray "
"differs from previous ones.", "differs from previous ones.",
i); i);
PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place), PADDLE_ENFORCE(x[i].place() == place,
"The place class of the %zu'th element in LoDTensorArray " "The place class of the %zu'th element in LoDTensorArray "
"differs from previous ones.", "differs from previous ones.",
i); i);
...@@ -82,13 +131,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -82,13 +131,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
// Build LoDTensor `out` // Build LoDTensor `out`
framework::LoD *out_lod = out->mutable_lod(); framework::LoD *out_lod = out->mutable_lod();
out_lod->clear(); out_lod->clear();
size_t out_offset = 0;
auto prefix_lod = rank_table.coarse_lod(); auto prefix_lod = rank_table.coarse_lod();
prefix_lod.emplace_back(); prefix_lod.emplace_back();
auto &cur_level_lod = prefix_lod.back(); auto &cur_level_lod = prefix_lod.back();
cur_level_lod.push_back(0); cur_level_lod.push_back(0);
ArrayToLoDFunctor functor;
for (size_t idx : table_item_idx) { for (size_t idx : table_item_idx) {
cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length); cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
PADDLE_ENFORCE_LE(table_items[idx].length, x.size());
for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) { for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
x[x_idx].lod(), idx, idx + 1, 0); x[x_idx].lod(), idx, idx + 1, 0);
...@@ -106,17 +156,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -106,17 +156,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
if (len == 0) { if (len == 0) {
continue; continue;
} }
auto slice = out->Slice(out_offset, out_offset + len); functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset));
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice);
out_offset += len;
} }
} }
functor.out = out;
platform::VisitPlace(place, functor);
out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end()); out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
} }
}; };
......
...@@ -86,8 +86,8 @@ class XeGradFunctor { ...@@ -86,8 +86,8 @@ class XeGradFunctor {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_; for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) { x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] = dx_[x_offset] = (x_offset != x_is_true_offset ||
(x_offset != x_is_true_offset || label_[sample_id] == ignore_index_) label_[sample_id] == static_cast<int64_t>(ignore_index_))
? static_cast<T>(0) ? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset]; : -dy_[sample_id] / x_[x_offset];
} }
......
...@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <map>
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
...@@ -26,6 +29,61 @@ struct CopyRange { ...@@ -26,6 +29,61 @@ struct CopyRange {
size_t end; size_t end;
}; };
struct LoDTensorToArrayFunctor;
template <typename DeviceContext>
struct LoDTensorToArrayFunctorImpl {
const LoDTensorToArrayFunctor *prev_functor_;
DeviceContext *dev_ctx_;
template <typename T>
void apply();
};
struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
std::vector<const framework::Tensor *> ref_inputs_;
mutable std::vector<framework::Tensor *> outputs_;
const framework::Tensor &input_;
explicit LoDTensorToArrayFunctor(const framework::Tensor &input)
: input_(input) {}
void AddOutput(framework::Tensor *t) {
outputs_.emplace_back(t);
ref_inputs_.emplace_back(t);
}
template <typename Place>
void operator()(Place place) const {
auto &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = pool.Get(place);
if (std::is_same<Place, platform::CPUPlace>::value) {
Apply(static_cast<platform::CPUDeviceContext *>(dev_ctx));
} else {
#ifdef PADDLE_WITH_CUDA
Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
#else
PADDLE_THROW("Not compiled with cuda");
#endif
}
}
template <typename DeviceContext>
void Apply(DeviceContext *dev_ctx) const {
LoDTensorToArrayFunctorImpl<DeviceContext> func;
func.prev_functor_ = this;
func.dev_ctx_ = dev_ctx;
framework::VisitDataType(framework::ToDataType(input_.type()), func);
}
};
template <typename DeviceContext>
template <typename T>
void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
math::ConcatGradFunctor<DeviceContext, T> func;
func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
&prev_functor_->outputs_);
}
class LoDTensorToArrayOp : public framework::OperatorBase { class LoDTensorToArrayOp : public framework::OperatorBase {
public: public:
LoDTensorToArrayOp(const std::string &type, LoDTensorToArrayOp(const std::string &type,
...@@ -72,6 +130,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -72,6 +130,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
} }
} }
auto &outputs = *const_cast<framework::Scope &>(scope)
.Var()
->GetMutable<std::map<size_t, framework::Tensor>>();
for (size_t i = 0; i < max_seq_len; ++i) { for (size_t i = 0; i < max_seq_len; ++i) {
auto &ranges = copy_ranges[i]; auto &ranges = copy_ranges[i];
size_t height = std::accumulate( size_t height = std::accumulate(
...@@ -90,17 +153,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -90,17 +153,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
// out[i][offset: offset+len] = x[each_range.begin: each_range.end] // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
auto slice = out[i].Slice(static_cast<int>(offset), auto slice = out[i].Slice(static_cast<int>(offset),
static_cast<int>(offset + len)); static_cast<int>(offset + len));
outputs.insert({each_range.begin, slice});
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
offset += len; offset += len;
} }
} }
LoDTensorToArrayFunctor functor(x);
for (auto &out_pair : outputs) {
functor.AddOutput(&out_pair.second);
}
platform::VisitPlace(place, functor);
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册