diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 3f2c2e64eb55e69a0aa8d9b17fd024a03fb35558..b29f528f3f749efa3463125c774c2f4d4ebcbc7c 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -289,14 +289,15 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, std::vector LoDTensor::SplitLoDTensor( const std::vector places) const { check_memory_size(); - PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now"); - size_t result_size = std::min(static_cast(dims()[0]), places.size()); - size_t remainder = dims()[0] % places.size(); + int batch_size = + lod().empty() ? dims()[0] : static_cast(lod()[0].size()) - 1; + size_t result_size = std::min(static_cast(batch_size), places.size()); + size_t remainder = batch_size % places.size(); std::vector results; results.reserve(result_size); - int step_width = static_cast(dims()[0] / result_size); + int step_width = static_cast(batch_size / result_size); for (size_t i = 0; i < result_size; ++i) { int begin = static_cast(i * step_width); int end = static_cast((i + 1) * step_width); @@ -307,14 +308,14 @@ std::vector LoDTensor::SplitLoDTensor( LoDTensor dst; if (lod().empty()) { auto src = Slice(begin, end); - auto &dst_place = places[place_idx]; + auto &dst_place = places[i]; framework::Copy(src, dst_place, &dst); } else { auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0); auto &offset = lod_and_offset.second; auto src = Slice(offset.first, offset.second); - auto &dst_place = places[place_idx]; + auto &dst_place = places[i]; framework::Copy(src, dst_place, &dst); LoD my_lod; @@ -327,7 +328,7 @@ std::vector LoDTensor::SplitLoDTensor( } dst.set_lod(my_lod); } - lods.emplace_back(dst); + results.emplace_back(dst); } return results;