From 9ed6c895f1663fb33dacb9c751f1d929dfe0f5f3 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Thu, 11 Mar 2021 14:38:58 +0800 Subject: [PATCH] optimize range op by place parameters on cpu rather than gpu, test=develop (#30811) --- paddle/fluid/operators/range_op.cu | 28 +++++++++++++++++++++------- python/paddle/fluid/layers/tensor.py | 6 +++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu index c527bc74eee..f2c78e0f70b 100644 --- a/paddle/fluid/operators/range_op.cu +++ b/paddle/fluid/operators/range_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/range_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,13 +34,26 @@ class CUDARangeKernel : public framework::OpKernel { auto* step_t = context.Input("Step"); auto* out = context.Output("Out"); + T start, end, step; framework::Tensor n; - framework::TensorCopy(*start_t, platform::CPUPlace(), &n); - T start = n.data()[0]; - framework::TensorCopy(*end_t, platform::CPUPlace(), &n); - T end = n.data()[0]; - framework::TensorCopy(*step_t, platform::CPUPlace(), &n); - T step = n.data()[0]; + if (::paddle::platform::is_cpu_place(start_t->place())) { + start = start_t->data()[0]; + } else { + framework::TensorCopy(*start_t, platform::CPUPlace(), &n); + start = n.data()[0]; + } + if (::paddle::platform::is_cpu_place(end_t->place())) { + end = end_t->data()[0]; + } else { + framework::TensorCopy(*end_t, platform::CPUPlace(), &n); + end = n.data()[0]; + } + if (::paddle::platform::is_cpu_place(step_t->place())) { + step = step_t->data()[0]; + } else { + framework::TensorCopy(*step_t, platform::CPUPlace(), &n); + step = n.data()[0]; + } int64_t size = 0; GetSize(start, end, step, &size); @@ -47,7 +61,7 @@ class CUDARangeKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); auto stream = context.cuda_device_context().stream(); - int block = 512; + int block = std::min(size, static_cast(256)); int grid = (size + block - 1) / block; RangeKernel<<>>(start, step, size, out_data); } diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index cd0d652af84..84f99962e84 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -1374,19 +1374,19 @@ def range(start, end, step, dtype, name=None): if not isinstance(start, Variable): with device_guard("cpu"): - start = fill_constant([1], dtype, start) + start = fill_constant([1], dtype, start, force_cpu=True) elif start.dtype != dtype: start = cast(start, dtype) if not isinstance(end, Variable): with device_guard("cpu"): - end = fill_constant([1], dtype, end) + end = fill_constant([1], dtype, end, force_cpu=True) elif end.dtype != dtype: end = cast(end, dtype) if not isinstance(step, Variable): with device_guard("cpu"): - step = fill_constant([1], dtype, step) + step = fill_constant([1], dtype, step, force_cpu=True) elif step.dtype != dtype: step = cast(step, dtype) -- GitLab