diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc index b9633721e296c0889c6ac7d359570e9d59153e6a..7add3d60f6f42e922bc6885a8c4f3b0be0149b82 100644 --- a/paddle/operators/seq_expand_op.cc +++ b/paddle/operators/seq_expand_op.cc @@ -40,7 +40,7 @@ class SeqExpandOp : public framework::OperatorWithKernel { out_dim[0] = out_dim[0] * repeat; } PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of PadOp should not be null."); + "Output(Out) of SeqExpandOp should not be null."); ctx->SetOutputDim("Out", out_dim); } }; diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h index e990f125127e507ed84d67a2e49443324edf37b1..d1dcc979207351a45eb2d7fd5201d282d98983d9 100644 --- a/paddle/operators/seq_expand_op.h +++ b/paddle/operators/seq_expand_op.h @@ -75,15 +75,37 @@ class SeqExpandKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); // copy data - Place place = boost::get(context.GetPlace()); + auto place = context.GetPlace(); size_t count = 0; - for (size_t i = 0; i < scales.size(); ++i) { - count = element_len * (x_lod[0][i + 1] - x_lod[0][i]); - for (size_t j = 0; j < scales[i]; ++j) { - memory::Copy(place, out_data, place, x_data, sizeof(T) * count); - out_data += count; + if (platform::is_cpu_place(place)) { + auto& cpu_place = boost::get(place); + for (size_t i = 0; i < scales.size(); ++i) { + count = element_len * (x_lod[0][i + 1] - x_lod[0][i]); + for (size_t j = 0; j < scales[i]; ++j) { + memory::Copy(cpu_place, out_data, cpu_place, x_data, + sizeof(T) * count); + out_data += count; + } + x_data += count; } - x_data += count; + } else { +#ifdef PADDLE_WITH_CUDA + auto& gpu_place = boost::get(place); + auto stream = reinterpret_cast( + context.device_context()) + .stream(); + for (size_t i = 0; i < scales.size(); ++i) { + count = element_len * (x_lod[0][i + 1] - x_lod[0][i]); + for (size_t j = 0; j < scales[i]; ++j) { + memory::Copy(gpu_place, out_data, gpu_place, x_data, + sizeof(T) * count, stream); + out_data += count; + } + x_data += count; + } +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif } out->set_lod(out_lod); @@ -113,7 +135,7 @@ class SeqExpandGradKernel : public framework::OpKernel { Eigen::TensorMap> d_x_t( d_x_data, static_cast((ele_count * element_len) / repeat)); auto place = context.GetEigenDevice(); - d_x_t.device(place) = d_out_t.sum(Eigen::array({0})); + d_x_t.device(place) = d_out_t.sum(Eigen::array({{0}})); d_out_data += (ele_count * element_len); d_x_data += ((ele_count * element_len) / repeat); }