提交 0be1e09f 编写于 作者: D dzhwinter

"fix ci"

上级 5447046a
......@@ -84,13 +84,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
}
}
out_dims[0] = out_first_dim;
ctx->SetOutputDim("Out", out_dims);
} else {
out_dims[0] = -1;
}
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
};
class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
......
......@@ -24,123 +24,128 @@ namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void sequence_expand_kernel(const T* x_data, T* out_data,
const size_t* lod,
const size_t* out_offset,
size_t lod_size, size_t element_len,
size_t x_size) {
int bid_x = blockIdx.x;
if (bid_x > lod_size) return;
int repeats = lod[bid_x];
int offset = out_offset[bid_x];
for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
out_data[(offset + tid_y) * element_len + tid_x] =
x_data[bid_x * element_len + tid_x];
__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
const size_t* ref_lod,
const size_t lod_size,
/* default=1,
the instance length*/
const int x_item_length, T* out_data) {
constexpr int N = 1024;
__shared__ int mem[N];
int offset = 0;
for (int i = 0; i < lod_size; ++i) {
mem[i] = offset;
if (i < lod_size - 1) {
offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
}
}
}
__syncthreads();
template <typename T>
__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
const size_t* lod,
const size_t* out_offset,
size_t lod_size, size_t element_len,
size_t dout_size, size_t dx_size) {
// reduce visit memory time.
// dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
threadIdx.y == 0) {
printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
lod_size, element_len, dout_size, dx_size);
}
extern __shared__ T shm[];
T* dout_shm = shm;
T* dx_shm = &shm[dout_size];
// int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (int idx = 0; idx < dout_size; ++idx) {
if (idx < dx_size) {
dx_shm[idx] = 0.0;
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
int x_item_count = x_lod[bid + 1] - x_lod[bid];
int repeats = ref_lod[bid + 1] - ref_lod[bid];
int out_offset = mem[bid];
int x_offset = x_lod[bid];
for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) {
out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
}
if (idx < dout_size) {
dout_shm[idx] = dout_data[idx];
}
}
}
int bid_x = blockIdx.x;
if (bid_x > lod_size) return;
int repeats = lod[bid_x];
int offset = out_offset[bid_x];
if (threadIdx.x == 0) {
printf("repeats=%d, offset=%ld\n", repeats, offset);
}
for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
T val = dout_shm[(offset + tid_y) * element_len + tid_x];
platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
int dx_idx = bid_x * element_len + tid_x;
int dout_idx = (offset + tid_y) * element_len + tid_x;
printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
template <typename T>
__global__ void sequence_expand_grad_kernel(const T* dout_data,
const size_t* ref_lod,
const size_t* dx_lod,
const size_t lod_size,
/* default=1,
the instance length*/
const int x_item_length,
T* dx_data) {
// TODO(dzhwinter) : too many atomicAdd
// use shared memory to reduce memory visits
constexpr int N = 1024;
__shared__ int mem[N];
int offset = 0;
for (int i = 0; i < lod_size; ++i) {
mem[i] = offset;
if (i < lod_size - 1) {
offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]);
}
}
__syncthreads();
// copy shared memory back to dx
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
idx += blockDim.x * gridDim.x) {
dx_data[idx] = dx_shm[idx];
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
int repeats = ref_lod[bid + 1] - ref_lod[bid];
int out_offset = mem[bid];
int x_offset = dx_lod[bid];
for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) {
platform::CudaAtomicAdd(
&dx_data[(x_offset + tid_y) * x_item_length + tid_x],
dout_data[(out_offset + tid_z * x_item_count + tid_y) *
x_item_length +
tid_x]);
}
}
}
}
template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const LoDTensor& x, LoDTensor* out) {
auto x_dims = x.dims();
size_t element_len = framework::product(x_dims) / x_dims[0];
auto lod = out->lod().back();
framework::Vector<size_t> out_lod;
for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i + 1] - lod[i]);
}
int thread_x = std::max(static_cast<int>(element_len), 32);
int block_x = static_cast<int>(out_lod.size());
dim3 block_size(thread_x, 1024 / thread_x);
void operator()(
const platform::CUDADeviceContext& context, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
int x_item_length = 1;
x_item_length = x.numel() / x.dims()[0];
VLOG(0) << "x_item_length" << x_item_length;
int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
int thread_y = std::max(1024 / thread_x, 16);
int thread_z = std::min(1024 / thread_x / thread_y, 16);
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), out->mutable_data<T>(context.GetPlace()),
out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
out_lod.size(), element_len, framework::product(x_dims));
x.data<T>(), x_lod.CUDAData(context.GetPlace()),
ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
out->mutable_data<T>(context.GetPlace()));
}
};
template <typename T>
struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const LoDTensor& x, const LoDTensor& out,
const LoDTensor& dout, LoDTensor* dx) {
auto x_dims = x.dims();
size_t element_len = framework::product(x_dims) / x_dims[0];
auto lod = out.lod().back();
framework::Vector<size_t> out_lod;
for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i + 1] - lod[i]);
}
size_t dout_size = framework::product(dout.dims());
size_t dx_size = framework::product(dx->dims());
int thread_x = std::max(static_cast<int>(element_len), 32);
dim3 block_size(thread_x, 1024 / thread_x);
int block_x = static_cast<int>(out_lod.size());
const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand based lod*/
LoDTensor* dx) {
int x_item_length = 1;
x_item_length = framework::product(dx->dims()) / dx->dims()[0];
int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
int thread_y = std::max(1024 / thread_x, 16);
int thread_z = std::min(1024 / thread_x / thread_y, 16);
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_grad_kernel<<<grid_size, block_size,
(dout_size + dx_size) * sizeof(T),
context.stream()>>>(
dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
out_lod.size(), element_len, dout_size, dx_size);
sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
dx->mutable_data<T>(context.GetPlace()));
}
};
......
......@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <numeric> // std::itoa
#include <numeric> // std::iota
#include <glog/logging.h>
#include <sstream>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -29,40 +31,42 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
struct SequenceExpandFunctor {
void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
void operator()(
const DeviceContext& ctx, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out);
};
template <typename DeviceContext, typename T>
struct SequenceExpandGradFunctor {
void operator()(const DeviceContext& ctx, const LoDTensor& x,
const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx);
void operator()(
const DeviceContext& ctx, const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* dx);
};
template <typename T>
struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
void operator()(
const platform::CPUDeviceContext& context, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
auto& out_lod = out->lod()[0];
framework::Vector<size_t> x_lod;
if (x.lod() == 1) {
x_lod = x.lod()[0];
} else {
x_lod.reserve(out_lod.size());
std::itoa(x_lod.begin(), x_lod.end(), 0); // fill 0 ~ out_lod.size()-1
}
int out_offset = 0;
auto& eigen_place = *context.eigen_device();
for (size_t i = 1; i < out_lod.size(); ++i) {
int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
auto x_sub_tensor = x->Slice(x_start, x_end);
auto x_sub_tensor = x.Slice(x_start, x_end);
x_sub_tensor.Resize({1, x_sub_tensor.numel()});
int out_start = out_offset;
if (x_lod.size() == 1) {
out_start = out_lod[0][out_offset];
if (out->lod().size() == 1) {
out_start = out->lod()[0][out_offset];
}
auto out_sub_tensor =
out->Slice(out_start, out_start + x_seq_len * repeat_num);
......@@ -71,6 +75,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
EigenMatrix<T>::From(x_sub_tensor)
.broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
}
out_offset += repeat_num;
}
}
};
......@@ -96,13 +101,10 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
return;
}
auto& out_lod = *out->mutable_lod();
// x lod level is at most 1.
if (x_lod.size() == 0) {
out_lod = y_lod[ref_level];
} else if (x_lod.size() == 1) {
out_lod.resize(1);
out_lod[0] = {0};
framework::Vector<size_t> out_lod;
if (x_lod.size() == 1) {
out_lod.push_back(0);
int out_offset = 0;
for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
......@@ -110,14 +112,25 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
int x_end = x_lod[0][i];
int x_seq_len = x_end - x_start;
for (int j = 0; j < repeat_num; ++j) {
out_lod[0].push_back(out_lod[0].back() + x_seq_len);
out_lod.push_back(out_lod.back() + x_seq_len);
out_offset++;
}
}
// write lod to out if x has lod
auto& ref_lod = *out->mutable_lod();
ref_lod[0] = out_lod;
}
framework::Vector<size_t> ref_x_lod;
if (x->lod().size() == 1) {
ref_x_lod = x->lod()[0];
} else {
// x_lod doesn't has lod, use fake x lod, level = 0
ref_x_lod.resize(x->dims()[0] + 1);
std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
}
SequenceExpandFunctor<DeviceContext, T> functor;
functor(context.template device_context<DeviceContext>(), *x, out);
functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
y_lod[ref_level], out);
}
};
......@@ -135,32 +148,29 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
* */
template <typename T>
struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
auto& dev_ctx = context.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, g_x, static_cast<T>(0));
int g_out_offset = 0;
for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
void operator()(
const platform::CPUDeviceContext& context, const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* dx) {
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
set_zero(context, dx, static_cast<T>(0));
int dout_offset = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
if (repeat_num > 0) {
int x_start = i - 1;
int x_end = i;
if (x_lod.size() == 1) {
x_start = x_lod[0][i - 1];
x_end = x_lod[0][i];
}
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
auto g_x_sub = g_x->Slice(x_start, x_end);
g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
int g_out_end = g_out_offset + repeat_num * x_seq_len;
auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
math::ColwiseSum<DeviceContext, T> col_sum;
col_sum(dev_ctx, g_out_sub, &g_x_sub);
g_out_offset += repeat_num * x_seq_len;
auto dx_sub = dx->Slice(x_start, x_end);
dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
int dout_end = dout_offset + repeat_num * x_seq_len;
auto dout_sub = dout.Slice(dout_offset, dout_end);
dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
col_sum(context, dout_sub, &dx_sub);
dout_offset += repeat_num * x_seq_len;
}
}
}
......@@ -179,20 +189,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
g_x->mutable_data<T>(context.GetPlace());
g_x->set_lod(x->lod());
auto& x_lod = x->lod();
auto& y_lod = y->lod();
if (ref_level == -1) ref_level = y_lod.size() - 1;
// just copy the gradient
if (y_lod[ref_level].size() <= 1) {
framework::TensorCopy(*g_out, context.GetPlace(), g_x);
return;
}
framework::Vector<size_t> ref_x_lod;
framework::Vector<size_t> ref_lod = y_lod[ref_level];
if (x->lod().size() == 1) {
ref_x_lod = x->lod()[0];
} else {
// x_lod doesn't has lod, use fake x lod, level = 0
ref_x_lod.resize(x->dims()[0] + 1);
std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
}
SequenceExpandGradFunctor<DeviceContext, T> functor;
functor(context.template device_context<DeviceContext>(), *x, *y, *g_out,
g_x);
functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
ref_lod, g_x);
}
};
......
......@@ -19,14 +19,8 @@ from op_test import OpTest
class TestSequenceExpand(OpTest):
def set_data(self):
x = [i / 10.0 for i in range(3)]
y = [i / 10.0 for i in range(8)]
x_data = np.array(x).reshape(3, 1).astype('float32')
y_data = np.array(y).reshape(8, 1).astype('float32')
print(x_data)
print(y_data)
# x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
# y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
y_lod = [[0, 1, 4, 8]]
self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
......@@ -53,8 +47,10 @@ class TestSequenceExpand(OpTest):
x_len = x_idx[i] - x_idx[i - 1]
if repeat_num > 0:
x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
x_sub = np.repeat(x_sub, repeat_num, axis=0)
out = np.vstack((out, x_sub))
stacked_x_sub = x_sub
for r in range(repeat_num - 1):
stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
out = np.vstack((out, stacked_x_sub))
if x_lod is not None:
for j in xrange(repeat_num):
out_lod[0].append(out_lod[0][-1] + x_len)
......@@ -107,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
class TestSequenceExpandCase4(TestSequenceExpand):
def set_data(self):
data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
data = np.random.uniform(0.1, 1, [5 * 2, 1])
x_data = np.array(data).reshape([5, 2]).astype('float32')
x_lod = [[0, 2, 5]]
y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
y_lod = [[0, 1, 2], [0, 1, 2]]
y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
y_lod = [[0, 1, 3], [0, 1, 3]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册