提交 53c8c36a 编写于 作者: D dzhwinter

"debug the process"

上级 e4c35d83
...@@ -44,7 +44,7 @@ struct ExecutorPrepareContext { ...@@ -44,7 +44,7 @@ struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id) ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {} : prog_(prog), block_id_(block_id) {}
const framework::ProgramDesc& prog_; const framework::ProgramDesc prog_;
size_t block_id_; size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
}; };
......
...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <stdio.h>
#include <algorithm>
#include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -22,47 +25,71 @@ using LoDTensor = framework::LoDTensor; ...@@ -22,47 +25,71 @@ using LoDTensor = framework::LoDTensor;
template <typename T> template <typename T>
__global__ void sequence_expand_kernel(const T* x_data, T* out_data, __global__ void sequence_expand_kernel(const T* x_data, T* out_data,
const size_t* lod, size_t lod_size, const size_t* lod,
size_t element_len) { const size_t* out_offset,
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; size_t lod_size, size_t element_len,
for (; tid_x < static_cast<int>(lod_size - 1); size_t x_size) {
tid_x += blockDim.x * gridDim.x) { int bid_x = blockIdx.x;
int scale = lod[tid_x + 1] - lod[tid_x]; if (bid_x > lod_size) return;
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int repeats = lod[bid_x];
for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { int offset = out_offset[bid_x];
int tid_z = blockIdx.z * blockDim.z + threadIdx.z; for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
int item_start = tid_x / element_len; for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { out_data[(offset + tid_y) * element_len + tid_x] =
out_data[item_start * scale + tid_z] = x_data[item_start + tid_z]; x_data[bid_x * element_len + tid_x];
}
} }
} }
} }
template <typename T> template <typename T>
__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data, __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
const size_t* lod, size_t lod_size, const size_t* lod,
size_t element_len, const size_t* out_offset,
size_t dout_size) { size_t lod_size, size_t element_len,
size_t dout_size, size_t dx_size) {
// reduce visit memory time.
// dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
threadIdx.y == 0) {
printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
lod_size, element_len, dout_size, dx_size);
}
extern __shared__ T shm[]; extern __shared__ T shm[];
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; T* dout_shm = shm;
for (; tid_x < static_cast<int>(lod_size - 1); T* dx_shm = &shm[dout_size];
tid_x += blockDim.x * gridDim.x) {
int scale = lod[tid_x + 1] - lod[tid_x]; // int idx = threadIdx.x + blockIdx.x * blockDim.x;
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (int idx = 0; idx < dout_size; ++idx) {
for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { if (idx < dx_size) {
int tid_z = blockIdx.z * blockDim.z + threadIdx.z; dx_shm[idx] = 0.0;
int item_start = tid_x / element_len; }
for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { if (idx < dout_size) {
shm[item_start + tid_z] += dout_data[item_start * scale + tid_z]; dout_shm[idx] = dout_data[idx];
} }
}
int bid_x = blockIdx.x;
if (bid_x > lod_size) return;
int repeats = lod[bid_x];
int offset = out_offset[bid_x];
if (threadIdx.x == 0) {
printf("repeats=%d, offset=%ld\n", repeats, offset);
}
for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
T val = dout_shm[(offset + tid_y) * element_len + tid_x];
platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
int dx_idx = bid_x * element_len + tid_x;
int dout_idx = (offset + tid_y) * element_len + tid_x;
printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
} }
} }
// synchronize before write to dx
__syncthreads(); __syncthreads();
for (int idx = blockDim.x * blockIdx.x + threadIdx.x; // copy shared memory back to dx
idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) { for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
dx_data[idx] = shm[idx]; idx += blockDim.x * gridDim.x) {
dx_data[idx] = dx_shm[idx];
} }
} }
...@@ -72,15 +99,20 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> { ...@@ -72,15 +99,20 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
const LoDTensor& x, LoDTensor* out) { const LoDTensor& x, LoDTensor* out) {
auto x_dims = x.dims(); auto x_dims = x.dims();
size_t element_len = framework::product(x_dims) / x_dims[0]; size_t element_len = framework::product(x_dims) / x_dims[0];
T* out_data = out->mutable_data<T>(context.GetPlace()); auto lod = out->lod().back();
auto out_starts = out->lod().back(); framework::Vector<size_t> out_lod;
for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i + 1] - lod[i]);
}
dim3 block_size(16, 32, element_len); int thread_x = std::max(static_cast<int>(element_len), 32);
dim3 grid_size(10, 10); int block_x = static_cast<int>(out_lod.size());
dim3 block_size(thread_x, 1024 / thread_x);
dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>( sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), out->mutable_data<T>(context.GetPlace()), x.data<T>(), out->mutable_data<T>(context.GetPlace()),
out_starts.CUDAData(context.GetPlace()), out_starts.size(), out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
element_len); out_lod.size(), element_len, framework::product(x_dims));
} }
}; };
...@@ -91,16 +123,24 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> { ...@@ -91,16 +123,24 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
const LoDTensor& dout, LoDTensor* dx) { const LoDTensor& dout, LoDTensor* dx) {
auto x_dims = x.dims(); auto x_dims = x.dims();
size_t element_len = framework::product(x_dims) / x_dims[0]; size_t element_len = framework::product(x_dims) / x_dims[0];
auto out_starts = out.lod().back(); auto lod = out.lod().back();
framework::Vector<size_t> out_lod;
for (size_t i = 0; i < lod.size() - 1; ++i) {
out_lod.push_back(lod[i + 1] - lod[i]);
}
size_t dout_size = framework::product(dout.dims());
size_t dx_size = framework::product(dx->dims());
dim3 block_size(16, 32, element_len); int thread_x = std::max(static_cast<int>(element_len), 32);
dim3 grid_size(10, 10); dim3 block_size(thread_x, 1024 / thread_x);
size_t out_size = framework::product(dx->dims()); int block_x = static_cast<int>(out_lod.size());
sequence_expand_grad_kernel<<<grid_size, block_size, out_size * sizeof(T), dim3 grid_size(block_x, 1);
sequence_expand_grad_kernel<<<grid_size, block_size,
(dout_size + dx_size) * sizeof(T),
context.stream()>>>( context.stream()>>>(
dout.data<T>(), dx->mutable_data<T>(context.GetPlace()), dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len, out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
out_size); out_lod.size(), element_len, dout_size, dx_size);
} }
}; };
......
...@@ -362,6 +362,9 @@ class OpTest(unittest.TestCase): ...@@ -362,6 +362,9 @@ class OpTest(unittest.TestCase):
for a, b, name in itertools.izip(numeric_grads, analytic_grads, names): for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
abs_a = np.abs(a) abs_a = np.abs(a)
abs_a[abs_a < 1e-3] = 1 abs_a[abs_a < 1e-3] = 1
print("actual", a)
print("*****")
print("expected", b)
diff_mat = np.abs(a - b) / abs_a diff_mat = np.abs(a - b) / abs_a
max_diff = np.max(diff_mat) max_diff = np.max(diff_mat)
......
...@@ -19,8 +19,14 @@ from op_test import OpTest ...@@ -19,8 +19,14 @@ from op_test import OpTest
class TestSequenceExpand(OpTest): class TestSequenceExpand(OpTest):
def set_data(self): def set_data(self):
x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') x = [i / 10.0 for i in range(3)]
y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') y = [i / 10.0 for i in range(8)]
x_data = np.array(x).reshape(3, 1).astype('float32')
y_data = np.array(y).reshape(8, 1).astype('float32')
print(x_data)
print(y_data)
# x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
# y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
y_lod = [[0, 1, 4, 8]] y_lod = [[0, 1, 4, 8]]
self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
...@@ -45,47 +51,43 @@ class TestSequenceExpand(OpTest): ...@@ -45,47 +51,43 @@ class TestSequenceExpand(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
# class TestSequenceExpandCase1(TestSequenceExpand):
class TestSequenceExpandCase1(TestSequenceExpand): # def set_data(self):
def set_data(self): # x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') # x_lod = [[0, 2, 5]]
x_lod = [[0, 2, 5]] # y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') # y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]] # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
# class TestSequenceExpandCase2(TestSequenceExpand):
# def set_data(self):
class TestSequenceExpandCase2(TestSequenceExpand): # x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
def set_data(self): # x_lod = [[0, 1]]
x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32') # y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
x_lod = [[0, 1]] # y_lod = [[0, 2]]
y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
y_lod = [[0, 2]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} # class TestSequenceExpandCase3(TestSequenceExpand):
# def set_data(self):
# x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
class TestSequenceExpandCase3(TestSequenceExpand): # x_lod = [[0, 1, 2, 3, 4]]
def set_data(self): # y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32') # y_lod = [[0, 2, 4, 4, 6]]
x_lod = [[0, 1, 2, 3, 4]] # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
y_lod = [[0, 2, 4, 4, 6]] # class TestSequenceExpandCase4(TestSequenceExpand):
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} # def set_data(self):
# x_data = np.array(
# [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
class TestSequenceExpandCase4(TestSequenceExpand): # [2, 5]).astype('float32')
def set_data(self): # x_lod = [[
x_data = np.array( # 0,
[0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape( # 1,
[2, 5]).astype('float32') # 2,
x_lod = [[ # ]]
0, # y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
1, # y_lod = [[0, 1, 2], [0, 1, 2]]
2, # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
]]
y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
y_lod = [[0, 1, 2], [0, 1, 2]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册