提交 6b3e0b78 编写于 作者: Z zchen0211

gather function with test passed

上级 eef55ca7
......@@ -41,6 +41,11 @@ function(op_library TARGET)
endif()
endfunction()
op_library(gather SRCS gather_func.cc)
cc_test(gather_test SRCS gather_test.cc DEPS gather)
op_library(scatter SRCS scatter_func.cc)
op_library(add_op SRCS add_op.cc add_op.cu)
cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gather_func.h"
#include <cstring>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
......@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory.h>
#include <cstring>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
/**
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename Place, typename T>
Tensor* Gather(Tensor* src, Tensor* index) {
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
using paddle::framework::Tensor;
using paddle::framework::DDim;
// Source shape
auto src_dims = src->dims();
DDim output_dims(dims_src);
// Create a tensor of shape [index_size, dim_src[1:]]
output_dims[0] = index_size;
Tensor* New_tensor;
float* output = nullptr;
/* slice size */
int slice_size = 1;
for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
/* Gathering */
if (place == CPUPlace()) {
// init for CPU
output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
CPUGather(
src->data(), index->data(), slice_size, new_tensor->mutable_data());
} else { // GPU
// init for GPU
output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
/* how to specialize device??*/
GPUGather(
d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
}
return New_tensor;
}
namespace paddle {
namespace operators {
/* Implementation of CPU copy */
template <typename T>
......@@ -70,9 +37,8 @@ void CPUGather(const T* params,
for (size_t i = 0; i < index_size; ++i) {
int index_ = indices[i];
/* copy src[index_] to output[i] */
memcpy(
output + i * slice_bytes, params + index_ * slice_bytes, slice_bytes);
// copy src[index_] to output[i]
memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
}
}
......@@ -81,37 +47,51 @@ void CPUGather(const T* params,
d = cuda_stream(gpu_id_, stream_id_);
*/
template <typename T>
void GPUGather(const GPUDevice& d,
const T* src,
void GPUGather(const T* src,
const int* index,
const int slice_size,
const int index_size,
T* output) {
int block_count = slice_size * index_size;
int thread_per_block = 1024;
T* output);
GatherOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
src, index, output, slice_size, indices_size, slice_size, out_size);
}
template <typename T>
__global__ void GatherOpKernel(const T* params,
const int* indices,
T* out,
int64 indices_size,
int64 slice_size,
int64 out_size) {
/* I suppose we have the following macro,
which I strongly suggest that we should put in cuda:
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
/**
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
CUDA_1D_KERNEL_LOOP(i, out_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
int gather_i = indices[indices_i];
int params_i = gather_i * slice_size + slice_i;
out[i] = *(params + params_i);
template <typename T>
void Gather(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
auto src_dims = src->dims();
DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering
if (platform::is_cpu_place(place)) {
CPUGather<T>(src->data<T>(),
index->data<int>(),
slice_size,
index_size,
output->data<T>());
} else {
// init for GPU
// output_arr = output->mutable_data<T>(output_dims, platform::GPUPlace());
// how to specialize device??
// GPUGather(
// d, src->data(), index->data(), slice_size,
// new_tensor->mutable_data());
}
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/operators/gather_func.h"
#include "paddle/platform/place.h"
#include <gtest/gtest.h>
#include <iostream>
#include <string>
TEST(_abc_, GatherData) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::operators;
Tensor* src = new Tensor();
Tensor* index = new Tensor();
Tensor* output = new Tensor();
// src.Resize(make_ddim({3, 4}));
int* p_src = nullptr;
int* p_index = nullptr;
p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
for (size_t i = 0; i < 12; ++i) p_src[i] = i;
p_index[0] = 1;
p_index[1] = 0;
// gather
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
Gather<int>(CPUPlace(), src, index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册