提交 6b3e0b78 编写于 作者: Z zchen0211

gather function with test passed

上级 eef55ca7
...@@ -41,6 +41,11 @@ function(op_library TARGET) ...@@ -41,6 +41,11 @@ function(op_library TARGET)
endif() endif()
endfunction() endfunction()
op_library(gather SRCS gather_func.cc)
cc_test(gather_test SRCS gather_test.cc DEPS gather)
op_library(scatter SRCS scatter_func.cc)
op_library(add_op SRCS add_op.cc add_op.cu) op_library(add_op SRCS add_op.cc add_op.cu)
cc_test(add_op_test SRCS add_op_test.cc DEPS add_op) cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gather_func.h"
#include <cstring>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
...@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory.h>
#include <cstring> #include <cstring>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
/** using paddle::framework::Tensor;
* Return a new tensor from source tensor, gathered according to index using paddle::framework::DDim;
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename Place, typename T>
Tensor* Gather(Tensor* src, Tensor* index) {
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
// Source shape namespace paddle {
auto src_dims = src->dims(); namespace operators {
DDim output_dims(dims_src);
// Create a tensor of shape [index_size, dim_src[1:]]
output_dims[0] = index_size;
Tensor* New_tensor;
float* output = nullptr;
/* slice size */
int slice_size = 1;
for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
/* Gathering */
if (place == CPUPlace()) {
// init for CPU
output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
CPUGather(
src->data(), index->data(), slice_size, new_tensor->mutable_data());
} else { // GPU
// init for GPU
output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
/* how to specialize device??*/
GPUGather(
d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
}
return New_tensor;
}
/* Implementation of CPU copy */ /* Implementation of CPU copy */
template <typename T> template <typename T>
...@@ -70,9 +37,8 @@ void CPUGather(const T* params, ...@@ -70,9 +37,8 @@ void CPUGather(const T* params,
for (size_t i = 0; i < index_size; ++i) { for (size_t i = 0; i < index_size; ++i) {
int index_ = indices[i]; int index_ = indices[i];
/* copy src[index_] to output[i] */ // copy src[index_] to output[i]
memcpy( memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
output + i * slice_bytes, params + index_ * slice_bytes, slice_bytes);
} }
} }
...@@ -81,37 +47,51 @@ void CPUGather(const T* params, ...@@ -81,37 +47,51 @@ void CPUGather(const T* params,
d = cuda_stream(gpu_id_, stream_id_); d = cuda_stream(gpu_id_, stream_id_);
*/ */
template <typename T> template <typename T>
void GPUGather(const GPUDevice& d, void GPUGather(const T* src,
const T* src,
const int* index, const int* index,
const int slice_size, const int slice_size,
const int index_size, const int index_size,
T* output) { T* output);
int block_count = slice_size * index_size;
int thread_per_block = 1024;
GatherOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>( /**
src, index, output, slice_size, indices_size, slice_size, out_size); * Return a new tensor from source tensor, gathered according to index
} * input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
template <typename T> * return: output tensor
__global__ void GatherOpKernel(const T* params,
const int* indices,
T* out,
int64 indices_size,
int64 slice_size,
int64 out_size) {
/* I suppose we have the following macro,
which I strongly suggest that we should put in cuda:
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
*/ */
CUDA_1D_KERNEL_LOOP(i, out_size) { template <typename T>
int indices_i = i / slice_size; void Gather(const platform::Place& place,
int slice_i = i - indices_i * slice_size; // offset inside the slice const paddle::framework::Tensor* src,
int gather_i = indices[indices_i]; const paddle::framework::Tensor* index,
int params_i = gather_i * slice_size + slice_i; paddle::framework::Tensor* output) {
out[i] = *(params + params_i); // check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
auto src_dims = src->dims();
DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering
if (platform::is_cpu_place(place)) {
CPUGather<T>(src->data<T>(),
index->data<int>(),
slice_size,
index_size,
output->data<T>());
} else {
// init for GPU
// output_arr = output->mutable_data<T>(output_dims, platform::GPUPlace());
// how to specialize device??
// GPUGather(
// d, src->data(), index->data(), slice_size,
// new_tensor->mutable_data());
} }
} }
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/operators/gather_func.h"
#include "paddle/platform/place.h"
#include <gtest/gtest.h>
#include <iostream>
#include <string>
TEST(_abc_, GatherData) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::operators;
Tensor* src = new Tensor();
Tensor* index = new Tensor();
Tensor* output = new Tensor();
// src.Resize(make_ddim({3, 4}));
int* p_src = nullptr;
int* p_index = nullptr;
p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
for (size_t i = 0; i < 12; ++i) p_src[i] = i;
p_index[0] = 1;
p_index[1] = 0;
// gather
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
Gather<int>(CPUPlace(), src, index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册