提交 6b3e0b78 编写于 作者: Z zchen0211

gather function with test passed

上级 eef55ca7
...@@ -41,6 +41,11 @@ function(op_library TARGET) ...@@ -41,6 +41,11 @@ function(op_library TARGET)
endif() endif()
endfunction() endfunction()
op_library(gather SRCS gather_func.cc)
cc_test(gather_test SRCS gather_test.cc DEPS gather)
op_library(scatter SRCS scatter_func.cc)
op_library(add_op SRCS add_op.cc add_op.cu) op_library(add_op SRCS add_op.cc add_op.cu)
cc_test(add_op_test SRCS add_op_test.cc DEPS add_op) cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gather_func.h"
#include <cstring>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,51 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory.h>
#include <cstring> #include <cstring>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
/** using paddle::framework::Tensor;
* Return a new tensor from source tensor, gathered according to index using paddle::framework::DDim;
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename Place, typename T>
Tensor* Gather(Tensor* src, Tensor* index) {
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
// Source shape
auto src_dims = src->dims();
DDim output_dims(dims_src);
// Create a tensor of shape [index_size, dim_src[1:]]
output_dims[0] = index_size;
Tensor* New_tensor;
float* output = nullptr;
/* slice size */
int slice_size = 1;
for (size_t i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
/* Gathering */ namespace paddle {
if (place == CPUPlace()) { namespace operators {
// init for CPU
output = New_tensor.mutable_data<T>(output_dims, CPUPlace());
CPUGather(
src->data(), index->data(), slice_size, new_tensor->mutable_data());
} else { // GPU
// init for GPU
output = New_tensor.mutable_data<T>(output_dims, GPUPlace());
/* how to specialize device??*/
GPUGather(
d, src->data(), index->data(), slice_size, new_tensor->mutable_data());
}
return New_tensor;
}
/* Implementation of CPU copy */ /* Implementation of CPU copy */
template <typename T> template <typename T>
...@@ -70,48 +37,61 @@ void CPUGather(const T* params, ...@@ -70,48 +37,61 @@ void CPUGather(const T* params,
for (size_t i = 0; i < index_size; ++i) { for (size_t i = 0; i < index_size; ++i) {
int index_ = indices[i]; int index_ = indices[i];
/* copy src[index_] to output[i] */ // copy src[index_] to output[i]
memcpy( memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
output + i * slice_bytes, params + index_ * slice_bytes, slice_bytes);
} }
} }
/* Implementation of GPU copy: /* Implementation of GPU copy:
I suppose the GPUDevice& d, contains gpu_id and thread_id I suppose the GPUDevice& d, contains gpu_id and thread_id
d = cuda_stream(gpu_id_, stream_id_); d = cuda_stream(gpu_id_, stream_id_);
*/ */
template <typename T> template <typename T>
void GPUGather(const GPUDevice& d, void GPUGather(const T* src,
const T* src,
const int* index, const int* index,
const int slice_size, const int slice_size,
const int index_size, const int index_size,
T* output) { T* output);
int block_count = slice_size * index_size;
int thread_per_block = 1024;
GatherOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(
src, index, output, slice_size, indices_size, slice_size, out_size);
}
/**
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T> template <typename T>
__global__ void GatherOpKernel(const T* params, void Gather(const platform::Place& place,
const int* indices, const paddle::framework::Tensor* src,
T* out, const paddle::framework::Tensor* index,
int64 indices_size, paddle::framework::Tensor* output) {
int64 slice_size, // check index of shape 1-D
int64 out_size) { PADDLE_ENFORCE(index->dims().size() == 1);
/* I suppose we have the following macro, int index_size = index->dims()[0];
which I strongly suggest that we should put in cuda:
#define CUDA_1D_KERNEL_LOOP(i, n) \ auto src_dims = src->dims();
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ DDim output_dims(src_dims);
i += blockDim.x * gridDim.x) output_dims[0] = index_size;
*/
CUDA_1D_KERNEL_LOOP(i, out_size) { // slice size
int indices_i = i / slice_size; int slice_size = 1;
int slice_i = i - indices_i * slice_size; // offset inside the slice for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
int gather_i = indices[indices_i];
int params_i = gather_i * slice_size + slice_i; // Gathering
out[i] = *(params + params_i); if (platform::is_cpu_place(place)) {
CPUGather<T>(src->data<T>(),
index->data<int>(),
slice_size,
index_size,
output->data<T>());
} else {
// init for GPU
// output_arr = output->mutable_data<T>(output_dims, platform::GPUPlace());
// how to specialize device??
// GPUGather(
// d, src->data(), index->data(), slice_size,
// new_tensor->mutable_data());
} }
} }
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/operators/gather_func.h"
#include "paddle/platform/place.h"
#include <gtest/gtest.h>
#include <iostream>
#include <string>
TEST(_abc_, GatherData) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::operators;
Tensor* src = new Tensor();
Tensor* index = new Tensor();
Tensor* output = new Tensor();
// src.Resize(make_ddim({3, 4}));
int* p_src = nullptr;
int* p_index = nullptr;
p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
for (size_t i = 0; i < 12; ++i) p_src[i] = i;
p_index[0] = 1;
p_index[1] = 0;
// gather
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
Gather<int>(CPUPlace(), src, index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册