未验证 提交 73858547 编写于 作者: A Abhinav Arora 提交者: GitHub

Fix more CPPLint issues in fluid/operators/math (#10276)

* Fix CPPLint issues in lstm_cpu_kernel.h

* Fix CPPLint issues in math/math_function_test

* Fix CPPLint issues in math/math_function_test

* Fix CPPLint issues in math/concat.cc

* Fix CPPLint issues in math/concat.cc

* Fix CPPLint issues in math/concat.cc

* Fix CPPLint issues in math/gru_cpu_kernel

* Fix CPPLint issues in math/selected_rows_functor_test.cu

* Fix compile error

* Fix compile error
上级 fb7ca48c
...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T> paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor; concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs); concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
} }
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -70,20 +71,20 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int num = outputs->size();
int input_rows = 1; int input_rows = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i]; input_rows *= dim_0[i];
} }
int input_cols = 0; int input_cols = 0;
std::vector<int64_t> output_cols(outputs.size()); std::vector<int64_t> output_cols(outputs->size());
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
int t_cols = outputs[i].numel() / input_rows; int t_cols = outputs->at(i).numel() / input_rows;
input_cols += t_cols; input_cols += t_cols;
output_cols[i] = t_cols; output_cols[i] = t_cols;
} }
...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -95,7 +96,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
T* dst_ptr = outputs[j].data<T>() + k * col_len; T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len); sizeof(T) * col_len);
col_idx += col_len; col_idx += col_len;
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -202,16 +204,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs.size(); int o_num = outputs->size();
int out_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs->at(0).dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
out_row *= dim_0[i]; out_row *= dim_0[i];
} }
int out_col = outputs[0].numel() / out_row; int out_col = outputs->at(0).numel() / out_row;
int in_col = 0, in_row = out_row; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -221,13 +223,13 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row; int t_col = outputs->at(i).numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out_col) sameShape = false; if (t_col != out_col) sameShape = false;
} }
in_col += t_col; in_col += t_col;
outputs_cols[i + 1] = in_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs->at(i).data<T>();
} }
T** dev_out_gpu_data = T** dev_out_gpu_data =
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T> ...@@ -56,7 +57,7 @@ template <typename DeviceContext, typename T>
class ConcatGradFunctor { class ConcatGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>& outputs); const int axis, std::vector<framework::Tensor>* outputs);
}; };
} // namespace math } // namespace math
......
...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
__m256 r_value_reset_gate; __m256 r_value_reset_gate;
__m256 r_value_reset_output; __m256 r_value_reset_output;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *reset_gate = (__m256 *)(gate_value + frame_size); __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_reset_gate = reset_gate[i]; r_value_reset_gate = reset_gate[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
update_gate[i] = r_value_update_gate; update_gate[i] = r_value_update_gate;
reset_gate[i] = r_value_reset_gate; reset_gate[i] = r_value_reset_gate;
((__m256 *)reset_output_value)[i] = r_value_reset_output; (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output;
} }
#endif #endif
} }
...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
__m256 r_value_frame_state; __m256 r_value_frame_state;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 r_output; __m256 r_output;
__m256 *update_gate = (__m256 *)gate_value; __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
__m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_update_gate = update_gate[i]; r_value_update_gate = update_gate[i];
r_value_frame_state = frame_state[i]; r_value_frame_state = frame_state[i];
if (prev_output_value) { if (prev_output_value) {
r_prev_out = ((__m256 *)prev_output_value)[i]; r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
} }
op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
r_output, active_node); r_output, active_node);
frame_state[i] = r_value_frame_state; frame_state[i] = r_value_frame_state;
((__m256 *)output_value)[i] = r_output; (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
} }
#endif #endif
} }
...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
__m256 r_out_grad; __m256 r_out_grad;
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); __m256 *frame_state_value =
__m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
__m256 *frame_state_grad =
reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
r_frame_state_value = frame_state_value[i]; r_frame_state_value = frame_state_value[i];
r_out_grad = ((__m256 *)output_grad)[i]; r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
frame_state_grad[i] = r_frame_state_grad; frame_state_grad[i] = r_frame_state_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
__m256 r_reset_output_grad = _mm256_set1_ps(0.0f); __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
__m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 r_prev_out_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *update_gate_value = (__m256 *)gate_value; __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
__m256 *update_gate_grad = (__m256 *)gate_grad; __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
__m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); __m256 *reset_gate_value =
__m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); reinterpret_cast<__m256 *>(gate_value + frame_size);
__m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_update_gate_value = update_gate_value[i]; r_update_gate_value = update_gate_value[i];
...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
r_reset_gate_value = reset_gate_value[i]; r_reset_gate_value = reset_gate_value[i];
if (prev_out_value && prev_out_grad) { if (prev_out_value && prev_out_grad) {
r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
} }
if (prev_out_value) { if (prev_out_value) {
r_prev_out_value = ((__m256 *)prev_out_value)[i]; r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
} }
if (prev_out_grad) { if (prev_out_grad) {
r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
} }
op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
update_gate_grad[i] = r_update_gate_grad; update_gate_grad[i] = r_update_gate_grad;
reset_gate_grad[i] = r_reset_gate_grad; reset_gate_grad[i] = r_reset_gate_grad;
if (prev_out_grad) { if (prev_out_grad) {
((__m256 *)prev_out_grad)[i] = r_prev_out_grad; (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
} }
} }
#endif #endif
......
...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_state_atv; __m256 r_state_atv;
__m256 r_out; __m256 r_out;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *value_og =
reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
value_ig[i] = r_value_ig; value_ig[i] = r_value_ig;
value_fg[i] = r_value_fg; value_fg[i] = r_value_fg;
value_og[i] = r_value_og; value_og[i] = r_value_og;
((__m256 *)value.state_value)[i] = r_state; (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
((__m256 *)value.state_active_value)[i] = r_state_atv; (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
((__m256 *)value.output_value)[i] = r_out; (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
} }
#endif #endif
} }
...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
__m256 r_checkFGrad; __m256 r_checkFGrad;
__m256 r_checkOGrad; __m256 r_checkOGrad;
__m256 *value_in = (__m256 *)value.gate_value; __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size); __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); __m256 *value_fg =
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
__m256 *grad_in = (__m256 *)grad.gate_grad; __m256 *value_og =
__m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
__m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
__m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
__m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
__m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i]; r_value_in = value_in[i];
...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
r_value_fg = value_fg[i]; r_value_fg = value_fg[i];
r_value_og = value_og[i]; r_value_og = value_og[i];
if (value.check_ig) { if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i]; r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
r_checkF = ((__m256 *)value.check_fg)[i]; r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
r_checkO = ((__m256 *)value.check_og)[i]; r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
} }
r_state = ((__m256 *)value.state_value)[i]; r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
r_state_atv = ((__m256 *)value.state_active_value)[i]; r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
r_output_grad = ((__m256 *)grad.output_grad)[i]; r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
r_state_grad = ((__m256 *)grad.state_grad)[i]; r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
if (value.prev_state_value) { if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i]; r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
} }
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, ...@@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
grad_ig[i] = r_grad_ig; grad_ig[i] = r_grad_ig;
grad_fg[i] = r_grad_fg; grad_fg[i] = r_grad_fg;
grad_og[i] = r_grad_og; grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad; (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
if (grad.prev_state_grad) if (grad.prev_state_grad)
((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
if (value.prev_state_value) { if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; if (grad.check_ig_grad)
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
if (grad.check_fg_grad)
(reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
} }
if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; if (grad.check_og_grad)
(reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
} }
#endif #endif
} }
......
...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, ...@@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
} }
TEST(math_function, notrans_mul_trans_fp32) { TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) {
} }
TEST(math_function, notrans_mul_trans_fp16) { TEST(math_function, notrans_mul_trans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, false, input2_gpu, true,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5); EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14); EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) {
} }
TEST(math_function, trans_mul_notrans_fp32) { TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input1_gpu; paddle::platform::CUDAPlace gpu_place(0);
Tensor input2_gpu; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, gpu_place); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) {
} }
TEST(math_function, trans_mul_notrans_fp16) { TEST(math_function, trans_mul_notrans_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
Tensor input1; paddle::framework::Tensor out_gpu;
Tensor input1_gpu; paddle::framework::Tensor out;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
return; return;
} }
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place); out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext,
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, paddle::platform::float16>(
float16(0)); context, input1_gpu, true, input2_gpu, false,
paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0));
TensorCopySync(out_gpu, cpu_place, &out); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>(); paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9); EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12); EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) {
} }
TEST(math_function, gemm_notrans_cublas_fp32) { TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
} }
TEST(math_function, gemm_notrans_cublas_fp16) { TEST(math_function, gemm_notrans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1,
4, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
} }
TEST(math_function, gemm_trans_cublas_fp32) { TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
Tensor input1; paddle::framework::Tensor input1_gpu;
Tensor input2; paddle::framework::Tensor input2_gpu;
Tensor input3; paddle::framework::Tensor input3_gpu;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) {
} }
TEST(math_function, gemm_trans_cublas_fp16) { TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework; paddle::framework::Tensor input1;
using namespace paddle::platform; paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
Tensor input1; paddle::platform::CPUPlace cpu_place;
Tensor input2; paddle::platform::CUDAPlace gpu_place(0);
Tensor input3; paddle::platform::CUDADeviceContext context(gpu_place);
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53 // fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) { if (context.GetComputeCapability() < 53) {
...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) {
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); paddle::platform::float16* input1_ptr =
input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place); paddle::platform::float16* input2_ptr =
input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(), fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); paddle::platform::float16* input3_ptr =
input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>(); paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
float16* b = input2_gpu.data<float16>(); paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); paddle::platform::float16* c =
input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), paddle::platform::float16>(
c + 1, 4); context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3,
3, paddle::platform::float16(1), c + 1, 4);
TensorCopySync(input3_gpu, cpu_place, &input3); paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0); EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) {
template <typename T> template <typename T>
void GemvTest(int m, int n, bool trans) { void GemvTest(int m, int n, bool trans) {
using namespace paddle::framework; paddle::framework::Tensor mat_a;
using namespace paddle::platform; paddle::framework::Tensor vec_b;
paddle::framework::Tensor vec_c;
Tensor mat_a;
Tensor vec_b;
Tensor vec_c;
CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); paddle::platform::CUDADeviceContext context(gpu_place);
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place); T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
Tensor g_mat_a; paddle::framework::Tensor g_mat_a;
Tensor g_vec_b; paddle::framework::Tensor g_vec_b;
Tensor g_vec_c; paddle::framework::Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place); T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place); T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place); T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) { ...@@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i); data_b[i] = static_cast<T>(i);
} }
TensorCopySync(mat_a, gpu_place, &g_mat_a); paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
paddle::operators::math::gemv<CUDADeviceContext, T>( paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
TensorCopySync(g_vec_c, cpu_place, &vec_c); paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAdd<CUDADeviceContext, float> add_functor; paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
float>
add_functor;
add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
auto out_height = output->height(); auto out_height = output->height();
...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
std::unique_ptr<Tensor> tensor2{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor2{
tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor2->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor; paddle::operators::math::SelectedRowsAddTensor<
paddle::platform::CUDADeviceContext, float>
add_tensor_functor;
add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu; paddle::framework::Tensor tensor2_cpu;
TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>(); auto* tensor2_cpu_data = tensor2_cpu.data<float>();
...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) {
} }
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
using namespace paddle::framework; paddle::platform::CUDAPlace gpu_place(0);
using namespace paddle::platform; paddle::platform::CPUPlace cpu_place;
using namespace paddle::operators::math; paddle::platform::CUDADeviceContext ctx(gpu_place);
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
CUDAPlace gpu_place(0); float>
CPUPlace cpu_place; functor;
CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor;
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
std::vector<int64_t> rows1{0, 4, 7}; std::vector<int64_t> rows1{0, 4, 7};
std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value(); auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>( in1_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)}; std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value(); auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>( in2_value->mutable_data<float>(
make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place); paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<SelectedRows> output{new SelectedRows()}; std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height); output->set_height(height);
auto* out_value = output->mutable_value(); auto* out_value = output->mutable_value();
// simplely concat two SelectedRows // simply concat two SelectedRows
out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place); out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
gpu_place);
SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor; paddle::operators::math::SelectedRowsAddTo<
paddle::platform::CUDADeviceContext, float>
add_to_functor;
add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows1, 0, output.get());
add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[5], 7);
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; paddle::framework::Tensor out_cpu;
TensorCopy(*out_value, cpu_place, ctx, &out_cpu); paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
std::unique_ptr<Tensor> tensor1{new Tensor()}; std::unique_ptr<paddle::framework::Tensor> tensor1{
tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place); new paddle::framework::Tensor()};
tensor1->mutable_data<float>(
paddle::framework::make_ddim({height, row_numel}), gpu_place);
functor(ctx, tensor1.get(), 3.0); functor(ctx, tensor1.get(), 3.0);
SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor; paddle::operators::math::SelectedRowsAddToTensor<
paddle::platform::CUDADeviceContext, float>
add_to_tensor_functor;
add_to_tensor_functor(ctx, *output, tensor1.get()); add_to_tensor_functor(ctx, *output, tensor1.get());
Tensor tensor1_cpu; paddle::framework::Tensor tensor1_cpu;
TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor1_cpu_data = tensor1_cpu.data<float>(); auto* tensor1_cpu_data = tensor1_cpu.data<float>();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册