From 7044cfa7c7a134b0ba8cb0b9ab85b8e5b8888b1a Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 25 Feb 2019 05:34:04 +0000 Subject: [PATCH] add sgd jitcode and op test test=develop --- paddle/fluid/operators/jit/gen/CMakeLists.txt | 1 + paddle/fluid/operators/jit/gen/sgd.cc | 130 ++++++++++++++++++ paddle/fluid/operators/jit/gen/sgd.h | 60 ++++++++ .../fluid/tests/unittests/test_sgd_op.py | 29 +++- 4 files changed, 215 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/jit/gen/sgd.cc create mode 100644 paddle/fluid/operators/jit/gen/sgd.h diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 294f73d964..eb0c03568d 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -32,3 +32,4 @@ USE_JITKERNEL_GEN(kSeqPool) USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kEmbSeqPool) +USE_JITKERNEL_GEN(kSgd) diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc new file mode 100644 index 0000000000..a745a27f95 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/sgd.h" +#include // offsetof +#include +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void SgdJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 7; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + const size_t width_size = w_ * sizeof(float); + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + vbroadcastss(ymm_lr, ptr[param_lr]); + // protect rdx + mov(reg_ptr_grad_i, param_grad); + mov(reg_ptr_rows_i, param_rows); + + mov(reg_rows_size_in_byte, + qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]); + mov(rax, sizeof(int64_t)); + mul(reg_rows_size_in_byte); + mov(reg_rows_size_in_byte, rax); + add(reg_rows_size_in_byte, reg_ptr_rows_i); + + Label l_next_row; + L(l_next_row); + { + mov(reg_row, qword[reg_ptr_rows_i]); + mov(rax, width_size); + mul(reg_row); + mov(reg_row, rax); + + mov(reg_ptr_param_i, param_param); + mov(reg_ptr_out_i, param_out); + add(reg_ptr_param_i, reg_row); + add(reg_ptr_out_i, reg_row); + + size_t w_offset = 0; + for (int num_regs : groups) { + // load grad + size_t inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]); + inner_offfset += block_size; + } + + // load param + inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]); + inner_offfset += block_size; + } + + // compute out + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr); + vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i)); + } + + // save out + inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs)); + inner_offfset += block_size; + } + w_offset += (block_size * num_regs); + } + + add(reg_ptr_grad_i, width_size); + add(reg_ptr_rows_i, sizeof(int64_t)); + cmp(reg_ptr_rows_i, reg_rows_size_in_byte); + jl(l_next_row, T_NEAR); + } + + postCode(); +} + +class SgdCreator : public JitCodeCreator { + public: + bool UseMe(const sgd_attr_t& attr) const override { + return platform::MayIUse(platform::avx) && + attr.grad_width % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const sgd_attr_t& attr) const override { + return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8; + } + std::unique_ptr CreateJitCode( + const sgd_attr_t& attr) const override { + PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); + PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); + PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator); diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h new file mode 100644 index 0000000000..317edcd2bc --- /dev/null +++ b/paddle/fluid/operators/jit/gen/sgd.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class SgdJitCode : public JitCode { + public: + explicit SgdJitCode(const sgd_attr_t& attr, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(attr.grad_width) { + this->genCode(); + } + + DECLARE_JIT_CODE(SgdJitCode); + void genCode() override; + + private: + int w_; + reg64_t param_lr{abi_param1}; + reg64_t param_param{abi_param2}; + reg64_t param_grad{abi_param3}; + reg64_t param_rows{abi_param4}; + reg64_t param_out{abi_param5}; + reg64_t param_attr{abi_param6}; + + ymm_t ymm_lr = ymm_t(15); + + reg64_t reg_ptr_grad_i{r10}; + reg64_t reg_ptr_rows_i{r11}; + reg64_t reg_rows_size_in_byte{r12}; + reg64_t reg_row{r13}; + reg64_t reg_ptr_param_i{r14}; + reg64_t reg_ptr_out_i{r15}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index b46e4bfb86..162e6d1938 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -24,17 +24,28 @@ from op_test import OpTest class TestSGDOp(OpTest): def setUp(self): self.op_type = "sgd" - w = np.random.random((102, 105)).astype("float32") - g = np.random.random((102, 105)).astype("float32") + self.conf() + w = np.random.random((self.h, self.w)).astype("float32") + g = np.random.random((self.h, self.w)).astype("float32") lr = np.array([0.1]).astype("float32") self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} self.outputs = {'ParamOut': w - lr * g} + def conf(self): + self.h = 102 + self.w = 105 + def test_check_output(self): self.check_output() +class TestSGDOpCase8X(TestSGDOp): + def conf(self): + self.h = 10 + self.w = 64 + + class TestSparseSGDOp(unittest.TestCase): def check_with_place(self, place): scope = core.Scope() @@ -42,12 +53,12 @@ class TestSparseSGDOp(unittest.TestCase): # create and initialize Grad Variable height = 10 rows = [0, 4, 7] - row_numel = 12 + self.conf() grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows.set_height(height) grad_selected_rows.set_rows(rows) - np_array = np.ones((len(rows), row_numel)).astype("float32") + np_array = np.ones((len(rows), self.row_numel)).astype("float32") np_array[0, 0] = 2.0 np_array[2, 8] = 4.0 @@ -56,7 +67,7 @@ class TestSparseSGDOp(unittest.TestCase): # create and initialize Param Variable param = scope.var('Param').get_tensor() - param_array = np.full((height, row_numel), 5.0).astype("float32") + param_array = np.full((height, self.row_numel), 5.0).astype("float32") param.set(param_array, place) # create and initialize LeraningRate Variable @@ -98,6 +109,14 @@ class TestSparseSGDOp(unittest.TestCase): for place in places: self.check_with_place(place) + def conf(self): + self.row_numel = 12 + + +class TestSparseSGDOpCase8X(TestSparseSGDOp): + def conf(self): + self.row_numel = 16 + class TestSGDOpOptimizeSelectedRows(unittest.TestCase): def check_with_place(self, place): -- GitLab