// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/cinn/optim/vectorize_loops.h" #include #include #include #include #include #include #include #include "paddle/cinn/common/cas.h" #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/ir/collect_ir_nodes.h" #include "paddle/cinn/ir/ir_operators.h" #include "paddle/cinn/ir/ir_printer.h" #include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_replace.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/tensor_write_tell.h" #include "paddle/cinn/optim/unroll_loops.h" #include "paddle/cinn/utils/functional.h" namespace cinn { namespace optim { using namespace ir; // NOLINT using common::make_const; using common::make_one; using common::make_zero; //! Widen an expression to the given number of lanes. Expr Widen(Expr e, int lanes) { if (e.type().lanes() == lanes) return e; if (const ir::Broadcast *op = e.As()) { if (lanes % op->lanes == 0) { return ir::Broadcast::Make(op->value, lanes); } } CHECK_EQ(e.type().lanes(), 1) << "Cannot broadcast lanes from " << e.type().lanes() << " to " << lanes; return ir::Broadcast::Make(e, lanes); } // tell whether a tensor can be vectorized or not on CUDA by collecting names // of tensors which meet all check predicates of vectoring class TensorVectorizeTeller : public ir::IRMutator { public: TensorVectorizeTeller(const Var &iter_var, const int factor, const absl::flat_hash_map *var_intervals) : iter_var_(iter_var), factor_(factor), var_intervals_(var_intervals) {} void Collect(const Expr *op) { IRMutator::Visit(op, op); } // return true if input tensor can be vectorized bool CanBeVectorized(const std::string &tensor_name) const { auto it = tensor2flag_.find(tensor_name); return it != tensor2flag_.end() && it->second; } private: const Var iter_var_; // loop var of new for-loop split from the vectorized loop const int factor_; const absl::flat_hash_map *var_intervals_; // save (tensor name) -> (bool flag) to indentify whether tensors can be vectorized or not std::unordered_map tensor2flag_; void Visit(const ir::Store *expr, const Expr *op) override { auto *node = op->As(); CHECK(node); IRMutator::Visit(&node->value, &node->value); auto *tensor = node->tensor.As(); CHECK(tensor); // a tensor should pass all check of pre-conditions in every time it appears if (!tensor2flag_.count(tensor->name) || tensor2flag_.at(tensor->name)) { bool flag = MeetConditions(node->tensor, node->indices); tensor2flag_[tensor->name] = flag; } } void Visit(const ir::Load *expr, const Expr *op) override { auto *node = op->As(); CHECK(node); auto *tensor = node->tensor.As(); CHECK(tensor); // a tensor should pass all check of pre-conditions in every time it appears if (!tensor2flag_.count(tensor->name) || tensor2flag_.at(tensor->name)) { bool flag = MeetConditions(node->tensor, node->indices); tensor2flag_[tensor->name] = flag; } } // return true if the tensor meets all conditions of vectorizing bool MeetConditions(const Expr &expr, const std::vector &indices) { const ir::_Tensor_ *tensor = expr.As(); auto find_matched_var_fn = [&](const Expr *x) { return x->As<_Var_>() && x->As<_Var_>()->name == iter_var_->name; }; // the size of the last dim should be divisible by factor Expr last_size = tensor->shape.back(); if (tensor->shape.empty() || !tensor->shape.back().As() || tensor->shape.back().as_int32() % factor_ != 0) { VLOG(5) << "Size of the last dim of tensor:" << tensor->name << " can't be divisible by factor:" << factor_ << ", shape:" << utils::Join(tensor->shape, ","); return false; } // the iter val must appear in the last index if (indices.empty() || ir::CollectIRNodes(indices.back(), find_matched_var_fn).empty()) { VLOG(5) << "Loop var:" << iter_var_->name << " is not used in the last index"; return false; } // the iter val can't appear in mulitple indices for (int i = 0; i < indices.size() - 1; ++i) { auto repeat_found = ir::CollectIRNodes(indices[i], find_matched_var_fn); if (!repeat_found.empty()) { VLOG(5) << "Loop var:" << iter_var_->name << " is used at more than last index, current:" << i; return false; } } // check tensor accessed sequentially by comparing index one by one Expr first_idx = optim::IRCopy(indices.back()); optim::IrReplace(&first_idx, Expr(iter_var_), Expr(0)); const auto &interval = var_intervals_->at(iter_var_->name); for (int i = 1; i < interval.r; ++i) { Expr next_idx = optim::IRCopy(indices.back()); optim::IrReplace(&next_idx, Expr(iter_var_), Expr(i)); auto gap = common::AutoSimplify(Expr(next_idx - first_idx)); if (!gap.As() || gap.as_int32() != i) { VLOG(5) << "Tensor:" << tensor->name << " is not accessed sequentially, next:" << next_idx << ", first:" << first_idx << ", gap:" << gap; return false; } VLOG(5) << "Tensor:" << tensor->name << " is accessed sequentially, next:" << next_idx << ", first:" << first_idx << ", gap:" << gap; } auto dtype = expr->type().ElementOf(); bool type_supported = dtype.is_float(32) || dtype.is_int(32) || dtype.is_uint(32) || dtype.is_float16() || dtype.is_bfloat16(); if (!type_supported) { VLOG(5) << "Only support vectorizing int,uint,float,float16,bloat16, but got " << dtype; return false; } return true; } }; // find tensors accessed sequentially in a for-loop to be vectorized, // and substitue the corresponding cuda built-in vector for them class CudaVectorizer : public IRMutator { const Var iter_var_; // the loop var of the vecotrized loop const int factor_; // the factor for vectorize TensorWriteTeller write_teller_; TensorVectorizeTeller vectorized_teller_; absl::flat_hash_map tensor2vectorized_vars_; std::vector vectorized_cast_exprs_; std::vector vectorized_store_exprs_; public: static constexpr int CudaVectorTypeMaxLanes = 8; CudaVectorizer(const Var &iter_var, const int factor, const absl::flat_hash_map *var_intervals) : iter_var_(iter_var), factor_(factor), vectorized_teller_(iter_var, factor, var_intervals) { CHECK(factor <= CudaVectorTypeMaxLanes) << "The maximum lanes of valid CUDA vector types: " << CudaVectorTypeMaxLanes << ", but factor: " << factor; } // return all cast statements collected through vectorizing std::vector VectorizedTypeCastExprs() { return vectorized_cast_exprs_; } // return all store statements collected through vectorizing std::vector VectorizedTypeStoreExprs() { return vectorized_store_exprs_; } void Visit(Expr *expr) { write_teller_.Collect(expr); vectorized_teller_.Collect(expr); IRMutator::Visit(expr, expr); } void Visit(const Load *op, Expr *expr) override { auto *node = expr->As(); auto *tensor = node->tensor.As(); if (node->is_addr_tensor() && vectorized_teller_.CanBeVectorized(tensor->name)) { TensorVectorized(node, &node->indices, false); } } void Visit(const Store *op, Expr *expr) override { auto *node = expr->As(); auto *tensor = node->tensor.As(); CHECK(tensor); if (vectorized_teller_.CanBeVectorized(tensor->name)) { TensorVectorized(node, &node->indices, true); } IRMutator::Visit(&node->value, &node->value); } private: void TensorVectorized(ir::LoadStoreAddrMnger *node, std::vector *indices, bool is_store) { auto *tensor = node->tensor.As(); VLOG(5) << "Vectorizing tensor:" << tensor->name; // save the tensor and its corresponding vector name when it first appear if (!tensor2vectorized_vars_.count(tensor->name)) { AppendCast(node->tensor, *indices, is_store); } auto vectorized_var = tensor2vectorized_vars_.at(tensor->name); // substitue a new tensor with the vector name and dtype auto t = vectorized_var->type().is_cpp_handle() ? node->tensor->type().PointerOf() : node->tensor->type(); node->tensor = ir::Tensor(vectorized_var->name, t, {Expr(factor_)}, {Expr(factor_)}, tensor->operation); // remain the last iterative indice indices->assign({iter_var_}); } std::string GetVectorTypeName(Type type) { std::string name_prefix = common::customized_type::kcuda_builtin_vector_t; #define GET_CUDA_VECTOR_TYPE_NAME(pred_expr, scalar_name) \ if (pred_expr) { \ return name_prefix + scalar_name + std::to_string(factor_); \ } GET_CUDA_VECTOR_TYPE_NAME(type.is_int(32), "int"); GET_CUDA_VECTOR_TYPE_NAME(type.is_uint(32), "uint"); GET_CUDA_VECTOR_TYPE_NAME(type.is_float(32), "float"); GET_CUDA_VECTOR_TYPE_NAME(type.is_float16(), "half"); GET_CUDA_VECTOR_TYPE_NAME(type.is_bfloat16(), "bfloat16"); #undef GET_CUDA_VECTOR_TYPE_NAME // others are not implementd yet CINN_NOT_IMPLEMENTED return ""; } void AppendCast(Expr tensor, const std::vector &indices, bool is_store) { auto *node = tensor.As(); bool is_const = !write_teller_.IsWrite(node->name); // generate the corresponding vector type Type scalar_type = tensor->type().ElementOf(); Type vector_type_ptr(Type::type_t::Customized, scalar_type.bits(), factor_); Type vector_type(Type::type_t::Customized, scalar_type.bits(), factor_); vector_type_ptr.set_customized_type(GetVectorTypeName(scalar_type)); vector_type_ptr.set_cpp_handle(); vector_type_ptr.set_cpp_const(is_const); vector_type.set_customized_type(GetVectorTypeName(scalar_type)); vector_type.set_cpp_const(is_const); // generate a local vector variable to be used in subsequent statements std::string vectorized_name = "vectorized_" + node->name; Var vectorized_var = _Var_::Make(vectorized_name, vector_type); tensor2vectorized_vars_.emplace(node->name, vectorized_var); // generate a get_addr expr to get the address of the tensor Expr converted_tensor = Load::Make(tensor, indices); optim::IrReplace(&converted_tensor, iter_var_, Expr(int32_t(0))); auto get_addr = ir::intrinsics::GetAddr::Make(converted_tensor); // generate a let expression to cast the tensor into the local vector auto cast = ir::Cast::Make(vector_type_ptr, get_addr); if (!is_store) { auto load = Load::Make(cast, {make_const(0)}); auto let = Let::Make(vectorized_var, load); vectorized_cast_exprs_.emplace_back(let); VLOG(5) << "Append a vectorized expr:" << let; } else { Var vectorized_ptr = _Var_::Make(vectorized_name + "_ptr", vector_type_ptr); auto let1 = Let::Make(vectorized_ptr, cast); auto let2 = Let::Make(vectorized_var, Expr(0)); vectorized_cast_exprs_.emplace_back(let1); vectorized_cast_exprs_.emplace_back(let2); VLOG(5) << "Append a vectorized expr:" << let1; VLOG(5) << "Append a vectorized expr:" << let2; auto t = ir::Tensor(vectorized_ptr->name, node->type().PointerOf(), {Expr(factor_)}, {Expr(factor_)}, node->operation); auto store = Store::Make(t, vectorized_var, {make_const(0)}); vectorized_store_exprs_.emplace_back(store); VLOG(5) << "Append a vectorized expr:" << store; } } }; //! Substitutes a vector for a scalar var in a Stmt. class Vectorizer : public IRMutator { //! The name of the variable to be vectorized. Var var; int lanes_{-1}; bool need_scalarize_{false}; bool to_vectorize_{false}; Expr ramp_; absl::flat_hash_map var_intervals_; //! A suffix to attach to widened variables. std::string widen_suffix; public: Vectorizer(const Var &var, int lanes, const absl::flat_hash_map &var_intervals = {}) : var(var), lanes_(lanes), var_intervals_(var_intervals) { // the identity ramp. ramp_ = Ramp::Make(make_zero(), make_one(), lanes_); } void Visit(Expr *expr) { CHECK(!need_scalarize_); IRMutator::Visit(expr, expr); if (need_scalarize_) { need_scalarize_ = false; Scalarize(expr); } } void Visit(const Cast *op, Expr *expr) override { auto *node = expr->As(); auto v0 = node->v(); Visit(&node->v()); if (v0.same_as(node->v())) return; Type t = op->type().with_lanes(node->v().type().lanes()); node->set_type(t); } void Visit(const _Var_ *op, Expr *expr) override { if (op->name == var->name) { *expr = Expr(ramp_); } } void Visit(const Add *op, Expr *expr) override { MutateAddSubOperator(op, expr); } void Visit(const Sub *op, Expr *expr) override { MutateAddSubOperator(op, expr); } void Visit(const Mul *op, Expr *expr) override { MutateMulDivOperator(op, expr); } void Visit(const Div *op, Expr *expr) override { MutateMulDivOperator(op, expr); } void Visit(const Mod *op, Expr *expr) override { MutateMulDivOperator(op, expr); } void Visit(const Min *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const Max *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const EQ *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const NE *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const LT *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const LE *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const GT *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const GE *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const And *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const Or *op, Expr *expr) override { BinaryOperatorVec(op, expr); } void Visit(const Ramp *op, Expr *expr) override {} void Visit(const Select *op, Expr *expr) override { auto *node = expr->As