// Copyright (c) 2022 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include "paddle/cinn/common/target.h" #include "paddle/cinn/ir/ir_schedule.h" namespace cinn { namespace auto_schedule { /* Loop feature enums */ enum class ForOptimizeFeatureEnum : int { kNone, kGpuBind, kParallel, kUnroll, kVectorize }; /* function to scale feature numbers */ inline float slog(float x) { return x < 0 ? std::log2(-x + 1) : std::log2(x + 1); } class LoopBlockFeature { public: // TODO(zhhsplendid): distinguish more types such as float16, float32, // float64, etc. However speed the gap between float and int are larger than // different bits, so we just distinguished int and float here /* Arithmetic features */ int float_add_or_sub = 0; int float_mul = 0; int float_div_or_mod = 0; int float_cmp = 0; int float_math_func = 0; int float_other_call = 0; // like simple assign, cast, etc. int int_add_or_sub = 0; int int_mul = 0; int int_div_or_mod = 0; int int_cmp = 0; int int_math_func = 0; int int_other_call = 0; // like simple assign, cast, etc. int bool_op = 0; int select_op = 0; static constexpr int kArithSize = 6 * 2 + 2; /** * Buffer memory features, which is the number of memory operations. * Note that different size of memory operation can have various speed, * however the speed difference would be small in OS. A meticulous TODO * may be collect operand sizes (like alloc size, write size, or so) */ int mem_alloc = 0; int mem_free = 0; int mem_read = 0; int mem_write = 0; static constexpr int kMemSize = 4; /** * Reduce and Broadcast features */ int float_reduce_sum_or_sub = 0; int float_reduce_mul = 0; int float_reduce_div = 0; int float_reduce_max_or_min = 0; int float_broadcast = 0; int int_reduce_sum_or_sub = 0; int int_reduce_mul = 0; int int_reduce_div = 0; int int_reduce_max_or_min = 0; int int_broadcast = 0; static constexpr int kReduceBroadcastSize = 10; /* Loop type features */ // A TODO maybe add loop position (Inner, Outer, Middle) feature ForOptimizeFeatureEnum loop_opt_type = ForOptimizeFeatureEnum::kNone; static constexpr int kOptApplySize = 5; /* Thread features if loop is optimized by GPU or CPU parallelism. * Useless in other cases. */ int len_blockIdx_x = 0; int len_blockIdx_y = 0; int len_blockIdx_z = 0; int len_threadIdx_x = 0; int len_threadIdx_y = 0; int len_threadIdx_z = 0; int len_vthread = 0; // length of virtual thread int vectorize_factor = 0; static constexpr int kThreadFeatureSize = 8; static constexpr int kTotalSize = kArithSize + kMemSize + kReduceBroadcastSize + kOptApplySize + kThreadFeatureSize; /* Non-feature attributes, used to maintain during feature_extractor */ // Number to indicate the loop block inside current one int num_sub_loops = 0; // Number of repeats of this loop, -1 represents unknown int loop_length = 1; }; /** * Feature of Expr. It is used in CostModel */ class Feature { public: Feature(); Feature(const common::Target& target); // Convert the various-length loop block features to fixed-size vector std::vector ToFixedSizeVector(); // Call when visit into a loop block to collect LoopBlockFeature void IntoLoopBlock(); // Call when exit a loop block to collect LoopBlockFeature void ExitLoopBlock(); // The current loop block which we should collect feature on LoopBlockFeature& CurrentLoopBlock(); // The current loop block which we should collect feature on const LoopBlockFeature& CurrentLoopBlock() const; private: // We treat a computation feature to be encoded as variable-length vector. // The root compute block is not a loop, but we treat it as a size-1 loop. // Blocks are encoded like a stack. Each LoopBlockFeature contains a // num_sub_loops to indicate the next level sub-loop-block it contains. // // For example, code like: // // some_compute_0 // loop1 { // some_compute_1 // loop2 { // some_compute_2 // } // } // // loop3 { // some_compute_3 // } // // We go through the code and push loops into stack, then the features are encoded as // [loop_block_feature_0, loop_block_feature_1, loop_block_feature_2, loop_block_feature_3] // where loop_block_feature_i stores the features of some_compute_i (such // as number of arithmetic operations) // // loop_block_feature_0.num_sub_loops = 2 // loop_block_feature_1.num_sub_loops = 1 // loop_block_feature_2.num_sub_loops = 0 // loop_block_feature_3.num_sub_loops = 0 std::vector stack_encoded_feature_; int current_loop_block_index_; std::vector parent_indices_; common::Target target_; }; } // namespace auto_schedule } // namespace cinn