// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include "paddle/cinn/common/target.h" #include "paddle/cinn/hlir/framework/node.h" #include "paddle/cinn/hlir/pe/schedule_param.pb.h" #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/lang/compute.h" #include "paddle/cinn/poly/stage.h" namespace cinn { namespace hlir { namespace pe { class ScheduleParam { public: ~ScheduleParam(); ScheduleParam(const ScheduleParam &) = delete; ScheduleParam &operator=(const ScheduleParam &) = delete; static ScheduleParam &get_cuda_instance() { static ScheduleParam instance{common::Target::Arch::NVGPU}; return instance; } static ScheduleParam &get_x86_instance() { static ScheduleParam instance{common::Target::Arch::X86}; return instance; } absl::flat_hash_map>> &GetParam() { return param_data; } absl::flat_hash_map> &operator[]( const std::string &key) { return param_data[key]; } int Count(const std::string &key) { return param_data.count(key); } private: ScheduleParam(common::Target::Arch arch); absl::flat_hash_map>> param_data; }; int GetInnerSplitter(int origin, int other_axis); int GetVectorizeFactor(int shape, int split_factor); int SplitEven(int origin); int GetBasicFactor(const Type &type, const common::Target &target); int GetBetterSplitFactor(int shape, int split_factor); int GetArrayPackingFactor(int shape, const Type &type, const common::Target &target); void ScheduleInjectiveCPU(poly::Stage *stage, const std::vector &output_shape, const common::Target &target, bool vectorizable = true); // to deprecate void ScheduleInjectiveCPU1(poly::Stage *stage, const std::vector &output_shape, const common::Target &target, bool vectorizable = true); void MatmulScheduleCUDA(poly::StageMap stages, const ir::Tensor &output, const common::Target &target); void MatmulScheduleCPU(poly::StageMap stage, const ir::Tensor &output, const ir::Tensor &packedB, const common::Target &target); void MulScheduleCPU(poly::StageMap stage, const ir::Tensor &output, const ir::Tensor &input_tensor, const common::Target &target); void SoftmaxScheduleCPU(poly::StageMap stage, const ir::Tensor &output, const ir::Tensor &temp, int axis = -1); void GetConv2dFactors(absl::flat_hash_map *factors, int oc, int ic, int fc, int oh, int ow, const Type &type, const common::Target &target, const std::string &key = "", bool import_params = true); void GetConv2d1x1Factors(absl::flat_hash_map *factors, int oc, int ic, int oh, int ow, const Type &type, const common::Target &target); void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages, const ir::Tensor &res, ir::Tensor &packed_out, const ir::Tensor &input_pad, const ir::Tensor &weights_dilation, const ir::Tensor &data, const common::Target &target, const std::string &key, bool do_padding); void GlobalPoolScheduleGPU(poly::StageMap stages, const std::vector &output, const common::Target &target); void PoolScheduleCPU(poly::StageMap stages, const ir::Tensor &output, const common::Target &target); void PoolScheduleGPU(poly::StageMap stages, ir::Tensor &output, const common::Target &target); void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages, const ir::Tensor &res, ir::Tensor &packed_out, const ir::Tensor &input_pad, const ir::Tensor &weights_dilation, const ir::Tensor &data, const common::Target &target); void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages, const ir::Tensor &res, ir::Tensor &packed_out, const ir::Tensor &input_pad, const ir::Tensor &weights_dilation, const ir::Tensor &data, const common::Target &target, const std::string &key, bool do_padding); void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages, const ir::Tensor &res, ir::Tensor &packed_out, const ir::Tensor &input_pad, const ir::Tensor &weights_dilation, const ir::Tensor &data, const common::Target &target); void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse( poly::StageMap stages, const ir::Tensor &res, ir::Tensor &packed_out, const ir::Tensor &input_pad, const ir::Tensor &weights_dilation, const ir::Tensor &data, const common::Target &target, bool do_padding); void CudaScheduleMul(poly::StageMap stages, ir::Tensor output, const std::vector &output_shape, const common::Target &target); // reduce shedules. void CudaReduceSchedule(poly::StageMap stages, ir::Tensor output, int last_dimension_num, const common::Target &target); void CudaWarpReduceSchedule(poly::StageMap stages, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target); void CudaBlockReduceInternalSchedule(poly::StageMap stages, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target); void CudaBlockReduceSchedule(poly::StageMap stages, ir::Tensor reduce_tmp_out, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target); void CudaBlockShuffleReduceSchedule(poly::StageMap stages, ir::Tensor reduce_reshape, ir::Tensor reduce_internal, ir::Tensor reduce_out, const common::Target &target); void CudaTwoStepReduceSchedule(poly::StageMap stages, ir::Tensor reshape, ir::Tensor internal, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target); void CudaScheduleDepthwiseConv(poly::StageMap stages, ir::Tensor &output, const common::Target &target); void CudaScheduleConv(poly::StageMap stages, ir::Tensor &input_pad, ir::Tensor &weights, ir::Tensor &output, const common::Target &target); void CudaScheduleWinogradConv(poly::StageMap wino_stages, std::vector &all_tensors, const common::Target &target); void CudaScheduleConv2(poly::StageMap stages, ir::Tensor &input_pad, ir::Tensor &weights, ir::Tensor &output, const common::Target &target, const std::string &key); void CudaScheduleInjective(poly::Stage *stage, const std::vector &output_shape, const common::Target &target); void CudaSplitSchedule(common::CINNValuePack *arg_pack, const std::vector> &output_shapes, int axis, const common::Target &target); void CreateCudaSerialData(const std::string &file_name = "default_serial.log"); std::string GenerateX86ConvKey(const std::vector &input_shape, const std::vector &weight_shape, const std::vector &strides, const std::vector &paddings, const std::vector &dilations, const int &index = 0, const std::string &model_name = ""); std::string GenerateX86ConvKey(const std::vector &input_shape, const std::vector &weight_shape, const std::vector &strides, const std::vector &paddings, const std::vector &dilations, const int &index = 0, const std::string &model_name = ""); void CreateX86SerialData(const std::string &file_name = "default_serial.log"); void LoadSerialData( absl::flat_hash_map>> *params, const std::string &file_name = "default_serial.log"); void SaveSerialData( const absl::flat_hash_map< std::string, absl::flat_hash_map>> &model_data, const std::string &file_name = "default_serial.log"); int GetMaxSplitter(int a, int b); absl::flat_hash_map>> CreateCudaParams(); } // namespace pe } // namespace hlir } // namespace cinn