// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/lowered_func.h" #include "paddle/cinn/poly/isl_utils.h" #include "paddle/cinn/poly/stage.h" namespace cinn { namespace optim { void OptimizeExprGPU(Expr* expr); /* // replace 'for' loop to gpu 'block/thread' // update buffer index to save memory size. // re-compute buffer size. */ /** * Remove the forloops of block and thread axis, add the kernel launch thread dimension information to the outermost * LoweredFunc. * * For example, input the code: * \code * // Note here, the outermost expression should be a LoweredFunc * _LoweredFunc_: * for (blockIdx.x, 0, 10) * for (threadIdx.x, 0, 20) * A(blockIdx.x, threadIdx.x) * \endcode * * will be modified to * \code * _LoweredFunc_: * A(blockIdx.x, threadIdx.x) * \endcode * * \note For that the dimensions of each threadIdx or blockIdx should be constant, so this only takes For nodes, not * \note PolyFor nodes is allowed to be GPU related. */ void RemoveGpuForloopsAxis(Expr* expr); /** * Add __syncthreads() to shared memory producer. */ void CudaSyncThreadsDropIfThenElse(Expr* expr); } // namespace optim } // namespace cinn