cinn_launch_op.h 10.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <memory>
#include <string>
#include <unordered_map>
20
#include <unordered_set>
21 22 23
#include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/scope.h"
#include "cinn/runtime/cinn_runtime.h"
24
#include "cinn/runtime/flags.h"
25
#include "paddle/fluid/framework/data_type.h"
26 27 28 29 30 31 32
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"

namespace paddle {
namespace operators {

33 34 35
constexpr char kX[] = "X";
constexpr char kOutputs[] = "Out";
constexpr char kCompilationKey[] = "compilation_key";
36 37

using LoDTensor = framework::LoDTensor;
38 39 40 41 42 43 44
using CinnTensor = ::cinn::hlir::framework::Tensor;
using CinnScope = ::cinn::hlir::framework::Scope;
using CinnCompiler = framework::paddle2cinn::CinnCompiler;
using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;

namespace details {

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
class CinnLaunchContext {
 public:
  explicit CinnLaunchContext(const CinnCompiledObject& compiled_obj);

  // Return whether a Paddle variable used on compiled kernels
  bool IsVariableUsed(const std::string& var_name);

  // Allocate buffer to a Paddle tensor with assginment information from CINN
  void MutableTensorData(const std::string& var_name,
                         const platform::Place& place, LoDTensor* paddle_tensor,
                         bool is_internal_var = false);

  // Assign tensor buffer to input or output variables
  void AssignExternalVariable(const std::string& var_name, LoDTensor* tensor);

  // Assign tensor buffer to internal variables
  void AssignInternalVariable(const std::string& var_name, LoDTensor* tensor);

  // Extract internal variable names from CinnScope
  // by excluding used input and output variables
65
  std::unordered_set<std::string> GetInternalVariableNames();
66 67 68 69

  // Finalize all execution arguments and return them
  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;

70 71 72 73
  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
    return std::move(hold_buffers_);
  }

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
 private:
  // Get CinnTensor with CINN variable name
  CinnTensor GetCinnTensor(const std::string& var_name);

  // Check whether tensors from Paddle and CINN of the same variable
  // are equivalent in type and dimension
  void CheckTensorEquivalent(const std::string& var_name,
                             const LoDTensor& paddle_tensor,
                             const CinnTensor& cinn_tensor);

  // Share the buffer of a Paddle tensor to CINN by delivering memory address
  // to a cinn_buffer_t object
  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(LoDTensor* tensor);

  // Set an argument with (cinn name)->(paddle tensor) pair
  void SetArgument(const std::string& cinn_name, LoDTensor* paddle_tensor);

 private:
  // a variable name map from paddle to cinn
  const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
  // the variable scope of cinn
  const std::shared_ptr<CinnScope> cinn_scope_;

  // all variables used by compiled executable program
  std::unordered_set<std::string> cinn_variable_names_;

  // because a cinn_pod_value_t does not own the cinn_buffer_t object,
  // an extra stroage is necessary to keep the object and it can
  // not be released until runtime program finish  execution.
  std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;

  // name to execution argument
  std::map<std::string, cinn_pod_value_t> name2argument_;
};

109 110 111 112 113 114
// Tranform Paddle place to CINN target
const ::cinn::common::Target& PlaceToCinnTarget(const platform::Place& place);

// Print detailed compilation result of graph for debug
void DebugCinnCompiledResult(const CinnCompiledObject& result);

115 116
// Launch cinn to execute compiled executable program and wait done
void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
117
                         const CinnLaunchContext& context, void* stream);
118 119 120

// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
void SetCinnRuntimeFlags();
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

template <typename DeviceContext>
void ReleaseResource(const std::vector<void*>& resources, void* stream) {
  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
  auto* buffers =
      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
  delete temp_scope;
  delete buffers;
}

template <typename DeviceContext>
void* GetStream(const framework::ExecutionContext& ctx) {
  return nullptr;
}

#ifdef PADDLE_WITH_CUDA
template <>
void ReleaseResource<platform::CUDADeviceContext>(
    const std::vector<void*>& resources, void* stream);

template <>
void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx);
#endif

146
}  // namespace details
147 148 149 150 151

template <typename DeviceContext, typename T>
class CinnLaunchOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
152 153
    const auto& scope = ctx.scope();
    const auto& place = ctx.GetPlace();
154
    void* stream = details::GetStream<DeviceContext>(ctx);
155 156 157 158 159 160 161
    // Step 1. Find graph object and prepare input
    PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
                      platform::errors::NotFound(
                          "No Attribute(%s) found for CinnLaunchOp operator.",
                          kCompilationKey));
    const auto& compilation_key =
        ctx.template Attr<std::string>(kCompilationKey);
162
    VLOG(4) << "CinnLaunchOp attribute(" << kCompilationKey << ") "
163 164
            << "value:\n"
            << CinnCompiler::GetInstance()->ReadableKey(compilation_key);
165 166

    auto input_variable_names = ctx.InputNames(kX);
167 168 169 170 171 172 173 174
    const auto& input_tensors = ctx.MultiInput<LoDTensor>(kX);
    std::map<std::string, const LoDTensor*> inputs_name2tensor;
    std::transform(input_variable_names.begin(), input_variable_names.end(),
                   input_tensors.begin(),
                   std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
                   [](const std::string& name, const LoDTensor* tensor) {
                     return std::make_pair(name, tensor);
                   });
175 176

    // Step 2. Get compilation result of the graph
177
    auto target = details::PlaceToCinnTarget(place);
178
    const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
179
        compilation_key, inputs_name2tensor, target, stream);
180
    details::DebugCinnCompiledResult(cinn_compiled_object);
181

182 183 184 185 186 187 188
    auto launch_context =
        std::make_unique<details::CinnLaunchContext>(cinn_compiled_object);

    // Step 3. Prepare arguments needed for the compiled executable program.
    VLOG(4) << "CinnLaunchOp prepare arguments";

    // 3.1 Prepare input variables: tensors of input variables have
189 190
    //     been initialized before graph compiled, just check the
    //     equiality between tensors of paddle and cinn.
191 192 193 194 195 196 197 198 199 200 201
    for (const auto& var_name : input_variable_names) {
      if (!launch_context->IsVariableUsed(var_name)) {
        // some input variables don't need for cinn because they are
        // eliminated by optimized passes or some cinn operators use
        // less variables
        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
        continue;
      }

      launch_context->AssignExternalVariable(
          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
202 203 204
    }

    // 3.2 Prepare output variables: all output variables should
205
    //     be initialized and allocated buffer before
206
    //     the runtime program start execution, the compilation result
207 208
    //     includes details of their buffer assginment and we use that to
    //     allocate space in Paddle. For those variables allocated yet,
209 210
    //     like persistable parameters, just check the equiality between
    //     Paddle allocation and CINN buffer assginment.
211
    auto output_variable_names = ctx.OutputNames(kOutputs);
212 213 214 215 216
    for (const auto var_name : output_variable_names) {
      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
                        platform::errors::InvalidArgument(
                            "Output variable(%s) not used by cinn", var_name));

217
      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
218 219
      if (!tensor->IsInitialized()) {
        launch_context->MutableTensorData(var_name, place, tensor);
220
      }
221 222
      launch_context->AssignExternalVariable(
          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
223 224
    }

225 226 227 228 229 230
    // 3.3 Prepare internal or temporary variables: Create a temporary
    //     scope to keep internal variables within graph or temporary
    //     variables needed by the compiled runtime program in addition.
    //     Here we directly use the names from CinnScope as Paddle variable
    //     names, because they will not be used outside the graph
    //     and should be destructed after computation finished.
231
    auto internal_variable_names = launch_context->GetInternalVariableNames();
232
    framework::Scope* temp_scope = scope.NewTmpScope().release();
233 234 235 236
    for (const auto& var_name : internal_variable_names) {
      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
      launch_context->MutableTensorData(var_name, place, tensor, true);
      launch_context->AssignInternalVariable(var_name, tensor);
237
    }
238

239 240 241 242
    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
    details::SetCinnRuntimeFlags();

    // Step 5. Launch CINN to execute the compiled executable program
243 244
    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
245
    VLOG(4) << "CinnLaunchOp launch execution done.";
246 247 248 249 250 251

    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
        launch_context->HandoverBuffers()};
    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
                                            stream);
252 253 254 255 256
  }
};

}  // namespace operators
}  // namespace paddle