dlpack.h

/*!
 *  Copyright (c) 2017 by Contributors
 * \file dlpack.h
 * \brief The common header of DLPack.
 */
#ifndef DLPACK_DLPACK_H_
#define DLPACK_DLPACK_H_

/**
 * \brief Compatibility with C++
 */
#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"
#else
#define DLPACK_EXTERN_C
#endif

/*! \brief The current version of dlpack */
#define DLPACK_VERSION 70

/*! \brief The current ABI version of dlpack */
#define DLPACK_ABI_VERSION 1

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
#ifdef DLPACK_EXPORTS
#define DLPACK_DLL __declspec(dllexport)
#else
#define DLPACK_DLL __declspec(dllimport)
#endif
#else
#define DLPACK_DLL
#endif

#include <stddef.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif
/*!
 * \brief The device type in DLDevice.
 */
#ifdef __cplusplus
typedef enum : int32_t {
#else
typedef enum {
#endif
    /*! \brief CPU device */
    kDLCPU = 1,
    /*! \brief CUDA GPU device */
    kDLCUDA = 2,
    /*!
     * \brief Pinned CUDA CPU memory by cudaMallocHost
     */
    kDLCUDAHost = 3,
    /*! \brief OpenCL devices. */
    kDLOpenCL = 4,
    /*! \brief Vulkan buffer for next generation graphics. */
    kDLVulkan = 7,
    /*! \brief Metal for Apple GPU. */
    kDLMetal = 8,
    /*! \brief Verilog simulator buffer */
    kDLVPI = 9,
    /*! \brief ROCm GPUs for AMD GPUs */
    kDLROCM = 10,
    /*!
     * \brief Pinned ROCm CPU memory allocated by hipMallocHost
     */
    kDLROCMHost = 11,
    /*!
     * \brief Reserved extension device type,
     * used for quickly test extension device
     * The semantics can differ depending on the implementation.
     */
    kDLExtDev = 12,
    /*!
     * \brief CUDA managed/unified memory allocated by cudaMallocManaged
     */
    kDLCUDAManaged = 13,
    /*!
     * \brief Unified shared memory allocated on a oneAPI non-partititioned
     * device. Call to oneAPI runtime is required to determine the device
     * type, the USM allocation type and the sycl context it is bound to.
     *
     */
    kDLOneAPI = 14,
    /*! \brief GPU support for next generation WebGPU standard. */
    kDLWebGPU = 15,
} DLDeviceType;

/*!
 * \brief A Device for Tensor and operator.
 */
// NB: This is the only difference from
// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to
// allow forward declaration of DLDevice.
typedef struct DLDevice_ {
    /*! \brief The device type used in the device. */
    DLDeviceType device_type;
    /*!
     * \brief The device index.
     * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
     */
    int32_t device_id;
} DLDevice;

/*!
 * \brief The type code options DLDataType.
 */
typedef enum {
    /*! \brief signed integer */
    kDLInt = 0U,
    /*! \brief unsigned integer */
    kDLUInt = 1U,
    /*! \brief IEEE floating point */
    kDLFloat = 2U,
    /*!
     * \brief Opaque handle type, reserved for testing purposes.
     * Frameworks need to agree on the handle data type for the exchange to be
     * well-defined.
     */
    kDLOpaqueHandle = 3U,
    /*! \brief bfloat16 */
    kDLBfloat = 4U,
    /*!
     * \brief complex number
     * (C/C++/Python layout: compact struct per complex number)
     */
    kDLComplex = 5U,
} DLDataTypeCode;

/*!
 * \brief The data type the tensor can hold. The data type is assumed to follow
 * the native endian-ness. An explicit error message should be raised when
 * attempting to export an array with non-native endianness
 *
 *  Examples
 *   - float: type_code = 2, bits = 32, lanes=1
 *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
 *   - int8: type_code = 0, bits = 8, lanes=1
 *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
 */
typedef struct {
    /*!
     * \brief Type code of base types.
     * We keep it uint8_t instead of DLDataTypeCode for minimal memory
     * footprint, but the value should be one of DLDataTypeCode enum values.
     * */
    uint8_t code;
    /*!
     * \brief Number of bits, common choices are 8, 16, 32.
     */
    uint8_t bits;
    /*! \brief Number of lanes in the type, used for vector types. */
    uint16_t lanes;
} DLDataType;

/*!
 * \brief Plain C Tensor object, does not manage memory.
 */
typedef struct {
    /*!
     * \brief The data pointer points to the allocated data. This will be CUDA
     * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
     * types. This pointer is always aligned to 256 bytes as in CUDA. The
     * `byte_offset` field should be used to point to the beginning of the data.
     *
     * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
     * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
     * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
     * (after which this note will be updated); at the moment it is recommended
     * to not rely on the data pointer being correctly aligned.
     *
     * For given DLTensor, the size of memory required to store the contents of
     * data is calculated as follows:
     *
     * \code{.c}
     * static inline size_t GetDataSize(const DLTensor* t) {
     *   size_t size = 1;
     *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
     *     size *= t->shape[i];
     *   }
     *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
     *   return size;
     * }
     * \endcode
     */
    void* data;
    /*! \brief The device of the tensor */
    DLDevice device;
    /*! \brief Number of dimensions */
    int32_t ndim;
    /*! \brief The data type of the pointer*/
    DLDataType dtype;
    /*! \brief The shape of the tensor */
    int64_t* shape;
    /*!
     * \brief strides of the tensor (in number of elements, not bytes)
     *  can be NULL, indicating tensor is compact and row-majored.
     */
    int64_t* strides;
    /*! \brief The offset in bytes to the beginning pointer to data */
    uint64_t byte_offset;
} DLTensor;

/*!
 * \brief C Tensor object, manage memory of DLTensor. This data structure is
 *  intended to facilitate the borrowing of DLTensor by another framework. It is
 *  not meant to transfer the tensor. When the borrowing framework doesn't need
 *  the tensor, it should call the deleter to notify the host that the resource
 *  is no longer needed.
 */
typedef struct DLManagedTensor {
    /*! \brief DLTensor which is being memory managed */
    DLTensor dl_tensor;
    /*! \brief the context of the original host framework of DLManagedTensor in
     *   which DLManagedTensor is used in the framework. It can also be NULL.
     */
    void* manager_ctx;
    /*! \brief Destructor signature void (*)(void*) - this should be called
     *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
     *   if there is no way for the caller to provide a reasonable destructor.
     *   The destructors deletes the argument self as well.
     */
    void (*deleter)(struct DLManagedTensor* self);
} DLManagedTensor;
#ifdef __cplusplus
}  // DLPACK_EXTERN_C
#endif
#endif  // DLPACK_DLPACK_H_