build(mge): add boost (part 1)

GitOrigin-RevId: 93e202dc5decbd153482edb5f605523d837800bc

build(mge): add boost (part 1)
GitOrigin-RevId: 93e202dc5decbd153482edb5f605523d837800bc
a5dea703 · Megvii Engine Team · 1dd27b39 · a5dea703 · a5dea703 · a5dea703
6 changed file
--- a/ci/compatibility/fbs/V2-backup/dtype.fbs
+++ b/ci/compatibility/fbs/V2-backup/dtype.fbs
+namespace mgb.serialization.fbs;
+
+// Keep in sync with dnn/include/megdnn/dtype.h
+// Please only add new dtypes at the end of this list
+enum DTypeEnum : byte {
+    Float32,
+    Uint8,
+    Int8,
+    Int16,
+    Int32,
+    IntB1,
+    IntB2,
+    IntB4,
+    Byte,
+    Float16,
+    UintB4,
+    Quantized8Asymm,
+    QuantizedS32,
+    QuantizedS8,
+    Quantized4Asymm,
+    QuantizedS4,
+    QuantizedS16,
+    BFloat16,
+    Bool,
+    Uint16,
+    QuantizedS1,
+}
+
+table LinearQuantizationParam {
+    scale:float;
+
+    // Won't be set for symmetric quantization types
+    zero_point:ubyte;
+}
+
+union DTypeParam {
+    LinearQuantizationParam,
+}
+
+table DType {
+    type:DTypeEnum;
+    param:DTypeParam;
+}
--- a/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
+++ b/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
+include "dtype.fbs";
+
+namespace mgb.serialization.fbs.param;
+
+struct PersistentDTypeScalar {
+    dtype:DTypeEnum;
+    storage:[ubyte:4];
+}
+
+table MGBAddUpdate {
+    alpha:PersistentDTypeScalar;
+    beta:PersistentDTypeScalar;
+    bias:PersistentDTypeScalar;
+}
+
+table Host2DeviceCopy {
+    enable_value_infer:bool = true;
+    dump_default_value:bool = false;
+    allow_cpu_mem_fwd:bool = true;
+}
+
+table Dimshuffle {
+    pattern:[int];
+    ndim:uint;
+}
+
+enum AxisDescMethod : byte {
+    ADD_1,
+    REMOVE,
+}
+
+struct AxisDesc {
+    method:AxisDescMethod;
+    axis:int;
+}
+
+table AxisAddRemove {
+    desc:[AxisDesc];
+}
+
+table MGBSleep {
+    device:bool = true;
+    host:bool = false;
+    seconds:double;
+}
+
+struct IndexDescMaskItem {
+    axis:byte;
+    begin:bool;
+    end:bool;
+    step:bool;
+    idx:bool;
+}
+
+table IndexDescMaskDump {
+    items:[IndexDescMaskItem];
+}
+
+table NMSKeep {
+    iou_thresh:float;
+    max_output:uint;
+}
--- a/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
+++ b/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
+// generated by gen_param_defs.py for c23d51f3c4f33119fd74f58f04d112ccea8f64f1249ab372300975ab7e710e9a
+include "dtype.fbs";
+namespace mgb.serialization.fbs.param;
+
+/// mode of collective communication
+enum CollectiveCommMode : uint  {
+    /// reduce by sum to output computing node
+    REDUCE_SUM = 0,
+    /// copy input value to each output computing node
+    BROADCAST = 1,
+    /// each output comp node gets the concatenated value of all inputs
+    ALL_GATHER = 2,
+    /// reduce inputs by sum and each output gets one part of it
+    REDUCE_SCATTER_SUM = 3,
+    /// every output gets the sum of all inputs
+    ALL_REDUCE_SUM = 4,
+    /// every output gets the max of all inputs
+    ALL_REDUCE_MAX = 5,
+    /// every output gets the min of all inputs
+    ALL_REDUCE_MIN = 6,
+    /// every output gets the prod of all inputs
+    ALL_REDUCE_PROD = 7,
+    /// concat inputs to one node
+    GATHER = 8,
+    /// scatter input to each output computing node
+    SCATTER = 9,
+    /// scatter inputs and gather them on each computing node
+    ALL_TO_ALL = 10,
+}
+
+/// mode for computing the gradient
+enum CondExecMarkGradMode : uint  {
+    /// normal gradient mode: sum all the activated components
+    SUM = 0,
+    /// use :attr:`CondExecMerge.SUM_COND_OUT` mode so oprs that depend on the
+    /// gradient opr would not be executed if the forward var is not used.
+    SUM_COND_OUT = 1,
+}
+
+/// static inference option. **Note:** This is a workaround: since
+/// currently static inference in MegBrain does not take conditional
+/// execution into account, this option can be used to bypass static
+/// inference errors. This is currently only used by automatically
+/// generated gradient oprs.
+enum CondExecMarkStaticInfer : uint  {
+    /// enable both shape and value inference
+    SHAPE_VALUE = 0,
+    /// only enable shape inference (disable value inference)
+    SHAPE_ONLY = 1,
+    /// disable both shape and value inference
+    NONE = 2,
+}
+
+enum CondExecMergeMode : uint  {
+    /// copy the var whose mask is activated to the output, requiring that
+    /// exactly one branch is active
+    EXACT_ONE = 0,
+    /// like :attr:`EXACT_ONE` with the requirement that all branches have the
+    /// same shape, so shape inference can be easier
+    EXACT_ONE_SAME_SHAPE = 1,
+    /// sum all the active branches into output var; require all branches to
+    /// have the same shape. Extra shape vars are needed in this mod, so the
+    /// outputs can be initialized to zero when no input is active (and their
+    /// shapes are probably unknown).
+    SUM = 2,
+    /// like :attr:`SUM` but also add an ExecutionMask to the readers of output
+    /// vars, so they would be skipped if  no branch is taken
+    SUM_COND_OUT = 3,
+}
+
+/// how to compare predicate var with branch keys
+enum CondExecPredMode : uint  {
+    /// The outputs correspond to branch keys, and the one which equals
+    /// predicate would be activated. This behaves like a case-statement in many
+    /// languages.
+    CASE = 0,
+    /// like :attr:`CASE`, but add an extra output that would be activated if no
+    /// branch is matched
+    CASE_FALLBACK = 1,
+    /// One more outputs would be produced than the number of branch keys,
+    /// representing the interval in which the predicate var fits in. The
+    /// intervals are defined as :math:`(-\\infty, k_0), [k_0, k_1), \\ldots,
+    /// [k_{n-2}, k_{n-1}), [k_{n-1}, \infty)`. The keys must be given in
+    /// ascending order.
+    PIECEWISE = 2,
+}
+
+enum CondExecPredLogicalMode : uint  {
+    /// logical or
+    OR = 0,
+    /// logical and
+    AND = 1,
+    /// exclusive-or
+    XOR = 2,
+    /// not or(inputs)
+    NOR = 3,
+    /// not and(inputs)
+    NAND = 4,
+    /// not xor(inputs)
+    XNOR = 5,
+}
+
+enum ExecutionPolicyStrategy : uint (bit_flags) {
+    /// use heuristic to choose the fastest algorithm
+    HEURISTIC =  0,
+    /// run possible algorithms on real device to find the best
+    PROFILE =  1,
+    /// when profile or heuristic algo selection it require the algosmust be
+    /// reproducible
+    REPRODUCIBLE =  2,
+    /// profile require algos are optmized to achieve fast-profile
+    OPTIMIZED =  3,
+}
+
+enum ExecutionPolicyV0Strategy : uint  {
+    /// use heuristic to choose the fastest algorithm
+    HEURISTIC = 0,
+    /// use heuristic to choose the fastest algorithm, and the chosen algorithm
+    /// is reproducible
+    HEURISTIC_REPRODUCIBLE = 1,
+    /// run possible algorithms on real device to find the best
+    PROFILE = 2,
+    /// the fastest of profile result that is also reproducible
+    PROFILE_REPRODUCIBLE = 3,
+    /// use profile result and heuristic to choose the fastest algorithm
+    PROFILE_HEURISTIC = 4,
+}
+
+table DType {
+    dtype:DTypeEnum = Byte;
+}
+
+table PersistentOutputStorage {
+    /// This is used for controlling memory sharing. Multiple
+    /// ``PersistentOutputStorage'' oprs with the same ``share_key'' would share
+    /// underlying tensor storage. Note that the value ``-1'' is treated
+    /// specially: storage of oprs with this key would be private and would not
+    /// be shared with any other opr.
+    share_key:int = -1;
+}
+
+/// optinal axis: axis == -1 means no axis
+table OptionalAxis {
+    axis:int = -1;
+}
+
+/// optinal axis: axis == MAX_NDIM means no axis
+table OptionalAxisV1 {
+    axis:int = 7;
+}
+
+table ExecutionPolicyV0 {
+    strategy:ExecutionPolicyV0Strategy = HEURISTIC;
+    /// workspace limit in bytes
+    workspace_limit:ulong = 18446744073709551615;
+}
+
+/// specify how to select an algorithm for an operator
+table ExecutionPolicy {
+    strategy:ExecutionPolicyStrategy = 1;
+    /// workspace limit in bytes
+    workspace_limit:ulong = 18446744073709551615;
+}
+
+table AssertEqual {
+    /// max allowed error; error is defined as the minimal of absolute and
+    /// relative error
+    maxerr:float = 0.0001;
+    /// whether to print maxerr to stdout during opr exec
+    verbose:bool = false;
+}
+
+table FpgaConv {
+    need_output_quantize:bool = false;
+    need_output_threshold:bool = false;
+    stride:int = 1;
+    input_bit_width:int = 2;
+    output_bit_width:int = 2;
+    weight_bit_width:int = 2;
+    thres0:int = 0;
+    thres1:int = 1;
+    unpool_size:uint = 4;
+    direct_size:uint = 4;
+}
+
+/// collective communication between multiple computing nodes on localhost
+table CollectiveComm {
+    /// mode of collective communication
+    mode:CollectiveCommMode = REDUCE_SUM;
+}
+
+/// HACK: The tag of this param def is actually used for another non-generated
+/// param def SerializedDType, the sole purpose of this param def is to provide
+/// a spare tag. Do not use.
+table FakeSerializedDType {
+}
+
+/// evaluate a predicate and branch keys to setup ExecutionMask objects with
+/// associated predicate proxy vars (PPVs)
+table CondExecPred {
+    /// how to compare predicate var with branch keys
+    mode:CondExecPredMode = CASE;
+    /// threshold for checking equality of float point values
+    eps:float = 0.0001;
+}
+
+/// compute a logical function over a set of PPVs
+table CondExecPredLogical {
+    mode:CondExecPredLogicalMode = OR;
+}
+
+/// add ExecutionMask of the input PPV to this opr and readers of the outputs of
+/// this opr
+table CondExecMark {
+    /// mode for computing the gradient
+    grad_mode:CondExecMarkGradMode = SUM;
+    /// static inference option. **Note:** This is a workaround: since
+    /// currently static inference in MegBrain does not take conditional
+    /// execution into account, this option can be used to bypass static
+    /// inference errors. This is currently only used by automatically
+    /// generated gradient oprs.
+    static_infer:CondExecMarkStaticInfer = SHAPE_VALUE;
+}
+
+/// merge multiple conditional execution branches
+table CondExecMerge {
+    /// number of output vars (i.e. vars per branch)
+    nr_output:uint = 1;
+    mode:CondExecMergeMode = EXACT_ONE;
+}
+
+/// opr Implements NVIDIA Optical Flow SDK.
+table NvOf {
+    precision:uint = 1;
+}
+
+
--- a/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
+++ b/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
--- a/ci/compatibility/fbs/V2-backup/schema_v2.fbs
+++ b/ci/compatibility/fbs/V2-backup/schema_v2.fbs
+include "dtype.fbs";
+include "opr_param_defs.fbs";
+include "mgb_opr_param_defs.fbs";
+include "mgb_cpp_opr.fbs";
+
+namespace mgb.serialization.fbs.v2;
+
+file_identifier "mge2";
+
+table CompNode {
+    logical_locator:string;
+}
+
+table DefaultTensorFormat{}
+
+table Image2DPackedTensorFormat{
+    align_axis: ubyte;
+}
+
+table LowbitsAlignedTensorFormat{
+    size_nbits: ubyte;
+    align_size_in_bits: ubyte;
+}
+
+/// The Tensor Format
+union TensorFormat {
+    DefaultTensorFormat = 1,
+    Image2DPackedTensorFormat = 2,
+    LowbitsAlignedTensorFormat = 3,
+}
+
+/// Opaque byte buffer defined by operator implementation
+table Blob {
+    data:[ubyte];
+}
+
+table Tensor {
+    name:string;
+    shape:[uint];
+    comp_node:CompNode;
+    dtype:DType;
+    format:TensorFormat;
+    /// The tensor raw data
+    data:[ubyte];
+}
+
+table Reserved0 {}
+table DeprecatedParam {}
+
+union OperatorParam {
+    param.Empty = 1,
+    param.Axis = 2,
+    param.Convolution = 3,
+    param.MaskPropagate = 4,
+    param.ConvPooling = 5,
+    param.ConvBias = 6,
+    param.SeparableConv = 7,
+    param.Images2Neibs = 8,
+    param.Pooling = 9,
+    param.LRN = 10,
+    param.BN = 11,
+    param.ROIPooling = 12,
+    param.WarpPerspective = 13,
+    param.SpatialTfGridGenerator = 14,
+    param.SpatialTfSampler = 15,
+    param.MGBAddUpdate = 16,
+    param.Elemwise = 17,
+    param.ElemwiseMultiType = 18,
+    param.PowC = 19,
+    param.MatrixMul = 20,
+    //Reserved for param.Winograd = 21,
+    DeprecatedParam = 21,
+    param.SVD = 22,
+    param.Reduce = 23,
+    param.Cumsum = 24,
+    param.CondTake = 25,
+    param.Argsort = 26,
+    param.IndexingRemap = 27,
+    param.MGBSleep = 28,
+    param.Linspace = 29,
+    param.LinspaceFull = 30,
+    param.Eye = 31,
+    param.UniformRNG = 32,
+    param.GaussianRNG = 33,
+    param.Flip = 34,
+    param.Rotate = 35,
+    param.ROICopy = 36,
+    param.CvtColor = 37,
+    param.WarpAffine = 38,
+    param.GaussianBlur = 39,
+    param.Resize = 40,
+    param.Convolution3D = 41,
+    param.Conv3DBias = 42,
+    param.SeparableConv3D = 43,
+    param.TopK = 44,
+    param.RelayoutFormat = 45,
+    param.SeparableFilter = 46,
+    param.LocalShare = 47,
+    param.ROIAlign = 48,
+    param.DeformablePSROIPooling = 49,
+    param.BatchConvBias = 50,
+    param.DType = 51,
+    param.PersistentOutputStorage = 52,
+    param.OptionalAxis = 53,
+    param.OptionalAxisV1 = 54,
+    param.ExecutionPolicy = 55,
+    param.AssertEqual = 56,
+    param.FpgaConv = 57,
+    param.CollectiveComm = 58,
+    param.CondExecPred = 59,
+    param.CondExecPredLogical = 60,
+    param.CondExecMark = 61,
+    param.CondExecMerge = 62,
+    param.Host2DeviceCopy = 63,
+    param.Dimshuffle = 64,
+    param.AxisAddRemove = 65,
+    param.IndexDescMaskDump = 66,
+    DType = 67,
+    param.Remap = 68,
+    param.NMSKeep = 69,
+    param.AdaptivePooling = 70,
+    param.NvOf = 71,
+    param.DctChannelSelect = 72,
+    param.FakeQuant = 73,
+    param.TQT = 74,
+    param.Correlation = 75,
+    param.LSQ = 76,
+    param.GammaRNG = 77,
+    param.PoissonRNG = 78,
+    param.PermutationRNG = 79,
+    param.BetaRNG = 80,
+    param.SlidingWindowTranspose = 81,
+    param.Padding = 82,
+    param.ShuffleRNG = 83,
+    param.CheckNonFinite = 84,
+    param.LayerNorm = 85,
+    param.Dropout = 86,
+    param.RNNCell = 87,
+    param.RNN = 88,
+    param.LSTM = 89,
+    param.Softmax = 90,
+    param.Diag = 91,
+}
+
+table Operator {
+    /// the Operator type id
+    type:string;
+    /// sometime type maybe not exist, so add type_id
+    type_id:ulong;
+    name:string;
+
+    /// Operator parameter
+    param:OperatorParam;
+    /// Operator may want to save more than one OperatorParam
+    additional_params:[OperatorParam];
+
+    /// ID of the input tensor in the middle_tensors of a model
+    inputs:[uint];
+
+    /// ID of the output tensor in the middle_tensors of a model
+    outputs:[uint];
+
+    comp_node:[CompNode];
+    output_dtype:DType;
+
+    /// the const value in tensor format of the Operator
+    tensors:[Tensor];
+
+    /// opr version, with develop of MegEngine, some opr may have multi version
+    opr_version:uint;
+
+    /// the order of the Operator in the graph
+    priority:int = 0;
+
+    /// custom may want to save big, opaque byte buffers.
+    custom_data:[Blob];
+}
+
+table Metadata {
+    is_valid:bool;
+    graph_modified:bool;
+    optimize_options:ulong;
+    user_info:string;
+}
+
+table MiddleTensor {
+    name:string;
+    shape:[uint];
+    comp_node:CompNode;
+    dtype:DType;
+    format:TensorFormat;
+}
+
+table OutputVar {
+    /// the id of the middle tensor in graph, the same as the inputs in Operator
+    compact_id:uint;
+    original_id:uint;
+}
+
+table OutputAlias {
+    id:uint;
+    name:string;
+}
+
+table Model {
+    /// the megengine version when serialize the model
+    mge_version:uint;
+
+    /// model version, now model support:
+    /// version v1: the original fbs serialization version
+    /// version v2: support backward and poor forward compatibility
+    model_version:uint;
+
+    oprs:[Operator];
+
+    /// the tensors produce and consume by the Operators, not the input or
+    /// output tensor
+    middle_tensors:[MiddleTensor];
+
+    output_vars_idx:[OutputVar];
+    output_alias:[OutputAlias];
+
+    nr_shared_tensor:uint;
+    /// the Metadata to storage the custom data or some flags
+    metadata:Metadata;
+}
+
+root_type Model;
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -66,7 +66,8 @@ target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
 target_include_directories(
  ${MODULE_NAME}
  PUBLIC src/include
-  PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
+  PRIVATE ${PROJECT_SOURCE_DIR}/third_party/boost_subset/boost ${PYTHON_INCLUDE_DIRS}
+          ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
 target_link_libraries(${MODULE_NAME} PRIVATE mgb_opdef_inc)
 target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
 target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)