Fixed gelu, added layernorm. Added timvx version gelu and layernorm (#1415)

* Fixed gelu save_graph error Added SaveTmGeluOp() * Added gelu timvx * Added layernorm operator * Added layernorm timvx

Fixed gelu, added layernorm. Added timvx version gelu and layernorm (#1415)
* Fixed gelu save_graph error Added SaveTmGeluOp() * Added gelu timvx * Added layernorm operator * Added layernorm timvx
c73708ce · shijie001 · GitHub · cb3b6e6a · c73708ce · c73708ce
15 changed file
--- a/source/device/cpu/op/layernorm/layernorm_ref.c
+++ b/source/device/cpu/op/layernorm/layernorm_ref.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: Shijie Chen
+ */
+
+#include "layernorm_param.h"
+
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "utility/sys_port.h"
+#include "utility/float.h"
+#include "utility/log.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_module.h"
+
+#include <math.h>
+
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int ref_layernorm_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
+                              struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
+{
+#if 1
+    // TIM-VX
+    int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
+    int count = 1;
+    for (int i = 0; i < input_tensor->dim_num - 1; i++)
+    {
+        count *= input_tensor->dims[i];
+    }
+#else
+    // PyTorch
+    int norm_size = gamma_tensor->elem_num;
+    int count = input_tensor->elem_num / gamma_tensor->elem_num;
+#endif
+
+    const float* input_data = (const float*)input_tensor->data;
+    float* output_data = (float*)output_tensor->data;
+
+    const float* gamma_data = (const float*)gamma_tensor->data;
+    const float* beta_data = (const float*)beta_tensor->data;
+
+    for (int i = 0; i < count; i++)
+    {
+        float sum = 0.f;
+        float sqsum = 0.f;
+        for (int j = 0; j < norm_size; j++)
+        {
+            float x = input_data[i * norm_size + j];
+            sum += x;
+            sqsum += x * x;
+        }
+        float mean = sum / norm_size;
+        float var = sqsum / norm_size - mean * mean;
+        float a = 1.0f / sqrtf(var + eps);
+        float b = -mean * a;
+        for (int j = 0; j < norm_size; j++)
+        {
+            int offset = i * norm_size + j;
+            output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
+        }
+    }
+
+    return 0;
+}
+
+static int ref_layernorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
+                               struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
+{
+#if 1
+    // TIM-VX
+    int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
+    int count = 1;
+    for (int i = 0; i < input_tensor->dim_num - 1; i++)
+    {
+        count *= input_tensor->dims[i];
+    }
+#else
+    // PyTorch
+    int norm_size = gamma_tensor->elem_num;
+    int count = input_tensor->elem_num / gamma_tensor->elem_num;
+#endif
+
+    int total_size = input_tensor->elem_num;
+    float* input_data = (float*)sys_malloc(total_size * sizeof(float));
+    float* output_data = (float*)sys_malloc(total_size * sizeof(float));
+
+    // dequant
+    {
+        const uint8_t* input_uint8 = (const uint8_t*)input_tensor->data;
+        float input_scale = input_tensor->scale;
+        int input_zero = input_tensor->zero_point;
+
+        for (int i = 0; i < total_size; i++)
+            input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
+    }
+
+    const float* gamma_data = (const float*)gamma_tensor->data;
+    const float* beta_data = (const float*)beta_tensor->data;
+
+    for (int i = 0; i < count; i++)
+    {
+        float sum = 0.f;
+        float sqsum = 0.f;
+        for (int j = 0; j < norm_size; j++)
+        {
+            float x = input_data[i * norm_size + j];
+            sum += x;
+            sqsum += x * x;
+        }
+        float mean = sum / norm_size;
+        float var = sqsum / norm_size - mean * mean;
+        float a = 1.0f / sqrtf(var + eps);
+        float b = -mean * a;
+        for (int j = 0; j < norm_size; j++)
+        {
+            int offset = i * norm_size + j;
+            output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
+        }
+    }
+
+    // quant
+    {
+        uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
+        float output_scale = output_tensor->scale;
+        int output_zero = output_tensor->zero_point;
+        for (int i = 0; i < total_size; i++)
+        {
+            int udata = (int)roundf(output_data[i] / output_scale + output_zero);
+            if (udata > 255)
+                udata = 255;
+            else if (udata < 0)
+                udata = 0;
+            output_uint8[i] = udata;
+        }
+    }
+
+    sys_free(input_data);
+    sys_free(output_data);
+    return 0;
+}
+
+static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct node* node = exec_node->ir_node;
+    struct graph* graph = node->graph;
+
+    struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* gamma_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    struct tensor* beta_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
+
+    struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+
+    struct layernorm_Param* param = (struct layernorm_Param*)node->op.param_mem;
+    float eps = param->eps;
+
+    int ret = -1;
+    if (input_tensor->data_type == TENGINE_DT_FP32)
+        ret = ref_layernorm_fp32(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ret = ref_layernorm_uint8(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);
+
+    return ret;
+}
+
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
+{
+    return OPS_SCORE_BEST;
+}
+
+static struct node_ops hcl_node_ops = {.prerun = NULL,
+                                       .run = run,
+                                       .reshape = NULL,
+                                       .postrun = NULL,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score};
+
+int register_layernorm_ref_op()
+{
+    return register_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
+}
+
+int unregister_layernorm_ref_op()
+{
+    return unregister_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
+}
--- a/source/device/tim-vx/op/timvx_gelu.cc
+++ b/source/device/tim-vx/op/timvx_gelu.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, Open AI Lab
+ * Author: Shijie Chen
+ */
+
+#include "timvx_executor.hpp"
+
+extern "C"
+{
+#include "operator/op.h"
+}
+
+
+bool VXEngine::AddGeluNode(struct node* ir_node)
+{
+    struct graph* ir_graph = ir_node->graph;
+
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    auto gelu = graph->CreateOperation<tim::vx::ops::Gelu>();
+    (*gelu)
+        .BindInputs({ this->vx_tensor_map[input_tensor->index] })
+        .BindOutputs({ this->vx_tensor_map[output_tensor->index] });
+
+    return true;
+}
+
--- a/source/device/tim-vx/op/timvx_layernorm.cc
+++ b/source/device/tim-vx/op/timvx_layernorm.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, Open AI Lab
+ * Author: Shijie Chen
+ */
+
+#include "timvx_executor.hpp"
+
+extern "C"
+{
+#include "operator/op.h"
+#include "layernorm_param.h"
+}
+
+
+bool VXEngine::AddLayerNormNode(struct node* ir_node)
+{
+    struct graph* ir_graph = ir_node->graph;
+
+    std::vector<std::shared_ptr<tim::vx::Tensor> > bn_in_tensor(ir_node->input_num);
+
+    int in_set[3] = {0, 2, 1};
+    for (int i = 0; i < ir_node->input_num; i++)
+    {
+        int idx = in_set[i];
+        struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[idx]);
+        bn_in_tensor[i] = this->vx_tensor_map[input_tensor->index];
+    }
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    struct layernorm_Param* param = (struct layernorm_Param*)ir_node->op.param_mem;
+
+    auto layernorm = graph->CreateOperation<tim::vx::ops::LayerNormalization>(0, param->eps);
+    (*layernorm)
+            .BindInputs({ bn_in_tensor })
+            .BindOutputs({ this->vx_tensor_map[output_tensor->index] });
+
+    return true;
+}
+
--- a/source/device/tim-vx/timvx_executor.cc
+++ b/source/device/tim-vx/timvx_executor.cc
@@ -365,6 +365,12 @@ int VXEngine::Build(struct subgraph* subgraph)
            case OP_L2NORMALIZATION:
                this->AddL2normalizationNode(ir_node);
                break;
+            case OP_GELU:
+                this->AddGeluNode(ir_node);
+                break;
+            case OP_LAYERNORM:
+                this->AddLayerNormNode(ir_node);
+                break;
            default:
                fprintf(stderr, "Tengine TIM-VX: Cannot support OP(%d).\n", ir_node->index);
                break;

--- a/source/device/tim-vx/timvx_executor.hpp
+++ b/source/device/tim-vx/timvx_executor.hpp
@@ -79,6 +79,7 @@ extern "C" {
 #include "tim/vx/ops/transpose.h"
 #include "tim/vx/ops/spatial_transformer.h"
 #include "tim/vx/ops/l2normalization.h"
+#include "tim/vx/ops/layernormalization.h"

 #define SPEC_TYPE_CONV      1
 #define SPEC_TYPE_CONV_BIAS 2
@@ -145,6 +146,8 @@ private:
    bool AddUpsampleNode(struct node* ir_node);
    bool AddSpatialtransformerNode(struct node* ir_node);
    bool AddL2normalizationNode(struct node* ir_node);
+    bool AddGeluNode(struct node* ir_node);
+    bool AddLayerNormNode(struct node* ir_node);

 public:
    std::shared_ptr<tim::vx::Context> context;

--- a/source/device/tim-vx/timvx_limit.hpp
+++ b/source/device/tim-vx/timvx_limit.hpp
@@ -131,5 +131,7 @@ const int timvx_supported_ops[] = {
    //    OP_WHERE,
    //    OP_SOFTPLUS,
    //    OP_RECIPROCAL,
+    OP_GELU,
+    OP_LAYERNORM,
    //    OP_BUILTIN_LAST
 };
--- a/source/operator/op.h
+++ b/source/operator/op.h
@@ -140,6 +140,7 @@ enum
    OP_SPATIALTRANSFORMER,
    OP_EXPAND,
    OP_GELU,
+    OP_LAYERNORM,
    OP_BUILTIN_LAST
 };


--- a/source/operator/op_name.h
+++ b/source/operator/op_name.h
@@ -127,3 +127,4 @@
 #define OP_SPATIALTRANSFORMER_NAME    "SpatialTransformer"
 #define OP_EXPAND_NAME                "Expand"
 #define OP_GELU_NAME                  "Gelu"
+#define OP_LAYERNORM_NAME             "LayerNorm"
\ No newline at end of file
--- a/source/operator/prototype/layernorm.c
+++ b/source/operator/prototype/layernorm.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: Shijie Chen
+ */
+
+#include "layernorm_param.h"
+
+#include "api/c_api.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "module/module.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+
+#include <string.h>
+
+static int infer_shape(struct node* node)
+{
+    struct graph* graph = node->graph;
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+
+    set_ir_tensor_shape(output, input->dims, input->dim_num);
+
+    return 0;
+}
+
+static int init_op(struct op* op)
+{
+    struct layernorm_Param* param = (struct layernorm_Param*)sys_malloc(sizeof(struct layernorm_Param));
+
+    if (param == NULL)
+    {
+        return -1;
+    }
+
+    /*set the param default value */
+    memset(param, 0, sizeof(struct layernorm_Param));
+    op->param_mem = param;
+    op->param_size = sizeof(struct layernorm_Param);
+    op->same_shape = 0;
+    op->infer_shape = infer_shape;
+
+    return 0;
+}
+
+static void release_op(struct op* op)
+{
+    sys_free(op->param_mem);
+}
+
+int register_layernorm_op()
+{
+    struct method m;
+
+    m.version = 1;
+    m.init = init_op;
+    m.release = release_op;
+
+    return register_op(OP_LAYERNORM, OP_LAYERNORM_NAME, &m);
+}
+
+int unregister_layernorm_op()
+{
+    return unregister_op(OP_LAYERNORM, 1);
+}
--- a/source/operator/prototype/layernorm_param.h
+++ b/source/operator/prototype/layernorm_param.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: Shijie Chen
+ */
+
+#ifndef __LAYERNORM_PARAM_H__
+#define __LAYERNORM_PARAM_H__
+
+struct layernorm_Param
+{
+    float eps;
+};
+
+#endif
--- a/source/serializer/tmfile/op/tm2_layernorm.c
+++ b/source/serializer/tmfile/op/tm2_layernorm.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: Shijie Chen
+ */
+
+#include "layernorm_param.h"
+
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "module/module.h"
+#include "serializer/serializer.h"
+#include "tmfile/tm2_serializer.h"
+#include "device/device.h"
+#include "utility/log.h"
+
+static int layernorm_op_map(int op)
+{
+    return OP_LAYERNORM;
+}
+
+static int tm2_load_layernorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
+                                 const TM2_Operator* tm_op)
+{
+    struct layernorm_Param* gather_param = (struct layernorm_Param*)ir_node->op.param_mem;
+    const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
+    const char* mem_base = tm2_priv->base;
+    const TM2_LayerNormParam* tm_param = (TM2_LayerNormParam*)(mem_base + tm_op->offset_t_param);
+
+    gather_param->eps = tm_param->eps;
+
+    return 0;
+}
+
+int register_tm2_layernorm_op()
+{
+    struct serializer* tm2_s = find_serializer_via_name("tengine");
+
+    if (tm2_s == NULL)
+    {
+        TLOG_ERR("tengine serializer has not been registered yet\n");
+        return -1;
+    }
+
+    tm2_s->register_op_loader(tm2_s, TM2_OPTYPE_LAYERNORM, 1, tm2_load_layernorm, layernorm_op_map, NULL);
+
+    return 0;
+}
+
+int unregister_tm2_layernorm_op()
+{
+    struct serializer* tm2_s = find_serializer_via_name("tengine");
+
+    tm2_s->unregister_op_loader(tm2_s, TM2_OPTYPE_LAYERNORM, 1, tm2_load_layernorm);
+
+    return 0;
+}
--- a/source/serializer/tmfile/tm2_format.h
+++ b/source/serializer/tmfile/tm2_format.h
@@ -151,6 +151,8 @@ typedef uint8_t tm_bool_t;     /* bool is 1-byte unsigned integer */
 #define TM2_OPSTR_SPATIALTRANSFORMER   "SpatialTransformer"
 #define TM2_OPSTR_EXPAND               "Expand"
 #define TM2_OPSTR_GELU                 "Gelu"
+#define TM2_OPSTR_LAYERNORM            "LayerNorm"
+
 /* Operator types */
 #define TM2_OPTYPE_ACCURACY             0  /* No Param                 */
 #define TM2_OPTYPE_BATCHNORMALIZATION   1  /* TM2_BatchNormParam       */
@@ -258,7 +260,8 @@ typedef uint8_t tm_bool_t;     /* bool is 1-byte unsigned integer */
 #define TM2_OPTYPE_RECIPROCAL           103
 #define TM2_OPTYPE_SPATIALTRANSFORMER   105
 #define TM2_OPTYPE_GELU                 106
-#define TM2_OPTYPE_NUM                  107
+#define TM2_OPTYPE_LAYERNORM            107
+#define TM2_OPTYPE_NUM                  108
 /* --------------------- -------- TM objects -------------------------------- */

 typedef struct
@@ -1006,6 +1009,11 @@ typedef struct
    int dim_num;
 } TM2_ExpandParam;

+typedef struct
+{
+    float eps;
+} TM2_LayerNormParam;
+
 #ifdef __cplusplus
 }
 #endif

--- a/tools/convert_tool/onnx/onnx2tengine.cpp
+++ b/tools/convert_tool/onnx/onnx2tengine.cpp
@@ -2255,6 +2255,14 @@ static int load_gru(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& o
    return 0;
 }

+static int load_layer_norm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
+{
+    struct layernorm_Param* layernorm_param = (struct layernorm_Param*)node->op.param_mem;
+    layernorm_param->eps = GetAttributeOrDefault<float>(onnx_node, "epsilon", 1e-5);
+
+    return 0;
+}
+
 /*
 *   OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER START
 */
@@ -2342,6 +2350,7 @@ void onnx_serializer::register_op_load()
    op_load_map["Unsqueeze"] = std::pair<int, op_load_t>(OP_UNSQUEEZE, load_unsqueeze);
    op_load_map["Where"] = std::pair<int, op_load_t>(OP_WHERE, load_no_param);
    op_load_map["Gelu"] = std::pair<int, op_load_t>(OP_GELU, load_no_param);
+    op_load_map["LayerNorm"] = std::pair<int, op_load_t>(OP_LAYERNORM, load_layer_norm);
 }
 /*
 *   OPERATOR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER END

--- a/tools/save_graph/op_include.h
+++ b/tools/save_graph/op_include.h
@@ -103,6 +103,7 @@ extern "C" {
 #include "tile_param.h"
 #include "expand_param.h"
 #include "spatialtransformer_param.h"
+#include "layernorm_param.h"

 #ifdef __cplusplus
 }

--- a/tools/save_graph/tm2_op_save.cpp
+++ b/tools/save_graph/tm2_op_save.cpp
@@ -1422,6 +1422,23 @@ tm_uoffset_t SaveTmReciprocalOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir
    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
 }

+tm_uoffset_t SaveTmGeluOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node)
+{
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_GELU, TM2_NOT_SET);
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
+tm_uoffset_t SaveTmLayerNormOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node)
+{
+    struct layernorm_Param* p = (struct layernorm_Param*)node->op.param_mem;
+    TM2_LayerNormParam tm_param;
+    tm_param.eps = p->eps;
+    TM2_Operator tm_op;
+    SetTmOperator(&tm_op, TM2_OPTYPE_LAYERNORM, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_LayerNormParam)));
+    return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
+}
+
 op_save_t SaveTmOpFunc(uint32_t op_type)
 {
    switch (op_type)
@@ -1606,6 +1623,10 @@ op_save_t SaveTmOpFunc(uint32_t op_type)
        return SaveTmMaximumOp;
    case OP_MINIMUM:
        return SaveTmMinimumOp;
+    case OP_GELU:
+        return SaveTmGeluOp;
+    case OP_LAYERNORM:
+        return SaveTmLayerNormOp;
    default:
        // fprintf(stderr, "Operator #%d not supported in tengine model yet\n", op_type);
        return nullptr;