diff --git a/lite/example/cpp_example/example.h b/lite/example/cpp_example/example.h
index 929fff9328917453ebeae71583d61f2992c3de51..aaafdd387b6f5edcfe05895bd70cc819b3542906 100644
--- a/lite/example/cpp_example/example.h
+++ b/lite/example/cpp_example/example.h
@@ -67,7 +67,8 @@ bool config_user_allocator(const Args& args);
 bool register_cryption_method(const Args& args);
 bool update_cryption_key(const Args& args);
 bool async_forward(const Args& args);
-
+bool set_input_callback(const Args& arg);
+bool set_output_callback(const Args& arg);
 #if LITE_WITH_CUDA
 bool device_input(const Args& args);
 bool device_input_output(const Args& args);
diff --git a/lite/example/cpp_example/main.cpp b/lite/example/cpp_example/main.cpp
index cffb245bf705fd253e3aa47245e93e3efb7ad490..0100b230c016f1d860b73bfc5290fa1c9dbcac39 100644
--- a/lite/example/cpp_example/main.cpp
+++ b/lite/example/cpp_example/main.cpp
@@ -160,6 +160,8 @@ REGIST_EXAMPLE("reset_input", reset_input);
 REGIST_EXAMPLE("reset_input_output", reset_input_output);
 REGIST_EXAMPLE("config_user_allocator", config_user_allocator);
 REGIST_EXAMPLE("async_forward", async_forward);
+REGIST_EXAMPLE("set_input_callback", set_input_callback);
+REGIST_EXAMPLE("set_output_callback", set_output_callback);
 
 REGIST_EXAMPLE("basic_c_interface", basic_c_interface);
 REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface);
diff --git a/lite/example/cpp_example/mge/basic.cpp b/lite/example/cpp_example/mge/basic.cpp
index 6dfafc2e72b04ba860bc392a5abb48ef7f4f5410..55e20270f9c0cf03b6519d71693025cf5b1c2686 100644
--- a/lite/example/cpp_example/mge/basic.cpp
+++ b/lite/example/cpp_example/mge/basic.cpp
@@ -365,6 +365,142 @@ bool lite::example::async_forward(const Args& args) {
     printf("max=%e, sum=%e\n", max, sum);
     return true;
 }
+
+bool lite::example::set_input_callback(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    Config config;
+    config.options.var_sanity_check_first_run = false;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! set input callback
+    volatile bool finished = false;
+    network->set_start_callback(
+            [&finished](const std::unordered_map<
+                        std::string, std::pair<IO, std::shared_ptr<Tensor>>>& inputs) {
+#if !__DEPLOY_ON_XP_SP2__
+                std::cout << "worker thread_id:" << std::this_thread::get_id()
+                          << std::endl;
+#endif
+                for (auto&& item : inputs) {
+                    std::cout << "input name: " << item.first
+                              << "input dim: " << item.second.second->get_layout().ndim
+                              << std::endl;
+                }
+                finished = true;
+            });
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    //! forward
+    network->forward();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("Forward finish, count is %zu\n", count);
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
+bool lite::example::set_output_callback(const Args& args) {
+    std::string network_path = args.model_path;
+    std::string input_path = args.input_path;
+    Config config;
+    config.options.var_sanity_check_first_run = false;
+
+    //! create and load the network
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(network_path);
+
+    //! set input data to input tensor
+    std::shared_ptr<Tensor> input_tensor = network->get_output_tensor(0);
+    //! copy or forward data to network
+    size_t length = input_tensor->get_tensor_total_size_in_byte();
+    void* dst_ptr = input_tensor->get_memory_ptr();
+    auto src_tensor = parse_npy(input_path);
+    void* src = src_tensor->get_memory_ptr();
+    memcpy(dst_ptr, src, length);
+
+    //! set output callback
+    volatile bool finished = false;
+    network->set_finish_callback(
+            [&finished](const std::unordered_map<
+                        std::string, std::pair<IO, std::shared_ptr<Tensor>>>& outputs) {
+#if !__DEPLOY_ON_XP_SP2__
+                std::cout << "worker thread_id:" << std::this_thread::get_id()
+                          << std::endl;
+#endif
+                for (auto&& item : outputs) {
+                    std::cout << "output name: " << item.first
+                              << "output dim: " << item.second.second->get_layout().ndim
+                              << std::endl;
+                }
+                finished = true;
+            });
+
+#if !__DEPLOY_ON_XP_SP2__
+    std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
+#endif
+
+    //! forward
+    network->forward();
+    network->wait();
+    size_t count = 0;
+    while (finished == false) {
+        count++;
+    }
+    printf("Forward finish, count is %zu\n", count);
+
+    //! get the output data or read tensor set in network_in
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    void* out_data = output_tensor->get_memory_ptr();
+    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
+                        output_tensor->get_layout().get_elem_size();
+    printf("length=%zu\n", length);
+    float max = -1.0f;
+    float sum = 0.0f;
+    for (size_t i = 0; i < out_length; i++) {
+        float data = static_cast<float*>(out_data)[i];
+        sum += data;
+        if (max < data)
+            max = data;
+    }
+    printf("max=%e, sum=%e\n", max, sum);
+    return true;
+}
+
 #endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h
index 64ca7a377f7358eebb86afff97acc7ffde0623b6..ff838d0e815e2bfd53182c63fc6b24f08aff9246 100644
--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -184,6 +184,8 @@ typedef int (*LiteThreadAffinityCallback)(int thread_id);
 
 typedef int (*LiteAsyncCallback)();
 
+typedef int (*LiteAsyncCallbackWithData)(void* user_data);
+
 /*!
  * \brief the start/finish callback function
  * \param unordered_map map from the io tensor name to the pair of which is the
@@ -193,9 +195,17 @@ typedef int (*LiteAsyncCallback)();
 typedef int (*LiteStartCallback)(
         const LiteIO* inputs, const LiteTensor* input_tensors, size_t size);
 
+typedef int (*LiteStartCallbackWithData)(
+        const LiteIO* inputs, const LiteTensor* input_tensors, size_t size,
+        void* user_data);
+
 typedef int (*LiteFinishCallback)(
         const LiteIO* outputs, const LiteTensor* output_tensors, size_t size);
 
+typedef int (*LiteFinishCallbackWithData)(
+        const LiteIO* outputs, const LiteTensor* output_tensors, size_t size,
+        void* user_data);
+
 /*!
  * \brief The network is construct form a model, implement model load, init,
  * forward, and display some model information
@@ -442,6 +452,19 @@ LITE_API int LITE_set_network_algo_workspace_limit(
 LITE_API int LITE_set_async_callback(
         LiteNetwork network, const LiteAsyncCallback async_callback);
 
+/**
+ * \brief set the network forward in async mode and set the async callback
+ * function
+ * \param[in] network The loaded model
+ * \param[in] async_callback when network finish forwarding, the callback
+ * will be called
+ * \param[in] user_data user defined data for something user want to deploy
+ * at forward finish stage
+ */
+LITE_API int LITE_set_async_callback_with_userdata(
+        LiteNetwork network, const LiteAsyncCallbackWithData async_callback,
+        void* user_data);
+
 /**
  * \brief set the start forward callback function, which will be execute beform
  *  forward, this can be used to check network input or dump model inputs
@@ -453,6 +476,20 @@ LITE_API int LITE_set_async_callback(
 LITE_API int LITE_set_start_callback(
         LiteNetwork network, const LiteStartCallback start_callback);
 
+/**
+ * \brief set the start forward callback function, which will be execute beform
+ *  forward, this can be used to check network input or dump model inputs
+ *  for debug
+ * \param[in] network The loaded model
+ * \param[in] start_callback when network start forwarding, the callbak
+ * will be called
+ * \param[in] user_data user defined data for something user want to deploy
+ * at forward start stage
+ */
+LITE_API int LITE_set_start_callback_with_userdata(
+        LiteNetwork network, const LiteStartCallbackWithData start_callback,
+        void* user_data);
+
 /**
  * \brief set the finish forward callback function, which will be execute after
  * forward, this can be used to dump model outputs for debug
@@ -463,6 +500,19 @@ LITE_API int LITE_set_start_callback(
 LITE_API int LITE_set_finish_callback(
         LiteNetwork network, const LiteFinishCallback finish_callback);
 
+/**
+ * \brief set the finish forward callback function, which will be execute after
+ * forward, this can be used to dump model outputs for debug
+ * \param[in] network The loaded model
+ * \param[in] finish_callback when network finish forwarding, the callbak
+ * will be called
+ * \param[in] user_data user defined data for something user want to deploy
+ * at finish stage
+ */
+LITE_API int LITE_set_finish_callback_with_userdata(
+        LiteNetwork network, const LiteFinishCallbackWithData finish_callback,
+        void* user_data);
+
 /**
  * \brief set threads affinity callback
  * \param[in] network The loaded model
diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp
index d9419df6028967cc0cc90419102bb947dc2f1e93..51df08cf99c10316d36628fff4013afb1af11542 100644
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
@@ -355,6 +355,22 @@ int LITE_set_async_callback(
     LITE_CAPI_END();
 }
 
+int LITE_set_async_callback_with_userdata(
+        LiteNetwork network, LiteAsyncCallbackWithData async_callback,
+        void* user_data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    LITE_ASSERT(async_callback, "The ptr pass to LITE api is null");
+
+    auto lite_async_callback = [async_callback, user_data]() -> void {
+        async_callback(user_data);
+    };
+    static_cast<lite::Network*>(network)->set_async_callback(
+            std::move(lite_async_callback));
+
+    LITE_CAPI_END();
+}
+
 int LITE_set_start_callback(
         LiteNetwork network, const LiteStartCallback start_callback) {
     LITE_CAPI_BEGIN();
@@ -381,6 +397,34 @@ int LITE_set_start_callback(
     LITE_CAPI_END();
 }
 
+int LITE_set_start_callback_with_userdata(
+        LiteNetwork network, const LiteStartCallbackWithData start_callback,
+        void* user_data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_start_callback =
+            [start_callback,
+             user_data](const std::unordered_map<
+                        std::string,
+                        std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>& inputs_map)
+            -> void {
+        std::vector<LiteIO> ios;
+        std::vector<LiteTensor> io_tensors;
+        size_t nr_io = 0;
+        for (const auto& io : inputs_map) {
+            nr_io++;
+            auto&& lite_io = io.second.first;
+            ios.push_back(
+                    {lite_io.name.c_str(), lite_io.is_host, lite_io.io_type,
+                     convert_to_clayout(lite_io.config_layout)});
+            io_tensors.push_back(io.second.second.get());
+        }
+        start_callback(ios.data(), io_tensors.data(), nr_io, user_data);
+    };
+    static_cast<lite::Network*>(network)->set_start_callback(lite_start_callback);
+    LITE_CAPI_END();
+}
+
 int LITE_set_finish_callback(
         LiteNetwork network, const LiteFinishCallback finish_callback) {
     LITE_CAPI_BEGIN();
@@ -407,6 +451,34 @@ int LITE_set_finish_callback(
     LITE_CAPI_END();
 }
 
+int LITE_set_finish_callback_with_userdata(
+        LiteNetwork network, const LiteFinishCallbackWithData finish_callback,
+        void* user_data) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto lite_finish_callback =
+            [finish_callback,
+             user_data](const std::unordered_map<
+                        std::string,
+                        std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>&
+                                outputs_map) -> void {
+        std::vector<LiteIO> ios;
+        std::vector<LiteTensor> io_tensors;
+        size_t nr_io = 0;
+        for (const auto& io : outputs_map) {
+            nr_io++;
+            auto&& lite_io = io.second.first;
+            ios.push_back(
+                    {lite_io.name.c_str(), lite_io.is_host, lite_io.io_type,
+                     convert_to_clayout(lite_io.config_layout)});
+            io_tensors.push_back(io.second.second.get());
+        }
+        finish_callback(ios.data(), io_tensors.data(), nr_io, user_data);
+    };
+    static_cast<lite::Network*>(network)->set_finish_callback(lite_finish_callback);
+    LITE_CAPI_END();
+}
+
 int LITE_enable_profile_performance(
         LiteNetwork network, const char* profile_json_file_path) {
     LITE_CAPI_BEGIN();
diff --git a/lite/test/test_network_c.cpp b/lite/test/test_network_c.cpp
index 21b79f63182beb82bf5c13660a0cd774a93b4ad7..0c16857b9f25821aa869103359942fb9cb05cbe9 100644
--- a/lite/test/test_network_c.cpp
+++ b/lite/test/test_network_c.cpp
@@ -74,11 +74,21 @@ int multi_thread_affinity(int id) {
 };
 
 volatile bool finished = false;
-int finish_callback() {
+int async_callback() {
     finished = true;
     return 0;
 }
 
+volatile bool finished_with_data = false;
+int async_callback_with_data(void* user_data) {
+    if (user_data != NULL) {
+        std::cout << "async_callback user_data addr=" << std::hex << user_data
+                  << std::endl;
+    }
+    finished_with_data = true;
+    return 0;
+}
+
 volatile bool start_checked = false;
 int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors, size_t size) {
     start_checked = true;
@@ -96,6 +106,29 @@ int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors, size_t
     return 0;
 }
 
+volatile bool start_checked_with_data = false;
+int start_callback_with_data(
+        const LiteIO* inputs, const LiteTensor* input_tensors, size_t size,
+        void* user_data) {
+    start_checked_with_data = true;
+    auto check_func = [&]() {
+        if (user_data != NULL) {
+            std::cout << "start_callback user_data addr=" << std::hex << user_data
+                      << std::endl;
+        }
+        ASSERT_EQ(size, 1);
+        ASSERT_EQ(std::string(inputs->name), "data");
+        LiteLayout layout;
+        LITE_get_tensor_layout(*input_tensors, &layout);
+        ASSERT_EQ(layout.ndim, 4);
+        ASSERT_EQ(layout.shapes[1], 3);
+        ASSERT_EQ(layout.shapes[2], 224);
+        ASSERT_EQ(layout.shapes[3], 224);
+    };
+    check_func();
+    return 0;
+}
+
 volatile bool finish_checked = false;
 int finish_callback(
         const LiteIO* outputs, const LiteTensor* output_tensors, size_t size) {
@@ -113,6 +146,28 @@ int finish_callback(
     return 0;
 }
 
+volatile bool finish_checked_with_data = false;
+int finish_callback_with_data(
+        const LiteIO* outputs, const LiteTensor* output_tensors, size_t size,
+        void* user_data) {
+    finish_checked_with_data = true;
+    auto check_func = [&]() {
+        if (user_data != NULL) {
+            std::cout << "finish_callback user_data addr=" << std::hex << user_data
+                      << std::endl;
+        }
+        ASSERT_EQ(size, 1);
+        ASSERT_EQ(
+                std::string(outputs->name),
+                "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
+        LiteLayout layout;
+        LITE_get_tensor_layout(*output_tensors, &layout);
+        ASSERT_EQ(layout.shapes[1], 1000);
+    };
+    check_func();
+    return 0;
+}
+
 }  // namespace
 
 #define LITE_CAPI_CHECK(_expr)                 \
@@ -671,6 +726,21 @@ TEST(TestCapiNetWork, StartCallBack) {
     LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }
 
+TEST(TestCapiNetWork, StartCallBackWithData) {
+    ForwardMgb;
+    MakeNetwork;
+    LoadNetwork;
+    size_t user_data = 1;
+    LITE_CAPI_CHECK(LITE_set_start_callback_with_userdata(
+            c_network, start_callback_with_data, &user_data));
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    ASSERT_TRUE(start_checked_with_data);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
 TEST(TestCapiNetWork, FinishCallBack) {
     ForwardMgb;
     MakeNetwork;
@@ -684,6 +754,21 @@ TEST(TestCapiNetWork, FinishCallBack) {
     LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }
 
+TEST(TestCapiNetWork, FinishCallBackWtihData) {
+    ForwardMgb;
+    MakeNetwork;
+    LoadNetwork;
+    size_t user_data = 1;
+    LITE_CAPI_CHECK(LITE_set_finish_callback_with_userdata(
+            c_network, finish_callback_with_data, &user_data));
+    SetInput;
+    ForwardNetwork;
+    GetOutput;
+    CompareResult;
+    ASSERT_TRUE(finish_checked_with_data);
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
 TEST(TestCapiNetWork, BasicCryptAes) {
     ForwardMgb;
 
@@ -723,7 +808,7 @@ TEST(TestCapiNetWork, AsyncExec) {
     LiteConfig c_config = *default_config();
     c_config.options.var_sanity_check_first_run = false;
     LITE_CAPI_CHECK(LITE_make_network(&c_network, c_config, *default_network_io()));
-    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback));
+    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback));
     LoadNetwork;
     SetInput;
 
@@ -740,6 +825,32 @@ TEST(TestCapiNetWork, AsyncExec) {
     LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }
 
+TEST(TestCapiNetWork, AsyncExecWithData) {
+    finished = false;
+    ForwardMgb;
+    LiteNetwork c_network;
+    LiteConfig c_config = *default_config();
+    c_config.options.var_sanity_check_first_run = false;
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, c_config, *default_network_io()));
+    size_t user_data = 1;
+    LITE_CAPI_CHECK(LITE_set_async_callback_with_userdata(
+            c_network, async_callback_with_data, &user_data));
+    LoadNetwork;
+    SetInput;
+
+    LITE_forward(c_network);
+    size_t count = 0;
+    while (finished_with_data == false) {
+        count++;
+    }
+    ASSERT_GT(count, 0);
+    finished_with_data = false;
+
+    GetOutput;
+    CompareResult;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
 TEST(TestCapiNetWork, OutputShapeOnly) {
     ForwardMgb;
     LiteNetwork c_network;