提交 f84e8fa0 编写于 作者: X xuanzhuo

alinux: add tcprt framework to kernel

to #26353046

TcpRT: Instrument and Diagnostic Analysis System for Service Quality
of Cloud Databases at Massive Scale in Real-time.

It can also provide information for all request/response services. Such as
HTTP request.

This is the kernel framework for tcprt, more work needs tcprt module
support.

TcpRt module should call tcp_unregitsert_rt before rmmod.

TcpRt hooks will be called when sock init, recv data, send data,
packet acked and socket been destroy. The private data save to
icsk->icsk_tcp_rt_priv.
Reviewed-by: NCambda Zhu <cambda@linux.alibaba.com>
Acked-by: NDust Li <dust.li@linux.alibaba.com>
Signed-off-by: Nxuanzhuo <xuanzhuo@linux.alibaba.com>
上级 4dc24f04
...@@ -136,6 +136,11 @@ Maintainers List (try to look for most precise areas first) ...@@ -136,6 +136,11 @@ Maintainers List (try to look for most precise areas first)
----------------------------------- -----------------------------------
TCP RT
M: xuanzhuo <xuanzhuo@linux.alibaba.com>
S: Maintained
F: net/ipv4/tcp_rt.c
3C59X NETWORK DRIVER 3C59X NETWORK DRIVER
M: Steffen Klassert <klassert@kernel.org> M: Steffen Klassert <klassert@kernel.org>
L: netdev@vger.kernel.org L: netdev@vger.kernel.org
......
...@@ -139,6 +139,11 @@ struct inet_connection_sock { ...@@ -139,6 +139,11 @@ struct inet_connection_sock {
} icsk_mtup; } icsk_mtup;
u32 icsk_user_timeout; u32 icsk_user_timeout;
#ifdef CONFIG_TCP_RT
const struct tcp_rt_ops *icsk_tcp_rt_ops;
void *icsk_tcp_rt_priv;
#endif
u64 icsk_ca_priv[88 / sizeof(u64)]; u64 icsk_ca_priv[88 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64)) #define ICSK_CA_PRIV_SIZE (11 * sizeof(u64))
}; };
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include <net/snmp.h> #include <net/snmp.h>
#include <net/ip.h> #include <net/ip.h>
#include <net/tcp_states.h> #include <net/tcp_states.h>
#include <net/tcp_rt.h>
#include <net/inet_ecn.h> #include <net/inet_ecn.h>
#include <net/dst.h> #include <net/dst.h>
......
/* SPDX-License-Identifier: GPL-2.0
*
* TcpRT Instrument and Diagnostic Analysis System for Service Quality
* of Cloud Databases at Massive Scale in Real-time.
*
* It can also provide information for all request/response
* services. Such as HTTP request.
*
* This is the kernel framework, more work needs tcprt module
* support.
*/
#ifndef _TCP_RT_H
#define _TCP_RT_H
#ifdef CONFIG_TCP_RT
struct tcp_rt_ops {
/*
* initialize private data (required)
*
* ret:
* 0: success alloc private data for this connection.
* -1: fail alloc private data or not care about this connection.
* Then this connection will not ref to tcp_rt.
*/
int (*init)(struct sock *sk);
/* cleanup private data (required) */
void (*release)(struct sock *sk);
/* recv data */
void (*recv_data)(struct sock *sk);
/* send data */
void (*send_data)(struct sock *sk);
/* hook for packet ack accounting */
void (*pkts_acked)(struct sock *sk);
struct module *owner;
};
/*
* tcp_register_rt() - register tcp_rt ops to kernel
* @rt: tcp_rt_ops
*
* ret:
* -EINVAL: init or release of ops not init.
* -EBUSY: fail to get the moudle.
* -EEXIST: there exists one tcprt.
* 0: success
*/
int tcp_register_rt(const struct tcp_rt_ops *rt);
/*
* tcp_unregister_rt() - unregister the tcp_rt ops from kernel.
*
* After call this, the new connection will no ref to the tcp_rt, but
* the old connection still ref to the tcp_rt, you must wait for all
* old connection been released, then you can try to rmmod module.
* So, this function cannot been called inside the module_exit.
* You should call this by such as debugfs. such as:
*
* ----
* static ssize_t tcp_rt_inactive(struct file *file, const char __user *buff,
* size_t count, loff_t *offset)
* {
* tcp_unregister_rt(&rt_ops);
* return count;
* }
*
* static struct file_operations fops = {
* .owner = THIS_MODULE,
* .write = tcp_rt_inactive,
* };
*
* static int __init rt_register(void)
* {
* if (!debugfs_create_file("tcp-rt-no-active", 0600, NULL, NULL, fops)) {
* return -1;
* }
*
* ret = tcp_register_rt(&rt_ops);
* if (ret) {
* pr_err("tcp-rt register rt failed!\n");
* tcp_rt_base_released();
* return ret;
* }
*
* return 0;
* }
*
* static void __exit rt_unregister(void)
* {
* pr_info("tcp-rt: released\n");
* }
*
* module_init(rt_register);
* module_exit(rt_unregister);
* ----
*
* run this cmd before you want to rmmod module:
* echo 0 > /sys/kernel/debug/tcp-rt-no-active
*
*/
void tcp_unregister_rt(struct tcp_rt_ops *rt);
void tcp_init_rt(struct sock *sk);
void tcp_cleanup_rt(struct sock *sk);
#define tcp_rt_call(sk, fun) \
do { \
if (inet_csk(sk)->icsk_tcp_rt_ops && \
inet_csk(sk)->icsk_tcp_rt_ops->fun) \
inet_csk(sk)->icsk_tcp_rt_ops->fun(sk); \
} while (0)
#else
#define tcp_cleanup_rt(sk)
#define tcp_init_rt(sk)
#define tcp_rt_call(sk, fun)
#endif
#endif /* _TCP_RT_H */
...@@ -753,3 +753,16 @@ config TCP_MD5SIG ...@@ -753,3 +753,16 @@ config TCP_MD5SIG
on the Internet. on the Internet.
If unsure, say N. If unsure, say N.
config TCP_RT
bool "TCP RT"
default n
help
TcpRT: Instrument and Diagnostic Analysis System for Service Quality
of Cloud Databases at Massive Scale in Real-time.
It can also provide information for all request/response services. Such as
HTTP request.
This is used for account opening kernel framework, more work needs
tcprt module support.
...@@ -65,5 +65,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o ...@@ -65,5 +65,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_TCP_RT) += tcp_rt.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o xfrm4_output.o xfrm4_protocol.o
...@@ -467,6 +467,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) ...@@ -467,6 +467,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk); tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk); tcp_init_buffer_space(sk);
tcp_init_rt(sk);
} }
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
......
...@@ -677,6 +677,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) ...@@ -677,6 +677,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
now = tcp_jiffies32; now = tcp_jiffies32;
tcp_rt_call(sk, recv_data);
if (!icsk->icsk_ack.ato) { if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize /* The _first_ data packet received, initialize
* delayed ACK engine. * delayed ACK engine.
...@@ -3217,6 +3219,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, ...@@ -3217,6 +3219,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
} }
tcp_rt_call(sk, pkts_acked);
if (icsk->icsk_ca_ops->pkts_acked) { if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked, struct ack_sample sample = { .pkts_acked = pkts_acked,
.rtt_us = sack->rate->rtt_us, .rtt_us = sack->rate->rtt_us,
......
...@@ -1971,6 +1971,8 @@ void tcp_v4_destroy_sock(struct sock *sk) ...@@ -1971,6 +1971,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_cleanup_congestion_control(sk); tcp_cleanup_congestion_control(sk);
tcp_cleanup_rt(sk);
tcp_cleanup_ulp(sk); tcp_cleanup_ulp(sk);
/* Cleanup up the write buffer. */ /* Cleanup up the write buffer. */
......
...@@ -2406,6 +2406,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2406,6 +2406,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
tcp_schedule_loss_probe(sk, false); tcp_schedule_loss_probe(sk, false);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited); tcp_cwnd_validate(sk, is_cwnd_limited);
tcp_rt_call(sk, send_data);
return false; return false;
} }
return !tp->packets_out && !tcp_write_queue_empty(sk); return !tp->packets_out && !tcp_write_queue_empty(sk);
......
// SPDX-License-Identifier: GPL-2.0
#include <net/tcp.h>
#include <linux/module.h>
static const struct tcp_rt_ops __rcu *tcp_rt;
static DEFINE_SPINLOCK(tcp_rt_lock);
int tcp_register_rt(const struct tcp_rt_ops *rt)
{
const struct tcp_rt_ops *ort;
int ret = 0;
if (!rt->init || !rt->release) {
pr_err("tcp_rt does not implement required ops\n");
return -EINVAL;
}
spin_lock(&tcp_rt_lock);
ret = try_module_get(rt->owner);
if (unlikely(!ret)) {
ret = -EBUSY;
} else {
ort = rcu_dereference_protected(tcp_rt, true);
if (ort) {
ret = -EEXIST;
pr_err("tcp_rt already registered\n");
module_put(rt->owner);
} else {
rcu_assign_pointer(tcp_rt, rt);
ret = 0;
}
}
spin_unlock(&tcp_rt_lock);
return ret;
}
EXPORT_SYMBOL_GPL(tcp_register_rt);
void tcp_unregister_rt(struct tcp_rt_ops *rt)
{
const struct tcp_rt_ops *ort = NULL;
spin_lock(&tcp_rt_lock);
rcu_swap_protected(tcp_rt, ort, true);
if (ort)
module_put(ort->owner);
spin_unlock(&tcp_rt_lock);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_rt);
void tcp_init_rt(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_rt_ops *ops;
rcu_read_lock();
ops = rcu_dereference(tcp_rt);
if (ops)
if (unlikely(!try_module_get(ops->owner)))
ops = NULL;
icsk->icsk_tcp_rt_ops = ops;
rcu_read_unlock();
ops = icsk->icsk_tcp_rt_ops;
if (!ops)
return;
if (ops->init(sk)) {
module_put(ops->owner);
icsk->icsk_tcp_rt_ops = NULL;
}
}
void tcp_cleanup_rt(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_tcp_rt_ops) {
icsk->icsk_tcp_rt_ops->release(sk);
module_put(icsk->icsk_tcp_rt_ops->owner);
icsk->icsk_tcp_rt_ops = NULL;
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册