diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 63ef386afd0ab251588003f17bf1f0283e20698b..70c71c9206e2bf4806fc0cfd8bf6464464f27623 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -70,6 +70,7 @@ Contents: lapb-module mac80211-injection mpls-sysctl + mptcp-sysctl multiqueue netconsole netdev-features diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst new file mode 100644 index 0000000000000000000000000000000000000000..6af0196c4297cc84a0299f46a8836722e1be3a04 --- /dev/null +++ b/Documentation/networking/mptcp-sysctl.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +MPTCP Sysfs variables +===================== + +/proc/sys/net/mptcp/* Variables +=============================== + +enabled - INTEGER + Control whether MPTCP sockets can be created. + + MPTCP sockets can be created if the value is nonzero. This is + a per-namespace sysctl. + + Default: 1 + +add_addr_timeout - INTEGER (seconds) + Set the timeout after which an ADD_ADDR control message will be + resent to an MPTCP peer that has not acknowledged a previous + ADD_ADDR message. + + The default value matches TCP_RTO_MAX. This is a per-namespace + sysctl. + + Default: 120 diff --git a/MAINTAINERS b/MAINTAINERS index 17f5571788c9a06b74927ec58057f14195811f48..badaaa815aa36dc613cbcaf043b8d5f1a8b311d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12265,6 +12265,7 @@ L: mptcp@lists.01.org S: Maintained W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues +F: Documentation/networking/mptcp-sysctl.rst F: include/net/mptcp.h F: include/uapi/linux/mptcp.h F: net/mptcp/ diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 753ba7e755d6d1427634110ee975ab61474f3420..6e706d838e4ebd593f5f896dae16d553de670ef0 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -29,7 +29,8 @@ struct mptcp_ext { use_ack:1, ack64:1, mpc_map:1, - __unused:2; + frozen:1, + __unused:1; /* one byte hole */ }; @@ -106,6 +107,19 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, from->active_extensions = 0; } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ + struct mptcp_ext *from_ext; + + from_ext = skb_ext_find(from, SKB_EXT_MPTCP); + if (!from_ext) + return; + + from_ext->frozen = 1; + skb_ext_copy(to, from); +} + static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { @@ -193,6 +207,11 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, { } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ +} + static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e96787a14b6f9af8beddb1067a7cb8dc..9abf5a0358d54869aa91a1e1a3125c1072af7305 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1569,6 +1569,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2123,6 +2124,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2393,6 +2395,7 @@ static int tcp_mtu_probe(struct sock *sk) skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); + mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 54b888f940093ce434447c74c5e605be03a1a848..96ba616f59bfeea8dc9da378e93a4bf06635fc8b 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -18,6 +18,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; int mptcp_enabled; + unsigned int add_addr_timeout; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net) return mptcp_get_pernet(net)->mptcp_enabled; } +unsigned int mptcp_get_add_addr_timeout(struct net *net) +{ + return mptcp_get_pernet(net)->add_addr_timeout; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = { */ .proc_handler = proc_dointvec, }, + { + .procname = "add_addr_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, {} }; static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; } static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) } table[0].data = &pernet->mptcp_enabled; + table[1].data = &pernet->add_addr_timeout; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0d6f3d912891a11e082ee565d1c9ea2944e08b51..ed60538df7b27fd771954283999cecc8d883786e 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -206,6 +206,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); pr_debug("msk=%p", msk); @@ -232,7 +233,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(net)); spin_unlock_bh(&msk->pm.lock); @@ -264,6 +266,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); if (lookup_anno_list_by_saddr(msk, &entry->addr)) return false; @@ -279,7 +282,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); return true; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7419fd15d84c95e87c8834aab4339f2dc883f76..b84b84adc9addc58c811f6f5b29af39130ae80bd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,6 +466,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; u32 old_copied_seq; bool done = false; + int sk_rbuf; + + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + + if (unlikely(ssk_rbuf > sk_rbuf)) { + WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); + sk_rbuf = ssk_rbuf; + } + } pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); @@ -528,7 +540,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } @@ -622,6 +634,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + int sk_rbuf, ssk_rbuf; bool wake; /* move_skbs_to_msk below can legitly clear the data_avail flag, @@ -632,12 +645,16 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (wake) set_bit(MPTCP_DATA_READY, &msk->flags); - if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && - move_skbs_to_msk(msk, ssk)) + ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + if (unlikely(ssk_rbuf > sk_rbuf)) + sk_rbuf = ssk_rbuf; + + /* over limit? can't append more skbs to msk */ + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - /* don't schedule if mptcp sk is (still) over limit */ - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) + if (move_skbs_to_msk(msk, ssk)) goto wake; /* mptcp socket is owned, release_cb should retry */ @@ -754,8 +771,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, if (!tcp_skb_can_collapse_to(skb)) return false; - /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == write_seq; + /* can collapse only if MPTCP level sequence is in order and this + * mapping has not been xmitted yet + */ + return mpext && mpext->data_seq + mpext->data_len == write_seq && + !mpext->frozen; } static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, @@ -833,19 +853,25 @@ static void mptcp_clean_una(struct sock *sk) } out: - if (cleaned) { + if (cleaned) sk_mem_reclaim_partial(sk); +} - /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); - smp_mb__after_atomic(); +static void mptcp_clean_una_wakeup(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ - sk_stream_write_space(sk); - } + mptcp_clean_una(sk); + + /* Only wake up writers if a subflow is ready */ + if (mptcp_is_writeable(msk)) { + set_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + + /* set SEND_SPACE before sk_stream_write_space clears + * NOSPACE + */ + sk_stream_write_space(sk); } } @@ -1476,13 +1502,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + bool slowpath; if (!ssk) break; - lock_sock(ssk); + slowpath = lock_sock_fast(ssk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - release_sock(ssk); + unlock_sock_fast(ssk, slowpath); } while (!done); if (mptcp_ofo_queue(msk) || moved > 0) { @@ -1748,7 +1775,7 @@ static void mptcp_worker(struct work_struct *work) long timeo = 0; lock_sock(sk); - mptcp_clean_una(sk); + mptcp_clean_una_wakeup(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 13ab89dc19141de0fb8416c87424c9d87adaceed..278c88c405e847ad51c94fe95b7b6c67369e730d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -362,6 +362,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) } int mptcp_is_enabled(struct net *net); +unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config index 741a1c4f4ae8f167387d945c8b28e38d7581123d..0faaccd21447e4a0458201f8a9d7ab2921a25656 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -5,3 +5,13 @@ CONFIG_INET_DIAG=m CONFIG_INET_MPTCP_DIAG=m CONFIG_VETH=y CONFIG_NET_SCH_NETEM=m +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NF_TABLES=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_COMPAT=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 08f53d86dedcba331adc47b3bc54c51849e5d00d..0d93b243695f258bdaf84ae3e7e0f67e5120775e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -13,6 +13,24 @@ capture=0 TEST_COUNT=0 +# generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || +# (ip6 && (ip6[74] & 0xf0) == 0x30)'" +CBPF_MPTCP_SUBOPTION_ADD_ADDR="14, + 48 0 0 0, + 84 0 0 240, + 21 0 3 64, + 48 0 0 54, + 84 0 0 240, + 21 6 7 48, + 48 0 0 0, + 84 0 0 240, + 21 0 4 96, + 48 0 0 74, + 84 0 0 240, + 21 0 1 48, + 6 0 0 65535, + 6 0 0 0" + init() { capout=$(mktemp) @@ -82,6 +100,26 @@ reset_with_cookies() done } +reset_with_add_addr_timeout() +{ + local ip="${1:-4}" + local tables + + tables="iptables" + if [ $ip -eq 6 ]; then + tables="ip6tables" + fi + + reset + + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 + ip netns exec $ns2 $tables -A OUTPUT -p tcp \ + -m tcp --tcp-option 30 \ + -m bpf --bytecode \ + "$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \ + -j DROP +} + for arg in "$@"; do if [ "$arg" = "-c" ]; then capture=1 @@ -94,6 +132,17 @@ if [ $? -ne 0 ];then exit $ksft_skip fi +iptables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without iptables tool" + exit $ksft_skip +fi + +ip6tables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without ip6tables tool" + exit $ksft_skip +fi check_transfer() { @@ -135,6 +184,7 @@ do_transfer() connect_addr="$5" rm_nr_ns1="$6" rm_nr_ns2="$7" + speed="$8" port=$((10000+$TEST_COUNT)) TEST_COUNT=$((TEST_COUNT+1)) @@ -159,7 +209,7 @@ do_transfer() sleep 1 fi - if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then + if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" else mptcp_connect="./mptcp_connect -r" @@ -250,26 +300,13 @@ run_tests() listener_ns="$1" connector_ns="$2" connect_addr="$3" + rm_nr_ns1="${4:-0}" + rm_nr_ns2="${5:-0}" + speed="${6:-fast}" lret=0 - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0 - lret=$? - if [ $lret -ne 0 ]; then - ret=$lret - return - fi -} - -run_remove_tests() -{ - listener_ns="$1" - connector_ns="$2" - connect_addr="$3" - rm_nr_ns1="$4" - rm_nr_ns2="$5" - lret=0 - - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2} + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \ + ${rm_nr_ns1} ${rm_nr_ns2} ${speed} lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -491,12 +528,21 @@ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows and signal" 3 3 3 chk_add_nr 1 1 +# add_addr timeout +reset_with_add_addr_timeout +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 0 0 slow +chk_join_nr "signal address, ADD_ADDR timeout" 1 1 1 +chk_add_nr 4 0 + # single subflow, remove reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 ip netns exec $ns2 ./pm_nl_ctl limits 0 1 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 0 1 +run_tests $ns1 $ns2 10.0.1.1 0 1 slow chk_join_nr "remove single subflow" 1 1 1 chk_rm_nr 1 1 @@ -506,7 +552,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2 ip netns exec $ns2 ./pm_nl_ctl limits 0 2 ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 0 2 +run_tests $ns1 $ns2 10.0.1.1 0 2 slow chk_join_nr "remove multiple subflows" 2 2 2 chk_rm_nr 2 2 @@ -515,7 +561,7 @@ reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 1 -run_remove_tests $ns1 $ns2 10.0.1.1 1 0 +run_tests $ns1 $ns2 10.0.1.1 1 0 slow chk_join_nr "remove single address" 1 1 1 chk_add_nr 1 1 chk_rm_nr 0 0 @@ -526,7 +572,7 @@ ip netns exec $ns1 ./pm_nl_ctl limits 0 2 ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 2 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 1 1 +run_tests $ns1 $ns2 10.0.1.1 1 1 slow chk_join_nr "remove subflow and signal" 2 2 2 chk_add_nr 1 1 chk_rm_nr 1 1 @@ -538,7 +584,7 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 3 ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow -run_remove_tests $ns1 $ns2 10.0.1.1 1 2 +run_tests $ns1 $ns2 10.0.1.1 1 2 slow chk_join_nr "remove subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2