提交 837b9955 编写于 作者: D David S. Miller

Merge branch 'ip_frag_next'

Florian Westphal says:

====================
net: force refragmentation for DF reassembed skbs

output path tests:

    if (skb->len > mtu) ip_fragment()

This breaks connectivity in one corner case:
 If the skb was reassembled, but has the DF bit set and ..
 .. its reassembled size is <= outdev mtu ..
 .. we will forward a DF packet larger than what the sender
    transmitted on wire.

If a router later in the path can't forward this packet, it will send an
icmp error in response to an mtu that the original sender never exceeded.

This changes ipv4 defrag/output path to

a) force refragmentation for DF reassembled skbs and
b) set DF bit on all fragments when refragmenting if it was set on original
frags.

tested via:
from scapy.all import *
dip="10.23.42.2"
payload="A"*1400
packet=IP(dst=dip,id=12345,flags='DF')/UDP(sport=42,dport=42)/payload
frags=fragment(packet,fragsize=1200)
for fragment in frags:
    send(fragment)

Without this patch, we generate fragments without df bit set based
on the outgoing device mtu when fragmenting after forwarding, ie.

IP (ttl 64, id 12345, offset 0, flags [+, DF], proto UDP (17), length 1204)
    192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400
IP (ttl 64, id 12345, offset 1184, flags [DF], proto UDP (17), length 244)
    192.168.7.1 > 10.23.42.2: ip-proto-17

on ingress will either turn into

IP (ttl 63, id 12345, offset 0, flags [+], proto UDP (17), length 1396)
    192.168.7.1.42 > 10.23.42.2.42: UDP, length 1400
IP (ttl 63, id 12345, offset 1376, flags [none], proto UDP (17), length 52)

(mtu 1400: We strip df and send larger fragment), or

IP (ttl 63, id 12345, offset 0, flags [DF], proto UDP (17), length 1428)
    192.168.7.1.42 > 10.23.42.2.42: [udp sum ok] UDP, length 1400

if mtu is 1500.  And in this case things break; router with a smaller mtu
will send icmp error, but original sender only sent packets <= 1204 byte.

With patch, we keep intent of such fragments and will emit DF-fragments
that won't exceed 1204 byte in size.

Joint work with Hannes Frederic Sowa.

Changes since v2:
 - split unrelated patches from series
 - rework changelog of patch #2 to better illustrate breakage
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -43,7 +43,7 @@ enum {
* @len: total length of the original datagram
* @meat: length of received fragments so far
* @flags: fragment queue flags
* @max_size: (ipv4 only) maximum received fragment size with IP_DF set
* @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
*/
struct inet_frag_queue {
......
......@@ -45,6 +45,7 @@ struct inet_skb_parm {
#define IPSKB_FRAG_COMPLETE BIT(3)
#define IPSKB_REROUTED BIT(4)
#define IPSKB_DOREDIRECT BIT(5)
#define IPSKB_FRAG_PMTU BIT(6)
u16 frag_max_size;
};
......
......@@ -75,6 +75,7 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */
int iif;
unsigned int rid;
struct inet_peer *peer;
......@@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
......@@ -481,9 +483,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
fragsize = skb->len + ihl;
if (fragsize > qp->q.max_size)
qp->q.max_size = fragsize;
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
skb->len + ihl > qp->q.max_size)
qp->q.max_size = skb->len + ihl;
fragsize > qp->max_df_size)
qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
......@@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = qp->q.max_size;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(head);
/* max_size != 0 implies at least one fragment had IP_DF set */
iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
/* When we set IP_DF on a refragmented skb we must also force a
* call to ip_fragment to avoid forwarding a DF-skb of size s while
* original sender only sent fragments of size f (where f < s).
*
* We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
* frag seen to avoid sending tiny DF-fragments in case skb was built
* from one very small df-fragment and one large non-df frag.
*/
if (qp->max_df_size == qp->q.max_size) {
IPCB(head)->flags |= IPSKB_FRAG_PMTU;
iph->frag_off = htons(IP_DF);
} else {
iph->frag_off = 0;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
......
......@@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
int (*output)(struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
......@@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
......@@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
skb_gso_network_seglen(skb) <= mtu)
return ip_finish_output2(sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
......@@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
err = ip_fragment(sk, segs, ip_finish_output2);
err = ip_fragment(sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
......@@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
......@@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
return dst_output_sk(sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(sk, skb);
return ip_finish_output_gso(sk, skb, mtu);
if (skb->len > ip_skb_dst_mtu(skb))
return ip_fragment(sk, skb, ip_finish_output2);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
return ip_fragment(sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(sk, skb);
}
......@@ -482,12 +487,15 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
}
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
int (*output)(struct sock *, struct sk_buff *))
{
struct iphdr *iph = ip_hdr(skb);
unsigned int mtu = ip_skb_dst_mtu(skb);
if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
if ((iph->frag_off & htons(IP_DF)) == 0)
return ip_do_fragment(sk, skb, output);
if (unlikely(!skb->ignore_df ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
struct rtable *rt = skb_rtable(skb);
......@@ -532,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
......@@ -727,6 +737,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
iph->frag_off |= htons(IP_DF);
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册