diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d84932650fd3e926cc11d5fa0bcaee6cc8a8cd33..c7712787933c1734c98a0191db23d9cb2886d0f8 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -27,6 +27,12 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. +rt_cache_rebuild_count - INTEGER + The per net-namespace route cache emergency rebuild threshold. + Any net-namespace having its route cache rebuilt due to + a hash bucket chain being too long more than this many times + will have its route caching disabled + IP Fragmentation: ipfrag_high_thresh - INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ece1c926b5d18204855d484349148bb442823c6e..977f482d97a95144f0f777df1b0bbbacb7f1b11e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,8 @@ struct netns_ipv4 { int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_errors_use_inbound_ifaddr; + int sysctl_rt_cache_rebuild_count; + int current_rt_cache_rebuild_count; struct timer_list rt_secret_timer; atomic_t rt_genid; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2ea6dcc3e2ccd62b6faedb97b31e2206f121764a..21ce7e1b22848a2a335fa8acbb8b120dc3dbf83d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; +static int rt_chain_length_max __read_mostly = 20; static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); @@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(struct dst_ops *ops); +static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { @@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; }; + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ defined(CONFIG_PROVE_LOCKING) /* @@ -674,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt) return score; } +static inline bool rt_caching(const struct net *net) +{ + return net->ipv4.current_rt_cache_rebuild_count <= + net->ipv4.sysctl_rt_cache_rebuild_count; +} + +static inline bool compare_hash_inputs(const struct flowi *fl1, + const struct flowi *fl2) +{ + return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | + (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + (fl1->iif ^ fl2->iif)) == 0); +} + static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | @@ -753,11 +770,24 @@ static void rt_do_flush(int process_context) } } +/* + * While freeing expired entries, we compute average chain length + * and standard deviation, using fixed-point arithmetic. + * This to have an estimation of rt_chain_length_max + * rt_chain_length_max = max(elasticity, AVG + 4*SD) + * We use 3 bits for frational part, and 29 (or 61) for magnitude. + */ + +#define FRACT_BITS 3 +#define ONE (1UL << FRACT_BITS) + static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; struct rtable *rth, **rthp; + unsigned long length = 0, samples = 0; + unsigned long sum = 0, sum2 = 0; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; @@ -766,6 +796,7 @@ static void rt_check_expire(void) goal = (unsigned int)mult; if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + length = 0; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; @@ -775,6 +806,8 @@ static void rt_check_expire(void) if (need_resched()) cond_resched(); + samples++; + if (*rthp == NULL) continue; spin_lock_bh(rt_hash_lock_addr(i)); @@ -789,11 +822,29 @@ static void rt_check_expire(void) if (time_before_eq(jiffies, rth->u.dst.expires)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; + /* + * Only bump our length if the hash + * inputs on entries n and n+1 are not + * the same, we only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + if ((*rthp == NULL) || + !compare_hash_inputs(&(*rthp)->fl, + &rth->fl)) + length += ONE; continue; } } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; + if ((*rthp == NULL) || + !compare_hash_inputs(&(*rthp)->fl, + &rth->fl)) + length += ONE; continue; } @@ -802,6 +853,15 @@ static void rt_check_expire(void) rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); } rover = i; } @@ -851,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net) mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } +static void rt_secret_rebuild_oneshot(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); + rt_cache_invalidate(net); + if (ip_rt_secret_interval) { + net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + } +} + +static void rt_emergency_hash_rebuild(struct net *net) +{ + if (net_ratelimit()) { + printk(KERN_WARNING "Route hash chain too long!\n"); + printk(KERN_WARNING "Adjust your secret_interval!\n"); + } + + rt_secret_rebuild_oneshot(net); +} + /* Short description of GC goals. @@ -989,6 +1069,7 @@ out: return 0; static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; + struct rtable *rthi; unsigned long now; struct rtable *cand, **candp; u32 min_score; @@ -1002,7 +1083,13 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) candp = NULL; now = jiffies; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + rt_drop(rt); + return 0; + } + rthp = &rt_hash_table[hash].chain; + rthi = NULL; spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { @@ -1048,6 +1135,17 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) chain_length++; rthp = &rth->u.dst.rt_next; + + /* + * check to see if the next entry in the chain + * contains the same hash input values as rt. If it does + * This is where we will insert into the list, instead of + * at the head. This groups entries that differ by aspects not + * relvant to the hash function together, which we use to adjust + * our chain length + */ + if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl)) + rthi = rth; } if (cand) { @@ -1061,6 +1159,16 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) *candp = cand->u.dst.rt_next; rt_free(cand); } + } else { + if (chain_length > rt_chain_length_max) { + struct net *net = dev_net(rt->u.dst.dev); + int num = ++net->ipv4.current_rt_cache_rebuild_count; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", + rt->u.dst.dev->name, num); + } + rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + } } /* Try to bind route to arp only if it is output @@ -1098,7 +1206,11 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; + if (rthi) + rt->u.dst.rt_next = rthi->u.dst.rt_next; + else + rt->u.dst.rt_next = rt_hash_table[hash].chain; + #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; @@ -1114,7 +1226,11 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) * previous writes to rt are comitted to memory * before making rt visible to other CPUS. */ - rcu_assign_pointer(rt_hash_table[hash].chain, rt); + if (rthi) + rcu_assign_pointer(rthi->u.dst.rt_next, rt); + else + rcu_assign_pointer(rt_hash_table[hash].chain, rt); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; @@ -1217,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, || ipv4_is_zeronet(new_gw)) goto reject_redirect; + if (!rt_caching(net)) + goto reject_redirect; + if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -2130,6 +2249,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct net *net; net = dev_net(dev); + + if (!rt_caching(net)) + goto skip_cache; + tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net)); @@ -2154,6 +2277,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, } rcu_read_unlock(); +skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2539,6 +2663,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; + if (!rt_caching(net)) + goto slow_output; + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); @@ -2563,6 +2690,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, } rcu_read_unlock_bh(); +slow_output: return ip_route_output_slow(net, rp, flp); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1bb10df8ce7dd94b89a0b43b170307557cf521f0..0cc8d31f9ac0edad1012664bab4180d51589013e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -795,6 +795,14 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rt_cache_rebuild_count", + .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { } }; @@ -827,8 +835,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_icmp_ratelimit; table[5].data = &net->ipv4.sysctl_icmp_ratemask; + table[6].data = + &net->ipv4.sysctl_rt_cache_rebuild_count; } + net->ipv4.sysctl_rt_cache_rebuild_count = 4; + net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); if (net->ipv4.ipv4_hdr == NULL)