diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f35bfe43bf7abac26690328c5332d154023fd818..29a93518bf18acc7d9a0a3ac443abd03662e3792 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -101,19 +101,17 @@ ipfrag_high_thresh - INTEGER Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh bytes of memory is allocated for this purpose, the fragment handler will toss packets until ipfrag_low_thresh - is reached. + is reached. This also serves as a maximum limit to namespaces + different from the initial one. ipfrag_low_thresh - INTEGER - See ipfrag_high_thresh + Maximum memory used to reassemble IP fragments before the kernel + begins to remove incomplete fragment queues to free up resources. + The kernel still accepts new fragments for defragmentation. ipfrag_time - INTEGER Time in seconds to keep an IP fragment in memory. -ipfrag_secret_interval - INTEGER - Regeneration interval (in seconds) of the hash secret (or lifetime - for the hash secret) for IP fragments. - Default: 600 - ipfrag_max_dist - INTEGER ipfrag_max_dist is a non-negative integer value which defines the maximum "disorder" which is allowed among fragments which share a @@ -1162,11 +1160,6 @@ ip6frag_low_thresh - INTEGER ip6frag_time - INTEGER Time in seconds to keep an IPv6 fragment in memory. -ip6frag_secret_interval - INTEGER - Regeneration interval (in seconds) of the hash secret (or lifetime - for the hash secret) for IPv6 fragments. - Default: 600 - conf/default/*: Change the interface-specific default settings. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 6f59de98dabde3be0daa202f8be4528d49676f7f..6f4930a0b6600fbda1521c7e2a3bd028905d3935 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -4,10 +4,6 @@ #include struct netns_frags { - int nqueues; - struct list_head lru_list; - spinlock_t lru_lock; - /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ @@ -22,7 +18,6 @@ struct netns_frags { struct inet_frag_queue { spinlock_t lock; struct timer_list timer; /* when will this queue expire? */ - struct list_head lru_list; /* lru list member */ struct hlist_node list; atomic_t refcnt; struct sk_buff *fragments; /* list of received fragments */ @@ -32,6 +27,7 @@ struct inet_frag_queue { int meat; __u8 last_in; /* first/last segment arrived? */ +#define INET_FRAG_EVICTED 8 #define INET_FRAG_COMPLETE 4 #define INET_FRAG_FIRST_IN 2 #define INET_FRAG_LAST_IN 1 @@ -48,7 +44,7 @@ struct inet_frag_queue { * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or * struct frag_queue)) */ -#define INETFRAGS_MAXDEPTH 128 +#define INETFRAGS_MAXDEPTH 128 struct inet_frag_bucket { struct hlist_head chain; @@ -57,24 +53,27 @@ struct inet_frag_bucket { struct inet_frags { struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - /* This rwlock is a global lock (seperate per IPv4, IPv6 and - * netfilter). Important to keep this on a seperate cacheline. - * Its primarily a rebuild protection rwlock. - */ - rwlock_t lock ____cacheline_aligned_in_smp; - int secret_interval; - struct timer_list secret_timer; + + struct work_struct frags_work; + unsigned int next_bucket; + unsigned long last_rebuild_jiffies; + bool rebuild; /* The first call to hashfn is responsible to initialize * rnd. This is best done with net_get_random_once. + * + * rnd_seqlock is used to let hash insertion detect + * when it needs to re-lookup the hash chain to use. */ u32 rnd; + seqlock_t rnd_seqlock; int qsize; - unsigned int (*hashfn)(struct inet_frag_queue *); - bool (*match)(struct inet_frag_queue *q, void *arg); + unsigned int (*hashfn)(const struct inet_frag_queue *); + bool (*match)(const struct inet_frag_queue *q, + const void *arg); void (*constructor)(struct inet_frag_queue *q, - void *arg); + const void *arg); void (*destructor)(struct inet_frag_queue *); void (*skb_free)(struct sk_buff *); void (*frag_expire)(unsigned long data); @@ -87,19 +86,17 @@ void inet_frags_init_net(struct netns_frags *nf); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); -void inet_frag_destroy(struct inet_frag_queue *q, - struct inet_frags *f, int *work); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock); + struct inet_frags *f, void *key, unsigned int hash); + void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, NULL); + inet_frag_destroy(q, f); } /* Memory Tracking Functions. */ @@ -131,9 +128,9 @@ static inline void init_frag_mem_limit(struct netns_frags *nf) percpu_counter_init(&nf->mem, 0); } -static inline int sum_frag_mem_limit(struct netns_frags *nf) +static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) { - int res; + unsigned int res; local_bh_disable(); res = percpu_counter_sum_positive(&nf->mem); @@ -142,31 +139,6 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf) return res; } -static inline void inet_frag_lru_move(struct inet_frag_queue *q) -{ - spin_lock(&q->net->lru_lock); - if (!list_empty(&q->lru_list)) - list_move_tail(&q->lru_list, &q->net->lru_list); - spin_unlock(&q->net->lru_lock); -} - -static inline void inet_frag_lru_del(struct inet_frag_queue *q) -{ - spin_lock(&q->net->lru_lock); - list_del_init(&q->lru_list); - q->net->nqueues--; - spin_unlock(&q->net->lru_lock); -} - -static inline void inet_frag_lru_add(struct netns_frags *nf, - struct inet_frag_queue *q) -{ - spin_lock(&nf->lru_lock); - list_add_tail(&q->lru_list, &nf->lru_list); - q->net->nqueues++; - spin_unlock(&nf->lru_lock); -} - /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. diff --git a/include/net/ip.h b/include/net/ip.h index 2e8f055989c3c982df11c0160ae409e98814bcab..ca14799545fdc00521f52fd957396ce6e2e628af 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -495,7 +495,6 @@ static inline struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) } #endif int ip_frag_mem(struct net *net); -int ip_frag_nqueues(struct net *net); /* * Functions provided by ip_forward.c diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a25017247457cbf84510ffee6190052ae650c765..a2db816e8461cff706f23c725cf69372ab8bfcaa 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -299,11 +299,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) } #if IS_ENABLED(CONFIG_IPV6) -static inline int ip6_frag_nqueues(struct net *net) -{ - return net->ipv6.frags.nqueues; -} - static inline int ip6_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv6.frags); @@ -496,8 +491,8 @@ struct ip6_create_arg { u8 ecn; }; -void ip6_frag_init(struct inet_frag_queue *q, void *a); -bool ip6_frag_match(struct inet_frag_queue *q, void *a); +void ip6_frag_init(struct inet_frag_queue *q, const void *a); +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); /* * Equivalent of ipv4 struct ip diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index b85bd3f7048e7bdb7e44d06c0b49b4d2a22724f5..f13d4f32e207e124b5f07ffd7dce4da8a2d1f2c6 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -50,29 +50,25 @@ static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size, const struct ieee802154_addr *saddr, const struct ieee802154_addr *daddr) { - u32 c; - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - c = jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ieee802154_addr_hash(saddr), + ieee802154_addr_hash(daddr), + (__force u32)(tag + (d_size << 16)), + lowpan_frags.rnd); } -static unsigned int lowpan_hashfn(struct inet_frag_queue *q) +static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) { - struct lowpan_frag_queue *fq; + const struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); } -static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) +static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) { - struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; + const struct lowpan_frag_queue *fq; + const struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); return fq->tag == arg->tag && fq->d_size == arg->d_size && @@ -80,10 +76,10 @@ static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) ieee802154_addr_equal(&fq->daddr, arg->dst); } -static void lowpan_frag_init(struct inet_frag_queue *q, void *a) +static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { + const struct lowpan_create_arg *arg = a; struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); @@ -128,7 +124,6 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, arg.src = src; arg.dst = dst; - read_lock(&lowpan_frags.lock); hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, @@ -223,7 +218,6 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, return res; } - inet_frag_lru_move(&fq->q); return -1; err: kfree_skb(skb); @@ -373,8 +367,6 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) if (frag_info->d_size > ieee802154_lowpan->max_dsize) goto err; - inet_frag_evictor(&ieee802154_lowpan->frags, &lowpan_frags, false); - fq = fq_find(net, frag_info, &source, &dest); if (fq != NULL) { int ret; @@ -394,20 +386,25 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) EXPORT_SYMBOL(lowpan_frag_rcv); #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", @@ -426,10 +423,12 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", - .data = &lowpan_frags.secret_interval, + .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -452,7 +451,10 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &ieee802154_lowpan->frags.high_thresh; + table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; + table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh; table[1].data = &ieee802154_lowpan->frags.low_thresh; + table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; table[3].data = &ieee802154_lowpan->max_dsize; @@ -569,7 +571,6 @@ int __init lowpan_net_frag_init(void) lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; - lowpan_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&lowpan_frags); return ret; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3b01959bf4bb0bbc208d8d564c8595c67eedd179..62b1f73749dc9b5b0b6b174e43edb178f0ea6a93 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,12 @@ #include #include +#define INETFRAGS_EVICT_BUCKETS 128 +#define INETFRAGS_EVICT_MAX 512 + +/* don't rebuild inetfrag table with new secret more often than this */ +#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static void inet_frag_secret_rebuild(unsigned long dummy) +static unsigned int +inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) +{ + return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); +} + +static bool inet_frag_may_rebuild(struct inet_frags *f) +{ + return time_after(jiffies, + f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); +} + +static void inet_frag_secret_rebuild(struct inet_frags *f) { - struct inet_frags *f = (struct inet_frags *)dummy; - unsigned long now = jiffies; int i; - /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock(&f->lock); + write_seqlock_bh(&f->rnd_seqlock); + + if (!inet_frag_may_rebuild(f)) + goto out; get_random_bytes(&f->rnd, sizeof(u32)); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; struct hlist_node *n; hb = &f->hash[i]; + spin_lock(&hb->chain_lock); + hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = f->hashfn(q); + unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) { struct inet_frag_bucket *hb_dest; @@ -72,76 +93,195 @@ static void inet_frag_secret_rebuild(unsigned long dummy) /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; + + /* This is the only place where we take + * another chain_lock while already holding + * one. As this will not run concurrently, + * we cannot deadlock on hb_dest lock below, if its + * already locked it will be released soon since + * other caller cannot be waiting for hb lock + * that we've taken above. + */ + spin_lock_nested(&hb_dest->chain_lock, + SINGLE_DEPTH_NESTING); hlist_add_head(&q->list, &hb_dest->chain); + spin_unlock(&hb_dest->chain_lock); } } + spin_unlock(&hb->chain_lock); } - write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->secret_interval); + f->rebuild = false; + f->last_rebuild_jiffies = jiffies; +out: + write_sequnlock_bh(&f->rnd_seqlock); +} + +static bool inet_fragq_should_evict(const struct inet_frag_queue *q) +{ + return q->net->low_thresh == 0 || + frag_mem_limit(q->net) >= q->net->low_thresh; +} + +static unsigned int +inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +{ + struct inet_frag_queue *fq; + struct hlist_node *n; + unsigned int evicted = 0; + HLIST_HEAD(expired); + +evict_again: + spin_lock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &hb->chain, list) { + if (!inet_fragq_should_evict(fq)) + continue; + + if (!del_timer(&fq->timer)) { + /* q expiring right now thus increment its refcount so + * it won't be freed under us and wait until the timer + * has finished executing then destroy it + */ + atomic_inc(&fq->refcnt); + spin_unlock(&hb->chain_lock); + del_timer_sync(&fq->timer); + WARN_ON(atomic_read(&fq->refcnt) != 1); + inet_frag_put(fq, f); + goto evict_again; + } + + /* suppress xmit of (icmp) error packet */ + fq->last_in &= ~INET_FRAG_FIRST_IN; + fq->last_in |= INET_FRAG_EVICTED; + hlist_del(&fq->list); + hlist_add_head(&fq->list, &expired); + ++evicted; + } + + spin_unlock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &expired, list) + f->frag_expire((unsigned long) fq); + + return evicted; +} + +static void inet_frag_worker(struct work_struct *work) +{ + unsigned int budget = INETFRAGS_EVICT_BUCKETS; + unsigned int i, evicted = 0; + struct inet_frags *f; + + f = container_of(work, struct inet_frags, frags_work); + + BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + + local_bh_disable(); + + for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + evicted += inet_evict_bucket(f, &f->hash[i]); + i = (i + 1) & (INETFRAGS_HASHSZ - 1); + if (evicted > INETFRAGS_EVICT_MAX) + break; + } + + f->next_bucket = i; + + local_bh_enable(); + + if (f->rebuild && inet_frag_may_rebuild(f)) + inet_frag_secret_rebuild(f); +} + +static void inet_frag_schedule_worker(struct inet_frags *f) +{ + if (unlikely(!work_pending(&f->frags_work))) + schedule_work(&f->frags_work); } void inet_frags_init(struct inet_frags *f) { int i; + INIT_WORK(&f->frags_work, inet_frag_worker); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i]; spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } - rwlock_init(&f->lock); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, - (unsigned long)f); - f->secret_timer.expires = jiffies + f->secret_interval; - add_timer(&f->secret_timer); + seqlock_init(&f->rnd_seqlock); + f->last_rebuild_jiffies = 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { - nf->nqueues = 0; init_frag_mem_limit(nf); - INIT_LIST_HEAD(&nf->lru_list); - spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { - del_timer(&f->secret_timer); + cancel_work_sync(&f->frags_work); } EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { - nf->low_thresh = 0; + unsigned int seq; + int i; + nf->low_thresh = 0; local_bh_disable(); - inet_frag_evictor(nf, f, true); + +evict_again: + seq = read_seqbegin(&f->rnd_seqlock); + + for (i = 0; i < INETFRAGS_HASHSZ ; i++) + inet_evict_bucket(f, &f->hash[i]); + + if (read_seqretry(&f->rnd_seqlock, seq)) + goto evict_again; + local_bh_enable(); percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static struct inet_frag_bucket * +get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) +__acquires(hb->chain_lock) { struct inet_frag_bucket *hb; - unsigned int hash; + unsigned int seq, hash; + + restart: + seq = read_seqbegin(&f->rnd_seqlock); - read_lock(&f->lock); - hash = f->hashfn(fq); + hash = inet_frag_hashfn(f, fq); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); + if (read_seqretry(&f->rnd_seqlock, seq)) { + spin_unlock(&hb->chain_lock); + goto restart; + } + + return hb; +} + +static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +{ + struct inet_frag_bucket *hb; + + hb = get_frag_bucket_locked(fq, f); hlist_del(&fq->list); spin_unlock(&hb->chain_lock); - - read_unlock(&f->lock); - inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -165,8 +305,7 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, kfree_skb(skb); } -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, - int *work) +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) { struct sk_buff *fp; struct netns_frags *nf; @@ -186,86 +325,30 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, fp = xp; } sum = sum_truesize + f->qsize; - if (work) - *work -= sum; sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); kfree(q); - } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) -{ - struct inet_frag_queue *q; - int work, evicted = 0; - - if (!force) { - if (frag_mem_limit(nf) <= nf->high_thresh) - return 0; - } - - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0 || force) { - spin_lock(&nf->lru_lock); - - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } - - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); - - spin_unlock(&nf->lru_lock); - - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); - - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; - } - - return evicted; -} -EXPORT_SYMBOL(inet_frag_evictor); - static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { - struct inet_frag_bucket *hb; + struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; - unsigned int hash; - - read_lock(&f->lock); /* Protects against hash rebuild */ - /* - * While we stayed w/o the lock other CPU could update - * the rnd seed, so we need to re-calculate the hash - * chain. Fortunatelly the qp_in can be used to get one. - */ - hash = f->hashfn(qp_in); - hb = &f->hash[hash]; - spin_lock(&hb->chain_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * released the hash bucket lock. + * such entry could have been created on other cpu before + * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; @@ -278,9 +361,8 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); - inet_frag_lru_add(nf, qp); + spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return qp; } @@ -290,6 +372,11 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; + if (frag_mem_limit(nf) > nf->high_thresh) { + inet_frag_schedule_worker(f); + return NULL; + } + q = kzalloc(f->qsize, GFP_ATOMIC); if (q == NULL) return NULL; @@ -301,7 +388,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - INIT_LIST_HEAD(&q->lru_list); return q; } @@ -320,12 +406,15 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; int depth = 0; + if (frag_mem_limit(nf) > nf->low_thresh) + inet_frag_schedule_worker(f); + + hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); @@ -333,18 +422,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return q; } depth++; } spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key); - else - return ERR_PTR(-ENOBUFS); + + if (inet_frag_may_rebuild(f)) { + if (!f->rebuild) + f->rebuild = true; + inet_frag_schedule_worker(f); + } + + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ed32313e307c43202a4710c6f5b74e14c19a4c20..634fc31aa24309698eba0718efbbe2304512a7cc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -86,11 +86,6 @@ static inline u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_nqueues(struct net *net) -{ - return net->ipv4.frags.nqueues; -} - int ip_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv4.frags); @@ -109,21 +104,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); + ip4_frags.rnd); } -static unsigned int ip4_hashfn(struct inet_frag_queue *q) +static unsigned int ip4_hashfn(const struct inet_frag_queue *q) { - struct ipq *ipq; + const struct ipq *ipq; ipq = container_of(q, struct ipq, q); return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) { - struct ipq *qp; - struct ip4_create_arg *arg = a; + const struct ipq *qp; + const struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && @@ -133,14 +128,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -static void ip4_frag_init(struct inet_frag_queue *q, void *a) +static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); - struct ip4_create_arg *arg = a; + const struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; @@ -177,18 +172,6 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - */ -static void ip_evictor(struct net *net) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); -} - /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -207,7 +190,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + if (!(qp->q.last_in & INET_FRAG_EVICTED)) + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { @@ -260,7 +244,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; - read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); @@ -505,7 +488,6 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -655,9 +637,6 @@ int ip_defrag(struct sk_buff *skb, u32 user) net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - /* Start by cleaning up the memory. */ - ip_evictor(net); - /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; @@ -721,14 +700,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", @@ -740,10 +722,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", - .data = &ip4_frags.secret_interval, + .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -771,7 +755,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; + table[0].extra1 = &net->ipv4.frags.low_thresh; + table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; + table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -873,6 +860,5 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae0af9386f7ccf4cabafbcc8aff6c37309e820ea..8e3eb39f84e72c9e0f487bcfc8950b544b882a11 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -52,6 +52,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem; int orphans, sockets; local_bh_disable(); @@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(net), ip_frag_mem(net)); + frag_mem = ip_frag_mem(net); + seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0d5279fd852a48643b5b0b54834b2ee68a893116..3d4bccf6d67d38cd84b04cda51fd9234b20f3d2e 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,6 +63,8 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", @@ -76,14 +78,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -102,7 +107,10 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[0].data = &net->nf_frag.frags.timeout; table[1].data = &net->nf_frag.frags.low_thresh; + table[1].extra2 = &net->nf_frag.frags.high_thresh; table[2].data = &net->nf_frag.frags.high_thresh; + table[2].extra1 = &net->nf_frag.frags.low_thresh; + table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } hdr = register_net_sysctl(net, "net/netfilter", table); @@ -147,16 +155,13 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); } -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static unsigned int nf_hashfn(const struct inet_frag_queue *q) { const struct frag_queue *nq; @@ -196,7 +201,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.dst = dst; arg.ecn = ecn; - read_lock_bh(&nf_frags.lock); + local_bh_disable(); hash = nf_hash_frag(id, src, dst); q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); @@ -352,7 +357,6 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.last_in |= INET_FRAG_FIRST_IN; } - inet_frag_lru_move(&fq->q); return 0; discard_fq: @@ -597,10 +601,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - local_bh_disable(); - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); - local_bh_enable(); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq == NULL) { @@ -677,7 +677,6 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; - nf_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&nf_frags); ret = register_pernet_subsys(&nf_ct_net_ops); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 3317440ea34174032a7b0bee73d8f5b36cfb08ee..2d6f860e5c1e77cb087b2220ba2db9592a770159 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -33,6 +33,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -42,8 +43,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(net), ip6_frag_mem(net)); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index cc85a9ba50101c5abf86cb3e6d493bfb9b680c74..f1709c4a289a5802c1c1baa03bd9d87193b3b589 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -85,27 +85,23 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); } -static unsigned int ip6_hashfn(struct inet_frag_queue *q) +static unsigned int ip6_hashfn(const struct inet_frag_queue *q) { - struct frag_queue *fq; + const struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -bool ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) { - struct frag_queue *fq; - struct ip6_create_arg *arg = a; + const struct frag_queue *fq; + const struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); return fq->id == arg->id && @@ -115,10 +111,10 @@ bool ip6_frag_match(struct inet_frag_queue *q, void *a) } EXPORT_SYMBOL(ip6_frag_match); -void ip6_frag_init(struct inet_frag_queue *q, void *a) +void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - struct ip6_create_arg *arg = a; + const struct ip6_create_arg *arg = a; fq->id = arg->id; fq->user = arg->user; @@ -145,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + if (!(fq->q.last_in & INET_FRAG_EVICTED)) + IP6_INC_STATS_BH(net, __in6_dev_get(dev), + IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */ @@ -192,7 +190,6 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.dst = dst; arg.ecn = ecn; - read_lock(&ip6_frags.lock); hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); @@ -353,7 +350,6 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); - inet_frag_lru_move(&fq->q); return -1; discard_fq: @@ -523,7 +519,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - int evicted; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -552,11 +547,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); - if (evicted) - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS, evicted); - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq != NULL) { @@ -588,20 +578,25 @@ static const struct inet6_protocol frag_protocol = }; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", @@ -613,10 +608,12 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", - .data = &ip6_frags.secret_interval, + .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -636,7 +633,10 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; + table[0].extra1 = &net->ipv6.frags.low_thresh; + table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; + table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -746,7 +746,6 @@ int __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip6_frags); out: return ret;