diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e9639bba62c777bd05d2121d8e09a668f6982..bba15630469599b7a4d93d32baf03f86b93d1e20 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -68,19 +68,14 @@ struct arpt_table_info { unsigned int initial_entries; unsigned int hook_entry[NF_ARP_NUMHOOKS]; unsigned int underflow[NF_ARP_NUMHOOKS]; - char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); + void *entries[NR_CPUS]; }; static LIST_HEAD(arpt_target); static LIST_HEAD(arpt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, char *hdr_addr, int len) { @@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, outdev = out ? out->name : nulldevname; read_lock_bh(&table->lock); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, - smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); back = get_entry(table_base, table->private->underflow[hook]); @@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +static int mark_source_chains(struct arpt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e - = (struct arpt_entry *)(newinfo->entries + pos); + = (struct arpt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali goto next; e = (struct arpt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct arpt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali newpos = pos + e->next_offset; } e = (struct arpt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, struct arpt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -710,11 +705,11 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -743,29 +738,26 @@ static int translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) { + if (!mark_source_chains(newinfo, valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e, return 0; } +static inline int set_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct arpt_table_info *t, struct arpt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + ARPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size, struct arpt_entry *e; struct arpt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ struct arpt_entry_target *t; - e = (struct arpt_entry *)(table->private->entries + off); + e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } +static void free_table_info(struct arpt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct arpt_table_info *alloc_table_info(unsigned int size) +{ + struct arpt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { int ret; @@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len) struct arpt_table *t; struct arpt_table_info *newinfo, *oldinfo; struct arpt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct arpt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); up(&arpt_mutex); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len) struct arpt_counters_info tmp, *paddc; struct arpt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len) } i = 0; - ARPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table, struct arpt_table_info *newinfo; static struct arpt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; return ret; } - memcpy(newinfo->entries, repl->entries, repl->size); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&arpt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void arpt_unregister_table(struct arpt_table *table) { + void *loc_cpu_entry; + down(&arpt_mutex); LIST_DELETE(&arpt_tables, table); up(&arpt_mutex); /* Decrease module usage counts and free resources */ - ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8475e84dbd25ab1da3eebb1a5562a20263..2a26d167e1495774e3c7941ced08f39db0398397 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -105,20 +100,15 @@ struct ipt_table_info unsigned int underflow[NF_IP_NUMHOOKS]; /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ipt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -855,6 +845,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -875,11 +866,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -907,27 +898,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -943,15 +931,12 @@ replace_table(struct ipt_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ipt_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ipt_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ipt_table_info *t, struct ipt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry *e; struct ipt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct ipt_counters) * table->private->number; - counters = vmalloc(countersize); + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries, return ret; } +static void free_table_info(struct ipt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ipt_table_info *alloc_table_info(unsigned int size) +{ + struct ipt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); + if (newinfo->entries[cpu] == 0) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len) struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ipt_mutex); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len) struct ipt_counters_info tmp, *paddc; struct ipt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) struct ipt_table_info *newinfo; static struct ipt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu + * but dont care of preemption + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ipt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ipt_unregister_table(struct ipt_table *table) { + void *loc_cpu_entry; + down(&ipt_mutex); LIST_DELETE(&ipt_tables, table); up(&ipt_mutex); /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d469271c4d1879a75e6d9d3235b786f76f8691..dd80020d874035cfbcd058e035d5641b018d2fe2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -86,11 +86,6 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -108,20 +103,15 @@ struct ip6t_table_info unsigned int underflow[NF_IP6_NUMHOOKS]; /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ip6t_target); static LIST_HEAD(ip6t_match); static LIST_HEAD(ip6t_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -376,8 +366,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -649,7 +638,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ip6t_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +648,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +698,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +721,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -941,6 +931,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ip6t_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +952,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,27 +984,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -1029,15 +1017,12 @@ replace_table(struct ip6t_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ip6t_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ip6t_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -1072,16 +1057,44 @@ add_entry_to_counter(const struct ip6t_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ip6t_table_info *t, struct ip6t_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1098,6 +1111,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry *e; struct ip6t_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1109,13 +1123,13 @@ copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +1141,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1196,6 +1210,46 @@ get_entries(const struct ip6t_get_entries *entries, return ret; } +static void free_table_info(struct ip6t_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ip6t_table_info *alloc_table_info(unsigned int size) +{ + struct ip6t_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1204,6 +1258,7 @@ do_replace(void __user *user, unsigned int len) struct ip6t_table *t; struct ip6t_table_info *newinfo, *oldinfo; struct ip6t_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1212,13 +1267,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1229,10 +1284,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1271,8 +1325,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1284,11 +1339,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ip6t_mutex); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1321,6 +1376,7 @@ do_add_counters(void __user *user, unsigned int len) struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1350,7 +1406,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1543,28 +1601,29 @@ int ip6t_register_table(struct ip6t_table *table, struct ip6t_table_info *newinfo; static struct ip6t_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ip6t_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1593,20 +1652,23 @@ int ip6t_register_table(struct ip6t_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ip6t_unregister_table(struct ip6t_table *table) { + void *loc_cpu_entry; + down(&ip6t_mutex); LIST_DELETE(&ip6t_tables, table); up(&ip6t_mutex); /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */