提交 fd37ce34 编写于 作者: L Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Pull networking update from David S. Miller:
 "I think Eric Dumazet and I have dealt with all of the known routing
  cache removal fallout.  Some other minor fixes all around.

  1) Fix RCU of cached routes, particular of output routes which require
     liberation via call_rcu() instead of call_rcu_bh().  From Eric
     Dumazet.

  2) Make sure we purge net device references in cached routes properly.

  3) TG3 driver bug fixes from Michael Chan.

  4) Fix reported 'expires' value in ipv6 routes, from Li Wei.

  5) TUN driver ioctl leaks kernel bytes to userspace, from Mathias
     Krause."

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (22 commits)
  ipv4: Properly purge netdev references on uncached routes.
  ipv4: Cache routes in nexthop exception entries.
  ipv4: percpu nh_rth_output cache
  ipv4: Restore old dst_free() behavior.
  bridge: make port attributes const
  ipv4: remove rt_cache_rebuild_count
  net: ipv4: fix RCU races on dst refcounts
  net: TCP early demux cleanup
  tun: Fix formatting.
  net/tun: fix ioctl() based info leaks
  tg3: Update version to 3.124
  tg3: Fix race condition in tg3_get_stats64()
  tg3: Add New 5719 Read DMA workaround
  tg3: Fix Read DMA workaround for 5719 A0.
  tg3: Request APE_LOCK_PHY before PHY access
  ipv6: fix incorrect route 'expires' value passed to userspace
  mISDN: Bugfix only few bytes are transfered on a connection
  seeq: use PTR_RET at init_module of driver
  bnx2x: remove cast around the kmalloc in bnx2x_prev_mark_path
  ipv4: clean up put_child
  ...
......@@ -48,12 +48,6 @@ min_adv_mss - INTEGER
The advertised MSS depends on the first hop route MTU, but will
never be lower than this setting.
rt_cache_rebuild_count - INTEGER
The per net-namespace route cache emergency rebuild threshold.
Any net-namespace having its route cache rebuilt due to
a hash bucket chain being too long more than this many times
will have its route caching disabled
IP Fragmentation:
ipfrag_high_thresh - INTEGER
......
......@@ -449,7 +449,8 @@ hdlc_fill_fifo(struct bchannel *bch)
{
struct fritzcard *fc = bch->hw;
struct hdlc_hw *hdlc;
int count, fs, cnt = 0, idx, fillempty = 0;
int count, fs, cnt = 0, idx;
bool fillempty = false;
u8 *p;
u32 *ptr, val, addr;
......@@ -462,7 +463,7 @@ hdlc_fill_fifo(struct bchannel *bch)
return;
count = fs;
p = bch->fill;
fillempty = 1;
fillempty = true;
} else {
count = bch->tx_skb->len - bch->tx_idx;
if (count <= 0)
......@@ -477,7 +478,7 @@ hdlc_fill_fifo(struct bchannel *bch)
hdlc->ctrl.sr.cmd |= HDLC_CMD_XME;
}
ptr = (u32 *)p;
if (fillempty) {
if (!fillempty) {
pr_debug("%s.B%d: %d/%d/%d", fc->name, bch->nr, count,
bch->tx_idx, bch->tx_skb->len);
bch->tx_idx += count;
......
......@@ -9360,8 +9360,7 @@ static int __devinit bnx2x_prev_mark_path(struct bnx2x *bp)
struct bnx2x_prev_path_list *tmp_list;
int rc;
tmp_list = (struct bnx2x_prev_path_list *)
kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL);
tmp_list = kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL);
if (!tmp_list) {
BNX2X_ERR("Failed to allocate 'bnx2x_prev_path_list'\n");
return -ENOMEM;
......
......@@ -92,7 +92,7 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits)
#define DRV_MODULE_NAME "tg3"
#define TG3_MAJ_NUM 3
#define TG3_MIN_NUM 123
#define TG3_MIN_NUM 124
#define DRV_MODULE_VERSION \
__stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
#define DRV_MODULE_RELDATE "March 21, 2012"
......@@ -672,6 +672,12 @@ static int tg3_ape_lock(struct tg3 *tp, int locknum)
else
bit = 1 << tp->pci_fn;
break;
case TG3_APE_LOCK_PHY0:
case TG3_APE_LOCK_PHY1:
case TG3_APE_LOCK_PHY2:
case TG3_APE_LOCK_PHY3:
bit = APE_LOCK_REQ_DRIVER;
break;
default:
return -EINVAL;
}
......@@ -723,6 +729,12 @@ static void tg3_ape_unlock(struct tg3 *tp, int locknum)
else
bit = 1 << tp->pci_fn;
break;
case TG3_APE_LOCK_PHY0:
case TG3_APE_LOCK_PHY1:
case TG3_APE_LOCK_PHY2:
case TG3_APE_LOCK_PHY3:
bit = APE_LOCK_GRANT_DRIVER;
break;
default:
return;
}
......@@ -1052,6 +1064,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val)
udelay(80);
}
tg3_ape_lock(tp, tp->phy_ape_lock);
*val = 0x0;
frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) &
......@@ -1086,6 +1100,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val)
udelay(80);
}
tg3_ape_unlock(tp, tp->phy_ape_lock);
return ret;
}
......@@ -1105,6 +1121,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val)
udelay(80);
}
tg3_ape_lock(tp, tp->phy_ape_lock);
frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) &
MI_COM_PHY_ADDR_MASK);
frame_val |= ((reg << MI_COM_REG_ADDR_SHIFT) &
......@@ -1135,6 +1153,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val)
udelay(80);
}
tg3_ape_unlock(tp, tp->phy_ape_lock);
return ret;
}
......@@ -9066,8 +9086,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57780 ||
tg3_flag(tp, 57765_PLUS)) {
val = tr32(TG3_RDMA_RSRVCTRL_REG);
if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719 ||
GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5720) {
if (tp->pci_chip_rev_id == CHIPREV_ID_5719_A0) {
val &= ~(TG3_RDMA_RSRVCTRL_TXMRGN_MASK |
TG3_RDMA_RSRVCTRL_FIFO_LWM_MASK |
TG3_RDMA_RSRVCTRL_FIFO_HWM_MASK);
......@@ -9257,6 +9276,19 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
tw32_f(RDMAC_MODE, rdmac_mode);
udelay(40);
if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) {
for (i = 0; i < TG3_NUM_RDMA_CHANNELS; i++) {
if (tr32(TG3_RDMA_LENGTH + (i << 2)) > TG3_MAX_MTU(tp))
break;
}
if (i < TG3_NUM_RDMA_CHANNELS) {
val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL);
val |= TG3_LSO_RD_DMA_TX_LENGTH_WA;
tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val);
tg3_flag_set(tp, 5719_RDMA_BUG);
}
}
tw32(RCVDCC_MODE, RCVDCC_MODE_ENABLE | RCVDCC_MODE_ATTN_ENABLE);
if (!tg3_flag(tp, 5705_PLUS))
tw32(MBFREE_MODE, MBFREE_MODE_ENABLE);
......@@ -9616,6 +9648,16 @@ static void tg3_periodic_fetch_stats(struct tg3 *tp)
TG3_STAT_ADD32(&sp->tx_ucast_packets, MAC_TX_STATS_UCAST);
TG3_STAT_ADD32(&sp->tx_mcast_packets, MAC_TX_STATS_MCAST);
TG3_STAT_ADD32(&sp->tx_bcast_packets, MAC_TX_STATS_BCAST);
if (unlikely(tg3_flag(tp, 5719_RDMA_BUG) &&
(sp->tx_ucast_packets.low + sp->tx_mcast_packets.low +
sp->tx_bcast_packets.low) > TG3_NUM_RDMA_CHANNELS)) {
u32 val;
val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL);
val &= ~TG3_LSO_RD_DMA_TX_LENGTH_WA;
tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val);
tg3_flag_clear(tp, 5719_RDMA_BUG);
}
TG3_STAT_ADD32(&sp->rx_octets, MAC_RX_STATS_OCTETS);
TG3_STAT_ADD32(&sp->rx_fragments, MAC_RX_STATS_FRAGMENTS);
......@@ -12482,10 +12524,12 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
{
struct tg3 *tp = netdev_priv(dev);
if (!tp->hw_stats)
spin_lock_bh(&tp->lock);
if (!tp->hw_stats) {
spin_unlock_bh(&tp->lock);
return &tp->net_stats_prev;
}
spin_lock_bh(&tp->lock);
tg3_get_nstats(tp, stats);
spin_unlock_bh(&tp->lock);
......@@ -13648,6 +13692,23 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
tg3_flag_set(tp, PAUSE_AUTONEG);
tp->link_config.flowctrl = FLOW_CTRL_TX | FLOW_CTRL_RX;
if (tg3_flag(tp, ENABLE_APE)) {
switch (tp->pci_fn) {
case 0:
tp->phy_ape_lock = TG3_APE_LOCK_PHY0;
break;
case 1:
tp->phy_ape_lock = TG3_APE_LOCK_PHY1;
break;
case 2:
tp->phy_ape_lock = TG3_APE_LOCK_PHY2;
break;
case 3:
tp->phy_ape_lock = TG3_APE_LOCK_PHY3;
break;
}
}
if (tg3_flag(tp, USE_PHYLIB))
return tg3_phy_init(tp);
......
......@@ -1376,7 +1376,11 @@
#define TG3_LSO_RD_DMA_CRPTEN_CTRL 0x00004910
#define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_BD_4K 0x00030000
#define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_LSO_4K 0x000c0000
/* 0x4914 --> 0x4c00 unused */
#define TG3_LSO_RD_DMA_TX_LENGTH_WA 0x02000000
/* 0x4914 --> 0x4be0 unused */
#define TG3_NUM_RDMA_CHANNELS 4
#define TG3_RDMA_LENGTH 0x00004be0
/* Write DMA control registers */
#define WDMAC_MODE 0x00004c00
......@@ -2959,6 +2963,7 @@ enum TG3_FLAGS {
TG3_FLAG_L1PLLPD_EN,
TG3_FLAG_APE_HAS_NCSI,
TG3_FLAG_4K_FIFO_LIMIT,
TG3_FLAG_5719_RDMA_BUG,
TG3_FLAG_RESET_TASK_PENDING,
TG3_FLAG_5705_PLUS,
TG3_FLAG_IS_5788,
......@@ -3107,6 +3112,7 @@ struct tg3 {
int old_link;
u8 phy_addr;
u8 phy_ape_lock;
/* PHY info */
u32 phy_id;
......
......@@ -4682,6 +4682,7 @@ static int __devinit qlge_probe(struct pci_dev *pdev,
NETIF_F_HW_VLAN_TX | NETIF_F_RXCSUM;
ndev->features = ndev->hw_features |
NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
ndev->vlan_features = ndev->hw_features;
if (test_bit(QL_DMA64, &qdev->flags))
ndev->features |= NETIF_F_HIGHDMA;
......
......@@ -736,9 +736,7 @@ MODULE_PARM_DESC(irq, "SEEQ 8005 IRQ number");
int __init init_module(void)
{
dev_seeq = seeq8005_probe(-1);
if (IS_ERR(dev_seeq))
return PTR_ERR(dev_seeq);
return 0;
return PTR_RET(dev_seeq);
}
void __exit cleanup_module(void)
......
......@@ -1379,10 +1379,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
int vnet_hdr_sz;
int ret;
if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) {
if (copy_from_user(&ifr, argp, ifreq_len))
return -EFAULT;
} else {
memset(&ifr, 0, sizeof(ifr));
}
if (cmd == TUNGETFEATURES) {
/* Currently this just means: "what IFF flags are valid?".
* This is needed because we never checked for invalid flags on
......
......@@ -249,4 +249,13 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
return flags;
}
static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
#endif /* _INET_SOCK_H */
......@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <net/fib_rules.h>
#include <net/inetpeer.h>
#include <linux/percpu.h>
struct fib_config {
u8 fc_dst_len;
......@@ -54,6 +55,7 @@ struct fib_nh_exception {
u32 fnhe_pmtu;
__be32 fnhe_gw;
unsigned long fnhe_expires;
struct rtable __rcu *fnhe_rth;
unsigned long fnhe_stamp;
};
......@@ -81,8 +83,8 @@ struct fib_nh {
__be32 nh_gw;
__be32 nh_saddr;
int nh_saddr_genid;
struct rtable *nh_rth_output;
struct rtable *nh_rth_input;
struct rtable __rcu * __percpu *nh_pcpu_rth_output;
struct rtable __rcu *nh_rth_input;
struct fnhe_hash_bucket *nh_exceptions;
};
......
......@@ -61,8 +61,6 @@ struct netns_ipv4 {
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_errors_use_inbound_ifaddr;
int sysctl_rt_cache_rebuild_count;
int current_rt_cache_rebuild_count;
unsigned int sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
......
......@@ -57,6 +57,8 @@ struct rtable {
/* Miscellaneous cached information */
u32 rt_pmtu;
struct list_head rt_uncached;
};
static inline bool rt_is_input_route(const struct rtable *rt)
......@@ -107,6 +109,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
struct in_device;
extern int ip_rt_init(void);
extern void rt_cache_flush(struct net *net, int how);
extern void rt_flush_dev(struct net_device *dev);
extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
......
......@@ -27,7 +27,7 @@ struct brport_attribute {
};
#define BRPORT_ATTR(_name,_mode,_show,_store) \
struct brport_attribute brport_attr_##_name = { \
const struct brport_attribute brport_attr_##_name = { \
.attr = {.name = __stringify(_name), \
.mode = _mode }, \
.show = _show, \
......@@ -164,7 +164,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
store_multicast_router);
#endif
static struct brport_attribute *brport_attrs[] = {
static const struct brport_attribute *brport_attrs[] = {
&brport_attr_path_cost,
&brport_attr_priority,
&brport_attr_port_id,
......@@ -241,7 +241,7 @@ const struct sysfs_ops brport_sysfs_ops = {
int br_sysfs_addif(struct net_bridge_port *p)
{
struct net_bridge *br = p->br;
struct brport_attribute **a;
const struct brport_attribute **a;
int err;
err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj,
......
......@@ -625,9 +625,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
.rta_id = id,
};
if (expires)
ci.rta_expires = jiffies_to_clock_t(expires);
if (expires) {
unsigned long clock;
clock = jiffies_to_clock_t(abs(expires));
clock = min_t(unsigned long, clock, INT_MAX);
ci.rta_expires = (expires > 0) ? clock : -clock;
}
return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
......
......@@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2, -1);
rt_flush_dev(dev);
return NOTIFY_DONE;
}
......
......@@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
},
};
static void rt_fibinfo_free(struct rtable __rcu **rtp)
{
struct rtable *rt = rcu_dereference_protected(*rtp, 1);
if (!rt)
return;
/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
* because we waited an RCU grace period before calling
* free_fib_info_rcu()
*/
dst_free(&rt->dst);
}
static void free_nh_exceptions(struct fib_nh *nh)
{
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
......@@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh)
struct fib_nh_exception *next;
next = rcu_dereference_protected(fnhe->fnhe_next, 1);
rt_fibinfo_free(&fnhe->fnhe_rth);
kfree(fnhe);
fnhe = next;
......@@ -161,6 +179,23 @@ static void free_nh_exceptions(struct fib_nh *nh)
kfree(hash);
}
static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
{
int cpu;
if (!rtp)
return;
for_each_possible_cpu(cpu) {
struct rtable *rt;
rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
if (rt)
dst_free(&rt->dst);
}
free_percpu(rtp);
}
/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
......@@ -171,10 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
dev_put(nexthop_nh->nh_dev);
if (nexthop_nh->nh_exceptions)
free_nh_exceptions(nexthop_nh);
if (nexthop_nh->nh_rth_output)
dst_free(&nexthop_nh->nh_rth_output->dst);
if (nexthop_nh->nh_rth_input)
dst_free(&nexthop_nh->nh_rth_input->dst);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
release_net(fi->fib_net);
......@@ -804,6 +837,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_nhs = nhs;
change_nexthops(fi) {
nexthop_nh->nh_parent = fi;
nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
} endfor_nexthops(fi)
if (cfg->fc_mx) {
......
......@@ -159,7 +159,6 @@ struct trie {
#endif
};
static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
......@@ -473,7 +472,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
sizeof(struct rt_trie_node) << bits);
sizeof(struct rt_trie_node *) << bits);
return tn;
}
......@@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
static inline void put_child(struct trie *t, struct tnode *tn, int i,
static inline void put_child(struct tnode *tn, int i,
struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
......@@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
put_child(t, tn, 2*i, (struct rt_trie_node *) left);
put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
put_child(tn, 2*i, (struct rt_trie_node *) left);
put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
......@@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (tkey_extract_bits(node->key,
oldtnode->pos + oldtnode->bits,
1) == 0)
put_child(t, tn, 2*i, node);
put_child(tn, 2*i, node);
else
put_child(t, tn, 2*i+1, node);
put_child(tn, 2*i+1, node);
continue;
}
......@@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode = (struct tnode *) node;
if (inode->bits == 1) {
put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
tnode_free_safe(inode);
continue;
......@@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
put_child(t, tn, 2*i, NULL);
put_child(tn, 2*i, NULL);
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
put_child(t, tn, 2*i+1, NULL);
put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
size = tnode_child_length(left);
for (j = 0; j < size; j++) {
put_child(t, left, j, rtnl_dereference(inode->child[j]));
put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
put_child(left, j, rtnl_dereference(inode->child[j]));
put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
put_child(t, tn, 2*i, resize(t, left));
put_child(t, tn, 2*i+1, resize(t, right));
put_child(tn, 2*i, resize(t, left));
put_child(tn, 2*i+1, resize(t, right));
tnode_free_safe(inode);
}
......@@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
put_child(t, tn, i/2, (struct rt_trie_node *)newn);
put_child(tn, i/2, (struct rt_trie_node *)newn);
}
}
......@@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (left == NULL) {
if (right == NULL) /* Both are empty */
continue;
put_child(t, tn, i/2, right);
put_child(tn, i/2, right);
continue;
}
if (right == NULL) {
put_child(t, tn, i/2, left);
put_child(tn, i/2, left);
continue;
}
/* Two nonempty children */
newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
put_child(t, tn, i/2, NULL);
put_child(t, newBinNode, 0, left);
put_child(t, newBinNode, 1, right);
put_child(t, tn, i/2, resize(t, newBinNode));
put_child(tn, i/2, NULL);
put_child(newBinNode, 0, left);
put_child(newBinNode, 1, right);
put_child(tn, i/2, resize(t, newBinNode));
}
tnode_free_safe(oldtnode);
return tn;
......@@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, tp, cindex, (struct rt_trie_node *)l);
put_child(tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
......@@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
put_child(t, tn, missbit, (struct rt_trie_node *)l);
put_child(t, tn, 1-missbit, n);
put_child(tn, missbit, (struct rt_trie_node *)l);
put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, tp, cindex, (struct rt_trie_node *)tn);
put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
......@@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
if (tp) {
t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
put_child(t, tp, cindex, NULL);
put_child(tp, cindex, NULL);
trie_rebalance(t, tp);
} else
RCU_INIT_POINTER(t->trie, NULL);
......
......@@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
const struct net_protocol *ipprot;
int protocol = iph->protocol;
rcu_read_lock();
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->early_demux) {
ipprot->early_demux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
rcu_read_unlock();
}
/*
......
......@@ -147,6 +147,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void ipv4_dst_destroy(struct dst_entry *dst);
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
......@@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = {
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
......@@ -587,11 +589,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
static DEFINE_SEQLOCK(fnhe_seqlock);
static inline void rt_free(struct rtable *rt)
{
call_rcu(&rt->dst.rcu_head, dst_rcu_free);
}
static DEFINE_SPINLOCK(fnhe_lock);
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
{
struct fib_nh_exception *fnhe, *oldest;
struct rtable *orig;
oldest = rcu_dereference(hash->chain);
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
......@@ -599,6 +607,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
oldest = fnhe;
}
orig = rcu_dereference(oldest->fnhe_rth);
if (orig) {
RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
rt_free(orig);
}
return oldest;
}
......@@ -620,7 +633,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
int depth;
u32 hval = fnhe_hashfun(daddr);
write_seqlock_bh(&fnhe_seqlock);
spin_lock_bh(&fnhe_lock);
hash = nh->nh_exceptions;
if (!hash) {
......@@ -667,7 +680,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_stamp = jiffies;
out_unlock:
write_sequnlock_bh(&fnhe_seqlock);
spin_unlock_bh(&fnhe_lock);
return;
}
......@@ -1164,53 +1177,62 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
return NULL;
}
static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr)
{
__be32 fnhe_daddr, gw;
unsigned long expires;
unsigned int seq;
u32 pmtu;
restart:
seq = read_seqbegin(&fnhe_seqlock);
fnhe_daddr = fnhe->fnhe_daddr;
gw = fnhe->fnhe_gw;
pmtu = fnhe->fnhe_pmtu;
expires = fnhe->fnhe_expires;
if (read_seqretry(&fnhe_seqlock, seq))
goto restart;
if (daddr != fnhe_daddr)
return;
bool ret = false;
spin_lock_bh(&fnhe_lock);
if (pmtu) {
unsigned long diff = expires - jiffies;
if (daddr == fnhe->fnhe_daddr) {
struct rtable *orig;
if (time_before(jiffies, expires)) {
rt->rt_pmtu = pmtu;
dst_set_expires(&rt->dst, diff);
if (fnhe->fnhe_pmtu) {
unsigned long expires = fnhe->fnhe_expires;
unsigned long diff = expires - jiffies;
if (time_before(jiffies, expires)) {
rt->rt_pmtu = fnhe->fnhe_pmtu;
dst_set_expires(&rt->dst, diff);
}
}
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
rt->rt_gateway = fnhe->fnhe_gw;
}
orig = rcu_dereference(fnhe->fnhe_rth);
rcu_assign_pointer(fnhe->fnhe_rth, rt);
if (orig)
rt_free(orig);
fnhe->fnhe_stamp = jiffies;
ret = true;
} else {
/* Routes we intend to cache in nexthop exception have
* the DST_NOCACHE bit clear. However, if we are
* unsuccessful at storing this route into the cache
* we really need to set it.
*/
rt->dst.flags |= DST_NOCACHE;
}
if (gw) {
rt->rt_flags |= RTCF_REDIRECTED;
rt->rt_gateway = gw;
}
fnhe->fnhe_stamp = jiffies;
}
spin_unlock_bh(&fnhe_lock);
static inline void rt_free(struct rtable *rt)
{
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
return ret;
}
static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
{
struct rtable *orig, *prev, **p = &nh->nh_rth_output;
if (rt_is_input_route(rt))
p = &nh->nh_rth_input;
struct rtable *orig, *prev, **p;
bool ret = true;
if (rt_is_input_route(rt)) {
p = (struct rtable **)&nh->nh_rth_input;
} else {
if (!nh->nh_pcpu_rth_output)
goto nocache;
p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
}
orig = *p;
prev = cmpxchg(p, orig, rt);
......@@ -1223,7 +1245,50 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
* unsuccessful at storing this route into the cache
* we really need to set it.
*/
nocache:
rt->dst.flags |= DST_NOCACHE;
ret = false;
}
return ret;
}
static DEFINE_SPINLOCK(rt_uncached_lock);
static LIST_HEAD(rt_uncached_list);
static void rt_add_uncached_list(struct rtable *rt)
{
spin_lock_bh(&rt_uncached_lock);
list_add_tail(&rt->rt_uncached, &rt_uncached_list);
spin_unlock_bh(&rt_uncached_lock);
}
static void ipv4_dst_destroy(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *) dst;
if (dst->flags & DST_NOCACHE) {
spin_lock_bh(&rt_uncached_lock);
list_del(&rt->rt_uncached);
spin_unlock_bh(&rt_uncached_lock);
}
}
void rt_flush_dev(struct net_device *dev)
{
if (!list_empty(&rt_uncached_list)) {
struct net *net = dev_net(dev);
struct rtable *rt;
spin_lock_bh(&rt_uncached_lock);
list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
if (rt->dst.dev != dev)
continue;
rt->dst.dev = net->loopback_dev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
spin_unlock_bh(&rt_uncached_lock);
}
}
......@@ -1239,20 +1304,24 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct fib_nh_exception *fnhe,
struct fib_info *fi, u16 type, u32 itag)
{
bool cached = false;
if (fi) {
struct fib_nh *nh = &FIB_RES_NH(*res);
if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = nh->nh_gw;
if (unlikely(fnhe))
rt_bind_exception(rt, fnhe, daddr);
dst_init_metrics(&rt->dst, fi->fib_metrics, true);
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
if (!(rt->dst.flags & DST_NOCACHE))
rt_cache_route(nh, rt);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
cached = rt_cache_route(nh, rt);
}
if (unlikely(!cached))
rt_add_uncached_list(rt);
#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
......@@ -1319,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
......@@ -1420,7 +1490,7 @@ static int __mkroute_input(struct sk_buff *skb,
do_cache = false;
if (res->fi) {
if (!itag) {
rth = FIB_RES_NH(*res).nh_rth_input;
rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
......@@ -1444,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
......@@ -1582,7 +1653,7 @@ out: return err;
do_cache = false;
if (res.fi) {
if (!itag) {
rth = FIB_RES_NH(res).nh_rth_input;
rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
err = 0;
......@@ -1610,6 +1681,7 @@ out: return err;
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
......@@ -1748,19 +1820,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = NULL;
if (fi) {
struct rtable __rcu **prth;
fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
if (!fnhe) {
rth = FIB_RES_NH(*res).nh_rth_output;
if (rt_cache_valid(rth)) {
dst_hold(&rth->dst);
return rth;
}
if (fnhe)
prth = &fnhe->fnhe_rth;
else
prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
if (rt_cache_valid(rth)) {
dst_hold(&rth->dst);
return rth;
}
}
rth = rt_dst_alloc(dev_out,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
fi && !fnhe);
fi);
if (!rth)
return ERR_PTR(-ENOBUFS);
......@@ -1773,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth->rt_iif = orig_oif ? : 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
RT_CACHE_STAT_INC(out_slow_tot);
......@@ -2052,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_type = ort->rt_type;
rt->rt_gateway = ort->rt_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
dst_free(new);
}
......
......@@ -783,13 +783,6 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "rt_cache_rebuild_count",
.data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "ping_group_range",
.data = &init_net.ipv4.sysctl_ping_group_range,
......@@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table[5].data =
&net->ipv4.sysctl_icmp_ratemask;
table[6].data =
&net->ipv4.sysctl_rt_cache_rebuild_count;
table[7].data =
&net->ipv4.sysctl_ping_group_range;
}
......@@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
net->ipv4.sysctl_ping_group_range[0] = 1;
net->ipv4.sysctl_ping_group_range[1] = 0;
net->ipv4.sysctl_rt_cache_rebuild_count = 4;
tcp_init_mem(net);
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
......
......@@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_ESTABLISHED);
if (skb != NULL) {
sk->sk_rx_dst = dst_clone(skb_dst(skb));
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
inet_sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}
......
......@@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
if (sk->sk_rx_dst) {
struct dst_entry *dst = sk->sk_rx_dst;
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
dst->ops->check(dst, 0) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
if (unlikely(sk->sk_rx_dst == NULL)) {
sk->sk_rx_dst = dst_clone(skb_dst(skb));
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
if (unlikely(sk->sk_rx_dst == NULL))
inet_sk_rx_dst_set(sk, skb);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
......
......@@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_sock *oldtp = tcp_sk(sk);
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
newsk->sk_rx_dst = dst_clone(skb_dst(skb));
inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
inet_sk_rx_dst_set(newsk, skb);
/* TCP Cookie Transactions require space for the cookie pair,
* as it differs for each connection. There is no need to
......
......@@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_type = rt->rt_type;
xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
}
......
......@@ -52,11 +52,9 @@ int ip6_rcv_finish(struct sk_buff *skb)
if (sysctl_ip_early_demux && !skb_dst(skb)) {
const struct inet6_protocol *ipprot;
rcu_read_lock();
ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
if (ipprot && ipprot->early_demux)
ipprot->early_demux(skb);
rcu_read_unlock();
}
if (!skb_dst(skb))
ip6_route_input(skb);
......
......@@ -2480,12 +2480,8 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
goto nla_put_failure;
if (!(rt->rt6i_flags & RTF_EXPIRES))
expires = 0;
else if (rt->dst.expires - jiffies < INT_MAX)
expires = rt->dst.expires - jiffies;
else
expires = INT_MAX;
expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
goto nla_put_failure;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册