提交 c8d68e6b 编写于 作者: J Jason Wang 提交者: David S. Miller

tuntap: multiqueue support

This patch converts tun/tap to a multiqueue devices and expose the multiqueue
queues as multiple file descriptors to userspace. Internally, each tun_file were
abstracted as a queue, and an array of pointers to tun_file structurs were
stored in tun_structure device, so multiple tun_files were allowed to be
attached to the device as multiple queues.

When choosing txq, we first try to identify a flow through its rxhash, if it
does not have such one, we could try recorded rxq and then use them to choose
the transmit queue. This policy may be changed in the future.
Signed-off-by: NJason Wang <jasowang@redhat.com>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 bbb00994
...@@ -109,6 +109,12 @@ struct tap_filter { ...@@ -109,6 +109,12 @@ struct tap_filter {
unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
}; };
/* 1024 is probably a high enough limit: modern hypervisors seem to support on
* the order of 100-200 CPUs so this leaves us some breathing space if we want
* to match a queue per guest CPU.
*/
#define MAX_TAP_QUEUES 1024
/* A tun_file connects an open character device to a tuntap netdevice. It /* A tun_file connects an open character device to a tuntap netdevice. It
* also contains all socket related strctures (except sock_fprog and tap_filter) * also contains all socket related strctures (except sock_fprog and tap_filter)
* to serve as one transmit queue for tuntap device. The sock_fprog and * to serve as one transmit queue for tuntap device. The sock_fprog and
...@@ -129,6 +135,7 @@ struct tun_file { ...@@ -129,6 +135,7 @@ struct tun_file {
struct fasync_struct *fasync; struct fasync_struct *fasync;
/* only used for fasnyc */ /* only used for fasnyc */
unsigned int flags; unsigned int flags;
u16 queue_index;
}; };
/* Since the socket were moved to tun_file, to preserve the behavior of persist /* Since the socket were moved to tun_file, to preserve the behavior of persist
...@@ -136,7 +143,8 @@ struct tun_file { ...@@ -136,7 +143,8 @@ struct tun_file {
* file were attached to a persist device. * file were attached to a persist device.
*/ */
struct tun_struct { struct tun_struct {
struct tun_file __rcu *tfile; struct tun_file __rcu *tfiles[MAX_TAP_QUEUES];
unsigned int numqueues;
unsigned int flags; unsigned int flags;
kuid_t owner; kuid_t owner;
kgid_t group; kgid_t group;
...@@ -157,56 +165,157 @@ struct tun_struct { ...@@ -157,56 +165,157 @@ struct tun_struct {
#endif #endif
}; };
/* We try to identify a flow through its rxhash first. The reason that
* we do not check rxq no. is becuase some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
* the userspace application move between processors, we may get a
* different rxq no. here. If we could not get rxhash, then we would
* hope the rxq no. may help here.
*/
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
{
struct tun_struct *tun = netdev_priv(dev);
u32 txq = 0;
u32 numqueues = 0;
rcu_read_lock();
numqueues = tun->numqueues;
txq = skb_get_rxhash(skb);
if (txq) {
/* use multiply and shift instead of expensive divide */
txq = ((u64)txq * numqueues) >> 32;
} else if (likely(skb_rx_queue_recorded(skb))) {
txq = skb_get_rx_queue(skb);
while (unlikely(txq >= numqueues))
txq -= numqueues;
}
rcu_read_unlock();
return txq;
}
static void tun_set_real_num_queues(struct tun_struct *tun)
{
netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}
static void __tun_detach(struct tun_file *tfile, bool clean)
{
struct tun_file *ntfile;
struct tun_struct *tun;
struct net_device *dev;
tun = rcu_dereference_protected(tfile->tun,
lockdep_rtnl_is_held());
if (tun) {
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
dev = tun->dev;
rcu_assign_pointer(tun->tfiles[index],
tun->tfiles[tun->numqueues - 1]);
rcu_assign_pointer(tfile->tun, NULL);
ntfile = rcu_dereference_protected(tun->tfiles[index],
lockdep_rtnl_is_held());
ntfile->queue_index = index;
--tun->numqueues;
sock_put(&tfile->sk);
synchronize_net();
/* Drop read queue */
skb_queue_purge(&tfile->sk.sk_receive_queue);
tun_set_real_num_queues(tun);
if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
if (dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(dev);
}
if (clean) {
BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
&tfile->socket.flags));
sk_release_kernel(&tfile->sk);
}
}
static void tun_detach(struct tun_file *tfile, bool clean)
{
rtnl_lock();
__tun_detach(tfile, clean);
rtnl_unlock();
}
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile;
int i, n = tun->numqueues;
for (i = 0; i < n; i++) {
tfile = rcu_dereference_protected(tun->tfiles[i],
lockdep_rtnl_is_held());
BUG_ON(!tfile);
wake_up_all(&tfile->wq.wait);
rcu_assign_pointer(tfile->tun, NULL);
--tun->numqueues;
}
BUG_ON(tun->numqueues != 0);
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rcu_dereference_protected(tun->tfiles[i],
lockdep_rtnl_is_held());
/* Drop read queue */
skb_queue_purge(&tfile->sk.sk_receive_queue);
sock_put(&tfile->sk);
}
}
static int tun_attach(struct tun_struct *tun, struct file *file) static int tun_attach(struct tun_struct *tun, struct file *file)
{ {
struct tun_file *tfile = file->private_data; struct tun_file *tfile = file->private_data;
int err; int err;
ASSERT_RTNL();
netif_tx_lock_bh(tun->dev);
err = -EINVAL; err = -EINVAL;
if (tfile->tun) if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held()))
goto out; goto out;
err = -EBUSY; err = -EBUSY;
if (tun->tfile) if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
goto out;
err = -E2BIG;
if (tun->numqueues == MAX_TAP_QUEUES)
goto out; goto out;
err = 0; err = 0;
/* Re-attach filter when attaching to a persist device */ /* Re-attach the filter to presist device */
if (tun->filter_attached == true) { if (tun->filter_attached == true) {
err = sk_attach_filter(&tun->fprog, tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
if (!err) if (!err)
goto out; goto out;
} }
tfile->queue_index = tun->numqueues;
rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tfile->tun, tun);
tfile->socket.sk->sk_sndbuf = tun->sndbuf; rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
rcu_assign_pointer(tun->tfile, tfile);
netif_carrier_on(tun->dev);
sock_hold(&tfile->sk); sock_hold(&tfile->sk);
tun->numqueues++;
out: tun_set_real_num_queues(tun);
netif_tx_unlock_bh(tun->dev);
return err;
}
static void __tun_detach(struct tun_struct *tun) if (tun->numqueues == 1)
{ netif_carrier_on(tun->dev);
struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
lockdep_rtnl_is_held());
/* Detach from net device */
netif_carrier_off(tun->dev);
rcu_assign_pointer(tun->tfile, NULL);
if (tfile) {
rcu_assign_pointer(tfile->tun, NULL);
synchronize_net(); /* device is allowed to go away first, so no need to hold extra
/* Drop read queue */ * refcnt.
skb_queue_purge(&tfile->socket.sk->sk_receive_queue); */
}
out:
return err;
} }
static struct tun_struct *__tun_get(struct tun_file *tfile) static struct tun_struct *__tun_get(struct tun_file *tfile)
...@@ -349,30 +458,20 @@ static const struct ethtool_ops tun_ethtool_ops; ...@@ -349,30 +458,20 @@ static const struct ethtool_ops tun_ethtool_ops;
/* Net device detach from fd. */ /* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev) static void tun_net_uninit(struct net_device *dev)
{ {
struct tun_struct *tun = netdev_priv(dev); tun_detach_all(dev);
struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
lockdep_rtnl_is_held());
/* Inform the methods they need to stop using the dev.
*/
if (tfile) {
wake_up_all(&tfile->wq.wait);
__tun_detach(tun);
synchronize_net();
}
} }
/* Net device open. */ /* Net device open. */
static int tun_net_open(struct net_device *dev) static int tun_net_open(struct net_device *dev)
{ {
netif_start_queue(dev); netif_tx_start_all_queues(dev);
return 0; return 0;
} }
/* Net device close. */ /* Net device close. */
static int tun_net_close(struct net_device *dev) static int tun_net_close(struct net_device *dev)
{ {
netif_stop_queue(dev); netif_tx_stop_all_queues(dev);
return 0; return 0;
} }
...@@ -380,16 +479,20 @@ static int tun_net_close(struct net_device *dev) ...@@ -380,16 +479,20 @@ static int tun_net_close(struct net_device *dev)
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{ {
struct tun_struct *tun = netdev_priv(dev); struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile; struct tun_file *tfile;
rcu_read_lock(); rcu_read_lock();
tfile = rcu_dereference(tun->tfile); tfile = rcu_dereference(tun->tfiles[txq]);
/* Drop packet if interface is not attached */ /* Drop packet if interface is not attached */
if (!tfile) if (txq >= tun->numqueues)
goto drop; goto drop;
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
BUG_ON(!tfile);
/* Drop if the filter does not like it. /* Drop if the filter does not like it.
* This is a noop if the filter is disabled. * This is a noop if the filter is disabled.
* Filter can be enabled only for the TAP devices. */ * Filter can be enabled only for the TAP devices. */
...@@ -400,12 +503,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -400,12 +503,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
sk_filter(tfile->socket.sk, skb)) sk_filter(tfile->socket.sk, skb))
goto drop; goto drop;
/* Limit the number of packets queued by divining txq length with the
* number of queues.
*/
if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>= dev->tx_queue_len) { >= dev->tx_queue_len / tun->numqueues){
if (!(tun->flags & TUN_ONE_QUEUE)) { if (!(tun->flags & TUN_ONE_QUEUE)) {
/* Normal queueing mode. */ /* Normal queueing mode. */
/* Packet scheduler handles dropping of further packets. */ /* Packet scheduler handles dropping of further packets. */
netif_stop_queue(dev); netif_stop_subqueue(dev, txq);
/* We won't see all dropped packets individually, so overrun /* We won't see all dropped packets individually, so overrun
* error is more appropriate. */ * error is more appropriate. */
...@@ -494,6 +600,7 @@ static const struct net_device_ops tun_netdev_ops = { ...@@ -494,6 +600,7 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_start_xmit = tun_net_xmit, .ndo_start_xmit = tun_net_xmit,
.ndo_change_mtu = tun_net_change_mtu, .ndo_change_mtu = tun_net_change_mtu,
.ndo_fix_features = tun_net_fix_features, .ndo_fix_features = tun_net_fix_features,
.ndo_select_queue = tun_select_queue,
#ifdef CONFIG_NET_POLL_CONTROLLER #ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller, .ndo_poll_controller = tun_poll_controller,
#endif #endif
...@@ -509,6 +616,7 @@ static const struct net_device_ops tap_netdev_ops = { ...@@ -509,6 +616,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_set_rx_mode = tun_net_mclist, .ndo_set_rx_mode = tun_net_mclist,
.ndo_set_mac_address = eth_mac_addr, .ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr, .ndo_validate_addr = eth_validate_addr,
.ndo_select_queue = tun_select_queue,
#ifdef CONFIG_NET_POLL_CONTROLLER #ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller, .ndo_poll_controller = tun_poll_controller,
#endif #endif
...@@ -550,7 +658,7 @@ static void tun_net_init(struct net_device *dev) ...@@ -550,7 +658,7 @@ static void tun_net_init(struct net_device *dev)
/* Character device part */ /* Character device part */
/* Poll */ /* Poll */
static unsigned int tun_chr_poll(struct file *file, poll_table * wait) static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
{ {
struct tun_file *tfile = file->private_data; struct tun_file *tfile = file->private_data;
struct tun_struct *tun = __tun_get(tfile); struct tun_struct *tun = __tun_get(tfile);
...@@ -995,7 +1103,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, ...@@ -995,7 +1103,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
schedule(); schedule();
continue; continue;
} }
netif_wake_queue(tun->dev); netif_wake_subqueue(tun->dev, tfile->queue_index);
ret = tun_put_user(tun, tfile, skb, iv, len); ret = tun_put_user(tun, tfile, skb, iv, len);
kfree_skb(skb); kfree_skb(skb);
...@@ -1156,6 +1264,9 @@ static int tun_flags(struct tun_struct *tun) ...@@ -1156,6 +1264,9 @@ static int tun_flags(struct tun_struct *tun)
if (tun->flags & TUN_VNET_HDR) if (tun->flags & TUN_VNET_HDR)
flags |= IFF_VNET_HDR; flags |= IFF_VNET_HDR;
if (tun->flags & TUN_TAP_MQ)
flags |= IFF_MULTI_QUEUE;
return flags; return flags;
} }
...@@ -1247,8 +1358,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -1247,8 +1358,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
if (*ifr->ifr_name) if (*ifr->ifr_name)
name = ifr->ifr_name; name = ifr->ifr_name;
dev = alloc_netdev(sizeof(struct tun_struct), name, dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
tun_setup); tun_setup,
MAX_TAP_QUEUES, MAX_TAP_QUEUES);
if (!dev) if (!dev)
return -ENOMEM; return -ENOMEM;
...@@ -1283,7 +1395,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -1283,7 +1395,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
err = tun_attach(tun, file); err = tun_attach(tun, file);
if (err < 0) if (err < 0)
goto failed; goto err_free_dev;
} }
tun_debug(KERN_INFO, tun, "tun_set_iff\n"); tun_debug(KERN_INFO, tun, "tun_set_iff\n");
...@@ -1303,18 +1415,22 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -1303,18 +1415,22 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
else else
tun->flags &= ~TUN_VNET_HDR; tun->flags &= ~TUN_VNET_HDR;
if (ifr->ifr_flags & IFF_MULTI_QUEUE)
tun->flags |= TUN_TAP_MQ;
else
tun->flags &= ~TUN_TAP_MQ;
/* Make sure persistent devices do not get stuck in /* Make sure persistent devices do not get stuck in
* xoff state. * xoff state.
*/ */
if (netif_running(tun->dev)) if (netif_running(tun->dev))
netif_wake_queue(tun->dev); netif_tx_wake_all_queues(tun->dev);
strcpy(ifr->ifr_name, tun->dev->name); strcpy(ifr->ifr_name, tun->dev->name);
return 0; return 0;
err_free_dev: err_free_dev:
free_netdev(dev); free_netdev(dev);
failed:
return err; return err;
} }
...@@ -1369,6 +1485,51 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) ...@@ -1369,6 +1485,51 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
return 0; return 0;
} }
static void tun_detach_filter(struct tun_struct *tun, int n)
{
int i;
struct tun_file *tfile;
for (i = 0; i < n; i++) {
tfile = rcu_dereference_protected(tun->tfiles[i],
lockdep_rtnl_is_held());
sk_detach_filter(tfile->socket.sk);
}
tun->filter_attached = false;
}
static int tun_attach_filter(struct tun_struct *tun)
{
int i, ret = 0;
struct tun_file *tfile;
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference_protected(tun->tfiles[i],
lockdep_rtnl_is_held());
ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
if (ret) {
tun_detach_filter(tun, i);
return ret;
}
}
tun->filter_attached = true;
return ret;
}
static void tun_set_sndbuf(struct tun_struct *tun)
{
struct tun_file *tfile;
int i;
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference_protected(tun->tfiles[i],
lockdep_rtnl_is_held());
tfile->socket.sk->sk_sndbuf = tun->sndbuf;
}
}
static long __tun_chr_ioctl(struct file *file, unsigned int cmd, static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg, int ifreq_len) unsigned long arg, int ifreq_len)
{ {
...@@ -1397,6 +1558,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ...@@ -1397,6 +1558,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
(unsigned int __user*)argp); (unsigned int __user*)argp);
} }
ret = 0;
rtnl_lock(); rtnl_lock();
tun = __tun_get(tfile); tun = __tun_get(tfile);
...@@ -1537,7 +1699,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ...@@ -1537,7 +1699,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
break; break;
} }
tun->sndbuf = tfile->socket.sk->sk_sndbuf = sndbuf; tun->sndbuf = sndbuf;
tun_set_sndbuf(tun);
break; break;
case TUNGETVNETHDRSZ: case TUNGETVNETHDRSZ:
...@@ -1568,9 +1731,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ...@@ -1568,9 +1731,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
break; break;
ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); ret = tun_attach_filter(tun);
if (!ret)
tun->filter_attached = true;
break; break;
case TUNDETACHFILTER: case TUNDETACHFILTER:
...@@ -1578,9 +1739,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ...@@ -1578,9 +1739,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = -EINVAL; ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
break; break;
ret = sk_detach_filter(tfile->socket.sk); ret = 0;
if (!ret) tun_detach_filter(tun, tun->numqueues);
tun->filter_attached = false;
break; break;
default: default:
...@@ -1685,37 +1845,9 @@ static int tun_chr_open(struct inode *inode, struct file * file) ...@@ -1685,37 +1845,9 @@ static int tun_chr_open(struct inode *inode, struct file * file)
static int tun_chr_close(struct inode *inode, struct file *file) static int tun_chr_close(struct inode *inode, struct file *file)
{ {
struct tun_file *tfile = file->private_data; struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
struct net *net = tfile->net; struct net *net = tfile->net;
rtnl_lock(); tun_detach(tfile, true);
tun = rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held());
if (tun) {
struct net_device *dev = tun->dev;
tun_debug(KERN_INFO, tun, "tun_chr_close\n");
__tun_detach(tun);
synchronize_net();
/* If desirable, unregister the netdevice. */
if (!(tun->flags & TUN_PERSIST)) {
if (dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(dev);
}
/* drop the reference that netdevice holds */
sock_put(&tfile->sk);
}
rtnl_unlock();
/* drop the reference that file holds */
BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
&tfile->socket.flags));
sk_release_kernel(&tfile->sk);
put_net(net); put_net(net);
return 0; return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册