提交 ccdfcc39 编写于 作者: P Patrick McHardy 提交者: David S. Miller

netlink: mmaped netlink: ring setup

Add support for mmap'ed RX and TX ring setup and teardown based on the
af_packet.c code. The following patches will use this to add the real
mmap'ed receive and transmit functionality.
Signed-off-by: NPatrick McHardy <kaber@trash.net>
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
上级 cf0a018a
#ifndef _UAPI__LINUX_NETLINK_H #ifndef _UAPI__LINUX_NETLINK_H
#define _UAPI__LINUX_NETLINK_H #define _UAPI__LINUX_NETLINK_H
#include <linux/kernel.h>
#include <linux/socket.h> /* for __kernel_sa_family_t */ #include <linux/socket.h> /* for __kernel_sa_family_t */
#include <linux/types.h> #include <linux/types.h>
...@@ -105,11 +106,42 @@ struct nlmsgerr { ...@@ -105,11 +106,42 @@ struct nlmsgerr {
#define NETLINK_PKTINFO 3 #define NETLINK_PKTINFO 3
#define NETLINK_BROADCAST_ERROR 4 #define NETLINK_BROADCAST_ERROR 4
#define NETLINK_NO_ENOBUFS 5 #define NETLINK_NO_ENOBUFS 5
#define NETLINK_RX_RING 6
#define NETLINK_TX_RING 7
struct nl_pktinfo { struct nl_pktinfo {
__u32 group; __u32 group;
}; };
struct nl_mmap_req {
unsigned int nm_block_size;
unsigned int nm_block_nr;
unsigned int nm_frame_size;
unsigned int nm_frame_nr;
};
struct nl_mmap_hdr {
unsigned int nm_status;
unsigned int nm_len;
__u32 nm_group;
/* credentials */
__u32 nm_pid;
__u32 nm_uid;
__u32 nm_gid;
};
enum nl_mmap_status {
NL_MMAP_STATUS_UNUSED,
NL_MMAP_STATUS_RESERVED,
NL_MMAP_STATUS_VALID,
NL_MMAP_STATUS_COPY,
NL_MMAP_STATUS_SKIP,
};
#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO
#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
#define NET_MAJOR 36 /* Major 36 is reserved for networking */ #define NET_MAJOR 36 /* Major 36 is reserved for networking */
enum { enum {
......
...@@ -23,6 +23,15 @@ menuconfig NET ...@@ -23,6 +23,15 @@ menuconfig NET
if NET if NET
config NETLINK_MMAP
bool "Netlink: mmaped IO"
help
This option enables support for memory mapped netlink IO. This
reduces overhead by avoiding copying data between kernel- and
userspace.
If unsure, say N.
config WANT_COMPAT_NETLINK_MESSAGES config WANT_COMPAT_NETLINK_MESSAGES
bool bool
help help
......
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <net/net_namespace.h> #include <net/net_namespace.h>
#include <net/sock.h> #include <net/sock.h>
...@@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u ...@@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
} }
#ifdef CONFIG_NETLINK_MMAP
static __pure struct page *pgvec_to_page(const void *addr)
{
if (is_vmalloc_addr(addr))
return vmalloc_to_page(addr);
else
return virt_to_page(addr);
}
static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
{
unsigned int i;
for (i = 0; i < len; i++) {
if (pg_vec[i] != NULL) {
if (is_vmalloc_addr(pg_vec[i]))
vfree(pg_vec[i]);
else
free_pages((unsigned long)pg_vec[i], order);
}
}
kfree(pg_vec);
}
static void *alloc_one_pg_vec_page(unsigned long order)
{
void *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
__GFP_NOWARN | __GFP_NORETRY;
buffer = (void *)__get_free_pages(gfp_flags, order);
if (buffer != NULL)
return buffer;
buffer = vzalloc((1 << order) * PAGE_SIZE);
if (buffer != NULL)
return buffer;
gfp_flags &= ~__GFP_NORETRY;
return (void *)__get_free_pages(gfp_flags, order);
}
static void **alloc_pg_vec(struct netlink_sock *nlk,
struct nl_mmap_req *req, unsigned int order)
{
unsigned int block_nr = req->nm_block_nr;
unsigned int i;
void **pg_vec, *ptr;
pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
if (pg_vec == NULL)
return NULL;
for (i = 0; i < block_nr; i++) {
pg_vec[i] = ptr = alloc_one_pg_vec_page(order);
if (pg_vec[i] == NULL)
goto err1;
}
return pg_vec;
err1:
free_pg_vec(pg_vec, order, block_nr);
return NULL;
}
static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
bool closing, bool tx_ring)
{
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ring *ring;
struct sk_buff_head *queue;
void **pg_vec = NULL;
unsigned int order = 0;
int err;
ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
if (!closing) {
if (atomic_read(&nlk->mapped))
return -EBUSY;
if (atomic_read(&ring->pending))
return -EBUSY;
}
if (req->nm_block_nr) {
if (ring->pg_vec != NULL)
return -EBUSY;
if ((int)req->nm_block_size <= 0)
return -EINVAL;
if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE))
return -EINVAL;
if (req->nm_frame_size < NL_MMAP_HDRLEN)
return -EINVAL;
if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
return -EINVAL;
ring->frames_per_block = req->nm_block_size /
req->nm_frame_size;
if (ring->frames_per_block == 0)
return -EINVAL;
if (ring->frames_per_block * req->nm_block_nr !=
req->nm_frame_nr)
return -EINVAL;
order = get_order(req->nm_block_size);
pg_vec = alloc_pg_vec(nlk, req, order);
if (pg_vec == NULL)
return -ENOMEM;
} else {
if (req->nm_frame_nr)
return -EINVAL;
}
err = -EBUSY;
mutex_lock(&nlk->pg_vec_lock);
if (closing || atomic_read(&nlk->mapped) == 0) {
err = 0;
spin_lock_bh(&queue->lock);
ring->frame_max = req->nm_frame_nr - 1;
ring->head = 0;
ring->frame_size = req->nm_frame_size;
ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
swap(ring->pg_vec_len, req->nm_block_nr);
swap(ring->pg_vec_order, order);
swap(ring->pg_vec, pg_vec);
__skb_queue_purge(queue);
spin_unlock_bh(&queue->lock);
WARN_ON(atomic_read(&nlk->mapped));
}
mutex_unlock(&nlk->pg_vec_lock);
if (pg_vec)
free_pg_vec(pg_vec, order, req->nm_block_nr);
return err;
}
static void netlink_mm_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct socket *sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
atomic_inc(&nlk_sk(sk)->mapped);
}
static void netlink_mm_close(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct socket *sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
atomic_dec(&nlk_sk(sk)->mapped);
}
static const struct vm_operations_struct netlink_mmap_ops = {
.open = netlink_mm_open,
.close = netlink_mm_close,
};
static int netlink_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ring *ring;
unsigned long start, size, expected;
unsigned int i;
int err = -EINVAL;
if (vma->vm_pgoff)
return -EINVAL;
mutex_lock(&nlk->pg_vec_lock);
expected = 0;
for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
if (ring->pg_vec == NULL)
continue;
expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
}
if (expected == 0)
goto out;
size = vma->vm_end - vma->vm_start;
if (size != expected)
goto out;
start = vma->vm_start;
for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
if (ring->pg_vec == NULL)
continue;
for (i = 0; i < ring->pg_vec_len; i++) {
struct page *page;
void *kaddr = ring->pg_vec[i];
unsigned int pg_num;
for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
page = pgvec_to_page(kaddr);
err = vm_insert_page(vma, start, page);
if (err < 0)
goto out;
start += PAGE_SIZE;
kaddr += PAGE_SIZE;
}
}
}
atomic_inc(&nlk->mapped);
vma->vm_ops = &netlink_mmap_ops;
err = 0;
out:
mutex_unlock(&nlk->pg_vec_lock);
return 0;
}
#else /* CONFIG_NETLINK_MMAP */
#define netlink_mmap sock_no_mmap
#endif /* CONFIG_NETLINK_MMAP */
static void netlink_destroy_callback(struct netlink_callback *cb) static void netlink_destroy_callback(struct netlink_callback *cb)
{ {
kfree_skb(cb->skb); kfree_skb(cb->skb);
...@@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk) ...@@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk)
} }
skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_receive_queue);
#ifdef CONFIG_NETLINK_MMAP
if (1) {
struct nl_mmap_req req;
memset(&req, 0, sizeof(req));
if (nlk->rx_ring.pg_vec)
netlink_set_ring(sk, &req, true, false);
memset(&req, 0, sizeof(req));
if (nlk->tx_ring.pg_vec)
netlink_set_ring(sk, &req, true, true);
}
#endif /* CONFIG_NETLINK_MMAP */
if (!sock_flag(sk, SOCK_DEAD)) { if (!sock_flag(sk, SOCK_DEAD)) {
printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
...@@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock, ...@@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock,
mutex_init(nlk->cb_mutex); mutex_init(nlk->cb_mutex);
} }
init_waitqueue_head(&nlk->wait); init_waitqueue_head(&nlk->wait);
#ifdef CONFIG_NETLINK_MMAP
mutex_init(&nlk->pg_vec_lock);
#endif
sk->sk_destruct = netlink_sock_destruct; sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol; sk->sk_protocol = protocol;
...@@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, ...@@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
if (level != SOL_NETLINK) if (level != SOL_NETLINK)
return -ENOPROTOOPT; return -ENOPROTOOPT;
if (optlen >= sizeof(int) && if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
optlen >= sizeof(int) &&
get_user(val, (unsigned int __user *)optval)) get_user(val, (unsigned int __user *)optval))
return -EFAULT; return -EFAULT;
...@@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, ...@@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
} }
err = 0; err = 0;
break; break;
#ifdef CONFIG_NETLINK_MMAP
case NETLINK_RX_RING:
case NETLINK_TX_RING: {
struct nl_mmap_req req;
/* Rings might consume more memory than queue limits, require
* CAP_NET_ADMIN.
*/
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (optlen < sizeof(req))
return -EINVAL;
if (copy_from_user(&req, optval, sizeof(req)))
return -EFAULT;
err = netlink_set_ring(sk, &req, false,
optname == NETLINK_TX_RING);
break;
}
#endif /* CONFIG_NETLINK_MMAP */
default: default:
err = -ENOPROTOOPT; err = -ENOPROTOOPT;
} }
...@@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = { ...@@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = {
.getsockopt = netlink_getsockopt, .getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg, .sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg, .recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap, .mmap = netlink_mmap,
.sendpage = sock_no_sendpage, .sendpage = sock_no_sendpage,
}; };
......
...@@ -6,6 +6,20 @@ ...@@ -6,6 +6,20 @@
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
struct netlink_ring {
void **pg_vec;
unsigned int head;
unsigned int frames_per_block;
unsigned int frame_size;
unsigned int frame_max;
unsigned int pg_vec_order;
unsigned int pg_vec_pages;
unsigned int pg_vec_len;
atomic_t pending;
};
struct netlink_sock { struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */ /* struct sock has to be the first member of netlink_sock */
struct sock sk; struct sock sk;
...@@ -24,6 +38,12 @@ struct netlink_sock { ...@@ -24,6 +38,12 @@ struct netlink_sock {
void (*netlink_rcv)(struct sk_buff *skb); void (*netlink_rcv)(struct sk_buff *skb);
void (*netlink_bind)(int group); void (*netlink_bind)(int group);
struct module *module; struct module *module;
#ifdef CONFIG_NETLINK_MMAP
struct mutex pg_vec_lock;
struct netlink_ring rx_ring;
struct netlink_ring tx_ring;
atomic_t mapped;
#endif /* CONFIG_NETLINK_MMAP */
}; };
static inline struct netlink_sock *nlk_sk(struct sock *sk) static inline struct netlink_sock *nlk_sk(struct sock *sk)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册