提交 b9d37bbb 编写于 作者: D David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Alexei Starovoitov says:

====================
pull-request: bpf 2020-06-17

The following pull-request contains BPF updates for your *net* tree.

We've added 10 non-merge commits during the last 2 day(s) which contain
a total of 14 files changed, 158 insertions(+), 59 deletions(-).

The main changes are:

1) Important fix for bpf_probe_read_kernel_str() return value, from Andrii.

2) [gs]etsockopt fix for large optlen, from Stanislav.

3) devmap allocation fix, from Toke.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -86,6 +86,20 @@ then the next program in the chain (A) will see those changes,
*not* the original input ``setsockopt`` arguments. The potentially
modified values will be then passed down to the kernel.
Large optval
============
When the ``optval`` is greater than the ``PAGE_SIZE``, the BPF program
can access only the first ``PAGE_SIZE`` of that data. So it has to options:
* Set ``optlen`` to zero, which indicates that the kernel should
use the original buffer from the userspace. Any modifications
done by the BPF program to the ``optval`` are ignored.
* Set ``optlen`` to the value less than ``PAGE_SIZE``, which
indicates that the kernel should use BPF's trimmed ``optval``.
When the BPF program returns with the ``optlen`` greater than
``PAGE_SIZE``, the userspace will receive ``EFAULT`` errno.
Example
=======
......
......@@ -3168,7 +3168,7 @@ union bpf_attr {
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
......
......@@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
{
if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
if (unlikely(max_optlen < 0))
return -EINVAL;
if (unlikely(max_optlen > PAGE_SIZE)) {
/* We don't expose optvals that are greater than PAGE_SIZE
* to the BPF program.
*/
max_optlen = PAGE_SIZE;
}
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
return 0;
return max_optlen;
}
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
......@@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
ret = sockopt_alloc_buf(&ctx, max_optlen);
if (ret)
return ret;
max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
if (max_optlen < 0)
return max_optlen;
ctx.optlen = *optlen;
if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
......@@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
/* export any potential modifications */
*level = ctx.level;
*optname = ctx.optname;
*optlen = ctx.optlen;
*kernel_optval = ctx.optval;
/* optlen == 0 from BPF indicates that we should
* use original userspace data.
*/
if (ctx.optlen != 0) {
*optlen = ctx.optlen;
*kernel_optval = ctx.optval;
}
}
out:
......@@ -1385,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
return retval;
ret = sockopt_alloc_buf(&ctx, max_optlen);
if (ret)
return ret;
ctx.optlen = max_optlen;
max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
if (max_optlen < 0)
return max_optlen;
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
......@@ -1404,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
if (ctx.optlen > max_optlen)
ctx.optlen = max_optlen;
if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
if (copy_from_user(ctx.optval, optval,
min(ctx.optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
......@@ -1436,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
ret = -EFAULT;
goto out;
if (ctx.optlen != 0) {
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
ret = -EFAULT;
goto out;
}
}
ret = ctx.retval;
......
......@@ -86,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
static struct hlist_head *dev_map_create_hash(unsigned int entries)
static struct hlist_head *dev_map_create_hash(unsigned int entries,
int numa_node)
{
int i;
struct hlist_head *hash;
hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
......@@ -145,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
return -EINVAL;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
goto free_charge;
......@@ -232,7 +234,7 @@ static void dev_map_free(struct bpf_map *map)
}
}
kfree(dtab->dev_index_head);
bpf_map_area_free(dtab->dev_index_head);
} else {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
......
......@@ -241,7 +241,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
if (unlikely(ret < 0))
goto fail;
return 0;
return ret;
fail:
memset(dst, 0, size);
return ret;
......
......@@ -462,6 +462,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->len = totsize - metasize;
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->frame_sz = PAGE_SIZE;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
......
......@@ -509,11 +509,8 @@ static void *alloc_rec_per_cpu(int record_size)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
void *array;
size_t size;
size = record_size * nr_cpus;
array = malloc(size);
memset(array, 0, size);
array = calloc(nr_cpus, record_size);
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
......@@ -528,8 +525,7 @@ static struct stats_record *alloc_stats_record(void)
int i;
/* Alloc main stats_record structure */
rec = malloc(sizeof(*rec));
memset(rec, 0, sizeof(*rec));
rec = calloc(1, sizeof(*rec));
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
......
......@@ -207,11 +207,8 @@ static struct datarec *alloc_record_per_cpu(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec *array;
size_t size;
size = sizeof(struct datarec) * nr_cpus;
array = malloc(size);
memset(array, 0, size);
array = calloc(nr_cpus, sizeof(struct datarec));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
......@@ -226,11 +223,11 @@ static struct stats_record *alloc_stats_record(void)
size = sizeof(*rec) + n_cpus * sizeof(struct record);
rec = malloc(size);
memset(rec, 0, size);
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
}
memset(rec, 0, size);
rec->rx_cnt.cpu = alloc_record_per_cpu();
rec->redir_err.cpu = alloc_record_per_cpu();
rec->kthread.cpu = alloc_record_per_cpu();
......
......@@ -198,11 +198,8 @@ static struct datarec *alloc_record_per_cpu(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec *array;
size_t size;
size = sizeof(struct datarec) * nr_cpus;
array = malloc(size);
memset(array, 0, size);
array = calloc(nr_cpus, sizeof(struct datarec));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
......@@ -214,11 +211,8 @@ static struct record *alloc_record_per_rxq(void)
{
unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
struct record *array;
size_t size;
size = sizeof(struct record) * nr_rxqs;
array = malloc(size);
memset(array, 0, size);
array = calloc(nr_rxqs, sizeof(struct record));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
exit(EXIT_FAIL_MEM);
......@@ -232,8 +226,7 @@ static struct stats_record *alloc_stats_record(void)
struct stats_record *rec;
int i;
rec = malloc(sizeof(*rec));
memset(rec, 0, sizeof(*rec));
rec = calloc(1, sizeof(struct stats_record));
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
......
......@@ -49,7 +49,7 @@ MAP COMMANDS
| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** | **sk_storage** | **struct_ops** }
| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** }
DESCRIPTION
===========
......
......@@ -49,6 +49,7 @@ const char * const map_type_name[] = {
[BPF_MAP_TYPE_STACK] = "stack",
[BPF_MAP_TYPE_SK_STORAGE] = "sk_storage",
[BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops",
[BPF_MAP_TYPE_RINGBUF] = "ringbuf",
};
const size_t map_type_name_size = ARRAY_SIZE(map_type_name);
......@@ -1590,7 +1591,7 @@ static int do_help(int argc, char **argv)
" lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops }\n"
" queue | stack | sk_storage | struct_ops | ringbuf }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, argv[-2]);
......
......@@ -3168,7 +3168,7 @@ union bpf_attr {
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
......
......@@ -13,6 +13,7 @@ static int getsetsockopt(void)
char cc[16]; /* TCP_CA_NAME_MAX */
} buf = {};
socklen_t optlen;
char *big_buf = NULL;
fd = socket(AF_INET, SOCK_STREAM, 0);
if (fd < 0) {
......@@ -22,24 +23,31 @@ static int getsetsockopt(void)
/* IP_TOS - BPF bypass */
buf.u8[0] = 0x08;
err = setsockopt(fd, SOL_IP, IP_TOS, &buf, 1);
optlen = getpagesize() * 2;
big_buf = calloc(1, optlen);
if (!big_buf) {
log_err("Couldn't allocate two pages");
goto err;
}
*(int *)big_buf = 0x08;
err = setsockopt(fd, SOL_IP, IP_TOS, big_buf, optlen);
if (err) {
log_err("Failed to call setsockopt(IP_TOS)");
goto err;
}
buf.u8[0] = 0x00;
memset(big_buf, 0, optlen);
optlen = 1;
err = getsockopt(fd, SOL_IP, IP_TOS, &buf, &optlen);
err = getsockopt(fd, SOL_IP, IP_TOS, big_buf, &optlen);
if (err) {
log_err("Failed to call getsockopt(IP_TOS)");
goto err;
}
if (buf.u8[0] != 0x08) {
log_err("Unexpected getsockopt(IP_TOS) buf[0] 0x%02x != 0x08",
buf.u8[0]);
if (*(int *)big_buf != 0x08) {
log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08",
*(int *)big_buf);
goto err;
}
......@@ -78,6 +86,28 @@ static int getsetsockopt(void)
goto err;
}
/* IP_FREEBIND - BPF can't access optval past PAGE_SIZE */
optlen = getpagesize() * 2;
memset(big_buf, 0, optlen);
err = setsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, optlen);
if (err != 0) {
log_err("Failed to call setsockopt, ret=%d", err);
goto err;
}
err = getsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, &optlen);
if (err != 0) {
log_err("Failed to call getsockopt, ret=%d", err);
goto err;
}
if (optlen != 1 || *(__u8 *)big_buf != 0x55) {
log_err("Unexpected IP_FREEBIND getsockopt, optlen=%d, optval=0x%x",
optlen, *(__u8 *)big_buf);
}
/* SO_SNDBUF is overwritten */
buf.u32 = 0x01010101;
......@@ -124,9 +154,11 @@ static int getsetsockopt(void)
goto err;
}
free(big_buf);
close(fd);
return 0;
err:
free(big_buf);
close(fd);
return -1;
}
......
......@@ -8,6 +8,10 @@
char _license[] SEC("license") = "GPL";
__u32 _version SEC("version") = 1;
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
#define SOL_CUSTOM 0xdeadbeef
struct sockopt_sk {
......@@ -28,12 +32,14 @@ int _getsockopt(struct bpf_sockopt *ctx)
__u8 *optval = ctx->optval;
struct sockopt_sk *storage;
if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
/* Not interested in SOL_IP:IP_TOS;
* let next BPF program in the cgroup chain or kernel
* handle it.
*/
ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
return 1;
}
if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
/* Not interested in SOL_SOCKET:SO_SNDBUF;
......@@ -51,6 +57,26 @@ int _getsockopt(struct bpf_sockopt *ctx)
return 1;
}
if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
if (optval + 1 > optval_end)
return 0; /* EPERM, bounds check */
ctx->retval = 0; /* Reset system call return value to zero */
/* Always export 0x55 */
optval[0] = 0x55;
ctx->optlen = 1;
/* Userspace buffer is PAGE_SIZE * 2, but BPF
* program can only see the first PAGE_SIZE
* bytes of data.
*/
if (optval_end - optval != PAGE_SIZE)
return 0; /* EPERM, unexpected data size */
return 1;
}
if (ctx->level != SOL_CUSTOM)
return 0; /* EPERM, deny everything except custom level */
......@@ -81,12 +107,14 @@ int _setsockopt(struct bpf_sockopt *ctx)
__u8 *optval = ctx->optval;
struct sockopt_sk *storage;
if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
/* Not interested in SOL_IP:IP_TOS;
* let next BPF program in the cgroup chain or kernel
* handle it.
*/
ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
return 1;
}
if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
/* Overwrite SO_SNDBUF value */
......@@ -112,6 +140,28 @@ int _setsockopt(struct bpf_sockopt *ctx)
return 1;
}
if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
/* Original optlen is larger than PAGE_SIZE. */
if (ctx->optlen != PAGE_SIZE * 2)
return 0; /* EPERM, unexpected data size */
if (optval + 1 > optval_end)
return 0; /* EPERM, bounds check */
/* Make sure we can trim the buffer. */
optval[0] = 0;
ctx->optlen = 1;
/* Usepace buffer is PAGE_SIZE * 2, but BPF
* program can only see the first PAGE_SIZE
* bytes of data.
*/
if (optval_end - optval != PAGE_SIZE)
return 0; /* EPERM, unexpected data size */
return 1;
}
if (ctx->level != SOL_CUSTOM)
return 0; /* EPERM, deny everything except custom level */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册