提交 85782e03 编写于 作者: D Daniel Borkmann 提交者: Alexei Starovoitov

bpf: undo prog rejection on read-only lock failure

Partially undo commit 9facc336 ("bpf: reject any prog that failed
read-only lock") since it caused a regression, that is, syzkaller was
able to manage to cause a panic via fault injection deep in set_memory_ro()
path by letting an allocation fail: In x86's __change_page_attr_set_clr()
it was able to change the attributes of the primary mapping but not in
the alias mapping via cpa_process_alias(), so the second, inner call
to the __change_page_attr() via __change_page_attr_set_clr() had to split
a larger page and failed in the alloc_pages() with the artifically triggered
allocation error which is then propagated down to the call site.

Thus, for set_memory_ro() this means that it returned with an error, but
from debugging a probe_kernel_write() revealed EFAULT on that memory since
the primary mapping succeeded to get changed. Therefore the subsequent
hdr->locked = 0 reset triggered the panic as it was performed on read-only
memory, so call-site assumptions were infact wrong to assume that it would
either succeed /or/ not succeed at all since there's no such rollback in
set_memory_*() calls from partial change of mappings, in other words, we're
left in a state that is "half done". A later undo via set_memory_rw() is
succeeding though due to matching permissions on that part (aka due to the
try_preserve_large_page() succeeding). While reproducing locally with
explicitly triggering this error, the initial splitting only happens on
rare occasions and in real world it would additionally need oom conditions,
but that said, it could partially fail. Therefore, it is definitely wrong
to bail out on set_memory_ro() error and reject the program with the
set_memory_*() semantics we have today. Shouldn't have gone the extra mile
since no other user in tree today infact checks for any set_memory_*()
errors, e.g. neither module_enable_ro() / module_disable_ro() for module
RO/NX handling which is mostly default these days nor kprobes core with
alloc_insn_page() / free_insn_page() as examples that could be invoked long
after bootup and original 314beb9b ("x86: bpf_jit_comp: secure bpf jit
against spraying attacks") did neither when it got first introduced to BPF
so "improving" with bailing out was clearly not right when set_memory_*()
cannot handle it today.

Kees suggested that if set_memory_*() can fail, we should annotate it with
__must_check, and all callers need to deal with it gracefully given those
set_memory_*() markings aren't "advisory", but they're expected to actually
do what they say. This might be an option worth to move forward in future
but would at the same time require that set_memory_*() calls from supporting
archs are guaranteed to be "atomic" in that they provide rollback if part
of the range fails, once that happened, the transition from RW -> RO could
be made more robust that way, while subsequent RO -> RW transition /must/
continue guaranteeing to always succeed the undo part.

Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com
Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com
Fixes: 9facc336 ("bpf: reject any prog that failed read-only lock")
Cc: Laura Abbott <labbott@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: NDaniel Borkmann <daniel@iogearbox.net>
Acked-by: NAlexei Starovoitov <ast@kernel.org>
Signed-off-by: NAlexei Starovoitov <ast@kernel.org>
上级 f605ce5e
......@@ -470,9 +470,7 @@ struct sock_fprog_kern {
};
struct bpf_binary_header {
u16 pages;
u16 locked:1;
u32 pages;
/* Some arches need word alignment for their instructions */
u8 image[] __aligned(4);
};
......@@ -481,7 +479,7 @@ struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
locked:1, /* Program image locked? */
undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
......@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
fp->locked = 1;
if (set_memory_ro((unsigned long)fp, fp->pages))
fp->locked = 0;
#endif
fp->undo_set_mem = 1;
set_memory_ro((unsigned long)fp, fp->pages);
}
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
if (fp->locked) {
WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
/* In case set_memory_rw() fails, we want to be the first
* to crash here instead of some random place later on.
*/
fp->locked = 0;
}
#endif
if (fp->undo_set_mem)
set_memory_rw((unsigned long)fp, fp->pages);
}
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
hdr->locked = 1;
if (set_memory_ro((unsigned long)hdr, hdr->pages))
hdr->locked = 0;
#endif
set_memory_ro((unsigned long)hdr, hdr->pages);
}
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
if (hdr->locked) {
WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
/* In case set_memory_rw() fails, we want to be the first
* to crash here instead of some random place later on.
*/
hdr->locked = 0;
}
#endif
set_memory_rw((unsigned long)hdr, hdr->pages);
}
static inline struct bpf_binary_header *
......@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
return (void *)addr;
}
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
{
if (!fp->locked)
return -ENOLCK;
if (fp->jited) {
const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
if (!hdr->locked)
return -ENOLCK;
}
return 0;
}
#endif
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
......
......@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE;
hdr->locked = 0;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
......@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0;
}
static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
int i, err;
for (i = 0; i < fp->aux->func_cnt; i++) {
err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
if (err)
return err;
}
return bpf_prog_check_pages_ro_single(fp);
#endif
return 0;
}
static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
......@@ -1524,17 +1506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* all eBPF JITs might immediately support all features.
*/
*err = bpf_check_tail_call(fp);
if (*err)
return fp;
/* Checkpoint: at this point onwards any cBPF -> eBPF or
* native eBPF program is read-only. If we failed to change
* the page attributes (e.g. allocation failure from
* splitting large pages), then reject the whole program
* in order to guarantee not ending up with any W+X pages
* from BPF side in kernel.
*/
*err = bpf_prog_check_pages_ro_locked(fp);
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册