提交 d00f26b6 编写于 作者: D David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-05-14

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Merged tag 'perf-for-bpf-2020-05-06' from tip tree that includes CAP_PERFMON.

2) support for narrow loads in bpf_sock_addr progs and additional
   helpers in cg-skb progs, from Andrey.

3) bpf benchmark runner, from Andrii.

4) arm and riscv JIT optimizations, from Luke.

5) bpf iterator infrastructure, from Yonghong.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
...@@ -795,6 +795,9 @@ static inline void emit_a32_alu_i(const s8 dst, const u32 val, ...@@ -795,6 +795,9 @@ static inline void emit_a32_alu_i(const s8 dst, const u32 val,
case BPF_RSH: case BPF_RSH:
emit(ARM_LSR_I(rd, rd, val), ctx); emit(ARM_LSR_I(rd, rd, val), ctx);
break; break;
case BPF_ARSH:
emit(ARM_ASR_I(rd, rd, val), ctx);
break;
case BPF_NEG: case BPF_NEG:
emit(ARM_RSB_I(rd, rd, val), ctx); emit(ARM_RSB_I(rd, rd, val), ctx);
break; break;
...@@ -860,8 +863,8 @@ static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[], ...@@ -860,8 +863,8 @@ static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx); emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
_emit(ARM_COND_MI, ARM_B(0), ctx); _emit(ARM_COND_PL,
emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx); ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx); emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx);
arm_bpf_put_reg32(dst_lo, ARM_LR, ctx); arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
...@@ -1408,7 +1411,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) ...@@ -1408,7 +1411,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_X:
case BPF_ALU | BPF_ARSH | BPF_K:
case BPF_ALU | BPF_ARSH | BPF_X: case BPF_ALU | BPF_ARSH | BPF_X:
case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_ADD | BPF_K:
case BPF_ALU64 | BPF_ADD | BPF_X: case BPF_ALU64 | BPF_ADD | BPF_X:
...@@ -1465,10 +1467,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) ...@@ -1465,10 +1467,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
case BPF_ALU64 | BPF_MOD | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K:
case BPF_ALU64 | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X:
goto notyet; goto notyet;
/* dst = dst >> imm */
/* dst = dst << imm */ /* dst = dst << imm */
case BPF_ALU | BPF_RSH | BPF_K: /* dst = dst >> imm */
/* dst = dst >> imm (signed) */
case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU | BPF_ARSH | BPF_K:
if (unlikely(imm > 31)) if (unlikely(imm > 31))
return -EINVAL; return -EINVAL;
if (imm) if (imm)
......
...@@ -94,6 +94,9 @@ ...@@ -94,6 +94,9 @@
#define ARM_INST_LSR_I 0x01a00020 #define ARM_INST_LSR_I 0x01a00020
#define ARM_INST_LSR_R 0x01a00030 #define ARM_INST_LSR_R 0x01a00030
#define ARM_INST_ASR_I 0x01a00040
#define ARM_INST_ASR_R 0x01a00050
#define ARM_INST_MOV_R 0x01a00000 #define ARM_INST_MOV_R 0x01a00000
#define ARM_INST_MOVS_R 0x01b00000 #define ARM_INST_MOVS_R 0x01b00000
#define ARM_INST_MOV_I 0x03a00000 #define ARM_INST_MOV_I 0x03a00000
......
...@@ -515,7 +515,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, ...@@ -515,7 +515,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU64 | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_LSH | BPF_X:
emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx); emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx);
if (!is64) if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx); emit_zext_32(rd, ctx);
break; break;
case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_X:
...@@ -542,13 +542,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, ...@@ -542,13 +542,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
/* dst = BSWAP##imm(dst) */ /* dst = BSWAP##imm(dst) */
case BPF_ALU | BPF_END | BPF_FROM_LE: case BPF_ALU | BPF_END | BPF_FROM_LE:
{ switch (imm) {
int shift = 64 - imm; case 16:
emit(rv_slli(rd, rd, 48), ctx);
emit(rv_slli(rd, rd, shift), ctx); emit(rv_srli(rd, rd, 48), ctx);
emit(rv_srli(rd, rd, shift), ctx); break;
case 32:
if (!aux->verifier_zext)
emit_zext_32(rd, ctx);
break;
case 64:
/* Do nothing */
break;
}
break; break;
}
case BPF_ALU | BPF_END | BPF_FROM_BE: case BPF_ALU | BPF_END | BPF_FROM_BE:
emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx); emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
...@@ -692,19 +700,19 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, ...@@ -692,19 +700,19 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_LSH | BPF_K: case BPF_ALU64 | BPF_LSH | BPF_K:
emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx); emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx);
if (!is64) if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx); emit_zext_32(rd, ctx);
break; break;
case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU64 | BPF_RSH | BPF_K: case BPF_ALU64 | BPF_RSH | BPF_K:
emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx); emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx);
if (!is64) if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx); emit_zext_32(rd, ctx);
break; break;
case BPF_ALU | BPF_ARSH | BPF_K: case BPF_ALU | BPF_ARSH | BPF_K:
case BPF_ALU64 | BPF_ARSH | BPF_K: case BPF_ALU64 | BPF_ARSH | BPF_K:
emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx); emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx);
if (!is64) if (!is64 && !aux->verifier_zext)
emit_zext_32(rd, ctx); emit_zext_32(rd, ctx);
break; break;
...@@ -784,11 +792,15 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, ...@@ -784,11 +792,15 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K:
case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP32 | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
rvoff = rv_offset(i, off, ctx); rvoff = rv_offset(i, off, ctx);
s = ctx->ninsns; s = ctx->ninsns;
emit_imm(RV_REG_T1, imm, ctx); if (imm) {
emit_imm(RV_REG_T1, imm, ctx);
rs = RV_REG_T1;
} else {
/* If imm is 0, simply use zero register. */
rs = RV_REG_ZERO;
}
if (!is64) { if (!is64) {
if (is_signed_bpf_cond(BPF_OP(code))) if (is_signed_bpf_cond(BPF_OP(code)))
emit_sext_32_rd(&rd, ctx); emit_sext_32_rd(&rd, ctx);
...@@ -799,16 +811,28 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, ...@@ -799,16 +811,28 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
/* Adjust for extra insns */ /* Adjust for extra insns */
rvoff -= (e - s) << 2; rvoff -= (e - s) << 2;
emit_branch(BPF_OP(code), rd, rs, rvoff, ctx);
break;
if (BPF_OP(code) == BPF_JSET) { case BPF_JMP | BPF_JSET | BPF_K:
/* Adjust for and */ case BPF_JMP32 | BPF_JSET | BPF_K:
rvoff -= 4; rvoff = rv_offset(i, off, ctx);
emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx); s = ctx->ninsns;
emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, if (is_12b_int(imm)) {
ctx); emit(rv_andi(RV_REG_T1, rd, imm), ctx);
} else { } else {
emit_branch(BPF_OP(code), rd, RV_REG_T1, rvoff, ctx); emit_imm(RV_REG_T1, imm, ctx);
emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
} }
/* For jset32, we should clear the upper 32 bits of t1, but
* sign-extension is sufficient here and saves one instruction,
* as t1 is used only in comparison against zero.
*/
if (!is64 && imm < 0)
emit(rv_addiw(RV_REG_T1, RV_REG_T1, 0), ctx);
e = ctx->ninsns;
rvoff -= (e - s) << 2;
emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx);
break; break;
/* function call */ /* function call */
......
...@@ -1475,8 +1475,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, ...@@ -1475,8 +1475,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
for (i = 0; i < insn_cnt; i++, insn++) { for (i = 0; i < insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm; const s32 imm32 = insn->imm;
const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true; const bool dstk = insn->dst_reg != BPF_REG_AX;
const bool sstk = insn->src_reg == BPF_REG_AX ? false : true; const bool sstk = insn->src_reg != BPF_REG_AX;
const u8 code = insn->code; const u8 code = insn->code;
const u8 *dst = bpf2ia32[insn->dst_reg]; const u8 *dst = bpf2ia32[insn->dst_reg];
const u8 *src = bpf2ia32[insn->src_reg]; const u8 *src = bpf2ia32[insn->src_reg];
......
...@@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = { ...@@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = {
.proc_release = seq_release_net, .proc_release = seq_release_net,
}; };
int bpf_iter_init_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
p->net = get_net(current->nsproxy->net_ns);
#endif
return 0;
}
void bpf_iter_fini_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
put_net(p->net);
#endif
}
struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
struct proc_dir_entry *parent, const struct seq_operations *ops, struct proc_dir_entry *parent, const struct seq_operations *ops,
unsigned int state_size, void *data) unsigned int state_size, void *data)
......
...@@ -31,6 +31,7 @@ struct seq_file; ...@@ -31,6 +31,7 @@ struct seq_file;
struct btf; struct btf;
struct btf_type; struct btf_type;
struct exception_table_entry; struct exception_table_entry;
struct seq_operations;
extern struct idr btf_idr; extern struct idr btf_idr;
extern spinlock_t btf_idr_lock; extern spinlock_t btf_idr_lock;
...@@ -319,6 +320,7 @@ enum bpf_reg_type { ...@@ -319,6 +320,7 @@ enum bpf_reg_type {
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID, /* reg points to kernel struct */
PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */
}; };
/* The information passed from prog-specific *_is_valid_access /* The information passed from prog-specific *_is_valid_access
...@@ -641,6 +643,12 @@ struct bpf_jit_poke_descriptor { ...@@ -641,6 +643,12 @@ struct bpf_jit_poke_descriptor {
u16 reason; u16 reason;
}; };
/* reg_type info for ctx arguments */
struct bpf_ctx_arg_aux {
u32 offset;
enum bpf_reg_type reg_type;
};
struct bpf_prog_aux { struct bpf_prog_aux {
atomic64_t refcnt; atomic64_t refcnt;
u32 used_map_cnt; u32 used_map_cnt;
...@@ -652,6 +660,8 @@ struct bpf_prog_aux { ...@@ -652,6 +660,8 @@ struct bpf_prog_aux {
u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_cnt; /* used by non-func prog as the number of func progs */
u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */
u32 ctx_arg_info_size;
const struct bpf_ctx_arg_aux *ctx_arg_info;
struct bpf_prog *linked_prog; struct bpf_prog *linked_prog;
bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */
bool offload_requested; bool offload_requested;
...@@ -1021,6 +1031,7 @@ static inline void bpf_enable_instrumentation(void) ...@@ -1021,6 +1031,7 @@ static inline void bpf_enable_instrumentation(void)
extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_prog_ops _name ## _prog_ops; \
...@@ -1080,6 +1091,7 @@ int generic_map_update_batch(struct bpf_map *map, ...@@ -1080,6 +1091,7 @@ int generic_map_update_batch(struct bpf_map *map,
int generic_map_delete_batch(struct bpf_map *map, int generic_map_delete_batch(struct bpf_map *map,
const union bpf_attr *attr, const union bpf_attr *attr,
union bpf_attr __user *uattr); union bpf_attr __user *uattr);
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
extern int sysctl_unprivileged_bpf_disabled; extern int sysctl_unprivileged_bpf_disabled;
...@@ -1126,6 +1138,40 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); ...@@ -1126,6 +1138,40 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
int bpf_obj_get_user(const char __user *pathname, int flags); int bpf_obj_get_user(const char __user *pathname, int flags);
#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...) \
extern int bpf_iter_ ## target(args); \
int __init bpf_iter_ ## target(args) { return 0; }
typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
const char *target;
const struct seq_operations *seq_ops;
bpf_iter_init_seq_priv_t init_seq_private;
bpf_iter_fini_seq_priv_t fini_seq_private;
u32 seq_priv_size;
u32 ctx_arg_info_size;
struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
};
struct bpf_iter_meta {
__bpf_md_ptr(struct seq_file *, seq);
u64 session_id;
u64 seq_num;
};
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
bool bpf_iter_prog_supported(struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
......
...@@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) ...@@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
#ifdef CONFIG_CGROUP_BPF #ifdef CONFIG_CGROUP_BPF
BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
#endif #endif
BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
...@@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct ...@@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}
/* audit system wants to get cap info from files as well */ /* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
......
...@@ -545,10 +545,8 @@ struct bpf_prog { ...@@ -545,10 +545,8 @@ struct bpf_prog {
unsigned int (*bpf_func)(const void *ctx, unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn); const struct bpf_insn *insn);
/* Instructions for interpreter */ /* Instructions for interpreter */
union { struct sock_filter insns[0];
struct sock_filter insns[0]; struct bpf_insn insnsi[];
struct bpf_insn insnsi[0];
};
}; };
struct sk_filter { struct sk_filter {
......
...@@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo ...@@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
void *data); void *data);
extern struct pid *tgid_pidfd_to_pid(const struct file *file); extern struct pid *tgid_pidfd_to_pid(const struct file *file);
extern int bpf_iter_init_seq_net(void *priv_data);
extern void bpf_iter_fini_seq_net(void *priv_data);
#ifdef CONFIG_PROC_PID_ARCH_STATUS #ifdef CONFIG_PROC_PID_ARCH_STATUS
/* /*
* The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
......
...@@ -35,8 +35,14 @@ int inet_shutdown(struct socket *sock, int how); ...@@ -35,8 +35,14 @@ int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog); int inet_listen(struct socket *sock, int backlog);
void inet_sock_destruct(struct sock *sk); void inet_sock_destruct(struct sock *sk);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0)
/* Grab and release socket lock. */
#define BIND_WITH_LOCK (1 << 1)
/* Called from BPF program. */
#define BIND_FROM_BPF (1 << 2)
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock); u32 flags);
int inet_getname(struct socket *sock, struct sockaddr *uaddr, int inet_getname(struct socket *sock, struct sockaddr *uaddr,
int peer); int peer);
int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
......
...@@ -544,6 +544,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) ...@@ -544,6 +544,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
} }
#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct fib6_info *, rt);
};
#endif
#ifdef CONFIG_IPV6_MULTIPLE_TABLES #ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net) static inline bool fib6_has_custom_rules(const struct net *net)
{ {
......
...@@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; ...@@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
struct ipv6_bpf_stub { struct ipv6_bpf_stub {
int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock); u32 flags);
struct sock *(*udp6_lib_lookup)(struct net *net, struct sock *(*udp6_lib_lookup)(struct net *net,
const struct in6_addr *saddr, __be16 sport, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, const struct in6_addr *daddr, __be16 dport,
......
...@@ -50,7 +50,6 @@ struct xdp_umem { ...@@ -50,7 +50,6 @@ struct xdp_umem {
u32 headroom; u32 headroom;
u32 chunk_size_nohr; u32 chunk_size_nohr;
struct user_struct *user; struct user_struct *user;
unsigned long address;
refcount_t users; refcount_t users;
struct work_struct work; struct work_struct work;
struct page **pgs; struct page **pgs;
...@@ -62,8 +61,8 @@ struct xdp_umem { ...@@ -62,8 +61,8 @@ struct xdp_umem {
struct net_device *dev; struct net_device *dev;
struct xdp_umem_fq_reuse *fq_reuse; struct xdp_umem_fq_reuse *fq_reuse;
bool zc; bool zc;
spinlock_t xsk_list_lock; spinlock_t xsk_tx_list_lock;
struct list_head xsk_list; struct list_head xsk_tx_list;
}; };
/* Nodes are linked in the struct xdp_sock map_list field, and used to /* Nodes are linked in the struct xdp_sock map_list field, and used to
......
...@@ -116,6 +116,7 @@ enum bpf_cmd { ...@@ -116,6 +116,7 @@ enum bpf_cmd {
BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_FD_BY_ID,
BPF_LINK_GET_NEXT_ID, BPF_LINK_GET_NEXT_ID,
BPF_ENABLE_STATS, BPF_ENABLE_STATS,
BPF_ITER_CREATE,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -218,6 +219,7 @@ enum bpf_attach_type { ...@@ -218,6 +219,7 @@ enum bpf_attach_type {
BPF_TRACE_FEXIT, BPF_TRACE_FEXIT,
BPF_MODIFY_RETURN, BPF_MODIFY_RETURN,
BPF_LSM_MAC, BPF_LSM_MAC,
BPF_TRACE_ITER,
__MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
}; };
...@@ -228,6 +230,7 @@ enum bpf_link_type { ...@@ -228,6 +230,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
MAX_BPF_LINK_TYPE, MAX_BPF_LINK_TYPE,
}; };
...@@ -612,6 +615,11 @@ union bpf_attr { ...@@ -612,6 +615,11 @@ union bpf_attr {
__u32 type; __u32 type;
} enable_stats; } enable_stats;
struct { /* struct used by BPF_ITER_CREATE command */
__u32 link_fd;
__u32 flags;
} iter_create;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF /* The description below is an attempt at providing documentation to eBPF
...@@ -667,8 +675,8 @@ union bpf_attr { ...@@ -667,8 +675,8 @@ union bpf_attr {
* For tracing programs, safely attempt to read *size* bytes from * For tracing programs, safely attempt to read *size* bytes from
* kernel space address *unsafe_ptr* and store the data in *dst*. * kernel space address *unsafe_ptr* and store the data in *dst*.
* *
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() * Generally, use **bpf_probe_read_user**\ () or
* instead. * **bpf_probe_read_kernel**\ () instead.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
* *
...@@ -676,7 +684,7 @@ union bpf_attr { ...@@ -676,7 +684,7 @@ union bpf_attr {
* Description * Description
* Return the time elapsed since system boot, in nanoseconds. * Return the time elapsed since system boot, in nanoseconds.
* Does not include time the system was suspended. * Does not include time the system was suspended.
* See: clock_gettime(CLOCK_MONOTONIC) * See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
* Return * Return
* Current *ktime*. * Current *ktime*.
* *
...@@ -1535,11 +1543,11 @@ union bpf_attr { ...@@ -1535,11 +1543,11 @@ union bpf_attr {
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
* Description * Description
* Copy a NUL terminated string from an unsafe kernel address * Copy a NUL terminated string from an unsafe kernel address
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
* more details. * more details.
* *
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() * Generally, use **bpf_probe_read_user_str**\ () or
* instead. * **bpf_probe_read_kernel_str**\ () instead.
* Return * Return
* On success, the strictly positive length of the string, * On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative * including the trailing NUL character. On error, a negative
...@@ -1567,7 +1575,7 @@ union bpf_attr { ...@@ -1567,7 +1575,7 @@ union bpf_attr {
* *
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
* Description * Description
* Equivalent to bpf_get_socket_cookie() helper that accepts * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
* *skb*, but gets socket from **struct bpf_sock_ops** context. * *skb*, but gets socket from **struct bpf_sock_ops** context.
* Return * Return
* A 8-byte long non-decreasing number. * A 8-byte long non-decreasing number.
...@@ -1596,6 +1604,7 @@ union bpf_attr { ...@@ -1596,6 +1604,7 @@ union bpf_attr {
* The option value of length *optlen* is pointed by *optval*. * The option value of length *optlen* is pointed by *optval*.
* *
* *bpf_socket* should be one of the following: * *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**. * and **BPF_CGROUP_INET6_CONNECT**.
...@@ -1664,12 +1673,12 @@ union bpf_attr { ...@@ -1664,12 +1673,12 @@ union bpf_attr {
* *
* The lower two bits of *flags* are used as the return code if * The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be * the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by * one of the XDP program return codes up to **XDP_TX**, as chosen
* the caller. Any higher bits in the *flags* argument must be * by the caller. Any higher bits in the *flags* argument must be
* unset. * unset.
* *
* See also bpf_redirect(), which only supports redirecting to an * See also **bpf_redirect**\ (), which only supports redirecting
* ifindex, but doesn't require a map to do so. * to an ifindex, but doesn't require a map to do so.
* Return * Return
* **XDP_REDIRECT** on success, or the value of the two lower bits * **XDP_REDIRECT** on success, or the value of the two lower bits
* of the *flags* argument on error. * of the *flags* argument on error.
...@@ -1777,7 +1786,7 @@ union bpf_attr { ...@@ -1777,7 +1786,7 @@ union bpf_attr {
* the time running for event since last normalization. The * the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event * enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an * open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is * eBPF program, users can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous * typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program. * value and do the calculation inside the eBPF program.
* Return * Return
...@@ -1804,6 +1813,7 @@ union bpf_attr { ...@@ -1804,6 +1813,7 @@ union bpf_attr {
* *opval* and of length *optlen*. * *opval* and of length *optlen*.
* *
* *bpf_socket* should be one of the following: * *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**. * and **BPF_CGROUP_INET6_CONNECT**.
...@@ -1825,7 +1835,7 @@ union bpf_attr { ...@@ -1825,7 +1835,7 @@ union bpf_attr {
* The first argument is the context *regs* on which the kprobe * The first argument is the context *regs* on which the kprobe
* works. * works.
* *
* This helper works by setting setting the PC (program counter) * This helper works by setting the PC (program counter)
* to an override function which is run in place of the original * to an override function which is run in place of the original
* probed function. This means the probed function is not run at * probed function. This means the probed function is not run at
* all. The replacement function just returns with the required * all. The replacement function just returns with the required
...@@ -1994,10 +2004,11 @@ union bpf_attr { ...@@ -1994,10 +2004,11 @@ union bpf_attr {
* *
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be * **AF_INET6**). It's advised to pass zero port (**sin_port**
* expensive, therefore binding to port is not permitted by the * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) * behavior and lets the kernel efficiently pick up an unused
* must be set to zero. * port as long as 4-tuple is unique. Passing non-zero port might
* lead to degraded performance.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
* *
...@@ -2291,7 +2302,7 @@ union bpf_attr { ...@@ -2291,7 +2302,7 @@ union bpf_attr {
* **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_keydown**\ () again with the same values, or calling
* **bpf_rc_repeat**\ (). * **bpf_rc_repeat**\ ().
* *
* Some protocols include a toggle bit, in case the button was * Some protocols include a toggle bit, in case the button was
* released and pressed again between consecutive scancodes. * released and pressed again between consecutive scancodes.
* *
* The *ctx* should point to the lirc sample as passed into * The *ctx* should point to the lirc sample as passed into
...@@ -2637,7 +2648,6 @@ union bpf_attr { ...@@ -2637,7 +2648,6 @@ union bpf_attr {
* *
* *th* points to the start of the TCP header, while *th_len* * *th* points to the start of the TCP header, while *th_len*
* contains **sizeof**\ (**struct tcphdr**). * contains **sizeof**\ (**struct tcphdr**).
*
* Return * Return
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
* error otherwise. * error otherwise.
...@@ -2820,7 +2830,6 @@ union bpf_attr { ...@@ -2820,7 +2830,6 @@ union bpf_attr {
* *
* *th* points to the start of the TCP header, while *th_len* * *th* points to the start of the TCP header, while *th_len*
* contains the length of the TCP header. * contains the length of the TCP header.
*
* Return * Return
* On success, lower 32 bits hold the generated SYN cookie in * On success, lower 32 bits hold the generated SYN cookie in
* followed by 16 bits which hold the MSS value for that cookie, * followed by 16 bits which hold the MSS value for that cookie,
...@@ -2903,7 +2912,7 @@ union bpf_attr { ...@@ -2903,7 +2912,7 @@ union bpf_attr {
* // size, after checking its boundaries. * // size, after checking its boundaries.
* } * }
* *
* In comparison, using **bpf_probe_read_user()** helper here * In comparison, using **bpf_probe_read_user**\ () helper here
* instead to read the string would require to estimate the length * instead to read the string would require to estimate the length
* at compile time, and would often result in copying more memory * at compile time, and would often result in copying more memory
* than necessary. * than necessary.
...@@ -2921,14 +2930,14 @@ union bpf_attr { ...@@ -2921,14 +2930,14 @@ union bpf_attr {
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
* Description * Description
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply. * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
* Return * Return
* On success, the strictly positive length of the string, including * On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value. * the trailing NUL character. On error, a negative value.
* *
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description * Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
* *rcv_nxt* is the ack_seq to be sent out. * *rcv_nxt* is the ack_seq to be sent out.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
...@@ -2956,19 +2965,19 @@ union bpf_attr { ...@@ -2956,19 +2965,19 @@ union bpf_attr {
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
* Description * Description
* For an eBPF program attached to a perf event, retrieve the * For an eBPF program attached to a perf event, retrieve the
* branch records (struct perf_branch_entry) associated to *ctx* * branch records (**struct perf_branch_entry**) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size * and store it in the buffer pointed by *buf* up to size
* *size* bytes. * *size* bytes.
* Return * Return
* On success, number of bytes written to *buf*. On error, a * On success, number of bytes written to *buf*. On error, a
* negative value. * negative value.
* *
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
* instead return the number of bytes required to store all the * instead return the number of bytes required to store all the
* branch entries. If this flag is set, *buf* may be NULL. * branch entries. If this flag is set, *buf* may be NULL.
* *
* **-EINVAL** if arguments invalid or **size** not a multiple * **-EINVAL** if arguments invalid or **size** not a multiple
* of sizeof(struct perf_branch_entry). * of **sizeof**\ (**struct perf_branch_entry**\ ).
* *
* **-ENOENT** if architecture does not support branch records. * **-ENOENT** if architecture does not support branch records.
* *
...@@ -2976,8 +2985,8 @@ union bpf_attr { ...@@ -2976,8 +2985,8 @@ union bpf_attr {
* Description * Description
* Returns 0 on success, values for *pid* and *tgid* as seen from the current * Returns 0 on success, values for *pid* and *tgid* as seen from the current
* *namespace* will be returned in *nsdata*. * *namespace* will be returned in *nsdata*.
* * Return
* On failure, the returned value is one of the following: * 0 on success, or one of the following in case of failure:
* *
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
* with nsfs of current task, or if dev conversion to dev_t lost high bits. * with nsfs of current task, or if dev conversion to dev_t lost high bits.
...@@ -3016,8 +3025,8 @@ union bpf_attr { ...@@ -3016,8 +3025,8 @@ union bpf_attr {
* a global identifier that can be assumed unique. If *ctx* is * a global identifier that can be assumed unique. If *ctx* is
* NULL, then the helper returns the cookie for the initial * NULL, then the helper returns the cookie for the initial
* network namespace. The cookie itself is very similar to that * network namespace. The cookie itself is very similar to that
* of bpf_get_socket_cookie() helper, but for network namespaces * of **bpf_get_socket_cookie**\ () helper, but for network
* instead of sockets. * namespaces instead of sockets.
* Return * Return
* A 8-byte long opaque number. * A 8-byte long opaque number.
* *
...@@ -3052,22 +3061,98 @@ union bpf_attr { ...@@ -3052,22 +3061,98 @@ union bpf_attr {
* *
* The *flags* argument must be zero. * The *flags* argument must be zero.
* Return * Return
* 0 on success, or a negative errno in case of failure. * 0 on success, or a negative error in case of failure:
*
* **-EINVAL** if specified *flags* are not supported.
*
* **-ENOENT** if the socket is unavailable for assignment.
* *
* * **-EINVAL** Unsupported flags specified. * **-ENETUNREACH** if the socket is unreachable (wrong netns).
* * **-ENOENT** Socket is unavailable for assignment. *
* * **-ENETUNREACH** Socket is unreachable (wrong netns). * **-EOPNOTSUPP** if the operation is not supported, for example
* * **-EOPNOTSUPP** Unsupported operation, for example a * a call from outside of TC ingress.
* call from outside of TC ingress. *
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). * **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
* *
* u64 bpf_ktime_get_boot_ns(void) * u64 bpf_ktime_get_boot_ns(void)
* Description * Description
* Return the time elapsed since system boot, in nanoseconds. * Return the time elapsed since system boot, in nanoseconds.
* Does include the time the system was suspended. * Does include the time the system was suspended.
* See: clock_gettime(CLOCK_BOOTTIME) * See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
* Return * Return
* Current *ktime*. * Current *ktime*.
*
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
* out the format string.
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
* the format string itself. The *data* and *data_len* are format string
* arguments. The *data* are a **u64** array and corresponding format string
* values are stored in the array. For strings and pointers where pointees
* are accessed, only the pointer values are stored in the *data* array.
* The *data_len* is the size of *data* in bytes.
*
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
* Reading kernel memory may fail due to either invalid address or
* valid address but requiring a major memory fault. If reading kernel memory
* fails, the string for **%s** will be an empty string, and the ip
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
* by returning 1 from bpf program.
*
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
*
* **-E2BIG** if *fmt* contains too many format specifiers.
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
* Description
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
* The *m* represents the seq_file. The *data* and *len* represent the
* data to write in bytes.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3195,7 +3280,11 @@ union bpf_attr { ...@@ -3195,7 +3280,11 @@ union bpf_attr {
FN(get_netns_cookie), \ FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \ FN(get_current_ancestor_cgroup_id), \
FN(sk_assign), \ FN(sk_assign), \
FN(ktime_get_boot_ns), FN(ktime_get_boot_ns), \
FN(seq_printf), \
FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -3673,7 +3762,7 @@ struct bpf_sock_addr { ...@@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order. * Stored in network byte order.
*/ */
__u32 user_port; /* Allows 4-byte read and write. __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
* Stored in network byte order * Stored in network byte order
*/ */
__u32 family; /* Allows 4-byte read, but no write */ __u32 family; /* Allows 4-byte read, but no write */
......
...@@ -367,8 +367,14 @@ struct vfs_ns_cap_data { ...@@ -367,8 +367,14 @@ struct vfs_ns_cap_data {
#define CAP_AUDIT_READ 37 #define CAP_AUDIT_READ 37
/*
* Allow system performance and observability privileged operations
* using perf_events, i915_perf and other kernel subsystems
*/
#define CAP_PERFMON 38
#define CAP_LAST_CAP CAP_AUDIT_READ #define CAP_LAST_CAP CAP_PERFMON
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
obj-y := core.o obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init) CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o
......
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/fs.h>
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
struct bpf_iter_target_info {
struct list_head list;
const struct bpf_iter_reg *reg_info;
u32 btf_id; /* cached value */
};
struct bpf_iter_link {
struct bpf_link link;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
bool done_stop;
u8 target_private[] __aligned(8);
};
static struct list_head targets = LIST_HEAD_INIT(targets);
static DEFINE_MUTEX(targets_mutex);
/* protect bpf_iter_link changes */
static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num++;
}
static void bpf_iter_dec_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num--;
}
static void bpf_iter_done_stop(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->done_stop = true;
}
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
* . assuming no_llseek
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
int err = 0;
void *p;
mutex_lock(&seq->lock);
if (!seq->buf) {
seq->size = PAGE_SIZE;
seq->buf = kmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
}
}
if (seq->count) {
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf + seq->from, n);
if (err) {
err = -EFAULT;
goto done;
}
seq->count -= n;
seq->from += n;
copied = n;
goto done;
}
seq->from = 0;
p = seq->op->start(seq, &seq->index);
if (!p)
goto stop;
if (IS_ERR(p)) {
err = PTR_ERR(p);
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
err = seq->op->show(seq, p);
if (err > 0) {
/* object is skipped, decrease seq_num, so next
* valid object can reuse the same seq_num.
*/
bpf_iter_dec_seq_num(seq);
seq->count = 0;
} else if (err < 0 || seq_has_overflowed(seq)) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
while (1) {
loff_t pos = seq->index;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
pr_info_ratelimited("buggy seq_file .next function %ps "
"did not updated position index\n",
seq->op->next);
seq->index++;
}
if (IS_ERR_OR_NULL(p))
break;
/* got a valid next object, increase seq_num */
bpf_iter_inc_seq_num(seq);
if (seq->count >= size)
break;
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
seq->count = offs;
} else if (err < 0 || seq_has_overflowed(seq)) {
seq->count = offs;
if (offs == 0) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
goto done;
}
break;
}
}
stop:
offs = seq->count;
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
if (!seq_has_overflowed(seq)) {
bpf_iter_done_stop(seq);
} else {
seq->count = offs;
if (offs == 0) {
err = -E2BIG;
goto done;
}
}
}
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf, n);
if (err) {
err = -EFAULT;
goto done;
}
copied = n;
seq->count -= n;
seq->from = n;
done:
if (!copied)
copied = err;
else
*ppos += copied;
mutex_unlock(&seq->lock);
return copied;
}
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
return prepare_seq_file(file, link);
}
static int iter_release(struct inode *inode, struct file *file)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
seq = file->private_data;
if (!seq)
return 0;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
if (iter_priv->tinfo->reg_info->fini_seq_private)
iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
return seq_release_private(inode, file);
}
const struct file_operations bpf_iter_fops = {
.open = iter_open,
.llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
/* The argument reg_info will be cached in bpf_iter_target_info.
* The common practice is to declare target reg_info as
* a const static variable and passed as an argument to
* bpf_iter_reg_target().
*/
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
tinfo->reg_info = reg_info;
INIT_LIST_HEAD(&tinfo->list);
mutex_lock(&targets_mutex);
list_add(&tinfo->list, &targets);
mutex_unlock(&targets_mutex);
return 0;
}
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
bool found = false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (reg_info == tinfo->reg_info) {
list_del(&tinfo->list);
kfree(tinfo);
found = true;
break;
}
}
mutex_unlock(&targets_mutex);
WARN_ON(found == false);
}
static void cache_btf_id(struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
tinfo->btf_id = prog->aux->attach_btf_id;
}
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
supported = true;
break;
}
if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
cache_btf_id(tinfo, prog);
supported = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (supported) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
return supported;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
kfree(iter_link);
}
static int bpf_iter_link_replace(struct bpf_link *link,
struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
int ret = 0;
mutex_lock(&link_mutex);
if (old_prog && link->prog != old_prog) {
ret = -EPERM;
goto out_unlock;
}
if (link->prog->type != new_prog->type ||
link->prog->expected_attach_type != new_prog->expected_attach_type ||
link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
ret = -EINVAL;
goto out_unlock;
}
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
out_unlock:
mutex_unlock(&link_mutex);
return ret;
}
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
};
bool bpf_link_is_iter(struct bpf_link *link)
{
return link->ops == &bpf_iter_link_lops;
}
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
struct bpf_iter_link *link;
bool existed = false;
u32 prog_btf_id;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog_btf_id) {
existed = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (!existed)
return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
}
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u32 total_priv_dsize;
struct seq_file *seq;
int err = 0;
mutex_lock(&link_mutex);
prog = link->link.prog;
bpf_prog_inc(prog);
mutex_unlock(&link_mutex);
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
tinfo->reg_info->seq_priv_size;
priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
if (tinfo->reg_info->init_seq_private) {
err = tinfo->reg_info->init_seq_private(priv_data->target_private);
if (err)
goto release_seq_file;
}
init_seq_meta(priv_data, tinfo, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
return 0;
release_seq_file:
seq_release_private(file->f_inode, file);
file->private_data = NULL;
release_prog:
bpf_prog_put(prog);
return err;
}
int bpf_iter_new_fd(struct bpf_link *link)
{
struct file *file;
unsigned int flags;
int err, fd;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
fd = get_unused_fd_flags(flags);
if (fd < 0)
return fd;
file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto free_fd;
}
err = prepare_seq_file(file,
container_of(link, struct bpf_iter_link, link));
if (err)
goto free_file;
fd_install(fd, file);
return fd;
free_file:
fput(file);
free_fd:
put_unused_fd(fd);
return err;
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
void *seq_priv;
seq = meta->seq;
if (seq->file->f_op != &bpf_iter_fops)
return NULL;
seq_priv = seq->private;
iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
target_private);
if (in_stop && iter_priv->done_stop)
return NULL;
meta->session_id = iter_priv->session_id;
meta->seq_num = iter_priv->seq_num;
return iter_priv->prog;
}
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
rcu_read_lock();
migrate_disable();
ret = BPF_PROG_RUN(prog, ctx);
migrate_enable();
rcu_read_unlock();
/* bpf program can only return 0 or 1:
* 0 : okay
* 1 : retry the same object
* The bpf_iter_run_prog() return value
* will be seq_ops->show() return value.
*/
return ret == 0 ? 0 : -EAGAIN;
}
...@@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, ...@@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_verifier_log *log = info->log; struct bpf_verifier_log *log = info->log;
const struct btf_param *args; const struct btf_param *args;
u32 nr_args, arg; u32 nr_args, arg;
int ret; int i, ret;
if (off % 8) { if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n", bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
...@@ -3791,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, ...@@ -3791,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* this is a pointer to another type */ /* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID; info->reg_type = PTR_TO_BTF_ID;
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
info->reg_type = ctx_arg_info->reg_type;
break;
}
}
if (tgt_prog) { if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
...@@ -3830,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log, ...@@ -3830,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *mtype, *elem_type = NULL; const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member; const struct btf_member *member;
const char *tname, *mname; const char *tname, *mname;
u32 vlen;
again: again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off); tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
...@@ -3838,7 +3847,43 @@ int btf_struct_access(struct bpf_verifier_log *log, ...@@ -3838,7 +3847,43 @@ int btf_struct_access(struct bpf_verifier_log *log,
return -EINVAL; return -EINVAL;
} }
vlen = btf_type_vlen(t);
if (off + size > t->size) { if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
*/
struct btf_array *array_elem;
if (vlen == 0)
goto error;
member = btf_type_member(t) + vlen - 1;
mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
NULL);
if (!btf_type_is_array(mtype))
goto error;
array_elem = (struct btf_array *)(mtype + 1);
if (array_elem->nelems != 0)
goto error;
moff = btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
/* Only allow structure for now, can be relaxed for
* other types later.
*/
elem_type = btf_type_skip_modifiers(btf_vmlinux,
array_elem->type, NULL);
if (!btf_type_is_struct(elem_type))
goto error;
off = (off - moff) % elem_type->size;
return btf_struct_access(log, elem_type, off, size, atype,
next_btf_id);
error:
bpf_log(log, "access beyond struct %s at off %u size %u\n", bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size); tname, off, size);
return -EACCES; return -EACCES;
......
...@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) ...@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
{ {
struct bpf_link *link = arg;
return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
&bpffs_obj_fops); bpf_link_is_iter(link) ?
&bpf_iter_fops : &bpffs_obj_fops);
} }
static struct dentry * static struct dentry *
......
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <linux/fs.h>
#include <linux/filter.h>
#include <linux/kernel.h>
struct bpf_iter_seq_map_info {
u32 mid;
};
static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
map = bpf_map_get_curr_or_next(&info->mid);
if (!map)
return NULL;
++*pos;
return map;
}
static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
++*pos;
++info->mid;
bpf_map_put((struct bpf_map *)v);
map = bpf_map_get_curr_or_next(&info->mid);
if (!map)
return NULL;
return map;
}
struct bpf_iter__bpf_map {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct bpf_map *, map);
};
DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
{
struct bpf_iter__bpf_map ctx;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret = 0;
ctx.meta = &meta;
ctx.map = v;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (prog)
ret = bpf_iter_run_prog(prog, &ctx);
return ret;
}
static int bpf_map_seq_show(struct seq_file *seq, void *v)
{
return __bpf_map_seq_show(seq, v, false);
}
static void bpf_map_seq_stop(struct seq_file *seq, void *v)
{
if (!v)
(void)__bpf_map_seq_show(seq, v, true);
else
bpf_map_put((struct bpf_map *)v);
}
static const struct seq_operations bpf_map_seq_ops = {
.start = bpf_map_seq_start,
.next = bpf_map_seq_next,
.stop = bpf_map_seq_stop,
.show = bpf_map_seq_show,
};
static const struct bpf_iter_reg bpf_map_reg_info = {
.target = "bpf_map",
.seq_ops = &bpf_map_seq_ops,
.init_seq_private = NULL,
.fini_seq_private = NULL,
.seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_map_iter_init(void)
{
return bpf_iter_reg_target(&bpf_map_reg_info);
}
late_initcall(bpf_map_iter_init);
...@@ -19,7 +19,7 @@ struct bpf_queue_stack { ...@@ -19,7 +19,7 @@ struct bpf_queue_stack {
u32 head, tail; u32 head, tail;
u32 size; /* max_entries + 1 */ u32 size; /* max_entries + 1 */
char elements[0] __aligned(8); char elements[] __aligned(8);
}; };
static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
......
...@@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) ...@@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT: case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT; return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
return BPF_PROG_TYPE_TRACING;
default: default:
return BPF_PROG_TYPE_UNSPEC; return BPF_PROG_TYPE_UNSPEC;
} }
...@@ -2932,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, ...@@ -2932,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
return err; return err;
} }
struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
{
struct bpf_map *map;
spin_lock_bh(&map_idr_lock);
again:
map = idr_get_next(&map_idr, id);
if (map) {
map = __bpf_map_inc_not_zero(map, false);
if (IS_ERR(map)) {
(*id)++;
goto again;
}
}
spin_unlock_bh(&map_idr_lock);
return map;
}
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id) struct bpf_prog *bpf_prog_by_id(u32 id)
...@@ -3729,6 +3750,15 @@ static int bpf_map_do_batch(const union bpf_attr *attr, ...@@ -3729,6 +3750,15 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
return err; return err;
} }
static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
if (attr->link_create.attach_type == BPF_TRACE_ITER &&
prog->expected_attach_type == BPF_TRACE_ITER)
return bpf_iter_link_attach(attr, prog);
return -EINVAL;
}
#define BPF_LINK_CREATE_LAST_FIELD link_create.flags #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
static int link_create(union bpf_attr *attr) static int link_create(union bpf_attr *attr)
{ {
...@@ -3765,6 +3795,9 @@ static int link_create(union bpf_attr *attr) ...@@ -3765,6 +3795,9 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog); ret = cgroup_bpf_link_attach(attr, prog);
break; break;
case BPF_PROG_TYPE_TRACING:
ret = tracing_bpf_link_attach(attr, prog);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
} }
...@@ -3927,6 +3960,29 @@ static int bpf_enable_stats(union bpf_attr *attr) ...@@ -3927,6 +3960,29 @@ static int bpf_enable_stats(union bpf_attr *attr)
return -EINVAL; return -EINVAL;
} }
#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
static int bpf_iter_create(union bpf_attr *attr)
{
struct bpf_link *link;
int err;
if (CHECK_ATTR(BPF_ITER_CREATE))
return -EINVAL;
if (attr->iter_create.flags)
return -EINVAL;
link = bpf_link_get_from_fd(attr->iter_create.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
bpf_link_put(link);
return err;
}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{ {
union bpf_attr attr; union bpf_attr attr;
...@@ -4054,6 +4110,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz ...@@ -4054,6 +4110,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_ENABLE_STATS: case BPF_ENABLE_STATS:
err = bpf_enable_stats(&attr); err = bpf_enable_stats(&attr);
break; break;
case BPF_ITER_CREATE:
err = bpf_iter_create(&attr);
break;
default: default:
err = -EINVAL; err = -EINVAL;
break; break;
......
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/init.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
};
struct bpf_iter_seq_task_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by {init, fini}_seq_pidns() callback functions.
*/
struct bpf_iter_seq_task_common common;
u32 tid;
};
static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
u32 *tid)
{
struct task_struct *task = NULL;
struct pid *pid;
rcu_read_lock();
retry:
pid = idr_get_next(&ns->idr, tid);
if (pid) {
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
goto retry;
}
}
rcu_read_unlock();
return task;
}
static void *task_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
task = task_seq_get_next(info->common.ns, &info->tid);
if (!task)
return NULL;
++*pos;
return task;
}
static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
task = task_seq_get_next(info->common.ns, &info->tid);
if (!task)
return NULL;
return task;
}
struct bpf_iter__task {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct task_struct *, task);
};
DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
bool in_stop)
{
struct bpf_iter_meta meta;
struct bpf_iter__task ctx;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (!prog)
return 0;
meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
}
static int task_seq_show(struct seq_file *seq, void *v)
{
return __task_seq_show(seq, v, false);
}
static void task_seq_stop(struct seq_file *seq, void *v)
{
if (!v)
(void)__task_seq_show(seq, v, true);
else
put_task_struct((struct task_struct *)v);
}
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
.stop = task_seq_stop,
.show = task_seq_show,
};
struct bpf_iter_seq_task_file_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by {init, fini}_seq_pidns() callback functions.
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
struct files_struct *files;
u32 tid;
u32 fd;
};
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
struct task_struct **task, struct files_struct **fstruct)
{
struct pid_namespace *ns = info->common.ns;
u32 curr_tid = info->tid, max_fds;
struct files_struct *curr_files;
struct task_struct *curr_task;
int curr_fd = info->fd;
/* If this function returns a non-NULL file object,
* it held a reference to the task/files_struct/file.
* Otherwise, it does not hold any reference.
*/
again:
if (*task) {
curr_task = *task;
curr_files = *fstruct;
curr_fd = info->fd;
} else {
curr_task = task_seq_get_next(ns, &curr_tid);
if (!curr_task)
return NULL;
curr_files = get_files_struct(curr_task);
if (!curr_files) {
put_task_struct(curr_task);
curr_tid = ++(info->tid);
info->fd = 0;
goto again;
}
/* set *fstruct, *task and info->tid */
*fstruct = curr_files;
*task = curr_task;
if (curr_tid == info->tid) {
curr_fd = info->fd;
} else {
info->tid = curr_tid;
curr_fd = 0;
}
}
rcu_read_lock();
max_fds = files_fdtable(curr_files)->max_fds;
for (; curr_fd < max_fds; curr_fd++) {
struct file *f;
f = fcheck_files(curr_files, curr_fd);
if (!f)
continue;
/* set info->fd */
info->fd = curr_fd;
get_file(f);
rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
rcu_read_unlock();
put_files_struct(curr_files);
put_task_struct(curr_task);
*task = NULL;
*fstruct = NULL;
info->fd = 0;
curr_tid = ++(info->tid);
goto again;
}
static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = NULL;
struct task_struct *task = NULL;
struct file *file;
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
++*pos;
info->task = task;
info->files = files;
return file;
}
static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = info->files;
struct task_struct *task = info->task;
struct file *file;
++*pos;
++info->fd;
fput((struct file *)v);
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
info->task = task;
info->files = files;
return file;
}
struct bpf_iter__task_file {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct task_struct *, task);
u32 fd __aligned(8);
__bpf_md_ptr(struct file *, file);
};
DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
struct task_struct *task, u32 fd,
struct file *file)
static int __task_file_seq_show(struct seq_file *seq, struct file *file,
bool in_stop)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct bpf_iter__task_file ctx;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (!prog)
return 0;
ctx.meta = &meta;
ctx.task = info->task;
ctx.fd = info->fd;
ctx.file = file;
return bpf_iter_run_prog(prog, &ctx);
}
static int task_file_seq_show(struct seq_file *seq, void *v)
{
return __task_file_seq_show(seq, v, false);
}
static void task_file_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
if (!v) {
(void)__task_file_seq_show(seq, v, true);
} else {
fput((struct file *)v);
put_files_struct(info->files);
put_task_struct(info->task);
info->files = NULL;
info->task = NULL;
}
}
static int init_seq_pidns(void *priv_data)
{
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
return 0;
}
static void fini_seq_pidns(void *priv_data)
{
struct bpf_iter_seq_task_common *common = priv_data;
put_pid_ns(common->ns);
}
static const struct seq_operations task_file_seq_ops = {
.start = task_file_seq_start,
.next = task_file_seq_next,
.stop = task_file_seq_stop,
.show = task_file_seq_show,
};
static const struct bpf_iter_reg task_reg_info = {
.target = "task",
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
PTR_TO_BTF_ID_OR_NULL },
},
};
static const struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
.seq_ops = &task_file_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task_file, task),
PTR_TO_BTF_ID_OR_NULL },
{ offsetof(struct bpf_iter__task_file, file),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init task_iter_init(void)
{
int ret;
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
return bpf_iter_reg_target(&task_file_reg_info);
}
late_initcall(task_iter_init);
...@@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) ...@@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
return type == PTR_TO_MAP_VALUE_OR_NULL || return type == PTR_TO_MAP_VALUE_OR_NULL ||
type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL ||
type == PTR_TO_TCP_SOCK_OR_NULL; type == PTR_TO_TCP_SOCK_OR_NULL ||
type == PTR_TO_BTF_ID_OR_NULL;
} }
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
...@@ -483,6 +484,7 @@ static const char * const reg_type_str[] = { ...@@ -483,6 +484,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
}; };
static char slot_type_char[] = { static char slot_type_char[] = {
...@@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, ...@@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */ /* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off); verbose(env, "%lld", reg->var_off.value + reg->off);
} else { } else {
if (t == PTR_TO_BTF_ID) if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id); verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t)) if (reg_type_may_be_refcounted_or_null(t))
...@@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) ...@@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK: case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID: case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID_OR_NULL:
return true; return true;
default: default:
return false; return false;
...@@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, ...@@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/ */
*reg_type = info.reg_type; *reg_type = info.reg_type;
if (*reg_type == PTR_TO_BTF_ID) if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
*btf_id = info.btf_id; *btf_id = info.btf_id;
else else
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
...@@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn ...@@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register. * a sub-register.
*/ */
regs[value_regno].subreg_def = DEF_NOT_SUBREG; regs[value_regno].subreg_def = DEF_NOT_SUBREG;
if (reg_type == PTR_TO_BTF_ID) if (reg_type == PTR_TO_BTF_ID ||
reg_type == PTR_TO_BTF_ID_OR_NULL)
regs[value_regno].btf_id = btf_id; regs[value_regno].btf_id = btf_id;
} }
regs[value_regno].type = reg_type; regs[value_regno].type = reg_type;
...@@ -3490,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, ...@@ -3490,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC; *stype = STACK_MISC;
goto mark; goto mark;
} }
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
goto mark;
if (state->stack[spi].slot_type[0] == STACK_SPILL && if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr); __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
...@@ -6572,6 +6581,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, ...@@ -6572,6 +6581,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_SOCK_COMMON; reg->type = PTR_TO_SOCK_COMMON;
} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
reg->type = PTR_TO_TCP_SOCK; reg->type = PTR_TO_TCP_SOCK;
} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
reg->type = PTR_TO_BTF_ID;
} }
if (is_null) { if (is_null) {
/* We don't need id and ref_obj_id from this point /* We don't need id and ref_obj_id from this point
...@@ -7101,6 +7112,10 @@ static int check_return_code(struct bpf_verifier_env *env) ...@@ -7101,6 +7112,10 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0; return 0;
range = tnum_const(0); range = tnum_const(0);
break; break;
case BPF_PROG_TYPE_TRACING:
if (env->prog->expected_attach_type != BPF_TRACE_ITER)
return 0;
break;
default: default:
return 0; return 0;
} }
...@@ -8425,6 +8440,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) ...@@ -8425,6 +8440,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK: case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID: case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID_OR_NULL:
return false; return false;
default: default:
return true; return true;
...@@ -10481,6 +10497,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ...@@ -10481,6 +10497,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
struct bpf_prog *tgt_prog = prog->aux->linked_prog; struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id; u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_"; const char prefix[] = "btf_trace_";
struct btf_func_model fmodel;
int ret = 0, subprog = -1, i; int ret = 0, subprog = -1, i;
struct bpf_trampoline *tr; struct bpf_trampoline *tr;
const struct btf_type *t; const struct btf_type *t;
...@@ -10622,6 +10639,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ...@@ -10622,6 +10639,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t; prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true; prog->aux->attach_btf_trace = true;
return 0; return 0;
case BPF_TRACE_ITER:
if (!btf_type_is_func(t)) {
verbose(env, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
prog->aux->attach_func_name = tname;
prog->aux->attach_func_proto = t;
if (!bpf_iter_prog_supported(prog))
return -EINVAL;
ret = btf_distill_func_proto(&env->log, btf, t,
tname, &fmodel);
return ret;
default: default:
if (!prog_extension) if (!prog_extension)
return -EINVAL; return -EINVAL;
......
...@@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000; ...@@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000;
#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */
#ifdef CONFIG_BPF_SYSCALL #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
static int bpf_stats_handler(struct ctl_table *table, int write, static int bpf_stats_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, void __user *buffer, size_t *lenp,
loff_t *ppos) loff_t *ppos)
......
...@@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) ...@@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto; return &bpf_trace_printk_proto;
} }
#define MAX_SEQ_PRINTF_VARARGS 12
#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
#define MAX_SEQ_PRINTF_STR_LEN 128
struct bpf_seq_printf_buf {
char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
};
static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
{
int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
int i, buf_used, copy_size, num_args;
u64 params[MAX_SEQ_PRINTF_VARARGS];
struct bpf_seq_printf_buf *bufs;
const u64 *args = data;
buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
if (WARN_ON_ONCE(buf_used > 1)) {
err = -EBUSY;
goto out;
}
bufs = this_cpu_ptr(&bpf_seq_printf_buf);
/*
* bpf_check()->check_func_arg()->check_stack_boundary()
* guarantees that fmt points to bpf program stack,
* fmt_size bytes of it were initialized and fmt_size > 0
*/
if (fmt[--fmt_size] != 0)
goto out;
if (data_len & 7)
goto out;
for (i = 0; i < fmt_size; i++) {
if (fmt[i] == '%') {
if (fmt[i + 1] == '%')
i++;
else if (!data || !data_len)
goto out;
}
}
num_args = data_len / 8;
/* check format string for allowed specifiers */
for (i = 0; i < fmt_size; i++) {
/* only printable ascii for now. */
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
goto out;
}
if (fmt[i] != '%')
continue;
if (fmt[i + 1] == '%') {
i++;
continue;
}
if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
err = -E2BIG;
goto out;
}
if (fmt_cnt >= num_args) {
err = -EINVAL;
goto out;
}
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
i++;
/* skip optional "[0 +-][num]" width formating field */
while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
fmt[i] == ' ')
i++;
if (fmt[i] >= '1' && fmt[i] <= '9') {
i++;
while (fmt[i] >= '0' && fmt[i] <= '9')
i++;
}
if (fmt[i] == 's') {
/* try our best to copy */
if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
err = -E2BIG;
goto out;
}
err = strncpy_from_unsafe(bufs->buf[memcpy_cnt],
(void *) (long) args[fmt_cnt],
MAX_SEQ_PRINTF_STR_LEN);
if (err < 0)
bufs->buf[memcpy_cnt][0] = '\0';
params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
fmt_cnt++;
memcpy_cnt++;
continue;
}
if (fmt[i] == 'p') {
if (fmt[i + 1] == 0 ||
fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x') {
/* just kernel pointers */
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
continue;
}
/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
err = -EINVAL;
goto out;
}
if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
err = -EINVAL;
goto out;
}
if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
err = -E2BIG;
goto out;
}
copy_size = (fmt[i + 2] == '4') ? 4 : 16;
err = probe_kernel_read(bufs->buf[memcpy_cnt],
(void *) (long) args[fmt_cnt],
copy_size);
if (err < 0)
memset(bufs->buf[memcpy_cnt], 0, copy_size);
params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
i += 2;
fmt_cnt++;
memcpy_cnt++;
continue;
}
if (fmt[i] == 'l') {
i++;
if (fmt[i] == 'l')
i++;
}
if (fmt[i] != 'i' && fmt[i] != 'd' &&
fmt[i] != 'u' && fmt[i] != 'x') {
err = -EINVAL;
goto out;
}
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
}
/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
* all of them to seq_printf().
*/
seq_printf(m, fmt, params[0], params[1], params[2], params[3],
params[4], params[5], params[6], params[7], params[8],
params[9], params[10], params[11]);
err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
out:
this_cpu_dec(bpf_seq_printf_buf_used);
return err;
}
static int bpf_seq_printf_btf_ids[5];
static const struct bpf_func_proto bpf_seq_printf_proto = {
.func = bpf_seq_printf,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
.btf_id = bpf_seq_printf_btf_ids,
};
BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
{
return seq_write(m, data, len) ? -EOVERFLOW : 0;
}
static int bpf_seq_write_btf_ids[5];
static const struct bpf_func_proto bpf_seq_write_proto = {
.func = bpf_seq_write,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.btf_id = bpf_seq_write_btf_ids,
};
static __always_inline int static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags, get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running) u64 *value, u64 *enabled, u64 *running)
...@@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_xdp_output: case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto; return &bpf_xdp_output_proto;
#endif #endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_printf_proto :
NULL;
case BPF_FUNC_seq_write:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_write_proto :
NULL;
default: default:
return raw_tp_prog_func_proto(func_id, prog); return raw_tp_prog_func_proto(func_id, prog);
} }
......
...@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { ...@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
}; };
#ifdef CONFIG_SOCK_CGROUP_DATA #ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
struct cgroup *cgrp;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgroup_id(cgrp);
}
BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{ {
struct sock *sk = skb_to_full_sk(skb); struct sock *sk = skb_to_full_sk(skb);
struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk)) if (!sk || !sk_fullsock(sk))
return 0; return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return __bpf_sk_cgroup_id(sk);
return cgroup_id(cgrp);
} }
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
...@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { ...@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX, .arg1_type = ARG_PTR_TO_CTX,
}; };
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
ancestor_level) int ancestor_level)
{ {
struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor; struct cgroup *ancestor;
struct cgroup *cgrp; struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk))
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ancestor = cgroup_ancestor(cgrp, ancestor_level); ancestor = cgroup_ancestor(cgrp, ancestor_level);
if (!ancestor) if (!ancestor)
...@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ...@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
return cgroup_id(ancestor); return cgroup_id(ancestor);
} }
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
ancestor_level)
{
struct sock *sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return 0;
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.func = bpf_skb_ancestor_cgroup_id, .func = bpf_skb_ancestor_cgroup_id,
.gpl_only = false, .gpl_only = false,
...@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { ...@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX, .arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
return __bpf_sk_cgroup_id(sk);
}
static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
.func = bpf_sk_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
};
BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
.func = bpf_sk_ancestor_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
.arg2_type = ARG_ANYTHING,
};
#endif #endif
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
...@@ -4525,30 +4563,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, ...@@ -4525,30 +4563,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
{ {
#ifdef CONFIG_INET #ifdef CONFIG_INET
struct sock *sk = ctx->sk; struct sock *sk = ctx->sk;
u32 flags = BIND_FROM_BPF;
int err; int err;
/* Binding to port can be expensive so it's prohibited in the helper.
* Only binding to IP is supported.
*/
err = -EINVAL; err = -EINVAL;
if (addr_len < offsetofend(struct sockaddr, sa_family)) if (addr_len < offsetofend(struct sockaddr, sa_family))
return err; return err;
if (addr->sa_family == AF_INET) { if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in)) if (addr_len < sizeof(struct sockaddr_in))
return err; return err;
if (((struct sockaddr_in *)addr)->sin_port != htons(0)) if (((struct sockaddr_in *)addr)->sin_port == htons(0))
return err; flags |= BIND_FORCE_ADDRESS_NO_PORT;
return __inet_bind(sk, addr, addr_len, true, false); return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) { } else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133) if (addr_len < SIN6_LEN_RFC2133)
return err; return err;
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
return err; flags |= BIND_FORCE_ADDRESS_NO_PORT;
/* ipv6_bpf_stub cannot be NULL, since it's called from /* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/ */
return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */ #endif /* CONFIG_IPV6 */
} }
#endif /* CONFIG_INET */ #endif /* CONFIG_INET */
...@@ -6159,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -6159,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA #ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id: case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto; return &bpf_skb_cgroup_id_proto;
case BPF_FUNC_skb_ancestor_cgroup_id:
return &bpf_skb_ancestor_cgroup_id_proto;
case BPF_FUNC_sk_cgroup_id:
return &bpf_sk_cgroup_id_proto;
case BPF_FUNC_sk_ancestor_cgroup_id:
return &bpf_sk_ancestor_cgroup_id_proto;
#endif #endif
#ifdef CONFIG_INET #ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sk_lookup_tcp_proto;
case BPF_FUNC_sk_lookup_udp:
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_sock: case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto; return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock: case BPF_FUNC_get_listener_sock:
...@@ -7031,6 +7081,7 @@ static bool sock_addr_is_valid_access(int off, int size, ...@@ -7031,6 +7081,7 @@ static bool sock_addr_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
msg_src_ip6[3]): msg_src_ip6[3]):
case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (type == BPF_READ) { if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default); bpf_ctx_record_field_size(info, size_default);
...@@ -7061,10 +7112,6 @@ static bool sock_addr_is_valid_access(int off, int size, ...@@ -7061,10 +7112,6 @@ static bool sock_addr_is_valid_access(int off, int size,
return false; return false;
} }
break; break;
case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (size != size_default)
return false;
break;
case offsetof(struct bpf_sock_addr, sk): case offsetof(struct bpf_sock_addr, sk):
if (type != BPF_READ) if (type != BPF_READ)
return false; return false;
...@@ -7960,8 +8007,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, ...@@ -7960,8 +8007,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn_buf, struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size) struct bpf_prog *prog, u32 *target_size)
{ {
int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
struct bpf_insn *insn = insn_buf; struct bpf_insn *insn = insn_buf;
int off;
switch (si->off) { switch (si->off) {
case offsetof(struct bpf_sock_addr, user_family): case offsetof(struct bpf_sock_addr, user_family):
...@@ -7996,9 +8043,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, ...@@ -7996,9 +8043,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sockaddr_in6, sin6_port)); offsetof(struct sockaddr_in6, sin6_port));
BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
sizeof_field(struct sockaddr_in6, sin6_port)); sizeof_field(struct sockaddr_in6, sin6_port));
SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, /* Account for sin6_port being smaller than user_port. */
struct sockaddr_in6, uaddr, port_size = min(port_size, BPF_LDST_BYTES(si));
sin6_port, tmp_reg); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
break; break;
case offsetof(struct bpf_sock_addr, family): case offsetof(struct bpf_sock_addr, family):
......
...@@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (err) if (err)
return err; return err;
return __inet_bind(sk, uaddr, addr_len, false, true); return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
} }
EXPORT_SYMBOL(inet_bind); EXPORT_SYMBOL(inet_bind);
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock) u32 flags)
{ {
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
...@@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* would be illegal to use them (multicast/broadcast) in * would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used. * which case the sending device address is used.
*/ */
if (with_lock) if (flags & BIND_WITH_LOCK)
lock_sock(sk); lock_sock(sk);
/* Check these errors (active socket, double bind). */ /* Check these errors (active socket, double bind). */
...@@ -520,16 +520,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -520,16 +520,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */ /* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port || if (snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) { (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) { if (sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0; inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE; err = -EADDRINUSE;
goto out_release_sock; goto out_release_sock;
} }
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (!(flags & BIND_FROM_BPF)) {
if (err) { err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
inet->inet_saddr = inet->inet_rcv_saddr = 0; if (err) {
goto out_release_sock; inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
} }
} }
...@@ -543,7 +545,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -543,7 +545,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
sk_dst_reset(sk); sk_dst_reset(sk);
err = 0; err = 0;
out_release_sock: out_release_sock:
if (with_lock) if (flags & BIND_WITH_LOCK)
release_sock(sk); release_sock(sk);
out: out:
return err; return err;
......
...@@ -273,7 +273,7 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, ...@@ -273,7 +273,7 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
} }
static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock) u32 flags)
{ {
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
...@@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES; return -EACCES;
if (with_lock) if (flags & BIND_WITH_LOCK)
lock_sock(sk); lock_sock(sk);
/* Check these errors (active socket, double bind). */ /* Check these errors (active socket, double bind). */
...@@ -400,18 +400,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -400,18 +400,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */ /* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port || if (snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) { (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) { if (sk->sk_prot->get_port(sk, snum)) {
sk->sk_ipv6only = saved_ipv6only; sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk); inet_reset_saddr(sk);
err = -EADDRINUSE; err = -EADDRINUSE;
goto out; goto out;
} }
err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); if (!(flags & BIND_FROM_BPF)) {
if (err) { err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
sk->sk_ipv6only = saved_ipv6only; if (err) {
inet_reset_saddr(sk); sk->sk_ipv6only = saved_ipv6only;
goto out; inet_reset_saddr(sk);
goto out;
}
} }
} }
...@@ -423,7 +425,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ...@@ -423,7 +425,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_dport = 0; inet->inet_dport = 0;
inet->inet_daddr = 0; inet->inet_daddr = 0;
out: out:
if (with_lock) if (flags & BIND_WITH_LOCK)
release_sock(sk); release_sock(sk);
return err; return err;
out_unlock: out_unlock:
...@@ -451,7 +453,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -451,7 +453,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (err) if (err)
return err; return err;
return __inet6_bind(sk, uaddr, addr_len, false, true); return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
} }
EXPORT_SYMBOL(inet6_bind); EXPORT_SYMBOL(inet6_bind);
......
...@@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void) ...@@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void)
} }
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
static int ipv6_route_seq_show(struct seq_file *seq, void *v) static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{ {
struct fib6_info *rt = v; struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private; struct ipv6_route_iter *iter = seq->private;
...@@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) ...@@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
return w->node && !(w->state == FWS_U && w->node == w->root); return w->node && !(w->state == FWS_U && w->node == w->root);
} }
static void ipv6_route_seq_stop(struct seq_file *seq, void *v) static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
__releases(RCU_BH) __releases(RCU_BH)
{ {
struct net *net = seq_file_net(seq); struct net *net = seq_file_net(seq);
...@@ -2637,6 +2637,62 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v) ...@@ -2637,6 +2637,62 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock_bh(); rcu_read_unlock_bh();
} }
#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
struct bpf_iter_meta *meta,
void *v)
{
struct bpf_iter__ipv6_route ctx;
ctx.meta = meta;
ctx.rt = v;
return bpf_iter_run_prog(prog, &ctx);
}
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
struct ipv6_route_iter *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (!prog)
return ipv6_route_native_seq_show(seq, v);
ret = ipv6_route_prog_seq_show(prog, &meta, v);
iter->w.leaf = NULL;
return ret;
}
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)ipv6_route_prog_seq_show(prog, &meta, v);
}
ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
return ipv6_route_native_seq_show(seq, v);
}
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
ipv6_route_native_seq_stop(seq, v);
}
#endif
const struct seq_operations ipv6_route_seq_ops = { const struct seq_operations ipv6_route_seq_ops = {
.start = ipv6_route_seq_start, .start = ipv6_route_seq_start,
.next = ipv6_route_seq_next, .next = ipv6_route_seq_next,
......
...@@ -6421,6 +6421,35 @@ void __init ip6_route_init_special_entries(void) ...@@ -6421,6 +6421,35 @@ void __init ip6_route_init_special_entries(void)
#endif #endif
} }
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
static const struct bpf_iter_reg ipv6_route_reg_info = {
.target = "ipv6_route",
.seq_ops = &ipv6_route_seq_ops,
.init_seq_private = bpf_iter_init_seq_net,
.fini_seq_private = bpf_iter_fini_seq_net,
.seq_priv_size = sizeof(struct ipv6_route_iter),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__ipv6_route, rt),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_iter_register(void)
{
return bpf_iter_reg_target(&ipv6_route_reg_info);
}
static void bpf_iter_unregister(void)
{
bpf_iter_unreg_target(&ipv6_route_reg_info);
}
#endif
#endif
int __init ip6_route_init(void) int __init ip6_route_init(void)
{ {
int ret; int ret;
...@@ -6483,6 +6512,14 @@ int __init ip6_route_init(void) ...@@ -6483,6 +6512,14 @@ int __init ip6_route_init(void)
if (ret) if (ret)
goto out_register_late_subsys; goto out_register_late_subsys;
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
ret = bpf_iter_register();
if (ret)
goto out_register_late_subsys;
#endif
#endif
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
...@@ -6515,6 +6552,11 @@ int __init ip6_route_init(void) ...@@ -6515,6 +6552,11 @@ int __init ip6_route_init(void)
void ip6_route_cleanup(void) void ip6_route_cleanup(void)
{ {
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
bpf_iter_unregister();
#endif
#endif
unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_netdevice_notifier(&ip6_route_dev_notifier);
unregister_pernet_subsys(&ip6_route_net_late_ops); unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_cleanup(); fib6_rules_cleanup();
......
...@@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) ...@@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return __netlink_seq_next(seq); return __netlink_seq_next(seq);
} }
static void netlink_seq_stop(struct seq_file *seq, void *v) static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{ {
struct nl_seq_iter *iter = seq->private; struct nl_seq_iter *iter = seq->private;
...@@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) ...@@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v)
} }
static int netlink_seq_show(struct seq_file *seq, void *v) static int netlink_native_seq_show(struct seq_file *seq, void *v)
{ {
if (v == SEQ_START_TOKEN) { if (v == SEQ_START_TOKEN) {
seq_puts(seq, seq_puts(seq,
...@@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v) ...@@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
return 0; return 0;
} }
#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct netlink_sock *, sk);
};
DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
static int netlink_prog_seq_show(struct bpf_prog *prog,
struct bpf_iter_meta *meta,
void *v)
{
struct bpf_iter__netlink ctx;
meta->seq_num--; /* skip SEQ_START_TOKEN */
ctx.meta = meta;
ctx.sk = nlk_sk((struct sock *)v);
return bpf_iter_run_prog(prog, &ctx);
}
static int netlink_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (!prog)
return netlink_native_seq_show(seq, v);
if (v != SEQ_START_TOKEN)
return netlink_prog_seq_show(prog, &meta, v);
return 0;
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)netlink_prog_seq_show(prog, &meta, v);
}
netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
return netlink_native_seq_show(seq, v);
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
netlink_native_seq_stop(seq, v);
}
#endif
static const struct seq_operations netlink_seq_ops = { static const struct seq_operations netlink_seq_ops = {
.start = netlink_seq_start, .start = netlink_seq_start,
.next = netlink_seq_next, .next = netlink_seq_next,
...@@ -2740,6 +2802,26 @@ static const struct rhashtable_params netlink_rhashtable_params = { ...@@ -2740,6 +2802,26 @@ static const struct rhashtable_params netlink_rhashtable_params = {
.automatic_shrinking = true, .automatic_shrinking = true,
}; };
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
static const struct bpf_iter_reg netlink_reg_info = {
.target = "netlink",
.seq_ops = &netlink_seq_ops,
.init_seq_private = bpf_iter_init_seq_net,
.fini_seq_private = bpf_iter_fini_seq_net,
.seq_priv_size = sizeof(struct nl_seq_iter),
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__netlink, sk),
PTR_TO_BTF_ID_OR_NULL },
},
};
static int __init bpf_iter_register(void)
{
return bpf_iter_reg_target(&netlink_reg_info);
}
#endif
static int __init netlink_proto_init(void) static int __init netlink_proto_init(void)
{ {
int i; int i;
...@@ -2748,6 +2830,12 @@ static int __init netlink_proto_init(void) ...@@ -2748,6 +2830,12 @@ static int __init netlink_proto_init(void)
if (err != 0) if (err != 0)
goto out; goto out;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
err = bpf_iter_register();
if (err)
goto out;
#endif
BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
......
...@@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) ...@@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
if (!xs->tx) if (!xs->tx)
return; return;
spin_lock_irqsave(&umem->xsk_list_lock, flags); spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
list_add_rcu(&xs->list, &umem->xsk_list); list_add_rcu(&xs->list, &umem->xsk_tx_list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags); spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
} }
void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
...@@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) ...@@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
if (!xs->tx) if (!xs->tx)
return; return;
spin_lock_irqsave(&umem->xsk_list_lock, flags); spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
list_del_rcu(&xs->list); list_del_rcu(&xs->list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags); spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
} }
/* The umem is stored both in the _rx struct and the _tx struct as we do /* The umem is stored both in the _rx struct and the _tx struct as we do
...@@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem) ...@@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem)
} }
} }
static int xdp_umem_pin_pages(struct xdp_umem *umem) static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
{ {
unsigned int gup_flags = FOLL_WRITE; unsigned int gup_flags = FOLL_WRITE;
long npgs; long npgs;
...@@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) ...@@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM; return -ENOMEM;
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
npgs = pin_user_pages(umem->address, umem->npgs, npgs = pin_user_pages(address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
...@@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) ...@@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM) if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL; return -EINVAL;
umem->address = (unsigned long)addr;
umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
: ~((u64)chunk_size - 1); : ~((u64)chunk_size - 1);
umem->size = size; umem->size = size;
...@@ -395,8 +394,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) ...@@ -395,8 +394,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pgs = NULL; umem->pgs = NULL;
umem->user = NULL; umem->user = NULL;
umem->flags = mr->flags; umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_list); INIT_LIST_HEAD(&umem->xsk_tx_list);
spin_lock_init(&umem->xsk_list_lock); spin_lock_init(&umem->xsk_tx_list_lock);
refcount_set(&umem->users, 1); refcount_set(&umem->users, 1);
...@@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) ...@@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err) if (err)
return err; return err;
err = xdp_umem_pin_pages(umem); err = xdp_umem_pin_pages(umem, (unsigned long)addr);
if (err) if (err)
goto out_account; goto out_account;
......
...@@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem) ...@@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
return; return;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) { list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) ...@@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
return; return;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) { list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) ...@@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
struct xdp_sock *xs; struct xdp_sock *xs;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) { list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
__xskq_cons_release(xs->tx); __xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk); xs->sk.sk_write_space(&xs->sk);
} }
...@@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) ...@@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
struct xdp_sock *xs; struct xdp_sock *xs;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) { list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
if (!xskq_cons_peek_desc(xs->tx, desc, umem)) if (!xskq_cons_peek_desc(xs->tx, desc, umem))
continue; continue;
......
...@@ -9,12 +9,12 @@ ...@@ -9,12 +9,12 @@
#include "xsk_queue.h" #include "xsk_queue.h"
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
{ {
if (!q) if (!q)
return; return;
q->size = size; q->umem_size = umem_size;
q->chunk_mask = chunk_mask; q->chunk_mask = chunk_mask;
} }
......
...@@ -30,7 +30,7 @@ struct xdp_umem_ring { ...@@ -30,7 +30,7 @@ struct xdp_umem_ring {
struct xsk_queue { struct xsk_queue {
u64 chunk_mask; u64 chunk_mask;
u64 size; u64 umem_size;
u32 ring_mask; u32 ring_mask;
u32 nentries; u32 nentries;
u32 cached_prod; u32 cached_prod;
...@@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, ...@@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
u64 base_addr = xsk_umem_extract_addr(addr); u64 base_addr = xsk_umem_extract_addr(addr);
addr = xsk_umem_add_offset_to_addr(addr); addr = xsk_umem_add_offset_to_addr(addr);
if (base_addr >= q->size || addr >= q->size || if (base_addr >= q->umem_size || addr >= q->umem_size ||
xskq_cons_crosses_non_contig_pg(umem, addr, length)) { xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
q->invalid_descs++; q->invalid_descs++;
return false; return false;
...@@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, ...@@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
{ {
if (addr >= q->size) { if (addr >= q->umem_size) {
q->invalid_descs++; q->invalid_descs++;
return false; return false;
} }
...@@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) ...@@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
return q ? q->invalid_descs : 0; return q ? q->invalid_descs : 0;
} }
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops); void xskq_destroy(struct xsk_queue *q_ops);
......
...@@ -5,12 +5,12 @@ ...@@ -5,12 +5,12 @@
* License as published by the Free Software Foundation. * License as published by the Free Software Foundation.
*/ */
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
#include <uapi/linux/perf_event.h> #include <uapi/linux/perf_event.h>
#include <linux/version.h> #include <linux/version.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) #define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;})
......
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#include <uapi/linux/in.h> #include <uapi/linux/in.h>
#include <uapi/linux/if.h> #include <uapi/linux/if.h>
#include <uapi/linux/if_ether.h> #include <uapi/linux/if_ether.h>
#include <uapi/linux/ip.h> #include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h> #include <uapi/linux/ipv6.h>
#include <uapi/linux/if_tunnel.h> #include <uapi/linux/if_tunnel.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#define IP_MF 0x2000 #define IP_MF 0x2000
#define IP_OFFSET 0x1FFF #define IP_OFFSET 0x1FFF
......
...@@ -5,8 +5,6 @@ ...@@ -5,8 +5,6 @@
* License as published by the Free Software Foundation. * License as published by the Free Software Foundation.
*/ */
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#include <uapi/linux/in.h> #include <uapi/linux/in.h>
#include <uapi/linux/if.h> #include <uapi/linux/if.h>
#include <uapi/linux/if_ether.h> #include <uapi/linux/if_ether.h>
...@@ -14,6 +12,8 @@ ...@@ -14,6 +12,8 @@
#include <uapi/linux/ipv6.h> #include <uapi/linux/ipv6.h>
#include <uapi/linux/if_tunnel.h> #include <uapi/linux/if_tunnel.h>
#include <uapi/linux/mpls.h> #include <uapi/linux/mpls.h>
#include <bpf/bpf_helpers.h>
#include "bpf_legacy.h"
#define IP_MF 0x2000 #define IP_MF 0x2000
#define IP_OFFSET 0x1FFF #define IP_OFFSET 0x1FFF
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
#include "hash_func01.h" #include "hash_func01.h"
#define MAX_CPUS 64 /* WARNING - sync with _user.c */ #define MAX_CPUS NR_CPUS
/* Special map type that can XDP_REDIRECT frames to another CPU */ /* Special map type that can XDP_REDIRECT frames to another CPU */
struct { struct {
......
...@@ -13,6 +13,7 @@ static const char *__doc__ = ...@@ -13,6 +13,7 @@ static const char *__doc__ =
#include <unistd.h> #include <unistd.h>
#include <locale.h> #include <locale.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <sys/sysinfo.h>
#include <getopt.h> #include <getopt.h>
#include <net/if.h> #include <net/if.h>
#include <time.h> #include <time.h>
...@@ -24,8 +25,6 @@ static const char *__doc__ = ...@@ -24,8 +25,6 @@ static const char *__doc__ =
#include <arpa/inet.h> #include <arpa/inet.h>
#include <linux/if_link.h> #include <linux/if_link.h>
#define MAX_CPUS 64 /* WARNING - sync with _kern.c */
/* How many xdp_progs are defined in _kern.c */ /* How many xdp_progs are defined in _kern.c */
#define MAX_PROG 6 #define MAX_PROG 6
...@@ -40,6 +39,7 @@ static char *ifname; ...@@ -40,6 +39,7 @@ static char *ifname;
static __u32 prog_id; static __u32 prog_id;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int n_cpus;
static int cpu_map_fd; static int cpu_map_fd;
static int rx_cnt_map_fd; static int rx_cnt_map_fd;
static int redirect_err_cnt_map_fd; static int redirect_err_cnt_map_fd;
...@@ -170,7 +170,7 @@ struct stats_record { ...@@ -170,7 +170,7 @@ struct stats_record {
struct record redir_err; struct record redir_err;
struct record kthread; struct record kthread;
struct record exception; struct record exception;
struct record enq[MAX_CPUS]; struct record enq[];
}; };
static bool map_collect_percpu(int fd, __u32 key, struct record *rec) static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
...@@ -225,10 +225,11 @@ static struct datarec *alloc_record_per_cpu(void) ...@@ -225,10 +225,11 @@ static struct datarec *alloc_record_per_cpu(void)
static struct stats_record *alloc_stats_record(void) static struct stats_record *alloc_stats_record(void)
{ {
struct stats_record *rec; struct stats_record *rec;
int i; int i, size;
rec = malloc(sizeof(*rec)); size = sizeof(*rec) + n_cpus * sizeof(struct record);
memset(rec, 0, sizeof(*rec)); rec = malloc(size);
memset(rec, 0, size);
if (!rec) { if (!rec) {
fprintf(stderr, "Mem alloc error\n"); fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM); exit(EXIT_FAIL_MEM);
...@@ -237,7 +238,7 @@ static struct stats_record *alloc_stats_record(void) ...@@ -237,7 +238,7 @@ static struct stats_record *alloc_stats_record(void)
rec->redir_err.cpu = alloc_record_per_cpu(); rec->redir_err.cpu = alloc_record_per_cpu();
rec->kthread.cpu = alloc_record_per_cpu(); rec->kthread.cpu = alloc_record_per_cpu();
rec->exception.cpu = alloc_record_per_cpu(); rec->exception.cpu = alloc_record_per_cpu();
for (i = 0; i < MAX_CPUS; i++) for (i = 0; i < n_cpus; i++)
rec->enq[i].cpu = alloc_record_per_cpu(); rec->enq[i].cpu = alloc_record_per_cpu();
return rec; return rec;
...@@ -247,7 +248,7 @@ static void free_stats_record(struct stats_record *r) ...@@ -247,7 +248,7 @@ static void free_stats_record(struct stats_record *r)
{ {
int i; int i;
for (i = 0; i < MAX_CPUS; i++) for (i = 0; i < n_cpus; i++)
free(r->enq[i].cpu); free(r->enq[i].cpu);
free(r->exception.cpu); free(r->exception.cpu);
free(r->kthread.cpu); free(r->kthread.cpu);
...@@ -350,7 +351,7 @@ static void stats_print(struct stats_record *stats_rec, ...@@ -350,7 +351,7 @@ static void stats_print(struct stats_record *stats_rec,
} }
/* cpumap enqueue stats */ /* cpumap enqueue stats */
for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) {
char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
char *errstr = ""; char *errstr = "";
...@@ -475,7 +476,7 @@ static void stats_collect(struct stats_record *rec) ...@@ -475,7 +476,7 @@ static void stats_collect(struct stats_record *rec)
map_collect_percpu(fd, 1, &rec->redir_err); map_collect_percpu(fd, 1, &rec->redir_err);
fd = cpumap_enqueue_cnt_map_fd; fd = cpumap_enqueue_cnt_map_fd;
for (i = 0; i < MAX_CPUS; i++) for (i = 0; i < n_cpus; i++)
map_collect_percpu(fd, i, &rec->enq[i]); map_collect_percpu(fd, i, &rec->enq[i]);
fd = cpumap_kthread_cnt_map_fd; fd = cpumap_kthread_cnt_map_fd;
...@@ -549,10 +550,10 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size, ...@@ -549,10 +550,10 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size,
*/ */
static void mark_cpus_unavailable(void) static void mark_cpus_unavailable(void)
{ {
__u32 invalid_cpu = MAX_CPUS; __u32 invalid_cpu = n_cpus;
int ret, i; int ret, i;
for (i = 0; i < MAX_CPUS; i++) { for (i = 0; i < n_cpus; i++) {
ret = bpf_map_update_elem(cpus_available_map_fd, &i, ret = bpf_map_update_elem(cpus_available_map_fd, &i,
&invalid_cpu, 0); &invalid_cpu, 0);
if (ret) { if (ret) {
...@@ -688,6 +689,8 @@ int main(int argc, char **argv) ...@@ -688,6 +689,8 @@ int main(int argc, char **argv)
int prog_fd; int prog_fd;
__u32 qsize; __u32 qsize;
n_cpus = get_nprocs_conf();
/* Notice: choosing he queue size is very important with the /* Notice: choosing he queue size is very important with the
* ixgbe driver, because it's driver page recycling trick is * ixgbe driver, because it's driver page recycling trick is
* dependend on pages being returned quickly. The number of * dependend on pages being returned quickly. The number of
...@@ -757,7 +760,7 @@ int main(int argc, char **argv) ...@@ -757,7 +760,7 @@ int main(int argc, char **argv)
case 'c': case 'c':
/* Add multiple CPUs */ /* Add multiple CPUs */
add_cpu = strtoul(optarg, NULL, 0); add_cpu = strtoul(optarg, NULL, 0);
if (add_cpu >= MAX_CPUS) { if (add_cpu >= n_cpus) {
fprintf(stderr, fprintf(stderr,
"--cpu nr too large for cpumap err(%d):%s\n", "--cpu nr too large for cpumap err(%d):%s\n",
errno, strerror(errno)); errno, strerror(errno));
......
...@@ -318,6 +318,11 @@ may be interested in: ...@@ -318,6 +318,11 @@ may be interested in:
of eBPF maps are used with a given helper function. of eBPF maps are used with a given helper function.
* *kernel/bpf/* directory contains other files in which additional helpers are * *kernel/bpf/* directory contains other files in which additional helpers are
defined (for cgroups, sockmaps, etc.). defined (for cgroups, sockmaps, etc.).
* The bpftool utility can be used to probe the availability of helper functions
on the system (as well as supported program and map types, and a number of
other parameters). To do so, run **bpftool feature probe** (see
**bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to
list features available to unprivileged users.
Compatibility between helper functions and program types can generally be found Compatibility between helper functions and program types can generally be found
in the files where helper functions are defined. Look for the **struct in the files where helper functions are defined. Look for the **struct
...@@ -338,6 +343,7 @@ SEE ALSO ...@@ -338,6 +343,7 @@ SEE ALSO
======== ========
**bpf**\ (2), **bpf**\ (2),
**bpftool**\ (8),
**cgroups**\ (7), **cgroups**\ (7),
**ip**\ (8), **ip**\ (8),
**perf_event_open**\ (2), **perf_event_open**\ (2),
...@@ -414,6 +420,7 @@ class PrinterHelpers(Printer): ...@@ -414,6 +420,7 @@ class PrinterHelpers(Printer):
'struct sk_reuseport_md', 'struct sk_reuseport_md',
'struct sockaddr', 'struct sockaddr',
'struct tcphdr', 'struct tcphdr',
'struct seq_file',
'struct __sk_buff', 'struct __sk_buff',
'struct sk_msg_md', 'struct sk_msg_md',
...@@ -450,6 +457,7 @@ class PrinterHelpers(Printer): ...@@ -450,6 +457,7 @@ class PrinterHelpers(Printer):
'struct sk_reuseport_md', 'struct sk_reuseport_md',
'struct sockaddr', 'struct sockaddr',
'struct tcphdr', 'struct tcphdr',
'struct seq_file',
} }
mapped_types = { mapped_types = {
'u8': '__u8', 'u8': '__u8',
......
...@@ -27,9 +27,9 @@ ...@@ -27,9 +27,9 @@
"audit_control", "setfcap" "audit_control", "setfcap"
#define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \
"wake_alarm", "block_suspend", "audit_read" "wake_alarm", "block_suspend", "audit_read", "perfmon"
#if CAP_LAST_CAP > CAP_AUDIT_READ #if CAP_LAST_CAP > CAP_PERFMON
#error New capability defined, please update COMMON_CAP2_PERMS. #error New capability defined, please update COMMON_CAP2_PERMS.
#endif #endif
......
...@@ -230,9 +230,14 @@ SEE ALSO ...@@ -230,9 +230,14 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-map**\ (8), **bpftool-btf**\ (8),
**bpftool-prog**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -20,7 +20,7 @@ SYNOPSIS ...@@ -20,7 +20,7 @@ SYNOPSIS
CGROUP COMMANDS CGROUP COMMANDS
=============== ===============
| **bpftool** **cgroup { show | list }** *CGROUP* [**effective**] | **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**]
| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] | **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**]
| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] | **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
...@@ -160,9 +160,13 @@ SEE ALSO ...@@ -160,9 +160,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -28,7 +28,7 @@ DESCRIPTION ...@@ -28,7 +28,7 @@ DESCRIPTION
=========== ===========
**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]]
Probe the running kernel and dump a number of eBPF-related Probe the running kernel and dump a number of eBPF-related
parameters, such as availability of the **bpf()** system call, parameters, such as availability of the **bpf**\ () system call,
JIT status, eBPF program types availability, eBPF helper JIT status, eBPF program types availability, eBPF helper
functions availability, and more. functions availability, and more.
...@@ -93,9 +93,13 @@ SEE ALSO ...@@ -93,9 +93,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -14,7 +14,7 @@ SYNOPSIS ...@@ -14,7 +14,7 @@ SYNOPSIS
*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
*COMMAND* := { **skeleton | **help** } *COMMAND* := { **skeleton** | **help** }
GEN COMMANDS GEN COMMANDS
============= =============
...@@ -36,12 +36,12 @@ DESCRIPTION ...@@ -36,12 +36,12 @@ DESCRIPTION
etc. Skeleton eliminates the need to lookup mentioned etc. Skeleton eliminates the need to lookup mentioned
components by name. Instead, if skeleton instantiation components by name. Instead, if skeleton instantiation
succeeds, they are populated in skeleton structure as valid succeeds, they are populated in skeleton structure as valid
libbpf types (e.g., struct bpf_map pointer) and can be libbpf types (e.g., **struct bpf_map** pointer) and can be
passed to existing generic libbpf APIs. passed to existing generic libbpf APIs.
In addition to simple and reliable access to maps and In addition to simple and reliable access to maps and
programs, skeleton provides a storage for BPF links (struct programs, skeleton provides a storage for BPF links (**struct
bpf_link) for each BPF program within BPF object. When bpf_link**) for each BPF program within BPF object. When
requested, supported BPF programs will be automatically requested, supported BPF programs will be automatically
attached and resulting BPF links stored for further use by attached and resulting BPF links stored for further use by
user in pre-allocated fields in skeleton struct. For BPF user in pre-allocated fields in skeleton struct. For BPF
...@@ -82,14 +82,14 @@ DESCRIPTION ...@@ -82,14 +82,14 @@ DESCRIPTION
- **example__open** and **example__open_opts**. - **example__open** and **example__open_opts**.
These functions are used to instantiate skeleton. It These functions are used to instantiate skeleton. It
corresponds to libbpf's **bpf_object__open()** API. corresponds to libbpf's **bpf_object__open**\ () API.
**_opts** variants accepts extra **bpf_object_open_opts** **_opts** variants accepts extra **bpf_object_open_opts**
options. options.
- **example__load**. - **example__load**.
This function creates maps, loads and verifies BPF This function creates maps, loads and verifies BPF
programs, initializes global data maps. It corresponds to programs, initializes global data maps. It corresponds to
libppf's **bpf_object__load** API. libppf's **bpf_object__load**\ () API.
- **example__open_and_load** combines **example__open** and - **example__open_and_load** combines **example__open** and
**example__load** invocations in one commonly used **example__load** invocations in one commonly used
...@@ -296,10 +296,13 @@ SEE ALSO ...@@ -296,10 +296,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-map**\ (8), **bpftool-btf**\ (8),
**bpftool-prog**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
============
bpftool-iter
============
-------------------------------------------------------------------------------
tool to create BPF iterators
-------------------------------------------------------------------------------
:Manual section: 8
SYNOPSIS
========
**bpftool** [*OPTIONS*] **iter** *COMMAND*
*COMMANDS* := { **pin** | **help** }
ITER COMMANDS
===================
| **bpftool** **iter pin** *OBJ* *PATH*
| **bpftool** **iter help**
|
| *OBJ* := /a/file/of/bpf_iter_target.o
DESCRIPTION
===========
**bpftool iter pin** *OBJ* *PATH*
A bpf iterator combines a kernel iterating of
particular kernel data (e.g., tasks, bpf_maps, etc.)
and a bpf program called for each kernel data object
(e.g., one task, one bpf_map, etc.). User space can
*read* kernel iterator output through *read()* syscall.
The *pin* command creates a bpf iterator from *OBJ*,
and pin it to *PATH*. The *PATH* should be located
in *bpffs* mount. It must not contain a dot
character ('.'), which is reserved for future extensions
of *bpffs*.
User can then *cat PATH* to see the bpf iterator output.
**bpftool iter help**
Print short help message.
OPTIONS
=======
-h, --help
Print short generic help message (similar to **bpftool help**).
-V, --version
Print version number (similar to **bpftool version**).
-d, --debug
Print all logs available, even debug-level information. This
includes logs from libbpf as well as from the verifier, when
attempting to load programs.
EXAMPLES
========
**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink**
::
Create a file-based bpf iterator from bpf_iter_netlink.o and pin it
to /sys/fs/bpf/my_netlink
SEE ALSO
========
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-btf**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8),
**bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -109,10 +109,13 @@ SEE ALSO ...@@ -109,10 +109,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -21,7 +21,7 @@ SYNOPSIS ...@@ -21,7 +21,7 @@ SYNOPSIS
MAP COMMANDS MAP COMMANDS
============= =============
| **bpftool** **map { show | list }** [*MAP*] | **bpftool** **map** { **show** | **list** } [*MAP*]
| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
| **bpftool** **map dump** *MAP* | **bpftool** **map dump** *MAP*
...@@ -49,7 +49,7 @@ MAP COMMANDS ...@@ -49,7 +49,7 @@ MAP COMMANDS
| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** } | | **queue** | **stack** | **sk_storage** | **struct_ops** }
DESCRIPTION DESCRIPTION
=========== ===========
...@@ -66,6 +66,13 @@ DESCRIPTION ...@@ -66,6 +66,13 @@ DESCRIPTION
Create a new map with given parameters and pin it to *bpffs* Create a new map with given parameters and pin it to *bpffs*
as *FILE*. as *FILE*.
*FLAGS* should be an integer which is the combination of
desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h
UAPI header for existing flags).
Keyword **dev** expects a network interface name, and is used
to request hardware offload for the map.
**bpftool map dump** *MAP* **bpftool map dump** *MAP*
Dump all entries in a given *MAP*. In case of **name**, Dump all entries in a given *MAP*. In case of **name**,
*MAP* may match several maps which will all be dumped. *MAP* may match several maps which will all be dumped.
...@@ -78,7 +85,7 @@ DESCRIPTION ...@@ -78,7 +85,7 @@ DESCRIPTION
exists; **noexist** update only if entry doesn't exist. exists; **noexist** update only if entry doesn't exist.
If the **hex** keyword is provided in front of the bytes If the **hex** keyword is provided in front of the bytes
sequence, the bytes are parsed as hexadeximal values, even if sequence, the bytes are parsed as hexadecimal values, even if
no "0x" prefix is added. If the keyword is not provided, then no "0x" prefix is added. If the keyword is not provided, then
the bytes are parsed as decimal values, unless a "0x" prefix the bytes are parsed as decimal values, unless a "0x" prefix
(for hexadecimal) or a "0" prefix (for octal) is provided. (for hexadecimal) or a "0" prefix (for octal) is provided.
...@@ -100,10 +107,10 @@ DESCRIPTION ...@@ -100,10 +107,10 @@ DESCRIPTION
extensions of *bpffs*. extensions of *bpffs*.
**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map.
Install perf rings into a perf event array map and dump Install perf rings into a perf event array map and dump
output of any bpf_perf_event_output() call in the kernel. output of any **bpf_perf_event_output**\ () call in the kernel.
By default read the number of CPUs on the system and By default read the number of CPUs on the system and
install perf ring for each CPU in the corresponding index install perf ring for each CPU in the corresponding index
in the array. in the array.
...@@ -116,24 +123,24 @@ DESCRIPTION ...@@ -116,24 +123,24 @@ DESCRIPTION
receiving events if it installed its rings earlier. receiving events if it installed its rings earlier.
**bpftool map peek** *MAP* **bpftool map peek** *MAP*
Peek next **value** in the queue or stack. Peek next value in the queue or stack.
**bpftool map push** *MAP* **value** *VALUE* **bpftool map push** *MAP* **value** *VALUE*
Push **value** onto the stack. Push *VALUE* onto the stack.
**bpftool map pop** *MAP* **bpftool map pop** *MAP*
Pop and print **value** from the stack. Pop and print value from the stack.
**bpftool map enqueue** *MAP* **value** *VALUE* **bpftool map enqueue** *MAP* **value** *VALUE*
Enqueue **value** into the queue. Enqueue *VALUE* into the queue.
**bpftool map dequeue** *MAP* **bpftool map dequeue** *MAP*
Dequeue and print **value** from the queue. Dequeue and print value from the queue.
**bpftool map freeze** *MAP* **bpftool map freeze** *MAP*
Freeze the map as read-only from user space. Entries from a Freeze the map as read-only from user space. Entries from a
frozen map can not longer be updated or deleted with the frozen map can not longer be updated or deleted with the
**bpf\ ()** system call. This operation is not reversible, **bpf**\ () system call. This operation is not reversible,
and the map remains immutable from user space until its and the map remains immutable from user space until its
destruction. However, read and write permissions for BPF destruction. However, read and write permissions for BPF
programs to the map remain unchanged. programs to the map remain unchanged.
...@@ -269,9 +276,13 @@ SEE ALSO ...@@ -269,9 +276,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -20,7 +20,7 @@ SYNOPSIS ...@@ -20,7 +20,7 @@ SYNOPSIS
NET COMMANDS NET COMMANDS
============ ============
| **bpftool** **net { show | list }** [ **dev** *NAME* ] | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] | **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
| **bpftool** **net help** | **bpftool** **net help**
...@@ -194,9 +194,13 @@ SEE ALSO ...@@ -194,9 +194,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -20,7 +20,7 @@ SYNOPSIS ...@@ -20,7 +20,7 @@ SYNOPSIS
PERF COMMANDS PERF COMMANDS
============= =============
| **bpftool** **perf { show | list }** | **bpftool** **perf** { **show** | **list** }
| **bpftool** **perf help** | **bpftool** **perf help**
DESCRIPTION DESCRIPTION
...@@ -85,9 +85,13 @@ SEE ALSO ...@@ -85,9 +85,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8),
**bpftool-struct_ops**\ (8)
...@@ -21,11 +21,11 @@ SYNOPSIS ...@@ -21,11 +21,11 @@ SYNOPSIS
PROG COMMANDS PROG COMMANDS
============= =============
| **bpftool** **prog { show | list }** [*PROG*] | **bpftool** **prog** { **show** | **list** } [*PROG*]
| **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}] | **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}]
| **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}] | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}]
| **bpftool** **prog pin** *PROG* *FILE* | **bpftool** **prog pin** *PROG* *FILE*
| **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*] | **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*]
| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
| **bpftool** **prog tracelog** | **bpftool** **prog tracelog**
...@@ -49,7 +49,7 @@ PROG COMMANDS ...@@ -49,7 +49,7 @@ PROG COMMANDS
| *ATTACH_TYPE* := { | *ATTACH_TYPE* := {
| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
| } | }
| *METRIC* := { | *METRICs* := {
| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | **cycles** | **instructions** | **l1d_loads** | **llc_misses**
| } | }
...@@ -155,7 +155,7 @@ DESCRIPTION ...@@ -155,7 +155,7 @@ DESCRIPTION
**bpftool prog tracelog** **bpftool prog tracelog**
Dump the trace pipe of the system to the console (stdout). Dump the trace pipe of the system to the console (stdout).
Hit <Ctrl+C> to stop printing. BPF programs can write to this Hit <Ctrl+C> to stop printing. BPF programs can write to this
trace pipe at runtime with the **bpf_trace_printk()** helper. trace pipe at runtime with the **bpf_trace_printk**\ () helper.
This should be used only for debugging purposes. For This should be used only for debugging purposes. For
streaming data from BPF programs to user space, one can use streaming data from BPF programs to user space, one can use
perf events (see also **bpftool-map**\ (8)). perf events (see also **bpftool-map**\ (8)).
...@@ -195,9 +195,9 @@ DESCRIPTION ...@@ -195,9 +195,9 @@ DESCRIPTION
**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs*
Profile *METRICs* for bpf program *PROG* for *DURATION* Profile *METRICs* for bpf program *PROG* for *DURATION*
seconds or until user hits Ctrl-C. *DURATION* is optional. seconds or until user hits <Ctrl+C>. *DURATION* is optional.
If *DURATION* is not specified, the profiling will run up to If *DURATION* is not specified, the profiling will run up to
UINT_MAX seconds. **UINT_MAX** seconds.
**bpftool prog help** **bpftool prog help**
Print short help message. Print short help message.
...@@ -267,7 +267,7 @@ EXAMPLES ...@@ -267,7 +267,7 @@ EXAMPLES
| |
| **# bpftool prog dump xlated id 10 file /tmp/t** | **# bpftool prog dump xlated id 10 file /tmp/t**
| **# ls -l /tmp/t** | **$ ls -l /tmp/t**
:: ::
...@@ -325,6 +325,7 @@ EXAMPLES ...@@ -325,6 +325,7 @@ EXAMPLES
| **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses** | **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses**
:: ::
51397 run_cnt 51397 run_cnt
40176203 cycles (83.05%) 40176203 cycles (83.05%)
42518139 instructions # 1.06 insns per cycle (83.39%) 42518139 instructions # 1.06 insns per cycle (83.39%)
...@@ -335,9 +336,13 @@ SEE ALSO ...@@ -335,9 +336,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-map**\ (8), **bpftool-btf**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-struct_ops**\ (8)
...@@ -105,12 +105,13 @@ SEE ALSO ...@@ -105,12 +105,13 @@ SEE ALSO
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool**\ (8), **bpftool**\ (8),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8) **bpftool-prog**\ (8)
**bpftool-gen**\ (8)
...@@ -75,11 +75,14 @@ SEE ALSO ...@@ -75,11 +75,14 @@ SEE ALSO
======== ========
**bpf**\ (2), **bpf**\ (2),
**bpf-helpers**\ (7), **bpf-helpers**\ (7),
**bpftool-prog**\ (8), **bpftool-btf**\ (8),
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-gen**\ (8),
**bpftool-iter**\ (8),
**bpftool-link**\ (8),
**bpftool-map**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8), **bpftool-perf**\ (8),
**bpftool-btf**\ (8), **bpftool-prog**\ (8),
**bpftool-gen**\ (8), **bpftool-struct_ops**\ (8)
...@@ -610,6 +610,19 @@ _bpftool() ...@@ -610,6 +610,19 @@ _bpftool()
;; ;;
esac esac
;; ;;
iter)
case $command in
pin)
_filedir
return 0
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'pin help' \
-- "$cur" ) )
;;
esac
;;
map) map)
local MAP_TYPE='id pinned name' local MAP_TYPE='id pinned name'
case $command in case $command in
......
...@@ -271,8 +271,8 @@ static void btf_int128_print(json_writer_t *jw, const void *data, ...@@ -271,8 +271,8 @@ static void btf_int128_print(json_writer_t *jw, const void *data,
} }
} }
static void btf_int128_shift(__u64 *print_num, u16 left_shift_bits, static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits,
u16 right_shift_bits) __u16 right_shift_bits)
{ {
__u64 upper_num, lower_num; __u64 upper_num, lower_num;
......
...@@ -157,7 +157,7 @@ static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur, ...@@ -157,7 +157,7 @@ static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur,
return false; return false;
} }
static bool is_jmp_insn(u8 code) static bool is_jmp_insn(__u8 code)
{ {
return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32; return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32;
} }
...@@ -176,7 +176,7 @@ static bool func_partition_bb_head(struct func_node *func) ...@@ -176,7 +176,7 @@ static bool func_partition_bb_head(struct func_node *func)
for (; cur <= end; cur++) { for (; cur <= end; cur++) {
if (is_jmp_insn(cur->code)) { if (is_jmp_insn(cur->code)) {
u8 opcode = BPF_OP(cur->code); __u8 opcode = BPF_OP(cur->code);
if (opcode == BPF_EXIT || opcode == BPF_CALL) if (opcode == BPF_EXIT || opcode == BPF_CALL)
continue; continue;
......
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (C) 2020 Facebook
#define _GNU_SOURCE
#include <linux/err.h>
#include <bpf/libbpf.h>
#include "main.h"
static int do_pin(int argc, char **argv)
{
const char *objfile, *path;
struct bpf_program *prog;
struct bpf_object *obj;
struct bpf_link *link;
int err;
if (!REQ_ARGS(2))
usage();
objfile = GET_ARG();
path = GET_ARG();
obj = bpf_object__open(objfile);
if (IS_ERR(obj)) {
p_err("can't open objfile %s", objfile);
return -1;
}
err = bpf_object__load(obj);
if (err) {
p_err("can't load objfile %s", objfile);
goto close_obj;
}
prog = bpf_program__next(NULL, obj);
if (!prog) {
p_err("can't find bpf program in objfile %s", objfile);
goto close_obj;
}
link = bpf_program__attach_iter(prog, NULL);
if (IS_ERR(link)) {
err = PTR_ERR(link);
p_err("attach_iter failed for program %s",
bpf_program__name(prog));
goto close_obj;
}
err = mount_bpffs_for_pin(path);
if (err)
goto close_link;
err = bpf_link__pin(link, path);
if (err) {
p_err("pin_iter failed for program %s to path %s",
bpf_program__name(prog), path);
goto close_link;
}
close_link:
bpf_link__destroy(link);
close_obj:
bpf_object__close(obj);
return err;
}
static int do_help(int argc, char **argv)
{
fprintf(stderr,
"Usage: %s %s pin OBJ PATH\n"
" %s %s help\n"
"\n",
bin_name, argv[-2], bin_name, argv[-2]);
return 0;
}
static const struct cmd cmds[] = {
{ "help", do_help },
{ "pin", do_pin },
{ 0 }
};
int do_iter(int argc, char **argv)
{
return cmd_select(cmds, argc, argv, do_help);
}
...@@ -16,6 +16,7 @@ static const char * const link_type_name[] = { ...@@ -16,6 +16,7 @@ static const char * const link_type_name[] = {
[BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
[BPF_LINK_TYPE_TRACING] = "tracing", [BPF_LINK_TYPE_TRACING] = "tracing",
[BPF_LINK_TYPE_CGROUP] = "cgroup", [BPF_LINK_TYPE_CGROUP] = "cgroup",
[BPF_LINK_TYPE_ITER] = "iter",
}; };
static int link_parse_fd(int *argc, char ***argv) static int link_parse_fd(int *argc, char ***argv)
......
...@@ -59,7 +59,7 @@ static int do_help(int argc, char **argv) ...@@ -59,7 +59,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n" " %s batch file FILE\n"
" %s version\n" " %s version\n"
"\n" "\n"
" OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n" " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n"
" " HELP_SPEC_OPTIONS "\n" " " HELP_SPEC_OPTIONS "\n"
"", "",
bin_name, bin_name, bin_name); bin_name, bin_name, bin_name);
...@@ -224,6 +224,7 @@ static const struct cmd cmds[] = { ...@@ -224,6 +224,7 @@ static const struct cmd cmds[] = {
{ "btf", do_btf }, { "btf", do_btf },
{ "gen", do_gen }, { "gen", do_gen },
{ "struct_ops", do_struct_ops }, { "struct_ops", do_struct_ops },
{ "iter", do_iter },
{ "version", do_version }, { "version", do_version },
{ 0 } { 0 }
}; };
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "json_writer.h" #include "json_writer.h"
/* Make sure we do not use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
#define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr))
#define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); })
...@@ -199,6 +202,7 @@ int do_feature(int argc, char **argv); ...@@ -199,6 +202,7 @@ int do_feature(int argc, char **argv);
int do_btf(int argc, char **argv); int do_btf(int argc, char **argv);
int do_gen(int argc, char **argv); int do_gen(int argc, char **argv);
int do_struct_ops(int argc, char **argv); int do_struct_ops(int argc, char **argv);
int do_iter(int argc, char **argv);
int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
int prog_parse_fd(int *argc, char ***argv); int prog_parse_fd(int *argc, char ***argv);
......
...@@ -1589,7 +1589,8 @@ static int do_help(int argc, char **argv) ...@@ -1589,7 +1589,8 @@ static int do_help(int argc, char **argv)
" percpu_array | stack_trace | cgroup_array | lru_hash |\n" " percpu_array | stack_trace | cgroup_array | lru_hash |\n"
" lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops }\n"
" " HELP_SPEC_OPTIONS "\n" " " HELP_SPEC_OPTIONS "\n"
"", "",
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
......
...@@ -39,7 +39,7 @@ struct event_ring_info { ...@@ -39,7 +39,7 @@ struct event_ring_info {
struct perf_event_sample { struct perf_event_sample {
struct perf_event_header header; struct perf_event_header header;
u64 time; __u64 time;
__u32 size; __u32 size;
unsigned char data[]; unsigned char data[];
}; };
......
...@@ -238,7 +238,7 @@ int prog_parse_fd(int *argc, char ***argv) ...@@ -238,7 +238,7 @@ int prog_parse_fd(int *argc, char ***argv)
return fd; return fd;
} }
static void show_prog_maps(int fd, u32 num_maps) static void show_prog_maps(int fd, __u32 num_maps)
{ {
struct bpf_prog_info info = {}; struct bpf_prog_info info = {};
__u32 len = sizeof(info); __u32 len = sizeof(info);
......
...@@ -8,7 +8,8 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL) ...@@ -8,7 +8,8 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(abspath ../../lib/bpf) LIBBPF_SRC := $(abspath ../../lib/bpf)
BPFOBJ := $(OUTPUT)/libbpf.a BPFOBJ := $(OUTPUT)/libbpf.a
BPF_INCLUDE := $(OUTPUT) BPF_INCLUDE := $(OUTPUT)
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \
-I$(abspath ../../include/uapi)
CFLAGS := -g -Wall CFLAGS := -g -Wall
# Try to detect best kernel BTF source # Try to detect best kernel BTF source
......
...@@ -116,6 +116,7 @@ enum bpf_cmd { ...@@ -116,6 +116,7 @@ enum bpf_cmd {
BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_FD_BY_ID,
BPF_LINK_GET_NEXT_ID, BPF_LINK_GET_NEXT_ID,
BPF_ENABLE_STATS, BPF_ENABLE_STATS,
BPF_ITER_CREATE,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -218,6 +219,7 @@ enum bpf_attach_type { ...@@ -218,6 +219,7 @@ enum bpf_attach_type {
BPF_TRACE_FEXIT, BPF_TRACE_FEXIT,
BPF_MODIFY_RETURN, BPF_MODIFY_RETURN,
BPF_LSM_MAC, BPF_LSM_MAC,
BPF_TRACE_ITER,
__MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
}; };
...@@ -228,6 +230,7 @@ enum bpf_link_type { ...@@ -228,6 +230,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
MAX_BPF_LINK_TYPE, MAX_BPF_LINK_TYPE,
}; };
...@@ -612,6 +615,11 @@ union bpf_attr { ...@@ -612,6 +615,11 @@ union bpf_attr {
__u32 type; __u32 type;
} enable_stats; } enable_stats;
struct { /* struct used by BPF_ITER_CREATE command */
__u32 link_fd;
__u32 flags;
} iter_create;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF /* The description below is an attempt at providing documentation to eBPF
...@@ -667,8 +675,8 @@ union bpf_attr { ...@@ -667,8 +675,8 @@ union bpf_attr {
* For tracing programs, safely attempt to read *size* bytes from * For tracing programs, safely attempt to read *size* bytes from
* kernel space address *unsafe_ptr* and store the data in *dst*. * kernel space address *unsafe_ptr* and store the data in *dst*.
* *
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() * Generally, use **bpf_probe_read_user**\ () or
* instead. * **bpf_probe_read_kernel**\ () instead.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
* *
...@@ -676,7 +684,7 @@ union bpf_attr { ...@@ -676,7 +684,7 @@ union bpf_attr {
* Description * Description
* Return the time elapsed since system boot, in nanoseconds. * Return the time elapsed since system boot, in nanoseconds.
* Does not include time the system was suspended. * Does not include time the system was suspended.
* See: clock_gettime(CLOCK_MONOTONIC) * See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
* Return * Return
* Current *ktime*. * Current *ktime*.
* *
...@@ -1535,11 +1543,11 @@ union bpf_attr { ...@@ -1535,11 +1543,11 @@ union bpf_attr {
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
* Description * Description
* Copy a NUL terminated string from an unsafe kernel address * Copy a NUL terminated string from an unsafe kernel address
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
* more details. * more details.
* *
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() * Generally, use **bpf_probe_read_user_str**\ () or
* instead. * **bpf_probe_read_kernel_str**\ () instead.
* Return * Return
* On success, the strictly positive length of the string, * On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative * including the trailing NUL character. On error, a negative
...@@ -1567,7 +1575,7 @@ union bpf_attr { ...@@ -1567,7 +1575,7 @@ union bpf_attr {
* *
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
* Description * Description
* Equivalent to bpf_get_socket_cookie() helper that accepts * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
* *skb*, but gets socket from **struct bpf_sock_ops** context. * *skb*, but gets socket from **struct bpf_sock_ops** context.
* Return * Return
* A 8-byte long non-decreasing number. * A 8-byte long non-decreasing number.
...@@ -1596,6 +1604,7 @@ union bpf_attr { ...@@ -1596,6 +1604,7 @@ union bpf_attr {
* The option value of length *optlen* is pointed by *optval*. * The option value of length *optlen* is pointed by *optval*.
* *
* *bpf_socket* should be one of the following: * *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**. * and **BPF_CGROUP_INET6_CONNECT**.
...@@ -1664,12 +1673,12 @@ union bpf_attr { ...@@ -1664,12 +1673,12 @@ union bpf_attr {
* *
* The lower two bits of *flags* are used as the return code if * The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be * the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by * one of the XDP program return codes up to **XDP_TX**, as chosen
* the caller. Any higher bits in the *flags* argument must be * by the caller. Any higher bits in the *flags* argument must be
* unset. * unset.
* *
* See also bpf_redirect(), which only supports redirecting to an * See also **bpf_redirect**\ (), which only supports redirecting
* ifindex, but doesn't require a map to do so. * to an ifindex, but doesn't require a map to do so.
* Return * Return
* **XDP_REDIRECT** on success, or the value of the two lower bits * **XDP_REDIRECT** on success, or the value of the two lower bits
* of the *flags* argument on error. * of the *flags* argument on error.
...@@ -1777,7 +1786,7 @@ union bpf_attr { ...@@ -1777,7 +1786,7 @@ union bpf_attr {
* the time running for event since last normalization. The * the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event * enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an * open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is * eBPF program, users can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous * typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program. * value and do the calculation inside the eBPF program.
* Return * Return
...@@ -1804,6 +1813,7 @@ union bpf_attr { ...@@ -1804,6 +1813,7 @@ union bpf_attr {
* *opval* and of length *optlen*. * *opval* and of length *optlen*.
* *
* *bpf_socket* should be one of the following: * *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**. * and **BPF_CGROUP_INET6_CONNECT**.
...@@ -1825,7 +1835,7 @@ union bpf_attr { ...@@ -1825,7 +1835,7 @@ union bpf_attr {
* The first argument is the context *regs* on which the kprobe * The first argument is the context *regs* on which the kprobe
* works. * works.
* *
* This helper works by setting setting the PC (program counter) * This helper works by setting the PC (program counter)
* to an override function which is run in place of the original * to an override function which is run in place of the original
* probed function. This means the probed function is not run at * probed function. This means the probed function is not run at
* all. The replacement function just returns with the required * all. The replacement function just returns with the required
...@@ -1994,10 +2004,11 @@ union bpf_attr { ...@@ -1994,10 +2004,11 @@ union bpf_attr {
* *
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be * **AF_INET6**). It's advised to pass zero port (**sin_port**
* expensive, therefore binding to port is not permitted by the * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) * behavior and lets the kernel efficiently pick up an unused
* must be set to zero. * port as long as 4-tuple is unique. Passing non-zero port might
* lead to degraded performance.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
* *
...@@ -2291,7 +2302,7 @@ union bpf_attr { ...@@ -2291,7 +2302,7 @@ union bpf_attr {
* **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_keydown**\ () again with the same values, or calling
* **bpf_rc_repeat**\ (). * **bpf_rc_repeat**\ ().
* *
* Some protocols include a toggle bit, in case the button was * Some protocols include a toggle bit, in case the button was
* released and pressed again between consecutive scancodes. * released and pressed again between consecutive scancodes.
* *
* The *ctx* should point to the lirc sample as passed into * The *ctx* should point to the lirc sample as passed into
...@@ -2637,7 +2648,6 @@ union bpf_attr { ...@@ -2637,7 +2648,6 @@ union bpf_attr {
* *
* *th* points to the start of the TCP header, while *th_len* * *th* points to the start of the TCP header, while *th_len*
* contains **sizeof**\ (**struct tcphdr**). * contains **sizeof**\ (**struct tcphdr**).
*
* Return * Return
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
* error otherwise. * error otherwise.
...@@ -2820,7 +2830,6 @@ union bpf_attr { ...@@ -2820,7 +2830,6 @@ union bpf_attr {
* *
* *th* points to the start of the TCP header, while *th_len* * *th* points to the start of the TCP header, while *th_len*
* contains the length of the TCP header. * contains the length of the TCP header.
*
* Return * Return
* On success, lower 32 bits hold the generated SYN cookie in * On success, lower 32 bits hold the generated SYN cookie in
* followed by 16 bits which hold the MSS value for that cookie, * followed by 16 bits which hold the MSS value for that cookie,
...@@ -2903,7 +2912,7 @@ union bpf_attr { ...@@ -2903,7 +2912,7 @@ union bpf_attr {
* // size, after checking its boundaries. * // size, after checking its boundaries.
* } * }
* *
* In comparison, using **bpf_probe_read_user()** helper here * In comparison, using **bpf_probe_read_user**\ () helper here
* instead to read the string would require to estimate the length * instead to read the string would require to estimate the length
* at compile time, and would often result in copying more memory * at compile time, and would often result in copying more memory
* than necessary. * than necessary.
...@@ -2921,14 +2930,14 @@ union bpf_attr { ...@@ -2921,14 +2930,14 @@ union bpf_attr {
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
* Description * Description
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply. * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
* Return * Return
* On success, the strictly positive length of the string, including * On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value. * the trailing NUL character. On error, a negative value.
* *
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description * Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
* *rcv_nxt* is the ack_seq to be sent out. * *rcv_nxt* is the ack_seq to be sent out.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
...@@ -2956,19 +2965,19 @@ union bpf_attr { ...@@ -2956,19 +2965,19 @@ union bpf_attr {
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
* Description * Description
* For an eBPF program attached to a perf event, retrieve the * For an eBPF program attached to a perf event, retrieve the
* branch records (struct perf_branch_entry) associated to *ctx* * branch records (**struct perf_branch_entry**) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size * and store it in the buffer pointed by *buf* up to size
* *size* bytes. * *size* bytes.
* Return * Return
* On success, number of bytes written to *buf*. On error, a * On success, number of bytes written to *buf*. On error, a
* negative value. * negative value.
* *
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
* instead return the number of bytes required to store all the * instead return the number of bytes required to store all the
* branch entries. If this flag is set, *buf* may be NULL. * branch entries. If this flag is set, *buf* may be NULL.
* *
* **-EINVAL** if arguments invalid or **size** not a multiple * **-EINVAL** if arguments invalid or **size** not a multiple
* of sizeof(struct perf_branch_entry). * of **sizeof**\ (**struct perf_branch_entry**\ ).
* *
* **-ENOENT** if architecture does not support branch records. * **-ENOENT** if architecture does not support branch records.
* *
...@@ -2976,8 +2985,8 @@ union bpf_attr { ...@@ -2976,8 +2985,8 @@ union bpf_attr {
* Description * Description
* Returns 0 on success, values for *pid* and *tgid* as seen from the current * Returns 0 on success, values for *pid* and *tgid* as seen from the current
* *namespace* will be returned in *nsdata*. * *namespace* will be returned in *nsdata*.
* * Return
* On failure, the returned value is one of the following: * 0 on success, or one of the following in case of failure:
* *
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
* with nsfs of current task, or if dev conversion to dev_t lost high bits. * with nsfs of current task, or if dev conversion to dev_t lost high bits.
...@@ -3016,8 +3025,8 @@ union bpf_attr { ...@@ -3016,8 +3025,8 @@ union bpf_attr {
* a global identifier that can be assumed unique. If *ctx* is * a global identifier that can be assumed unique. If *ctx* is
* NULL, then the helper returns the cookie for the initial * NULL, then the helper returns the cookie for the initial
* network namespace. The cookie itself is very similar to that * network namespace. The cookie itself is very similar to that
* of bpf_get_socket_cookie() helper, but for network namespaces * of **bpf_get_socket_cookie**\ () helper, but for network
* instead of sockets. * namespaces instead of sockets.
* Return * Return
* A 8-byte long opaque number. * A 8-byte long opaque number.
* *
...@@ -3052,22 +3061,98 @@ union bpf_attr { ...@@ -3052,22 +3061,98 @@ union bpf_attr {
* *
* The *flags* argument must be zero. * The *flags* argument must be zero.
* Return * Return
* 0 on success, or a negative errno in case of failure. * 0 on success, or a negative error in case of failure:
*
* **-EINVAL** if specified *flags* are not supported.
*
* **-ENOENT** if the socket is unavailable for assignment.
* *
* * **-EINVAL** Unsupported flags specified. * **-ENETUNREACH** if the socket is unreachable (wrong netns).
* * **-ENOENT** Socket is unavailable for assignment. *
* * **-ENETUNREACH** Socket is unreachable (wrong netns). * **-EOPNOTSUPP** if the operation is not supported, for example
* * **-EOPNOTSUPP** Unsupported operation, for example a * a call from outside of TC ingress.
* call from outside of TC ingress. *
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). * **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
* *
* u64 bpf_ktime_get_boot_ns(void) * u64 bpf_ktime_get_boot_ns(void)
* Description * Description
* Return the time elapsed since system boot, in nanoseconds. * Return the time elapsed since system boot, in nanoseconds.
* Does include the time the system was suspended. * Does include the time the system was suspended.
* See: clock_gettime(CLOCK_BOOTTIME) * See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
* Return * Return
* Current *ktime*. * Current *ktime*.
*
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
* out the format string.
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
* the format string itself. The *data* and *data_len* are format string
* arguments. The *data* are a **u64** array and corresponding format string
* values are stored in the array. For strings and pointers where pointees
* are accessed, only the pointer values are stored in the *data* array.
* The *data_len* is the size of *data* in bytes.
*
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
* Reading kernel memory may fail due to either invalid address or
* valid address but requiring a major memory fault. If reading kernel memory
* fails, the string for **%s** will be an empty string, and the ip
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
* by returning 1 from bpf program.
*
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
*
* **-E2BIG** if *fmt* contains too many format specifiers.
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
* Description
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
* The *m* represents the seq_file. The *data* and *len* represent the
* data to write in bytes.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3195,7 +3280,11 @@ union bpf_attr { ...@@ -3195,7 +3280,11 @@ union bpf_attr {
FN(get_netns_cookie), \ FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \ FN(get_current_ancestor_cgroup_id), \
FN(sk_assign), \ FN(sk_assign), \
FN(ktime_get_boot_ns), FN(ktime_get_boot_ns), \
FN(seq_printf), \
FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -3673,7 +3762,7 @@ struct bpf_sock_addr { ...@@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order. * Stored in network byte order.
*/ */
__u32 user_port; /* Allows 4-byte read and write. __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
* Stored in network byte order * Stored in network byte order
*/ */
__u32 family; /* Allows 4-byte read, but no write */ __u32 family; /* Allows 4-byte read, but no write */
......
...@@ -619,6 +619,16 @@ int bpf_link_update(int link_fd, int new_prog_fd, ...@@ -619,6 +619,16 @@ int bpf_link_update(int link_fd, int new_prog_fd,
return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
} }
int bpf_iter_create(int link_fd)
{
union bpf_attr attr;
memset(&attr, 0, sizeof(attr));
attr.iter_create.link_fd = link_fd;
return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
}
int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
__u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt)
{ {
......
...@@ -187,6 +187,8 @@ struct bpf_link_update_opts { ...@@ -187,6 +187,8 @@ struct bpf_link_update_opts {
LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
const struct bpf_link_update_opts *opts); const struct bpf_link_update_opts *opts);
LIBBPF_API int bpf_iter_create(int link_fd);
struct bpf_prog_test_run_attr { struct bpf_prog_test_run_attr {
int prog_fd; int prog_fd;
int repeat; int repeat;
......
...@@ -36,6 +36,20 @@ ...@@ -36,6 +36,20 @@
#define __weak __attribute__((weak)) #define __weak __attribute__((weak))
#endif #endif
/*
* Helper macro to manipulate data structures
*/
#ifndef offsetof
#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
#endif
#ifndef container_of
#define container_of(ptr, type, member) \
({ \
void *__mptr = (void *)(ptr); \
((type *)(__mptr - offsetof(type, member))); \
})
#endif
/* /*
* Helper structure used by eBPF C program * Helper structure used by eBPF C program
* to describe BPF map attributes to libbpf loader * to describe BPF map attributes to libbpf loader
......
...@@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx) \ ...@@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx) \
} \ } \
static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
/*
* BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
* in a structure.
*/
#define BPF_SEQ_PRINTF(seq, fmt, args...) \
({ \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
static const char ___fmt[] = fmt; \
unsigned long long ___param[] = { args }; \
_Pragma("GCC diagnostic pop") \
int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \
___param, sizeof(___param)); \
___ret; \
})
#endif #endif
...@@ -3237,7 +3237,7 @@ int bpf_map__resize(struct bpf_map *map, __u32 max_entries) ...@@ -3237,7 +3237,7 @@ int bpf_map__resize(struct bpf_map *map, __u32 max_entries)
} }
static int static int
bpf_object__probe_name(struct bpf_object *obj) bpf_object__probe_loading(struct bpf_object *obj)
{ {
struct bpf_load_program_attr attr; struct bpf_load_program_attr attr;
char *cp, errmsg[STRERR_BUFSIZE]; char *cp, errmsg[STRERR_BUFSIZE];
...@@ -3257,15 +3257,36 @@ bpf_object__probe_name(struct bpf_object *obj) ...@@ -3257,15 +3257,36 @@ bpf_object__probe_name(struct bpf_object *obj)
ret = bpf_load_program_xattr(&attr, NULL, 0); ret = bpf_load_program_xattr(&attr, NULL, 0);
if (ret < 0) { if (ret < 0) {
cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); ret = errno;
pr_warn("Error in %s():%s(%d). Couldn't load basic 'r0 = 0' BPF program.\n", cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
__func__, cp, errno); pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF "
return -errno; "program. Make sure your kernel supports BPF "
"(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is "
"set to big enough value.\n", __func__, cp, ret);
return -ret;
} }
close(ret); close(ret);
/* now try the same program, but with the name */ return 0;
}
static int
bpf_object__probe_name(struct bpf_object *obj)
{
struct bpf_load_program_attr attr;
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int ret;
/* make sure loading with name works */
memset(&attr, 0, sizeof(attr));
attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
attr.insns = insns;
attr.insns_cnt = ARRAY_SIZE(insns);
attr.license = "GPL";
attr.name = "test"; attr.name = "test";
ret = bpf_load_program_xattr(&attr, NULL, 0); ret = bpf_load_program_xattr(&attr, NULL, 0);
if (ret >= 0) { if (ret >= 0) {
...@@ -5636,7 +5657,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) ...@@ -5636,7 +5657,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
obj->loaded = true; obj->loaded = true;
err = bpf_object__probe_caps(obj); err = bpf_object__probe_loading(obj);
err = err ? : bpf_object__probe_caps(obj);
err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_and_load_btf(obj);
err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__sanitize_maps(obj);
...@@ -6586,6 +6608,8 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, ...@@ -6586,6 +6608,8 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec,
struct bpf_program *prog); struct bpf_program *prog);
static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
struct bpf_program *prog); struct bpf_program *prog);
static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
struct bpf_program *prog);
static const struct bpf_sec_def section_defs[] = { static const struct bpf_sec_def section_defs[] = {
BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER),
...@@ -6629,6 +6653,10 @@ static const struct bpf_sec_def section_defs[] = { ...@@ -6629,6 +6653,10 @@ static const struct bpf_sec_def section_defs[] = {
.is_attach_btf = true, .is_attach_btf = true,
.expected_attach_type = BPF_LSM_MAC, .expected_attach_type = BPF_LSM_MAC,
.attach_fn = attach_lsm), .attach_fn = attach_lsm),
SEC_DEF("iter/", TRACING,
.expected_attach_type = BPF_TRACE_ITER,
.is_attach_btf = true,
.attach_fn = attach_iter),
BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP),
BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT),
BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN),
...@@ -6891,6 +6919,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, ...@@ -6891,6 +6919,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
#define BTF_TRACE_PREFIX "btf_trace_" #define BTF_TRACE_PREFIX "btf_trace_"
#define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_LSM_PREFIX "bpf_lsm_"
#define BTF_ITER_PREFIX "bpf_iter_"
#define BTF_MAX_NAME_SIZE 128 #define BTF_MAX_NAME_SIZE 128
static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
...@@ -6921,6 +6950,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, ...@@ -6921,6 +6950,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name,
else if (attach_type == BPF_LSM_MAC) else if (attach_type == BPF_LSM_MAC)
err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name, err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name,
BTF_KIND_FUNC); BTF_KIND_FUNC);
else if (attach_type == BPF_TRACE_ITER)
err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name,
BTF_KIND_FUNC);
else else
err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
...@@ -7848,6 +7880,12 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, ...@@ -7848,6 +7880,12 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
return bpf_program__attach_lsm(prog); return bpf_program__attach_lsm(prog);
} }
static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
struct bpf_program *prog)
{
return bpf_program__attach_iter(prog, NULL);
}
struct bpf_link * struct bpf_link *
bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
{ {
...@@ -7882,6 +7920,42 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) ...@@ -7882,6 +7920,42 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
return link; return link;
} }
struct bpf_link *
bpf_program__attach_iter(struct bpf_program *prog,
const struct bpf_iter_attach_opts *opts)
{
char errmsg[STRERR_BUFSIZE];
struct bpf_link *link;
int prog_fd, link_fd;
if (!OPTS_VALID(opts, bpf_iter_attach_opts))
return ERR_PTR(-EINVAL);
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
pr_warn("program '%s': can't attach before loaded\n",
bpf_program__title(prog, false));
return ERR_PTR(-EINVAL);
}
link = calloc(1, sizeof(*link));
if (!link)
return ERR_PTR(-ENOMEM);
link->detach = &bpf_link__detach_fd;
link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL);
if (link_fd < 0) {
link_fd = -errno;
free(link);
pr_warn("program '%s': failed to attach to iterator: %s\n",
bpf_program__title(prog, false),
libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
return ERR_PTR(link_fd);
}
link->fd = link_fd;
return link;
}
struct bpf_link *bpf_program__attach(struct bpf_program *prog) struct bpf_link *bpf_program__attach(struct bpf_program *prog)
{ {
const struct bpf_sec_def *sec_def; const struct bpf_sec_def *sec_def;
...@@ -8300,7 +8374,7 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, ...@@ -8300,7 +8374,7 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
struct perf_sample_raw { struct perf_sample_raw {
struct perf_event_header header; struct perf_event_header header;
uint32_t size; uint32_t size;
char data[0]; char data[];
}; };
struct perf_sample_lost { struct perf_sample_lost {
......
...@@ -258,6 +258,15 @@ struct bpf_map; ...@@ -258,6 +258,15 @@ struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
struct bpf_iter_attach_opts {
size_t sz; /* size of this struct for forward/backward compatibility */
};
#define bpf_iter_attach_opts__last_field sz
LIBBPF_API struct bpf_link *
bpf_program__attach_iter(struct bpf_program *prog,
const struct bpf_iter_attach_opts *opts);
struct bpf_insn; struct bpf_insn;
/* /*
......
...@@ -258,6 +258,8 @@ LIBBPF_0.0.8 { ...@@ -258,6 +258,8 @@ LIBBPF_0.0.8 {
LIBBPF_0.0.9 { LIBBPF_0.0.9 {
global: global:
bpf_enable_stats; bpf_enable_stats;
bpf_iter_create;
bpf_link_get_fd_by_id; bpf_link_get_fd_by_id;
bpf_link_get_next_id; bpf_link_get_next_id;
bpf_program__attach_iter;
} LIBBPF_0.0.8; } LIBBPF_0.0.8;
...@@ -153,7 +153,7 @@ struct btf_ext_info_sec { ...@@ -153,7 +153,7 @@ struct btf_ext_info_sec {
__u32 sec_name_off; __u32 sec_name_off;
__u32 num_info; __u32 num_info;
/* Followed by num_info * record_size number of bytes */ /* Followed by num_info * record_size number of bytes */
__u8 data[0]; __u8 data[];
}; };
/* The minimum bpf_func_info checked by the loader */ /* The minimum bpf_func_info checked by the loader */
......
...@@ -686,8 +686,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) ...@@ -686,8 +686,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
break; break;
} }
} }
if (child_pid != -1) if (child_pid != -1) {
if (timeout)
kill(child_pid, SIGTERM);
wait4(child_pid, &status, 0, &stat_config.ru_data); wait4(child_pid, &status, 0, &stat_config.ru_data);
}
if (workload_exec_errno) { if (workload_exec_errno) {
const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
......
...@@ -1821,6 +1821,24 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, ...@@ -1821,6 +1821,24 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused,
} }
#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) #endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT)
static int
symbol__disassemble_bpf_image(struct symbol *sym,
struct annotate_args *args)
{
struct annotation *notes = symbol__annotation(sym);
struct disasm_line *dl;
args->offset = -1;
args->line = strdup("to be implemented");
args->line_nr = 0;
dl = disasm_line__new(args);
if (dl)
annotation_line__add(&dl->al, &notes->src->source);
free(args->line);
return 0;
}
/* /*
* Possibly create a new version of line with tabs expanded. Returns the * Possibly create a new version of line with tabs expanded. Returns the
* existing or new line, storage is updated if a new line is allocated. If * existing or new line, storage is updated if a new line is allocated. If
...@@ -1920,6 +1938,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) ...@@ -1920,6 +1938,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) {
return symbol__disassemble_bpf(sym, args); return symbol__disassemble_bpf(sym, args);
} else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) {
return symbol__disassemble_bpf_image(sym, args);
} else if (dso__is_kcore(dso)) { } else if (dso__is_kcore(dso)) {
kce.kcore_filename = symfs_filename; kce.kcore_filename = symfs_filename;
kce.addr = map__rip_2objdump(map, sym->start); kce.addr = map__rip_2objdump(map, sym->start);
......
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
#include <bpf/libbpf.h> #include <bpf/libbpf.h>
#include <linux/btf.h> #include <linux/btf.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/string.h>
#include <internal/lib.h>
#include <symbol/kallsyms.h>
#include "bpf-event.h" #include "bpf-event.h"
#include "debug.h" #include "debug.h"
#include "dso.h" #include "dso.h"
...@@ -290,11 +293,82 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, ...@@ -290,11 +293,82 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
return err ? -1 : 0; return err ? -1 : 0;
} }
struct kallsyms_parse {
union perf_event *event;
perf_event__handler_t process;
struct machine *machine;
struct perf_tool *tool;
};
static int
process_bpf_image(char *name, u64 addr, struct kallsyms_parse *data)
{
struct machine *machine = data->machine;
union perf_event *event = data->event;
struct perf_record_ksymbol *ksymbol;
int len;
ksymbol = &event->ksymbol;
*ksymbol = (struct perf_record_ksymbol) {
.header = {
.type = PERF_RECORD_KSYMBOL,
.size = offsetof(struct perf_record_ksymbol, name),
},
.addr = addr,
.len = page_size,
.ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF,
.flags = 0,
};
len = scnprintf(ksymbol->name, KSYM_NAME_LEN, "%s", name);
ksymbol->header.size += PERF_ALIGN(len + 1, sizeof(u64));
memset((void *) event + event->header.size, 0, machine->id_hdr_size);
event->header.size += machine->id_hdr_size;
return perf_tool__process_synth_event(data->tool, event, machine,
data->process);
}
static int
kallsyms_process_symbol(void *data, const char *_name,
char type __maybe_unused, u64 start)
{
char disp[KSYM_NAME_LEN];
char *module, *name;
unsigned long id;
int err = 0;
module = strchr(_name, '\t');
if (!module)
return 0;
/* We are going after [bpf] module ... */
if (strcmp(module + 1, "[bpf]"))
return 0;
name = memdup(_name, (module - _name) + 1);
if (!name)
return -ENOMEM;
name[module - _name] = 0;
/* .. and only for trampolines and dispatchers */
if ((sscanf(name, "bpf_trampoline_%lu", &id) == 1) ||
(sscanf(name, "bpf_dispatcher_%s", disp) == 1))
err = process_bpf_image(name, start, data);
free(name);
return err;
}
int perf_event__synthesize_bpf_events(struct perf_session *session, int perf_event__synthesize_bpf_events(struct perf_session *session,
perf_event__handler_t process, perf_event__handler_t process,
struct machine *machine, struct machine *machine,
struct record_opts *opts) struct record_opts *opts)
{ {
const char *kallsyms_filename = "/proc/kallsyms";
struct kallsyms_parse arg;
union perf_event *event; union perf_event *event;
__u32 id = 0; __u32 id = 0;
int err; int err;
...@@ -303,6 +377,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, ...@@ -303,6 +377,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size); event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size);
if (!event) if (!event)
return -1; return -1;
/* Synthesize all the bpf programs in system. */
while (true) { while (true) {
err = bpf_prog_get_next_id(id, &id); err = bpf_prog_get_next_id(id, &id);
if (err) { if (err) {
...@@ -335,6 +411,23 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, ...@@ -335,6 +411,23 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
break; break;
} }
} }
/* Synthesize all the bpf images - trampolines/dispatchers. */
if (symbol_conf.kallsyms_name != NULL)
kallsyms_filename = symbol_conf.kallsyms_name;
arg = (struct kallsyms_parse) {
.event = event,
.process = process,
.machine = machine,
.tool = session->tool,
};
if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) {
pr_err("%s: failed to synthesize bpf images: %s\n",
__func__, strerror(errno));
}
free(event); free(event);
return err; return err;
} }
......
...@@ -191,6 +191,7 @@ int dso__read_binary_type_filename(const struct dso *dso, ...@@ -191,6 +191,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__GUEST_KALLSYMS:
case DSO_BINARY_TYPE__JAVA_JIT: case DSO_BINARY_TYPE__JAVA_JIT:
case DSO_BINARY_TYPE__BPF_PROG_INFO: case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__BPF_IMAGE:
case DSO_BINARY_TYPE__NOT_FOUND: case DSO_BINARY_TYPE__NOT_FOUND:
ret = -1; ret = -1;
break; break;
......
...@@ -40,6 +40,7 @@ enum dso_binary_type { ...@@ -40,6 +40,7 @@ enum dso_binary_type {
DSO_BINARY_TYPE__GUEST_KCORE, DSO_BINARY_TYPE__GUEST_KCORE,
DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
DSO_BINARY_TYPE__BPF_PROG_INFO, DSO_BINARY_TYPE__BPF_PROG_INFO,
DSO_BINARY_TYPE__BPF_IMAGE,
DSO_BINARY_TYPE__NOT_FOUND, DSO_BINARY_TYPE__NOT_FOUND,
}; };
......
...@@ -736,6 +736,12 @@ int machine__process_switch_event(struct machine *machine __maybe_unused, ...@@ -736,6 +736,12 @@ int machine__process_switch_event(struct machine *machine __maybe_unused,
return 0; return 0;
} }
static int is_bpf_image(const char *name)
{
return strncmp(name, "bpf_trampoline_", sizeof("bpf_trampoline_") - 1) ||
strncmp(name, "bpf_dispatcher_", sizeof("bpf_dispatcher_") - 1);
}
static int machine__process_ksymbol_register(struct machine *machine, static int machine__process_ksymbol_register(struct machine *machine,
union perf_event *event, union perf_event *event,
struct perf_sample *sample __maybe_unused) struct perf_sample *sample __maybe_unused)
...@@ -759,6 +765,12 @@ static int machine__process_ksymbol_register(struct machine *machine, ...@@ -759,6 +765,12 @@ static int machine__process_ksymbol_register(struct machine *machine,
map->start = event->ksymbol.addr; map->start = event->ksymbol.addr;
map->end = map->start + event->ksymbol.len; map->end = map->start + event->ksymbol.len;
maps__insert(&machine->kmaps, map); maps__insert(&machine->kmaps, map);
dso__set_loaded(dso);
if (is_bpf_image(event->ksymbol.name)) {
dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE;
dso__set_long_name(dso, "", false);
}
} }
sym = symbol__new(map->map_ip(map, map->start), sym = symbol__new(map->map_ip(map, map->start),
......
...@@ -1544,6 +1544,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, ...@@ -1544,6 +1544,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
return true; return true;
case DSO_BINARY_TYPE__BPF_PROG_INFO: case DSO_BINARY_TYPE__BPF_PROG_INFO:
case DSO_BINARY_TYPE__BPF_IMAGE:
case DSO_BINARY_TYPE__NOT_FOUND: case DSO_BINARY_TYPE__NOT_FOUND:
default: default:
return false; return false;
......
...@@ -38,3 +38,4 @@ test_cpp ...@@ -38,3 +38,4 @@ test_cpp
/bpf_gcc /bpf_gcc
/tools /tools
/runqslower /runqslower
/bench
...@@ -77,7 +77,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ ...@@ -77,7 +77,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
# Compile but not part of 'make run_tests' # Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower test_lirc_mode2_user xdping test_cpp runqslower bench
TEST_CUSTOM_PROGS = urandom_read TEST_CUSTOM_PROGS = urandom_read
...@@ -265,6 +265,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) ...@@ -265,6 +265,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)
TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \
$$(filter-out $(SKEL_BLACKLIST), \ $$(filter-out $(SKEL_BLACKLIST), \
$$(TRUNNER_BPF_SRCS))) $$(TRUNNER_BPF_SRCS)))
TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS)
# Evaluate rules now with extra TRUNNER_XXX variables above already defined # Evaluate rules now with extra TRUNNER_XXX variables above already defined
$$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2))
...@@ -354,6 +355,7 @@ endef ...@@ -354,6 +355,7 @@ endef
TRUNNER_TESTS_DIR := prog_tests TRUNNER_TESTS_DIR := prog_tests
TRUNNER_BPF_PROGS_DIR := progs TRUNNER_BPF_PROGS_DIR := progs
TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
network_helpers.c testing_helpers.c \
flow_dissector_load.h flow_dissector_load.h
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \
$(wildcard progs/btf_dump_test_case_*.c) $(wildcard progs/btf_dump_test_case_*.c)
...@@ -405,6 +407,21 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) ...@@ -405,6 +407,21 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
$(call msg,CXX,,$@) $(call msg,CXX,,$@)
$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@
# Benchmark runner
$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h
$(call msg,CC,,$@)
$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
$(OUTPUT)/bench_count.o \
$(OUTPUT)/bench_rename.o \
$(OUTPUT)/bench_trigger.o
$(call msg,BINARY,,$@)
$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \
prog_tests/tests.h map_tests/tests.h verifier/tests.h \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \
feature \ feature \
......
==================
BPF Selftest Notes
==================
Additional information about selftest failures are
documented here.
bpf_iter test failures with clang/llvm 10.0.0
=============================================
With clang/llvm 10.0.0, the following two bpf_iter tests failed:
* ``bpf_iter/ipv6_route``
* ``bpf_iter/netlink``
The symptom for ``bpf_iter/ipv6_route`` looks like
.. code-block:: c
2: (79) r8 = *(u64 *)(r1 +8)
...
14: (bf) r2 = r8
15: (0f) r2 += r1
; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
16: (7b) *(u64 *)(r8 +64) = r2
only read is supported
The symptom for ``bpf_iter/netlink`` looks like
.. code-block:: c
; struct netlink_sock *nlk = ctx->sk;
2: (79) r7 = *(u64 *)(r1 +8)
...
15: (bf) r2 = r7
16: (0f) r2 += r1
; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
17: (7b) *(u64 *)(r7 +0) = r2
only read is supported
This is due to a llvm BPF backend bug. The fix
https://reviews.llvm.org/D78466
has been pushed to llvm 10.x release branch and will be
available in 10.0.1. The fix is available in llvm 11.0.0 trunk.
此差异已折叠。
/* SPDX-License-Identifier: GPL-2.0 */
#pragma once
#include <stdlib.h>
#include <stdbool.h>
#include <linux/err.h>
#include <errno.h>
#include <unistd.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <math.h>
#include <time.h>
#include <sys/syscall.h>
struct cpu_set {
bool *cpus;
int cpus_len;
int next_cpu;
};
struct env {
char *bench_name;
int duration_sec;
int warmup_sec;
bool verbose;
bool list;
bool affinity;
int consumer_cnt;
int producer_cnt;
struct cpu_set prod_cpus;
struct cpu_set cons_cpus;
};
struct bench_res {
long hits;
long drops;
};
struct bench {
const char *name;
void (*validate)();
void (*setup)();
void *(*producer_thread)(void *ctx);
void *(*consumer_thread)(void *ctx);
void (*measure)(struct bench_res* res);
void (*report_progress)(int iter, struct bench_res* res, long delta_ns);
void (*report_final)(struct bench_res res[], int res_cnt);
};
struct counter {
long value;
} __attribute__((aligned(128)));
extern struct env env;
extern const struct bench *bench;
void setup_libbpf();
void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
void hits_drops_report_final(struct bench_res res[], int res_cnt);
static inline __u64 get_time_ns() {
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return (u64)t.tv_sec * 1000000000 + t.tv_nsec;
}
static inline void atomic_inc(long *value)
{
(void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED);
}
static inline void atomic_add(long *value, long n)
{
(void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED);
}
static inline long atomic_swap(long *value, long n)
{
return __atomic_exchange_n(value, n, __ATOMIC_RELAXED);
}
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include "bench.h"
/* COUNT-GLOBAL benchmark */
static struct count_global_ctx {
struct counter hits;
} count_global_ctx;
static void *count_global_producer(void *input)
{
struct count_global_ctx *ctx = &count_global_ctx;
while (true) {
atomic_inc(&ctx->hits.value);
}
return NULL;
}
static void *count_global_consumer(void *input)
{
return NULL;
}
static void count_global_measure(struct bench_res *res)
{
struct count_global_ctx *ctx = &count_global_ctx;
res->hits = atomic_swap(&ctx->hits.value, 0);
}
/* COUNT-local benchmark */
static struct count_local_ctx {
struct counter *hits;
} count_local_ctx;
static void count_local_setup()
{
struct count_local_ctx *ctx = &count_local_ctx;
ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits));
if (!ctx->hits)
exit(1);
}
static void *count_local_producer(void *input)
{
struct count_local_ctx *ctx = &count_local_ctx;
int idx = (long)input;
while (true) {
atomic_inc(&ctx->hits[idx].value);
}
return NULL;
}
static void *count_local_consumer(void *input)
{
return NULL;
}
static void count_local_measure(struct bench_res *res)
{
struct count_local_ctx *ctx = &count_local_ctx;
int i;
for (i = 0; i < env.producer_cnt; i++) {
res->hits += atomic_swap(&ctx->hits[i].value, 0);
}
}
const struct bench bench_count_global = {
.name = "count-global",
.producer_thread = count_global_producer,
.consumer_thread = count_global_consumer,
.measure = count_global_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
const struct bench bench_count_local = {
.name = "count-local",
.setup = count_local_setup,
.producer_thread = count_local_producer,
.consumer_thread = count_local_consumer,
.measure = count_local_measure,
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
此差异已折叠。
此差异已折叠。
#!/bin/bash
set -eufo pipefail
for i in base kprobe kretprobe rawtp fentry fexit fmodret
do
summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
done
#!/bin/bash
set -eufo pipefail
for i in base tp rawtp kprobe fentry fmodret
do
summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
done
此差异已折叠。
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NETWORK_HELPERS_H
#define __NETWORK_HELPERS_H
#include <sys/socket.h>
#include <sys/types.h>
#include <linux/types.h>
typedef __u16 __sum16;
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <netinet/tcp.h>
#include <bpf/bpf_endian.h>
#define MAGIC_VAL 0x1234
#define NUM_ITER 100000
#define VIP_NUM 5
#define MAGIC_BYTES 123
/* ipv4 test vector */
struct ipv4_packet {
struct ethhdr eth;
struct iphdr iph;
struct tcphdr tcp;
} __packed;
extern struct ipv4_packet pkt_v4;
/* ipv6 test vector */
struct ipv6_packet {
struct ethhdr eth;
struct ipv6hdr iph;
struct tcphdr tcp;
} __packed;
extern struct ipv6_packet pkt_v6;
int start_server(int family, int type);
int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);
#endif
此差异已折叠。
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */ /* Copyright (c) 2019 Facebook */
#include <test_progs.h> #include <test_progs.h>
#include <network_helpers.h>
static void test_fexit_bpf2bpf_common(const char *obj_file, static void test_fexit_bpf2bpf_common(const char *obj_file,
const char *target_obj_file, const char *target_obj_file,
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <test_progs.h> #include <test_progs.h>
#include <network_helpers.h>
#include <error.h> #include <error.h>
#include <linux/if.h> #include <linux/if.h>
#include <linux/if_tun.h> #include <linux/if_tun.h>
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册