diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index daa015af16a092a8d4b7fd1df73f2bbe282251d9..f3b9e507ab05b26eb6517ac2f43992863a606ba1 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -565,7 +565,7 @@ TPACKET_V1 --> TPACKET_V2: (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr)) TPACKET_V2 --> TPACKET_V3: - - Flexible buffer implementation: + - Flexible buffer implementation for RX_RING: 1. Blocks can be configured with non-static frame-size 2. Read/poll is at a block-level (as opposed to packet-level) 3. Added poll timeout to avoid indefinite user-space wait @@ -574,7 +574,12 @@ TPACKET_V2 --> TPACKET_V3: 4.1 block::timeout 4.2 tpkt_hdr::sk_rxhash - RX Hash data available in user space - - Currently only RX_RING available + - TX_RING semantics are conceptually similar to TPACKET_V2; + use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN + instead of TPACKET2_HDRLEN. In the current implementation, + the tp_next_offset field in the tpacket3_hdr MUST be set to + zero, indicating that the ring does not hold variable sized frames. + Packets with non-zero values of tp_next_offset will be dropped. ------------------------------------------------------------------------------- + AF_PACKET fanout mode diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b9e1a13b4ba36a0bc7edf6a8c2c116c7d48c970c..7e39087d519b85105077ec265cbcf4c2ef906938 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: + h.h3->tp_status = status; + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return h.h2->tp_status; case TPACKET_V3: + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + return h.h3->tp_status; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -2497,6 +2502,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, ph.raw = frame; switch (po->tp_version) { + case TPACKET_V3: + if (ph.h3->tp_next_offset != 0) { + pr_warn_once("variable sized slot not supported"); + return -EINVAL; + } + tp_len = ph.h3->tp_len; + break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; @@ -2516,6 +2528,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_net; + break; case TPACKET_V2: off = ph.h2->tp_net; break; @@ -2525,6 +2540,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, } } else { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_mac; + break; case TPACKET_V2: off = ph.h2->tp_mac; break; @@ -4113,11 +4131,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct tpacket_req *req = &req_u->req; lock_sock(sk); - /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ - if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { - net_warn_ratelimited("Tx-ring is not supported.\n"); - goto out; - } rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; @@ -4177,11 +4190,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; switch (po->tp_version) { case TPACKET_V3: - /* Transmit path is not supported. We checked - * it above but just being paranoid - */ - if (!tx_ring) + /* Block transmit is not supported yet */ + if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); + } else { + struct tpacket_req3 *req3 = &req_u->req3; + + if (req3->tp_retire_blk_tov || + req3->tp_sizeof_priv || + req3->tp_feature_req_word) { + err = -EINVAL; + goto out; + } + } break; default: break; diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c index 24adf709bd9ddadfa02e6d308511340fe4eb52e1..4a1bc648b35e92667d0cf756b81a62f7a2a4b482 100644 --- a/tools/testing/selftests/net/psock_tpacket.c +++ b/tools/testing/selftests/net/psock_tpacket.c @@ -311,20 +311,33 @@ static inline void __v2_tx_user_ready(struct tpacket2_hdr *hdr) __sync_synchronize(); } -static inline int __v1_v2_tx_kernel_ready(void *base, int version) +static inline int __v3_tx_kernel_ready(struct tpacket3_hdr *hdr) +{ + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); +} + +static inline void __v3_tx_user_ready(struct tpacket3_hdr *hdr) +{ + hdr->tp_status = TP_STATUS_SEND_REQUEST; + __sync_synchronize(); +} + +static inline int __tx_kernel_ready(void *base, int version) { switch (version) { case TPACKET_V1: return __v1_tx_kernel_ready(base); case TPACKET_V2: return __v2_tx_kernel_ready(base); + case TPACKET_V3: + return __v3_tx_kernel_ready(base); default: bug_on(1); return 0; } } -static inline void __v1_v2_tx_user_ready(void *base, int version) +static inline void __tx_user_ready(void *base, int version) { switch (version) { case TPACKET_V1: @@ -333,6 +346,9 @@ static inline void __v1_v2_tx_user_ready(void *base, int version) case TPACKET_V2: __v2_tx_user_ready(base); break; + case TPACKET_V3: + __v3_tx_user_ready(base); + break; } } @@ -348,7 +364,22 @@ static void __v1_v2_set_packet_loss_discard(int sock) } } -static void walk_v1_v2_tx(int sock, struct ring *ring) +static inline void *get_next_frame(struct ring *ring, int n) +{ + uint8_t *f0 = ring->rd[0].iov_base; + + switch (ring->version) { + case TPACKET_V1: + case TPACKET_V2: + return ring->rd[n].iov_base; + case TPACKET_V3: + return f0 + (n * ring->req3.tp_frame_size); + default: + bug_on(1); + } +} + +static void walk_tx(int sock, struct ring *ring) { struct pollfd pfd; int rcv_sock, ret; @@ -360,9 +391,19 @@ static void walk_v1_v2_tx(int sock, struct ring *ring) .sll_family = PF_PACKET, .sll_halen = ETH_ALEN, }; + int nframes; + + /* TPACKET_V{1,2} sets up the ring->rd* related variables based + * on frames (e.g., rd_num is tp_frame_nr) whereas V3 sets these + * up based on blocks (e.g, rd_num is tp_block_nr) + */ + if (ring->version <= TPACKET_V2) + nframes = ring->rd_num; + else + nframes = ring->req3.tp_frame_nr; bug_on(ring->type != PACKET_TX_RING); - bug_on(ring->rd_num < NUM_PACKETS); + bug_on(nframes < NUM_PACKETS); rcv_sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); if (rcv_sock == -1) { @@ -388,10 +429,11 @@ static void walk_v1_v2_tx(int sock, struct ring *ring) create_payload(packet, &packet_len); while (total_packets > 0) { - while (__v1_v2_tx_kernel_ready(ring->rd[frame_num].iov_base, - ring->version) && + void *next = get_next_frame(ring, frame_num); + + while (__tx_kernel_ready(next, ring->version) && total_packets > 0) { - ppd.raw = ring->rd[frame_num].iov_base; + ppd.raw = next; switch (ring->version) { case TPACKET_V1: @@ -413,14 +455,27 @@ static void walk_v1_v2_tx(int sock, struct ring *ring) packet_len); total_bytes += ppd.v2->tp_h.tp_snaplen; break; + case TPACKET_V3: { + struct tpacket3_hdr *tx = next; + + tx->tp_snaplen = packet_len; + tx->tp_len = packet_len; + tx->tp_next_offset = 0; + + memcpy((uint8_t *)tx + TPACKET3_HDRLEN - + sizeof(struct sockaddr_ll), packet, + packet_len); + total_bytes += tx->tp_snaplen; + break; + } } status_bar_update(); total_packets--; - __v1_v2_tx_user_ready(ppd.raw, ring->version); + __tx_user_ready(next, ring->version); - frame_num = (frame_num + 1) % ring->rd_num; + frame_num = (frame_num + 1) % nframes; } poll(&pfd, 1, 1); @@ -460,7 +515,7 @@ static void walk_v1_v2(int sock, struct ring *ring) if (ring->type == PACKET_RX_RING) walk_v1_v2_rx(sock, ring); else - walk_v1_v2_tx(sock, ring); + walk_tx(sock, ring); } static uint64_t __v3_prev_block_seq_num = 0; @@ -583,7 +638,7 @@ static void walk_v3(int sock, struct ring *ring) if (ring->type == PACKET_RX_RING) walk_v3_rx(sock, ring); else - bug_on(1); + walk_tx(sock, ring); } static void __v1_v2_fill(struct ring *ring, unsigned int blocks) @@ -602,12 +657,13 @@ static void __v1_v2_fill(struct ring *ring, unsigned int blocks) ring->flen = ring->req.tp_frame_size; } -static void __v3_fill(struct ring *ring, unsigned int blocks) +static void __v3_fill(struct ring *ring, unsigned int blocks, int type) { - ring->req3.tp_retire_blk_tov = 64; - ring->req3.tp_sizeof_priv = 0; - ring->req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; - + if (type == PACKET_RX_RING) { + ring->req3.tp_retire_blk_tov = 64; + ring->req3.tp_sizeof_priv = 0; + ring->req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; + } ring->req3.tp_block_size = getpagesize() << 2; ring->req3.tp_frame_size = TPACKET_ALIGNMENT << 7; ring->req3.tp_block_nr = blocks; @@ -641,7 +697,7 @@ static void setup_ring(int sock, struct ring *ring, int version, int type) break; case TPACKET_V3: - __v3_fill(ring, blocks); + __v3_fill(ring, blocks, type); ret = setsockopt(sock, SOL_PACKET, type, &ring->req3, sizeof(ring->req3)); break; @@ -796,6 +852,7 @@ int main(void) ret |= test_tpacket(TPACKET_V2, PACKET_TX_RING); ret |= test_tpacket(TPACKET_V3, PACKET_RX_RING); + ret |= test_tpacket(TPACKET_V3, PACKET_TX_RING); if (ret) return 1;