Skip to content

Commit

Permalink
Merge branch 'bpf-fix-wrong-copied_seq-calculation-and-add-tests'
Browse files Browse the repository at this point in the history
Jiayuan Chen says:

====================
A previous commit described in this topic
http://lore.kernel.org/bpf/[email protected]
directly updated 'sk->copied_seq' in the tcp_eat_skb() function when the
action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS,
the update logic for 'sk->copied_seq' was moved to
tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature.

That commit works for a single stream_verdict scenario, as it also
modified 'sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb'
to remove updating 'sk->copied_seq'.

However, for programs where both stream_parser and stream_verdict are
active (strparser purpose), tcp_read_sock() was used instead of
tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock).
tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicated
updates.

In summary, for strparser + SK_PASS, copied_seq is redundantly calculated
in both tcp_read_sock() and tcp_bpf_recvmsg_parser().

The issue causes incorrect copied_seq calculations, which prevent
correct data reads from the recv() interface in user-land.

Also we added test cases for bpf + strparser and separated them from
sockmap_basic, as strparser has more encapsulation and parsing
capabilities compared to sockmap.
====================

Link: https://patch.msgid.link/[email protected]
Signed-off-by: Martin KaFai Lau <[email protected]>
  • Loading branch information
Martin KaFai Lau committed Jan 29, 2025
2 parents bc27c52 + 6fcfe96 commit 9bf412d
Show file tree
Hide file tree
Showing 12 changed files with 610 additions and 65 deletions.
9 changes: 8 additions & 1 deletion Documentation/networking/strparser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Functions
Callbacks
=========

There are six callbacks:
There are seven callbacks:

::

Expand Down Expand Up @@ -182,6 +182,13 @@ There are six callbacks:
the length of the message. skb->len - offset may be greater
then full_len since strparser does not trim the skb.

::

int (*read_sock)(struct strparser *strp, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
The read_sock callback is used by strparser instead of
sock->ops->read_sock, if provided.
::

int (*read_sock_done)(struct strparser *strp, int err);
Expand Down
2 changes: 2 additions & 0 deletions include/linux/skmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ struct sk_psock {
struct sk_psock_progs progs;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
struct strparser strp;
u32 copied_seq;
u32 ingress_bytes;
#endif
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
Expand Down
2 changes: 2 additions & 0 deletions include/net/strparser.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ struct strparser;
struct strp_callbacks {
int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
int (*read_sock)(struct strparser *strp, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
int (*read_sock_done)(struct strparser *strp, int err);
void (*abort_parser)(struct strparser *strp, int err);
void (*lock)(struct strparser *strp);
Expand Down
8 changes: 8 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *);
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor, bool noack,
u32 *copied_seq);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
void tcp_read_done(struct sock *sk, size_t len);
Expand Down Expand Up @@ -2599,6 +2602,11 @@ struct sk_psock;
#ifdef CONFIG_BPF_SYSCALL
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#ifdef CONFIG_BPF_STREAM_PARSER
struct strparser;
int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
#endif /* CONFIG_BPF_STREAM_PARSER */
#endif /* CONFIG_BPF_SYSCALL */

#ifdef CONFIG_INET
Expand Down
7 changes: 7 additions & 0 deletions net/core/skmsg.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}

#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
psock->ingress_bytes += len;
#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
Expand Down Expand Up @@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
if (!ret)
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);

if (sk_is_tcp(sk)) {
psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
psock->copied_seq = tcp_sk(sk)->copied_seq;
}
return ret;
}

Expand Down
5 changes: 4 additions & 1 deletion net/core/sock_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)

write_lock_bh(&sk->sk_callback_lock);
if (stream_parser && stream_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (sk_is_tcp(sk))
ret = sk_psock_init_strp(sk, psock);
else
ret = -EOPNOTSUPP;
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_put(sk, psock);
Expand Down
29 changes: 24 additions & 5 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb);
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor, bool noack,
u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
u32 seq = tp->copied_seq;
u32 seq = *copied_seq;
u32 offset;
int copied = 0;

Expand Down Expand Up @@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
WRITE_ONCE(*copied_seq, seq);
}
WRITE_ONCE(tp->copied_seq, seq);
WRITE_ONCE(*copied_seq, seq);

if (noack)
goto out;

tcp_rcv_space_adjust(sk);

Expand All @@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
out:
return copied;
}

int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
return __tcp_read_sock(sk, desc, recv_actor, false,
&tcp_sk(sk)->copied_seq);
}
EXPORT_SYMBOL(tcp_read_sock);

int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor, bool noack,
u32 *copied_seq)
{
return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
}

int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
Expand Down
36 changes: 36 additions & 0 deletions net/ipv4/tcp_bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
}

#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
struct sock *sk = strp->sk;
struct sk_psock *psock;
struct tcp_sock *tp;
int copied = 0;

tp = tcp_sk(sk);
rcu_read_lock();
psock = sk_psock(sk);
if (WARN_ON_ONCE(!psock)) {
desc->error = -EINVAL;
goto out;
}

psock->ingress_bytes = 0;
copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
&psock->copied_seq);
if (copied < 0)
goto out;
/* recv_actor may redirect skb to another socket (SK_REDIRECT) or
* just put skb into ingress queue of current socket (SK_PASS).
* For SK_REDIRECT, we need to ack the frame immediately but for
* SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
*/
tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
tcp_rcv_space_adjust(sk);
__tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
out:
rcu_read_unlock();
return copied;
}
#endif /* CONFIG_BPF_STREAM_PARSER */

int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
Expand Down
11 changes: 9 additions & 2 deletions net/strparser/strparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -347,15 +347,21 @@ static int strp_read_sock(struct strparser *strp)
struct socket *sock = strp->sk->sk_socket;
read_descriptor_t desc;

if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
if (unlikely(!sock || !sock->ops))
return -EBUSY;

if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock))
return -EBUSY;

desc.arg.data = strp;
desc.error = 0;
desc.count = 1; /* give more than one skb per call */

/* sk should be locked here, so okay to do read_sock */
sock->ops->read_sock(strp->sk, &desc, strp_recv);
if (strp->cb.read_sock)
strp->cb.read_sock(strp, &desc, strp_recv);
else
sock->ops->read_sock(strp->sk, &desc, strp_recv);

desc.error = strp->cb.read_sock_done(strp, desc.error);

Expand Down Expand Up @@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk,
strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
strp->cb.rcv_msg = cb->rcv_msg;
strp->cb.parse_msg = cb->parse_msg;
strp->cb.read_sock = cb->read_sock;
strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;

Expand Down
59 changes: 3 additions & 56 deletions tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,66 +522,15 @@ static void test_sockmap_skb_verdict_shutdown(void)
if (!ASSERT_EQ(err, 1, "epoll_wait(fd)"))
goto out_close;

n = recv(c1, &b, 1, SOCK_NONBLOCK);
ASSERT_EQ(n, 0, "recv_timeout(fin)");
n = recv(c1, &b, 1, MSG_DONTWAIT);
ASSERT_EQ(n, 0, "recv(fin)");
out_close:
close(c1);
close(p1);
out:
test_sockmap_pass_prog__destroy(skel);
}

static void test_sockmap_stream_pass(void)
{
int zero = 0, sent, recvd;
int verdict, parser;
int err, map;
int c = -1, p = -1;
struct test_sockmap_pass_prog *pass = NULL;
char snd[256] = "0123456789";
char rcv[256] = "0";

pass = test_sockmap_pass_prog__open_and_load();
verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
parser = bpf_program__fd(pass->progs.prog_skb_parser);
map = bpf_map__fd(pass->maps.sock_map_rx);

err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0);
if (!ASSERT_OK(err, "bpf_prog_attach stream parser"))
goto out;

err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
if (!ASSERT_OK(err, "bpf_prog_attach stream verdict"))
goto out;

err = create_pair(AF_INET, SOCK_STREAM, &c, &p);
if (err)
goto out;

/* sk_data_ready of 'p' will be replaced by strparser handler */
err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST);
if (!ASSERT_OK(err, "bpf_map_update_elem(p)"))
goto out_close;

/*
* as 'prog_skb_parser' return the original skb len and
* 'prog_skb_verdict' return SK_PASS, the kernel will just
* pass it through to original socket 'p'
*/
sent = xsend(c, snd, sizeof(snd), 0);
ASSERT_EQ(sent, sizeof(snd), "xsend(c)");

recvd = recv_timeout(p, rcv, sizeof(rcv), SOCK_NONBLOCK,
IO_TIMEOUT_SEC);
ASSERT_EQ(recvd, sizeof(rcv), "recv_timeout(p)");

out_close:
close(c);
close(p);

out:
test_sockmap_pass_prog__destroy(pass);
}

static void test_sockmap_skb_verdict_fionread(bool pass_prog)
{
Expand Down Expand Up @@ -628,7 +577,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
ASSERT_EQ(avail, expected, "ioctl(FIONREAD)");
/* On DROP test there will be no data to read */
if (pass_prog) {
recvd = recv_timeout(c1, &buf, sizeof(buf), SOCK_NONBLOCK, IO_TIMEOUT_SEC);
recvd = recv_timeout(c1, &buf, sizeof(buf), MSG_DONTWAIT, IO_TIMEOUT_SEC);
ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(c0)");
}

Expand Down Expand Up @@ -1101,8 +1050,6 @@ void test_sockmap_basic(void)
test_sockmap_progs_query(BPF_SK_SKB_VERDICT);
if (test__start_subtest("sockmap skb_verdict shutdown"))
test_sockmap_skb_verdict_shutdown();
if (test__start_subtest("sockmap stream parser and verdict pass"))
test_sockmap_stream_pass();
if (test__start_subtest("sockmap skb_verdict fionread"))
test_sockmap_skb_verdict_fionread(true);
if (test__start_subtest("sockmap skb_verdict fionread on drop"))
Expand Down
Loading

0 comments on commit 9bf412d

Please sign in to comment.