From 8f20fd98db14404ea7c396ca91c5b29f3b20769f Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 30 Jun 2014 14:57:42 -0700 Subject: [PATCH] netlink-socket: Work around upstream kernel Netlink bug. The upstream kernel net/netlink/af_netlink.c netlink_recvmsg() contains the following code to refill the Netlink socket buffer with more dump skbs while a dump is in progress: if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { sk->sk_err = ret; sk->sk_error_report(sk); } } The netlink_dump() function that this calls returns a negative number on error, the convention used throughout the kernel, and thus sk->sk_err receives a negative value on error. However, sk->sk_err is supposed to contain either 0 or a positive errno value, as one can see from a quick "grep" through net for 'sk_err =', e.g.: ipv4/tcp.c:2067: sk->sk_err = ECONNRESET; ipv4/tcp.c:2069: sk->sk_err = ECONNRESET; ipv4/tcp_input.c:4106: sk->sk_err = ECONNREFUSED; ipv4/tcp_input.c:4109: sk->sk_err = EPIPE; ipv4/tcp_input.c:4114: sk->sk_err = ECONNRESET; netlink/af_netlink.c:741: sk->sk_err = ENOBUFS; netlink/af_netlink.c:1796: sk->sk_err = ENOBUFS; packet/af_packet.c:2476: sk->sk_err = ENETDOWN; unix/af_unix.c:341: other->sk_err = ECONNRESET; unix/af_unix.c:407: skpair->sk_err = ECONNRESET; The result is that the next attempt to receive from the socket will return the error to userspace with the wrong sign. (The root of the error in this case is that multiple threads are attempting to read a single flow dump from a shared fd. That should work, but the kernel has an internal race that can result in one or more of those threads hitting the EINVAL case at the start of netlink_dump(). The EINVAL is harmless in this case and userspace should be able to ignore it, but reporting the EINVAL as if it were a 22-byte message received in userspace throws a real wrench in the works.) This bug makes me think that there are probably not many programs doing multithreaded Netlink dumps. Maybe it is good that we are considering other approaches. VMware-BZ: #1255704 Reported-by: Mihir Gangar Signed-off-by: Ben Pfaff Acked-by: Alex Wang --- AUTHORS | 1 + lib/netlink-socket.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/AUTHORS b/AUTHORS index 64ede5401..829667324 100644 --- a/AUTHORS +++ b/AUTHORS @@ -232,6 +232,7 @@ Michael A. Collins mike.a.collins@ark-net.org Michael Hu mhu@nicira.com Michael Mao mmao@nicira.com Michael Shigorin mike@osdn.org.ua +Mihir Gangar gangarm@vmware.com Mike Bursell mike.bursell@citrix.com Mike Kruze mkruze@nicira.com Min Chen ustcer.tonychan@gmail.com diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index 12d284a92..c5f380269 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -310,6 +310,7 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) struct iovec iov[2]; struct msghdr msg; ssize_t retval; + int error; ovs_assert(buf->allocated >= sizeof *nlmsghdr); ofpbuf_clear(buf); @@ -323,12 +324,23 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) msg.msg_iov = iov; msg.msg_iovlen = 2; + /* Receive a Netlink message from the kernel. + * + * This works around a kernel bug in which the kernel returns an error code + * as if it were the number of bytes read. It doesn't actually modify + * anything in the receive buffer in that case, so we can initialize the + * Netlink header with an impossible message length and then, upon success, + * check whether it changed. */ + nlmsghdr = ofpbuf_base(buf); do { + nlmsghdr->nlmsg_len = UINT32_MAX; retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT); - } while (retval < 0 && errno == EINTR); - - if (retval < 0) { - int error = errno; + error = (retval < 0 ? errno + : retval == 0 ? ECONNRESET /* not possible? */ + : nlmsghdr->nlmsg_len != UINT32_MAX ? 0 + : -retval); + } while (error == EINTR); + if (error) { if (error == ENOBUFS) { /* Socket receive buffer overflow dropped one or more messages that * the kernel tried to send to us. */ @@ -343,7 +355,6 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) return E2BIG; } - nlmsghdr = ofpbuf_data(buf); if (retval < sizeof *nlmsghdr || nlmsghdr->nlmsg_len < sizeof *nlmsghdr || nlmsghdr->nlmsg_len > retval) { -- 2.20.1