lib/netlink-socket.h

   1 /*
   2  * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef NETLINK_SOCKET_H
  18 #define NETLINK_SOCKET_H 1
  19
  20 /* Netlink socket definitions.
  21  *
  22  * This header file defines functions for working with Netlink sockets.  Only
  23  * Linux natively supports Netlink sockets, but Netlink is well suited as a
  24  * basis for extensible low-level protocols, so it can make sense to implement
  25  * a Netlink layer on other systems.  This doesn't have to be done in exactly
  26  * the same way as on Linux, as long as the implementation can support the
  27  * semantics that are important to Open vSwitch.  See "Usage concepts" below
  28  * for more information.
  29  *
  30  * For Netlink protocol definitions, see netlink-protocol.h.  For helper
  31  * functions for working with Netlink messages, see netlink.h.
  32  *
  33  *
  34  * Usage concepts
  35  * ==============
  36  *
  37  * Netlink is a datagram-based network protocol primarily for communication
  38  * between user processes and the kernel.  Netlink is specified in RFC 3549,
  39  * "Linux Netlink as an IP Services Protocol".
  40  *
  41  * Netlink is not suitable for use in physical networks of heterogeneous
  42  * machines because host byte order is used throughout.
  43  *
  44  * The AF_NETLINK socket namespace is subdivided into statically numbered
  45  * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the third
  46  * argument to the socket() function.  Maintaining the assigned numbers became
  47  * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC protocol was
  48  * introduced to map between human-readable names and dynamically assigned
  49  * numbers.  All recently introduced Netlink protocol messages in Linux
  50  * (including all of the Open vSwitch specific messages) fall under
  51  * NETLINK_GENERIC.  The Netlink library provides the nl_lookup_genl_family()
  52  * function for translating a Generic Netlink name to a number.  On Linux, this
  53  * queries the kernel Generic Netlink implementation, but on other systems it
  54  * might be easier to statically assign each of the names used by Open vSwitch
  55  * and then implement this function entirely in userspace.
  56  *
  57  * Each Netlink socket is distinguished by its Netlink PID, a 32-bit integer
  58  * that is analogous to a TCP or UDP port number.  The kernel has PID 0.
  59  *
  60  * Most Netlink messages manage a kernel table of some kind, e.g. the kernel
  61  * routing table, ARP table, etc.  Open vSwitch specific messages manage tables
  62  * of datapaths, ports within datapaths ("vports"), and flows within
  63  * datapaths.  Open vSwitch also has messages related to network packets
  64  * received on vports, which aren't really a table.
  65  *
  66  * Datagram protocols over a physical network are typically unreliable: in UDP,
  67  * for example, messages can be dropped, delivered more than once, or delivered
  68  * out of order.  In Linux, Netlink does not deliver messages out of order or
  69  * multiple times.  In some cases it can drop messages, but the kernel
  70  * indicates when a message has been dropped.  The description below of each
  71  * way Open vSwitch uses Netlink also explains how to work around dropped
  72  * messages.
  73  *
  74  * Open vSwitch uses Netlink in four characteristic ways:
  75  *
  76  *    1. Transactions.  A transaction is analogous to a system call, an ioctl,
  77  *       or an RPC: userspace sends a request to the kernel, which processes
  78  *       the request synchronously and returns a reply to userspace.
  79  *       (Sometimes there is no explicit reply, but even in that case userspace
  80  *       will receive an immediate reply if there is an error.)
  81  *
  82  *       nl_transact() is the primary interface for transactions over Netlink.
  83  *       This function doesn't take a socket as a parameter because sockets do
  84  *       not have any state related to transactions.
  85  *
  86  *       Netlink uses 16-bit "length" fields extensively, which effectively
  87  *       limits requests and replies to 64 kB.  "Dumps" (see below) are one way
  88  *       to work around this limit for replies.
  89  *
  90  *       In the Linux implementation of Netlink transactions, replies can
  91  *       sometimes be lost.  When this happens, nl_transact() automatically
  92  *       executes the transaction again.  This means that it is important that
  93  *       transactions be idempotent, or that the client be prepared to tolerate
  94  *       that a transaction might actually execute more than once.
  95  *
  96  *       The Linux implementation can execute several transactions at the same
  97  *       time more efficiently than individually.  nl_transact_multiple()
  98  *       allows for this.  The semantics are no different from executing each
  99  *       of the transactions individually with nl_transact().
 100  *
 101  *    2. Dumps.  A dump asks the kernel to provide all of the information in a
 102  *       table.  It consists of a request and a reply, where the reply consists
 103  *       of an arbitrary number of messages.  Each message in the reply is
 104  *       limited to 64 kB, as is the request, but the total size of the reply
 105  *       can be many times larger.
 106  *
 107  *       The reply to a dump is usually generated piece by piece, not
 108  *       atomically.  The reply can represent an inconsistent snapshot of the
 109  *       table.  This is especially likely if entries in the table were being
 110  *       added or deleted or changing during the dump.
 111  *
 112  *       nl_dump_start() begins a dump based on the caller-provided request and
 113  *       initializes a "struct nl_dump" to identify the dump.  Subsequent calls
 114  *       to nl_dump_next() then obtain the reply, one message at a time.
 115  *       Usually, each message gives information about some entry in a table,
 116  *       e.g. one flow in the Open vSwitch flow table, or one route in a
 117  *       routing table.  nl_dump_done() ends the dump.
 118  *
 119  *       Linux implements dumps so that messages in a reply do not get lost.
 120  *
 121  *    3. Multicast subscriptions.  Most kernel Netlink implementations allow a
 122  *       process to monitor changes to its table, by subscribing to a Netlink
 123  *       multicast group dedicated to that table.  Whenever the table's content
 124  *       changes (e.g. an entry is added or deleted or modified), the Netlink
 125  *       implementation sends a message to all sockets that subscribe to its
 126  *       multicast group notifying it of details of the change.  (This doesn't
 127  *       require much extra work by the Netlink implementer because the message
 128  *       is generally identical to the one sent as a reply to the request that
 129  *       changed the table.)
 130  *
 131  *       nl_sock_join_mcgroup() subscribes a socket to a multicast group, and
 132  *       nl_sock_recv() reads notifications.
 133  *
 134  *       If userspace doesn't read messages from a socket subscribed to a
 135  *       multicast group quickly enough, then notification messages can pile up
 136  *       in the socket's receive buffer.  If this continues long enough, the
 137  *       receive buffer will fill up and notifications will be lost.  In that
 138  *       case, nl_sock_recv() will return ENOBUFS.  The client can then use a
 139  *       dump to resynchronize with the table state.  (A simple implementation
 140  *       of multicast groups might take advantage of this by simply returning
 141  *       ENOBUFS whenever a table changes, without implementing actual
 142  *       notifications.  This would cause lots of extra dumps, so it may not be
 143  *       suitable as a production implementation.)
 144  *
 145  *    4. Unicast subscriptions (Open vSwitch specific).  Userspace can assign
 146  *       one or more Netlink PIDs to a vport as "upcall PIDs".  When a packet
 147  *       received on the vport does not match any flow in its datapath's flow
 148  *       table, the kernel hashes some of the packet's headers, uses the hash
 149  *       to select one of the PIDs, and sends the packet (encapsulated in an
 150  *       Open vSwitch Netlink message) to the socket with the selected PID.
 151  *
 152  *       nl_sock_recv() reads notifications sent this way.
 153  *
 154  *       Specifically on Windows platform, the datapath needs to allocate a
 155  *       queue for packets, and it does so only when userspace "subscribe"'s to
 156  *       packets on that netlink socket.  Before closing the netlink socket,
 157  *       userspace needs to "unsubscribe" packets on that netlink socket.
 158  *
 159  *       nl_sock_subscribe_packets() and nl_sock_unsubscribe_packets() are
 160  *       Windows specific.
 161  *
 162  *       Messages received this way can overflow, just like multicast
 163  *       subscription messages, and they are reported the same way.  Because
 164  *       packet notification messages do not report the state of a table, there
 165  *       is no way to recover the dropped packets; they are simply lost.
 166  *
 167  *       The main reason to support multiple PIDs per vport is to increase
 168  *       fairness, that is, to make it harder for a single high-flow-rate
 169  *       sender to drown out lower rate sources.  Multiple PIDs per vport might
 170  *       also improve packet handling latency or flow setup rate, but that is
 171  *       not the main goal.
 172  *
 173  *       Old versions of the Linux kernel module supported only one PID per
 174  *       vport, and userspace still copes with this, so a simple or early
 175  *       implementation might only support one PID per vport too.
 176  *
 177  *
 178  * Thread-safety
 179  * =============
 180  *
 181  * Most of the netlink functions are not fully thread-safe: Only a single
 182  * thread may use a given nl_sock or nl_dump at one time. The exceptions are:
 183  *
 184  *    - nl_sock_recv() is conditionally thread-safe: it may be called from
 185  *      different threads with the same nl_sock, but each caller must provide
 186  *      an independent receive buffer.
 187  *
 188  *    - nl_dump_next() is conditionally thread-safe: it may be called from
 189  *      different threads with the same nl_dump, but each caller must provide
 190  *      independent buffers.
 191  */
 192
 193 #include <stdbool.h>
 194 #include <stddef.h>
 195 #include <stdint.h>
 196 #include "ofpbuf.h"
 197 #include "ovs-atomic.h"
 198 #include "ovs-thread.h"
 199
 200 struct nl_sock;
 201
 202 #ifndef HAVE_NETLINK
 203 #ifndef _WIN32
 204 #error "netlink-socket.h is only for hosts that support Netlink sockets"
 205 #endif
 206 #endif
 207
 208 /* Netlink sockets. */
 209 int nl_sock_create(int protocol, struct nl_sock **);
 210 int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
 211 void nl_sock_destroy(struct nl_sock *);
 212
 213 int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group);
 214 int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group);
 215
 216 #ifdef _WIN32
 217 int nl_sock_subscribe_packets(struct nl_sock *sock);
 218 int nl_sock_unsubscribe_packets(struct nl_sock *sock);
 219 #endif
 220
 221 int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
 222 int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
 223                      uint32_t nlmsg_seq, bool wait);
 224 int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait);
 225
 226 int nl_sock_drain(struct nl_sock *);
 227
 228 void nl_sock_wait(const struct nl_sock *, short int events);
 229 #ifndef _WIN32
 230 int nl_sock_fd(const struct nl_sock *);
 231 #endif
 232
 233 uint32_t nl_sock_pid(const struct nl_sock *);
 234
 235 /* Batching transactions. */
 236 struct nl_transaction {
 237     /* Filled in by client. */
 238     struct ofpbuf *request;     /* Request to send. */
 239
 240     /* The client must initialize 'reply' to one of:
 241      *
 242      *   - NULL, if it does not care to examine the reply.
 243      *
 244      *   - Otherwise, to an ofpbuf with a memory allocation of at least
 245      *     NLMSG_HDRLEN bytes.
 246      */
 247     struct ofpbuf *reply;       /* Reply (empty if reply was an error code). */
 248     int error;                  /* Positive errno value, 0 if no error. */
 249 };
 250
 251 /* Transactions without an allocated socket. */
 252 int nl_transact(int protocol, const struct ofpbuf *request,
 253                 struct ofpbuf **replyp);
 254 void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
 255
 256 /* Table dumping. */
 257 #define NL_DUMP_BUFSIZE         4096
 258
 259 struct nl_dump {
 260     /* These members are immutable during the lifetime of the nl_dump. */
 261     struct nl_sock *sock;       /* Socket being dumped. */
 262     uint32_t nl_seq;            /* Expected nlmsg_seq for replies. */
 263
 264     /* 'mutex' protects 'status' and serializes access to 'sock'. */
 265     struct ovs_mutex mutex;     /* Protects 'status', synchronizes recv(). */
 266     int status OVS_GUARDED;     /* 0: dump in progress,
 267                                  * positive errno: dump completed with error,
 268                                  * EOF: dump completed successfully. */
 269 };
 270
 271 void nl_dump_start(struct nl_dump *, int protocol,
 272                    const struct ofpbuf *request);
 273 bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
 274 int nl_dump_done(struct nl_dump *);
 275
 276 /* Miscellaneous */
 277 int nl_lookup_genl_family(const char *name, int *number);
 278 int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
 279                            unsigned int *multicast_group);
 280
 281 #endif /* netlink-socket.h */