2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
56 #include "netlink-socket.h"
58 #include "openflow/openflow.h"
60 #include "poll-loop.h"
61 #include "rtnetlink.h"
62 #include "rtnetlink-link.h"
63 #include "socket-util.h"
68 VLOG_DEFINE_THIS_MODULE(netdev_linux);
70 COVERAGE_DEFINE(netdev_get_vlan_vid);
71 COVERAGE_DEFINE(netdev_set_policing);
72 COVERAGE_DEFINE(netdev_arp_lookup);
73 COVERAGE_DEFINE(netdev_get_ifindex);
74 COVERAGE_DEFINE(netdev_get_hwaddr);
75 COVERAGE_DEFINE(netdev_set_hwaddr);
76 COVERAGE_DEFINE(netdev_ethtool);
78 /* These were introduced in Linux 2.6.14, so they might be missing if we have
80 #ifndef ADVERTISED_Pause
81 #define ADVERTISED_Pause (1 << 13)
83 #ifndef ADVERTISED_Asym_Pause
84 #define ADVERTISED_Asym_Pause (1 << 14)
87 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
90 #define TC_RTAB_SIZE 1024
93 static struct rtnetlink_notifier netdev_linux_cache_notifier;
94 static int cache_notifier_refcount;
97 VALID_IFINDEX = 1 << 0,
98 VALID_ETHERADDR = 1 << 1,
102 VALID_CARRIER = 1 << 5,
103 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
104 VALID_POLICING = 1 << 7,
105 VALID_HAVE_VPORT_STATS = 1 << 8
113 /* Traffic control. */
115 /* An instance of a traffic control class. Always associated with a particular
118 * Each TC implementation subclasses this with whatever additional data it
121 const struct tc_ops *ops;
122 struct hmap queues; /* Contains "struct tc_queue"s.
123 * Read by generic TC layer.
124 * Written only by TC implementation. */
127 /* One traffic control queue.
129 * Each TC implementation subclasses this with whatever additional data it
132 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
133 unsigned int queue_id; /* OpenFlow queue ID. */
136 /* A particular kind of traffic control. Each implementation generally maps to
137 * one particular Linux qdisc class.
139 * The functions below return 0 if successful or a positive errno value on
140 * failure, except where otherwise noted. All of them must be provided, except
141 * where otherwise noted. */
143 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
144 * This is null for tc_ops_default and tc_ops_other, for which there are no
145 * appropriate values. */
146 const char *linux_name;
148 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
149 const char *ovs_name;
151 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
152 * queues. The queues are numbered 0 through n_queues - 1. */
153 unsigned int n_queues;
155 /* Called to install this TC class on 'netdev'. The implementation should
156 * make the Netlink calls required to set up 'netdev' with the right qdisc
157 * and configure it according to 'details'. The implementation may assume
158 * that the current qdisc is the default; that is, there is no need for it
159 * to delete the current qdisc before installing itself.
161 * The contents of 'details' should be documented as valid for 'ovs_name'
162 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
163 * (which is built as ovs-vswitchd.conf.db(8)).
165 * This function must return 0 if and only if it sets 'netdev->tc' to an
166 * initialized 'struct tc'.
168 * (This function is null for tc_ops_other, which cannot be installed. For
169 * other TC classes it should always be nonnull.) */
170 int (*tc_install)(struct netdev *netdev, const struct shash *details);
172 /* Called when the netdev code determines (through a Netlink query) that
173 * this TC class's qdisc is installed on 'netdev', but we didn't install
174 * it ourselves and so don't know any of the details.
176 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
177 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
178 * implementation should parse the other attributes of 'nlmsg' as
179 * necessary to determine its configuration. If necessary it should also
180 * use Netlink queries to determine the configuration of queues on
183 * This function must return 0 if and only if it sets 'netdev->tc' to an
184 * initialized 'struct tc'. */
185 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
187 /* Destroys the data structures allocated by the implementation as part of
188 * 'tc'. (This includes destroying 'tc->queues' by calling
191 * The implementation should not need to perform any Netlink calls. If
192 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
193 * (But it may not be desirable.)
195 * This function may be null if 'tc' is trivial. */
196 void (*tc_destroy)(struct tc *tc);
198 /* Retrieves details of 'netdev->tc' configuration into 'details'.
200 * The implementation should not need to perform any Netlink calls, because
201 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
202 * cached the configuration.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function may be null if 'tc' is not configurable.
210 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
212 /* Reconfigures 'netdev->tc' according to 'details', performing any
213 * required Netlink calls to complete the reconfiguration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_set)(struct netdev *, const struct shash *details);
223 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
224 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "Queue" table in
228 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
230 * The implementation should not need to perform any Netlink calls, because
231 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
232 * cached the queue configuration.
234 * This function may be null if 'tc' does not have queues ('n_queues' is
236 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
237 struct shash *details);
239 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
240 * 'details', perfoming any required Netlink calls to complete the
241 * reconfiguration. The caller ensures that 'queue_id' is less than
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "Queue" table in
246 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
248 * This function may be null if 'tc' does not have queues or its queues are
249 * not configurable. */
250 int (*class_set)(struct netdev *, unsigned int queue_id,
251 const struct shash *details);
253 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
254 * tc_queue's within 'netdev->tc->queues'.
256 * This function may be null if 'tc' does not have queues or its queues
257 * cannot be deleted. */
258 int (*class_delete)(struct netdev *, struct tc_queue *queue);
260 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
261 * 'struct tc_queue's within 'netdev->tc->queues'.
263 * On success, initializes '*stats'.
265 * This function may be null if 'tc' does not have queues or if it cannot
266 * report queue statistics. */
267 int (*class_get_stats)(const struct netdev *netdev,
268 const struct tc_queue *queue,
269 struct netdev_queue_stats *stats);
271 /* Extracts queue stats from 'nlmsg', which is a response to a
272 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
274 * This function may be null if 'tc' does not have queues or if it cannot
275 * report queue statistics. */
276 int (*class_dump_stats)(const struct netdev *netdev,
277 const struct ofpbuf *nlmsg,
278 netdev_dump_queue_stats_cb *cb, void *aux);
282 tc_init(struct tc *tc, const struct tc_ops *ops)
285 hmap_init(&tc->queues);
289 tc_destroy(struct tc *tc)
291 hmap_destroy(&tc->queues);
294 static const struct tc_ops tc_ops_htb;
295 static const struct tc_ops tc_ops_hfsc;
296 static const struct tc_ops tc_ops_default;
297 static const struct tc_ops tc_ops_other;
299 static const struct tc_ops *tcs[] = {
300 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
301 &tc_ops_hfsc, /* Hierarchical fair service curve. */
302 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
303 &tc_ops_other, /* Some other qdisc. */
307 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
308 static unsigned int tc_get_major(unsigned int handle);
309 static unsigned int tc_get_minor(unsigned int handle);
311 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
312 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
313 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
315 static struct tcmsg *tc_make_request(const struct netdev *, int type,
316 unsigned int flags, struct ofpbuf *);
317 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
319 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
320 struct nlattr **options);
321 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
322 struct nlattr **options,
323 struct netdev_queue_stats *);
324 static int tc_query_class(const struct netdev *,
325 unsigned int handle, unsigned int parent,
326 struct ofpbuf **replyp);
327 static int tc_delete_class(const struct netdev *, unsigned int handle);
329 static int tc_del_qdisc(struct netdev *netdev);
330 static int tc_query_qdisc(const struct netdev *netdev);
332 static int tc_calc_cell_log(unsigned int mtu);
333 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
334 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
335 const struct tc_ratespec *rate);
336 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
338 struct netdev_dev_linux {
339 struct netdev_dev netdev_dev;
341 struct shash_node *shash_node;
342 unsigned int cache_valid;
344 /* The following are figured out "on demand" only. They are only valid
345 * when the corresponding VALID_* bit in 'cache_valid' is set. */
347 uint8_t etheraddr[ETH_ADDR_LEN];
348 struct in_addr address, netmask;
352 bool is_internal; /* Is this an openvswitch internal device? */
353 bool is_tap; /* Is this a tuntap device? */
354 uint32_t kbits_rate; /* Policing data. */
355 uint32_t kbits_burst;
356 bool have_vport_stats;
360 struct tap_state tap;
364 struct netdev_linux {
365 struct netdev netdev;
369 /* An AF_INET socket (used for ioctl operations). */
370 static int af_inet_sock = -1;
372 /* A Netlink routing socket that is not subscribed to any multicast groups. */
373 static struct nl_sock *rtnl_sock;
375 struct netdev_linux_notifier {
376 struct netdev_notifier notifier;
380 static struct shash netdev_linux_notifiers =
381 SHASH_INITIALIZER(&netdev_linux_notifiers);
382 static struct rtnetlink_notifier netdev_linux_poll_notifier;
384 /* This is set pretty low because we probably won't learn anything from the
385 * additional log messages. */
386 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
388 static int netdev_linux_init(void);
390 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
391 int cmd, const char *cmd_name);
392 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
393 const char *cmd_name);
394 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
395 int cmd, const char *cmd_name);
396 static int get_flags(const struct netdev *, int *flagsp);
397 static int set_flags(struct netdev *, int flags);
398 static int do_get_ifindex(const char *netdev_name);
399 static int get_ifindex(const struct netdev *, int *ifindexp);
400 static int do_set_addr(struct netdev *netdev,
401 int ioctl_nr, const char *ioctl_name,
402 struct in_addr addr);
403 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
404 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
405 const uint8_t[ETH_ADDR_LEN]);
406 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
407 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410 is_netdev_linux_class(const struct netdev_class *netdev_class)
412 return netdev_class->init == netdev_linux_init;
415 static struct netdev_dev_linux *
416 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
418 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
419 assert(is_netdev_linux_class(netdev_class));
421 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
424 static struct netdev_linux *
425 netdev_linux_cast(const struct netdev *netdev)
427 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
435 netdev_linux_init(void)
437 static int status = -1;
439 /* Create AF_INET socket. */
440 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
441 status = af_inet_sock >= 0 ? 0 : errno;
443 VLOG_ERR("failed to create inet socket: %s", strerror(status));
446 /* Create rtnetlink socket. */
448 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
450 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
459 netdev_linux_run(void)
461 rtnetlink_link_notifier_run();
465 netdev_linux_wait(void)
467 rtnetlink_link_notifier_wait();
471 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
472 void *aux OVS_UNUSED)
474 struct netdev_dev_linux *dev;
476 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
478 const struct netdev_class *netdev_class =
479 netdev_dev_get_class(base_dev);
481 if (is_netdev_linux_class(netdev_class)) {
482 dev = netdev_dev_linux_cast(base_dev);
483 dev->cache_valid = 0;
487 struct shash device_shash;
488 struct shash_node *node;
490 shash_init(&device_shash);
491 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
492 SHASH_FOR_EACH (node, &device_shash) {
494 dev->cache_valid = 0;
496 shash_destroy(&device_shash);
500 /* Creates system and internal devices. */
502 netdev_linux_create(const struct netdev_class *class,
503 const char *name, const struct shash *args,
504 struct netdev_dev **netdev_devp)
506 struct netdev_dev_linux *netdev_dev;
509 if (!shash_is_empty(args)) {
510 VLOG_WARN("%s: arguments for %s devices should be empty",
514 if (!cache_notifier_refcount) {
515 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
516 netdev_linux_cache_cb, NULL);
521 cache_notifier_refcount++;
523 netdev_dev = xzalloc(sizeof *netdev_dev);
524 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
526 *netdev_devp = &netdev_dev->netdev_dev;
530 /* For most types of netdevs we open the device for each call of
531 * netdev_open(). However, this is not the case with tap devices,
532 * since it is only possible to open the device once. In this
533 * situation we share a single file descriptor, and consequently
534 * buffers, across all readers. Therefore once data is read it will
535 * be unavailable to other reads for tap devices. */
537 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
538 const char *name, const struct shash *args,
539 struct netdev_dev **netdev_devp)
541 struct netdev_dev_linux *netdev_dev;
542 struct tap_state *state;
543 static const char tap_dev[] = "/dev/net/tun";
547 if (!shash_is_empty(args)) {
548 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
551 netdev_dev = xzalloc(sizeof *netdev_dev);
552 state = &netdev_dev->state.tap;
554 /* Open tap device. */
555 state->fd = open(tap_dev, O_RDWR);
558 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
562 /* Create tap device. */
563 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
564 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
565 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
566 VLOG_WARN("%s: creating tap device failed: %s", name,
572 /* Make non-blocking. */
573 error = set_nonblocking(state->fd);
578 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
579 *netdev_devp = &netdev_dev->netdev_dev;
588 destroy_tap(struct netdev_dev_linux *netdev_dev)
590 struct tap_state *state = &netdev_dev->state.tap;
592 if (state->fd >= 0) {
597 /* Destroys the netdev device 'netdev_dev_'. */
599 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
601 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
602 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
604 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
605 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
608 if (class == &netdev_linux_class || class == &netdev_internal_class) {
609 cache_notifier_refcount--;
611 if (!cache_notifier_refcount) {
612 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
614 } else if (class == &netdev_tap_class) {
615 destroy_tap(netdev_dev);
624 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
625 struct netdev **netdevp)
627 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
628 struct netdev_linux *netdev;
629 enum netdev_flags flags;
632 /* Allocate network device. */
633 netdev = xzalloc(sizeof *netdev);
635 netdev_init(&netdev->netdev, netdev_dev_);
637 /* Verify that the device really exists, by attempting to read its flags.
638 * (The flags might be cached, in which case this won't actually do an
641 * Don't do this for "internal" netdevs, though, because those have to be
642 * created as netdev objects before they exist in the kernel, because
643 * creating them in the kernel happens by passing a netdev object to
644 * dpif_port_add(). */
645 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
646 error = netdev_get_flags(&netdev->netdev, &flags);
647 if (error == ENODEV) {
652 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
653 !netdev_dev->state.tap.opened) {
655 /* We assume that the first user of the tap device is the primary user
656 * and give them the tap FD. Subsequent users probably just expect
657 * this to be a system device so open it normally to avoid send/receive
658 * directions appearing to be reversed. */
659 netdev->fd = netdev_dev->state.tap.fd;
660 netdev_dev->state.tap.opened = true;
661 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
662 struct sockaddr_ll sll;
666 /* Create file descriptor. */
667 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
668 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
670 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
671 if (netdev->fd < 0) {
676 /* Set non-blocking mode. */
677 error = set_nonblocking(netdev->fd);
682 /* Get ethernet device index. */
683 error = get_ifindex(&netdev->netdev, &ifindex);
688 /* Bind to specific ethernet device. */
689 memset(&sll, 0, sizeof sll);
690 sll.sll_family = AF_PACKET;
691 sll.sll_ifindex = ifindex;
693 (struct sockaddr *) &sll, sizeof sll) < 0) {
695 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
700 /* Between the socket() and bind() calls above, the socket receives all
701 * packets of the requested type on all system interfaces. We do not
702 * want to receive that data, but there is no way to avoid it. So we
703 * must now drain out the receive queue. */
704 error = drain_rcvbuf(netdev->fd);
710 *netdevp = &netdev->netdev;
714 netdev_uninit(&netdev->netdev, true);
718 /* Closes and destroys 'netdev'. */
720 netdev_linux_close(struct netdev *netdev_)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
730 /* Initializes 'svec' with a list of the names of all known network devices. */
732 netdev_linux_enumerate(struct svec *svec)
734 struct if_nameindex *names;
736 names = if_nameindex();
740 for (i = 0; names[i].if_name != NULL; i++) {
741 svec_add(svec, names[i].if_name);
743 if_freenameindex(names);
746 VLOG_WARN("could not obtain list of network device names: %s",
753 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
755 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
757 if (netdev->fd < 0) {
758 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
763 ssize_t retval = read(netdev->fd, data, size);
766 } else if (errno != EINTR) {
767 if (errno != EAGAIN) {
768 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
769 strerror(errno), netdev_get_name(netdev_));
776 /* Registers with the poll loop to wake up from the next call to poll_block()
777 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
779 netdev_linux_recv_wait(struct netdev *netdev_)
781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 if (netdev->fd >= 0) {
783 poll_fd_wait(netdev->fd, POLLIN);
787 /* Discards all packets waiting to be received from 'netdev'. */
789 netdev_linux_drain(struct netdev *netdev_)
791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
792 if (netdev->fd < 0) {
794 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
796 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
797 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
801 drain_fd(netdev->fd, ifr.ifr_qlen);
804 return drain_rcvbuf(netdev->fd);
808 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
809 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
810 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
811 * the packet is too big or too small to transmit on the device.
813 * The caller retains ownership of 'buffer' in all cases.
815 * The kernel maintains a packet transmission queue, so the caller is not
816 * expected to do additional queuing of packets. */
818 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
822 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
824 if (netdev->fd < 0) {
829 ssize_t retval = write(netdev->fd, data, size);
831 /* The Linux AF_PACKET implementation never blocks waiting for room
832 * for packets, instead returning ENOBUFS. Translate this into
833 * EAGAIN for the caller. */
834 if (errno == ENOBUFS) {
836 } else if (errno == EINTR) {
838 } else if (errno != EAGAIN) {
839 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
840 netdev_get_name(netdev_), strerror(errno));
843 } else if (retval != size) {
844 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
845 "%zu) on %s", retval, size, netdev_get_name(netdev_));
853 /* Registers with the poll loop to wake up from the next call to poll_block()
854 * when the packet transmission queue has sufficient room to transmit a packet
855 * with netdev_send().
857 * The kernel maintains a packet transmission queue, so the client is not
858 * expected to do additional queuing of packets. Thus, this function is
859 * unlikely to ever be used. It is included for completeness. */
861 netdev_linux_send_wait(struct netdev *netdev_)
863 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
864 if (netdev->fd < 0) {
866 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
867 poll_fd_wait(netdev->fd, POLLOUT);
869 /* TAP device always accepts packets.*/
870 poll_immediate_wake();
874 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
875 * otherwise a positive errno value. */
877 netdev_linux_set_etheraddr(struct netdev *netdev_,
878 const uint8_t mac[ETH_ADDR_LEN])
880 struct netdev_dev_linux *netdev_dev =
881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
884 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
885 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
886 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
888 netdev_dev->cache_valid |= VALID_ETHERADDR;
889 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
897 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
898 * free the returned buffer. */
900 netdev_linux_get_etheraddr(const struct netdev *netdev_,
901 uint8_t mac[ETH_ADDR_LEN])
903 struct netdev_dev_linux *netdev_dev =
904 netdev_dev_linux_cast(netdev_get_dev(netdev_));
905 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
906 int error = get_etheraddr(netdev_get_name(netdev_),
907 netdev_dev->etheraddr);
911 netdev_dev->cache_valid |= VALID_ETHERADDR;
913 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
917 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
918 * in bytes, not including the hardware header; thus, this is typically 1500
919 * bytes for Ethernet devices. */
921 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
923 struct netdev_dev_linux *netdev_dev =
924 netdev_dev_linux_cast(netdev_get_dev(netdev_));
925 if (!(netdev_dev->cache_valid & VALID_MTU)) {
929 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
930 SIOCGIFMTU, "SIOCGIFMTU");
934 netdev_dev->mtu = ifr.ifr_mtu;
935 netdev_dev->cache_valid |= VALID_MTU;
937 *mtup = netdev_dev->mtu;
941 /* Returns the ifindex of 'netdev', if successful, as a positive number.
942 * On failure, returns a negative errno value. */
944 netdev_linux_get_ifindex(const struct netdev *netdev)
948 error = get_ifindex(netdev, &ifindex);
949 return error ? -error : ifindex;
953 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
955 struct netdev_dev_linux *netdev_dev =
956 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
965 fn = xasprintf("/sys/class/net/%s/carrier",
966 netdev_get_name(netdev_));
967 fd = open(fn, O_RDONLY);
970 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
974 retval = read(fd, line, sizeof line);
977 if (error == EINVAL) {
978 /* This is the normal return value when we try to check carrier
979 * if the network device is not up. */
981 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
984 } else if (retval == 0) {
986 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
990 if (line[0] != '0' && line[0] != '1') {
992 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
996 netdev_dev->carrier = line[0] != '0';
997 netdev_dev->cache_valid |= VALID_CARRIER;
999 *carrier = netdev_dev->carrier;
1010 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1011 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1014 check_for_working_netlink_stats(void)
1016 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1017 * preferable, so if that works, we'll use it. */
1018 int ifindex = do_get_ifindex("lo");
1020 VLOG_WARN("failed to get ifindex for lo, "
1021 "obtaining netdev stats from proc");
1024 struct netdev_stats stats;
1025 int error = get_stats_via_netlink(ifindex, &stats);
1027 VLOG_DBG("obtaining netdev stats via rtnetlink");
1030 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1031 "via proc (you are probably running a pre-2.6.19 "
1032 "kernel)", strerror(error));
1038 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1040 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1042 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1043 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1044 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1046 netdev_dev->is_tap = !strcmp(type, "tap");
1047 netdev_dev->is_internal = false;
1048 if (!netdev_dev->is_tap) {
1049 struct ethtool_drvinfo drvinfo;
1052 memset(&drvinfo, 0, sizeof drvinfo);
1053 error = netdev_linux_do_ethtool(name,
1054 (struct ethtool_cmd *)&drvinfo,
1056 "ETHTOOL_GDRVINFO");
1058 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1059 netdev_dev->is_internal = true;
1063 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1068 swap_uint64(uint64_t *a, uint64_t *b)
1075 /* Retrieves current device stats for 'netdev'. */
1077 netdev_linux_get_stats(const struct netdev *netdev_,
1078 struct netdev_stats *stats)
1080 struct netdev_dev_linux *netdev_dev =
1081 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1082 static int use_netlink_stats = -1;
1085 if (netdev_dev->have_vport_stats ||
1086 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1088 error = netdev_vport_get_stats(netdev_, stats);
1089 netdev_dev->have_vport_stats = !error;
1090 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1093 if (!netdev_dev->have_vport_stats) {
1094 if (use_netlink_stats < 0) {
1095 use_netlink_stats = check_for_working_netlink_stats();
1097 if (use_netlink_stats) {
1100 error = get_ifindex(netdev_, &ifindex);
1102 error = get_stats_via_netlink(ifindex, stats);
1105 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1109 /* If this port is an internal port then the transmit and receive stats
1110 * will appear to be swapped relative to the other ports since we are the
1111 * one sending the data, not a remote computer. For consistency, we swap
1112 * them back here. This does not apply if we are getting stats from the
1113 * vport layer because it always tracks stats from the perspective of the
1115 netdev_linux_update_is_pseudo(netdev_dev);
1116 if (!error && !netdev_dev->have_vport_stats &&
1117 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1118 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1119 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1120 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1121 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1122 stats->rx_length_errors = 0;
1123 stats->rx_over_errors = 0;
1124 stats->rx_crc_errors = 0;
1125 stats->rx_frame_errors = 0;
1126 stats->rx_fifo_errors = 0;
1127 stats->rx_missed_errors = 0;
1128 stats->tx_aborted_errors = 0;
1129 stats->tx_carrier_errors = 0;
1130 stats->tx_fifo_errors = 0;
1131 stats->tx_heartbeat_errors = 0;
1132 stats->tx_window_errors = 0;
1138 /* Stores the features supported by 'netdev' into each of '*current',
1139 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1140 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1141 * successful, otherwise a positive errno value. */
1143 netdev_linux_get_features(struct netdev *netdev,
1144 uint32_t *current, uint32_t *advertised,
1145 uint32_t *supported, uint32_t *peer)
1147 struct ethtool_cmd ecmd;
1150 memset(&ecmd, 0, sizeof ecmd);
1151 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1152 ETHTOOL_GSET, "ETHTOOL_GSET");
1157 /* Supported features. */
1159 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1160 *supported |= OFPPF_10MB_HD;
1162 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1163 *supported |= OFPPF_10MB_FD;
1165 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1166 *supported |= OFPPF_100MB_HD;
1168 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1169 *supported |= OFPPF_100MB_FD;
1171 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1172 *supported |= OFPPF_1GB_HD;
1174 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1175 *supported |= OFPPF_1GB_FD;
1177 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1178 *supported |= OFPPF_10GB_FD;
1180 if (ecmd.supported & SUPPORTED_TP) {
1181 *supported |= OFPPF_COPPER;
1183 if (ecmd.supported & SUPPORTED_FIBRE) {
1184 *supported |= OFPPF_FIBER;
1186 if (ecmd.supported & SUPPORTED_Autoneg) {
1187 *supported |= OFPPF_AUTONEG;
1189 if (ecmd.supported & SUPPORTED_Pause) {
1190 *supported |= OFPPF_PAUSE;
1192 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1193 *supported |= OFPPF_PAUSE_ASYM;
1196 /* Advertised features. */
1198 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1199 *advertised |= OFPPF_10MB_HD;
1201 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1202 *advertised |= OFPPF_10MB_FD;
1204 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1205 *advertised |= OFPPF_100MB_HD;
1207 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1208 *advertised |= OFPPF_100MB_FD;
1210 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1211 *advertised |= OFPPF_1GB_HD;
1213 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1214 *advertised |= OFPPF_1GB_FD;
1216 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1217 *advertised |= OFPPF_10GB_FD;
1219 if (ecmd.advertising & ADVERTISED_TP) {
1220 *advertised |= OFPPF_COPPER;
1222 if (ecmd.advertising & ADVERTISED_FIBRE) {
1223 *advertised |= OFPPF_FIBER;
1225 if (ecmd.advertising & ADVERTISED_Autoneg) {
1226 *advertised |= OFPPF_AUTONEG;
1228 if (ecmd.advertising & ADVERTISED_Pause) {
1229 *advertised |= OFPPF_PAUSE;
1231 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1232 *advertised |= OFPPF_PAUSE_ASYM;
1235 /* Current settings. */
1236 if (ecmd.speed == SPEED_10) {
1237 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1238 } else if (ecmd.speed == SPEED_100) {
1239 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1240 } else if (ecmd.speed == SPEED_1000) {
1241 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1242 } else if (ecmd.speed == SPEED_10000) {
1243 *current = OFPPF_10GB_FD;
1248 if (ecmd.port == PORT_TP) {
1249 *current |= OFPPF_COPPER;
1250 } else if (ecmd.port == PORT_FIBRE) {
1251 *current |= OFPPF_FIBER;
1255 *current |= OFPPF_AUTONEG;
1258 /* Peer advertisements. */
1259 *peer = 0; /* XXX */
1264 /* Set the features advertised by 'netdev' to 'advertise'. */
1266 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1268 struct ethtool_cmd ecmd;
1271 memset(&ecmd, 0, sizeof ecmd);
1272 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1273 ETHTOOL_GSET, "ETHTOOL_GSET");
1278 ecmd.advertising = 0;
1279 if (advertise & OFPPF_10MB_HD) {
1280 ecmd.advertising |= ADVERTISED_10baseT_Half;
1282 if (advertise & OFPPF_10MB_FD) {
1283 ecmd.advertising |= ADVERTISED_10baseT_Full;
1285 if (advertise & OFPPF_100MB_HD) {
1286 ecmd.advertising |= ADVERTISED_100baseT_Half;
1288 if (advertise & OFPPF_100MB_FD) {
1289 ecmd.advertising |= ADVERTISED_100baseT_Full;
1291 if (advertise & OFPPF_1GB_HD) {
1292 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1294 if (advertise & OFPPF_1GB_FD) {
1295 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1297 if (advertise & OFPPF_10GB_FD) {
1298 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1300 if (advertise & OFPPF_COPPER) {
1301 ecmd.advertising |= ADVERTISED_TP;
1303 if (advertise & OFPPF_FIBER) {
1304 ecmd.advertising |= ADVERTISED_FIBRE;
1306 if (advertise & OFPPF_AUTONEG) {
1307 ecmd.advertising |= ADVERTISED_Autoneg;
1309 if (advertise & OFPPF_PAUSE) {
1310 ecmd.advertising |= ADVERTISED_Pause;
1312 if (advertise & OFPPF_PAUSE_ASYM) {
1313 ecmd.advertising |= ADVERTISED_Asym_Pause;
1315 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1316 ETHTOOL_SSET, "ETHTOOL_SSET");
1319 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1320 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1321 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1322 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1323 * sets '*vlan_vid' to -1. */
1325 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1327 const char *netdev_name = netdev_get_name(netdev);
1328 struct ds line = DS_EMPTY_INITIALIZER;
1329 FILE *stream = NULL;
1333 COVERAGE_INC(netdev_get_vlan_vid);
1334 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1335 stream = fopen(fn, "r");
1341 if (ds_get_line(&line, stream)) {
1342 if (ferror(stream)) {
1344 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1347 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1352 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1354 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1355 fn, ds_cstr(&line));
1373 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1374 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1376 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1377 * positive errno value.
1379 * This function is equivalent to running
1380 * /sbin/tc qdisc del dev %s handle ffff: ingress
1381 * but it is much, much faster.
1384 netdev_linux_remove_policing(struct netdev *netdev)
1386 struct netdev_dev_linux *netdev_dev =
1387 netdev_dev_linux_cast(netdev_get_dev(netdev));
1388 const char *netdev_name = netdev_get_name(netdev);
1390 struct ofpbuf request;
1391 struct tcmsg *tcmsg;
1394 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1398 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1399 tcmsg->tcm_parent = TC_H_INGRESS;
1400 nl_msg_put_string(&request, TCA_KIND, "ingress");
1401 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1403 error = tc_transact(&request, NULL);
1404 if (error && error != ENOENT && error != EINVAL) {
1405 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1406 netdev_name, strerror(error));
1410 netdev_dev->kbits_rate = 0;
1411 netdev_dev->kbits_burst = 0;
1412 netdev_dev->cache_valid |= VALID_POLICING;
1416 /* Attempts to set input rate limiting (policing) policy. */
1418 netdev_linux_set_policing(struct netdev *netdev,
1419 uint32_t kbits_rate, uint32_t kbits_burst)
1421 struct netdev_dev_linux *netdev_dev =
1422 netdev_dev_linux_cast(netdev_get_dev(netdev));
1423 const char *netdev_name = netdev_get_name(netdev);
1426 COVERAGE_INC(netdev_set_policing);
1428 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1429 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1430 : kbits_burst); /* Stick with user-specified value. */
1432 if (netdev_dev->cache_valid & VALID_POLICING
1433 && netdev_dev->kbits_rate == kbits_rate
1434 && netdev_dev->kbits_burst == kbits_burst) {
1435 /* Assume that settings haven't changed since we last set them. */
1439 netdev_linux_remove_policing(netdev);
1441 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1442 if (system(command) != 0) {
1443 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1447 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1448 kbits_rate, kbits_burst);
1449 if (system(command) != 0) {
1450 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1455 netdev_dev->kbits_rate = kbits_rate;
1456 netdev_dev->kbits_burst = kbits_burst;
1457 netdev_dev->cache_valid |= VALID_POLICING;
1464 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1467 const struct tc_ops **opsp;
1469 for (opsp = tcs; *opsp != NULL; opsp++) {
1470 const struct tc_ops *ops = *opsp;
1471 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1472 svec_add(types, ops->ovs_name);
1478 static const struct tc_ops *
1479 tc_lookup_ovs_name(const char *name)
1481 const struct tc_ops **opsp;
1483 for (opsp = tcs; *opsp != NULL; opsp++) {
1484 const struct tc_ops *ops = *opsp;
1485 if (!strcmp(name, ops->ovs_name)) {
1492 static const struct tc_ops *
1493 tc_lookup_linux_name(const char *name)
1495 const struct tc_ops **opsp;
1497 for (opsp = tcs; *opsp != NULL; opsp++) {
1498 const struct tc_ops *ops = *opsp;
1499 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1506 static struct tc_queue *
1507 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1510 struct netdev_dev_linux *netdev_dev =
1511 netdev_dev_linux_cast(netdev_get_dev(netdev));
1512 struct tc_queue *queue;
1514 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1515 if (queue->queue_id == queue_id) {
1522 static struct tc_queue *
1523 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1525 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1529 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1531 struct netdev_qos_capabilities *caps)
1533 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1537 caps->n_queues = ops->n_queues;
1542 netdev_linux_get_qos(const struct netdev *netdev,
1543 const char **typep, struct shash *details)
1545 struct netdev_dev_linux *netdev_dev =
1546 netdev_dev_linux_cast(netdev_get_dev(netdev));
1549 error = tc_query_qdisc(netdev);
1554 *typep = netdev_dev->tc->ops->ovs_name;
1555 return (netdev_dev->tc->ops->qdisc_get
1556 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1561 netdev_linux_set_qos(struct netdev *netdev,
1562 const char *type, const struct shash *details)
1564 struct netdev_dev_linux *netdev_dev =
1565 netdev_dev_linux_cast(netdev_get_dev(netdev));
1566 const struct tc_ops *new_ops;
1569 new_ops = tc_lookup_ovs_name(type);
1570 if (!new_ops || !new_ops->tc_install) {
1574 error = tc_query_qdisc(netdev);
1579 if (new_ops == netdev_dev->tc->ops) {
1580 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1582 /* Delete existing qdisc. */
1583 error = tc_del_qdisc(netdev);
1587 assert(netdev_dev->tc == NULL);
1589 /* Install new qdisc. */
1590 error = new_ops->tc_install(netdev, details);
1591 assert((error == 0) == (netdev_dev->tc != NULL));
1598 netdev_linux_get_queue(const struct netdev *netdev,
1599 unsigned int queue_id, struct shash *details)
1601 struct netdev_dev_linux *netdev_dev =
1602 netdev_dev_linux_cast(netdev_get_dev(netdev));
1605 error = tc_query_qdisc(netdev);
1609 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1611 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1617 netdev_linux_set_queue(struct netdev *netdev,
1618 unsigned int queue_id, const struct shash *details)
1620 struct netdev_dev_linux *netdev_dev =
1621 netdev_dev_linux_cast(netdev_get_dev(netdev));
1624 error = tc_query_qdisc(netdev);
1627 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1628 || !netdev_dev->tc->ops->class_set) {
1632 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1636 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1638 struct netdev_dev_linux *netdev_dev =
1639 netdev_dev_linux_cast(netdev_get_dev(netdev));
1642 error = tc_query_qdisc(netdev);
1645 } else if (!netdev_dev->tc->ops->class_delete) {
1648 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1650 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1656 netdev_linux_get_queue_stats(const struct netdev *netdev,
1657 unsigned int queue_id,
1658 struct netdev_queue_stats *stats)
1660 struct netdev_dev_linux *netdev_dev =
1661 netdev_dev_linux_cast(netdev_get_dev(netdev));
1664 error = tc_query_qdisc(netdev);
1667 } else if (!netdev_dev->tc->ops->class_get_stats) {
1670 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1672 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1678 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1680 struct ofpbuf request;
1681 struct tcmsg *tcmsg;
1683 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1687 tcmsg->tcm_parent = 0;
1688 nl_dump_start(dump, rtnl_sock, &request);
1689 ofpbuf_uninit(&request);
1694 netdev_linux_dump_queues(const struct netdev *netdev,
1695 netdev_dump_queues_cb *cb, void *aux)
1697 struct netdev_dev_linux *netdev_dev =
1698 netdev_dev_linux_cast(netdev_get_dev(netdev));
1699 struct tc_queue *queue;
1700 struct shash details;
1704 error = tc_query_qdisc(netdev);
1707 } else if (!netdev_dev->tc->ops->class_get) {
1712 shash_init(&details);
1713 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1714 shash_clear(&details);
1716 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1718 (*cb)(queue->queue_id, &details, aux);
1723 shash_destroy(&details);
1729 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1730 netdev_dump_queue_stats_cb *cb, void *aux)
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev));
1734 struct nl_dump dump;
1739 error = tc_query_qdisc(netdev);
1742 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1747 if (!start_queue_dump(netdev, &dump)) {
1750 while (nl_dump_next(&dump, &msg)) {
1751 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1757 error = nl_dump_done(&dump);
1758 return error ? error : last_error;
1762 netdev_linux_get_in4(const struct netdev *netdev_,
1763 struct in_addr *address, struct in_addr *netmask)
1765 struct netdev_dev_linux *netdev_dev =
1766 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1768 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1771 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1772 SIOCGIFADDR, "SIOCGIFADDR");
1777 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1778 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1783 netdev_dev->cache_valid |= VALID_IN4;
1785 *address = netdev_dev->address;
1786 *netmask = netdev_dev->netmask;
1787 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1791 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1792 struct in_addr netmask)
1794 struct netdev_dev_linux *netdev_dev =
1795 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1798 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1800 netdev_dev->cache_valid |= VALID_IN4;
1801 netdev_dev->address = address;
1802 netdev_dev->netmask = netmask;
1803 if (address.s_addr != INADDR_ANY) {
1804 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1805 "SIOCSIFNETMASK", netmask);
1812 parse_if_inet6_line(const char *line,
1813 struct in6_addr *in6, char ifname[16 + 1])
1815 uint8_t *s6 = in6->s6_addr;
1816 #define X8 "%2"SCNx8
1818 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1819 "%*x %*x %*x %*x %16s\n",
1820 &s6[0], &s6[1], &s6[2], &s6[3],
1821 &s6[4], &s6[5], &s6[6], &s6[7],
1822 &s6[8], &s6[9], &s6[10], &s6[11],
1823 &s6[12], &s6[13], &s6[14], &s6[15],
1827 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1828 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1830 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1832 struct netdev_dev_linux *netdev_dev =
1833 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1834 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1838 netdev_dev->in6 = in6addr_any;
1840 file = fopen("/proc/net/if_inet6", "r");
1842 const char *name = netdev_get_name(netdev_);
1843 while (fgets(line, sizeof line, file)) {
1844 struct in6_addr in6_tmp;
1845 char ifname[16 + 1];
1846 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1847 && !strcmp(name, ifname))
1849 netdev_dev->in6 = in6_tmp;
1855 netdev_dev->cache_valid |= VALID_IN6;
1857 *in6 = netdev_dev->in6;
1862 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1864 struct sockaddr_in sin;
1865 memset(&sin, 0, sizeof sin);
1866 sin.sin_family = AF_INET;
1867 sin.sin_addr = addr;
1870 memset(sa, 0, sizeof *sa);
1871 memcpy(sa, &sin, sizeof sin);
1875 do_set_addr(struct netdev *netdev,
1876 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1879 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1880 make_in4_sockaddr(&ifr.ifr_addr, addr);
1882 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1886 /* Adds 'router' as a default IP gateway. */
1888 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1890 struct in_addr any = { INADDR_ANY };
1894 memset(&rt, 0, sizeof rt);
1895 make_in4_sockaddr(&rt.rt_dst, any);
1896 make_in4_sockaddr(&rt.rt_gateway, router);
1897 make_in4_sockaddr(&rt.rt_genmask, any);
1898 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1899 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1901 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1907 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1910 static const char fn[] = "/proc/net/route";
1915 *netdev_name = NULL;
1916 stream = fopen(fn, "r");
1917 if (stream == NULL) {
1918 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1923 while (fgets(line, sizeof line, stream)) {
1926 uint32_t dest, gateway, mask;
1927 int refcnt, metric, mtu;
1928 unsigned int flags, use, window, irtt;
1931 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1933 iface, &dest, &gateway, &flags, &refcnt,
1934 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1936 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1940 if (!(flags & RTF_UP)) {
1941 /* Skip routes that aren't up. */
1945 /* The output of 'dest', 'mask', and 'gateway' were given in
1946 * network byte order, so we don't need need any endian
1947 * conversions here. */
1948 if ((dest & mask) == (host->s_addr & mask)) {
1950 /* The host is directly reachable. */
1951 next_hop->s_addr = 0;
1953 /* To reach the host, we must go through a gateway. */
1954 next_hop->s_addr = gateway;
1956 *netdev_name = xstrdup(iface);
1967 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1968 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1969 * returns 0. Otherwise, it returns a positive errno value; in particular,
1970 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1972 netdev_linux_arp_lookup(const struct netdev *netdev,
1973 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1976 struct sockaddr_in sin;
1979 memset(&r, 0, sizeof r);
1980 sin.sin_family = AF_INET;
1981 sin.sin_addr.s_addr = ip;
1983 memcpy(&r.arp_pa, &sin, sizeof sin);
1984 r.arp_ha.sa_family = ARPHRD_ETHER;
1986 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1987 COVERAGE_INC(netdev_arp_lookup);
1988 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1990 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1991 } else if (retval != ENXIO) {
1992 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1993 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1999 nd_to_iff_flags(enum netdev_flags nd)
2002 if (nd & NETDEV_UP) {
2005 if (nd & NETDEV_PROMISC) {
2012 iff_to_nd_flags(int iff)
2014 enum netdev_flags nd = 0;
2018 if (iff & IFF_PROMISC) {
2019 nd |= NETDEV_PROMISC;
2025 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2026 enum netdev_flags on, enum netdev_flags *old_flagsp)
2028 int old_flags, new_flags;
2031 error = get_flags(netdev, &old_flags);
2033 *old_flagsp = iff_to_nd_flags(old_flags);
2034 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2035 if (new_flags != old_flags) {
2036 error = set_flags(netdev, new_flags);
2043 poll_notify(struct list *list)
2045 struct netdev_linux_notifier *notifier;
2046 LIST_FOR_EACH (notifier, node, list) {
2047 struct netdev_notifier *n = ¬ifier->notifier;
2053 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2054 void *aux OVS_UNUSED)
2057 struct list *list = shash_find_data(&netdev_linux_notifiers,
2063 struct shash_node *node;
2064 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2065 poll_notify(node->data);
2071 netdev_linux_poll_add(struct netdev *netdev,
2072 void (*cb)(struct netdev_notifier *), void *aux,
2073 struct netdev_notifier **notifierp)
2075 const char *netdev_name = netdev_get_name(netdev);
2076 struct netdev_linux_notifier *notifier;
2079 if (shash_is_empty(&netdev_linux_notifiers)) {
2081 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2082 netdev_linux_poll_cb, NULL);
2088 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2090 list = xmalloc(sizeof *list);
2092 shash_add(&netdev_linux_notifiers, netdev_name, list);
2095 notifier = xmalloc(sizeof *notifier);
2096 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2097 list_push_back(list, ¬ifier->node);
2098 *notifierp = ¬ifier->notifier;
2103 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2105 struct netdev_linux_notifier *notifier =
2106 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2109 /* Remove 'notifier' from its list. */
2110 list = list_remove(¬ifier->node);
2111 if (list_is_empty(list)) {
2112 /* The list is now empty. Remove it from the hash and free it. */
2113 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2114 shash_delete(&netdev_linux_notifiers,
2115 shash_find(&netdev_linux_notifiers, netdev_name));
2120 /* If that was the last notifier, unregister. */
2121 if (shash_is_empty(&netdev_linux_notifiers)) {
2122 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2126 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2130 netdev_linux_init, \
2132 netdev_linux_wait, \
2135 netdev_linux_destroy, \
2136 NULL, /* reconfigure */ \
2138 netdev_linux_open, \
2139 netdev_linux_close, \
2143 netdev_linux_recv, \
2144 netdev_linux_recv_wait, \
2145 netdev_linux_drain, \
2147 netdev_linux_send, \
2148 netdev_linux_send_wait, \
2150 netdev_linux_set_etheraddr, \
2151 netdev_linux_get_etheraddr, \
2152 netdev_linux_get_mtu, \
2153 netdev_linux_get_ifindex, \
2154 netdev_linux_get_carrier, \
2155 netdev_linux_get_stats, \
2158 netdev_linux_get_features, \
2159 netdev_linux_set_advertisements, \
2160 netdev_linux_get_vlan_vid, \
2162 netdev_linux_set_policing, \
2163 netdev_linux_get_qos_types, \
2164 netdev_linux_get_qos_capabilities, \
2165 netdev_linux_get_qos, \
2166 netdev_linux_set_qos, \
2167 netdev_linux_get_queue, \
2168 netdev_linux_set_queue, \
2169 netdev_linux_delete_queue, \
2170 netdev_linux_get_queue_stats, \
2171 netdev_linux_dump_queues, \
2172 netdev_linux_dump_queue_stats, \
2174 netdev_linux_get_in4, \
2175 netdev_linux_set_in4, \
2176 netdev_linux_get_in6, \
2177 netdev_linux_add_router, \
2178 netdev_linux_get_next_hop, \
2179 NULL, /* get_status */ \
2180 netdev_linux_arp_lookup, \
2182 netdev_linux_update_flags, \
2184 netdev_linux_poll_add, \
2185 netdev_linux_poll_remove \
2188 const struct netdev_class netdev_linux_class =
2191 netdev_linux_create,
2192 netdev_linux_enumerate,
2193 NULL); /* set_stats */
2195 const struct netdev_class netdev_tap_class =
2198 netdev_linux_create_tap,
2199 NULL, /* enumerate */
2200 NULL); /* set_stats */
2202 const struct netdev_class netdev_internal_class =
2205 netdev_linux_create,
2206 NULL, /* enumerate */
2207 netdev_vport_set_stats);
2209 /* HTB traffic control class. */
2211 #define HTB_N_QUEUES 0xf000
2215 unsigned int max_rate; /* In bytes/s. */
2219 struct tc_queue tc_queue;
2220 unsigned int min_rate; /* In bytes/s. */
2221 unsigned int max_rate; /* In bytes/s. */
2222 unsigned int burst; /* In bytes. */
2223 unsigned int priority; /* Lower values are higher priorities. */
2227 htb_get__(const struct netdev *netdev)
2229 struct netdev_dev_linux *netdev_dev =
2230 netdev_dev_linux_cast(netdev_get_dev(netdev));
2231 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2235 htb_install__(struct netdev *netdev, uint64_t max_rate)
2237 struct netdev_dev_linux *netdev_dev =
2238 netdev_dev_linux_cast(netdev_get_dev(netdev));
2241 htb = xmalloc(sizeof *htb);
2242 tc_init(&htb->tc, &tc_ops_htb);
2243 htb->max_rate = max_rate;
2245 netdev_dev->tc = &htb->tc;
2250 /* Create an HTB qdisc.
2252 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2254 htb_setup_qdisc__(struct netdev *netdev)
2257 struct tc_htb_glob opt;
2258 struct ofpbuf request;
2259 struct tcmsg *tcmsg;
2261 tc_del_qdisc(netdev);
2263 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2264 NLM_F_EXCL | NLM_F_CREATE, &request);
2268 tcmsg->tcm_handle = tc_make_handle(1, 0);
2269 tcmsg->tcm_parent = TC_H_ROOT;
2271 nl_msg_put_string(&request, TCA_KIND, "htb");
2273 memset(&opt, 0, sizeof opt);
2274 opt.rate2quantum = 10;
2278 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2279 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2280 nl_msg_end_nested(&request, opt_offset);
2282 return tc_transact(&request, NULL);
2285 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2286 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2288 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2289 unsigned int parent, struct htb_class *class)
2292 struct tc_htb_opt opt;
2293 struct ofpbuf request;
2294 struct tcmsg *tcmsg;
2298 netdev_get_mtu(netdev, &mtu);
2300 memset(&opt, 0, sizeof opt);
2301 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2302 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2303 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2304 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2305 opt.prio = class->priority;
2307 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2311 tcmsg->tcm_handle = handle;
2312 tcmsg->tcm_parent = parent;
2314 nl_msg_put_string(&request, TCA_KIND, "htb");
2315 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2316 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2317 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2318 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2319 nl_msg_end_nested(&request, opt_offset);
2321 error = tc_transact(&request, NULL);
2323 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2324 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2325 netdev_get_name(netdev),
2326 tc_get_major(handle), tc_get_minor(handle),
2327 tc_get_major(parent), tc_get_minor(parent),
2328 class->min_rate, class->max_rate,
2329 class->burst, class->priority, strerror(error));
2334 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2335 * description of them into 'details'. The description complies with the
2336 * specification given in the vswitch database documentation for linux-htb
2339 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2341 static const struct nl_policy tca_htb_policy[] = {
2342 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2343 .min_len = sizeof(struct tc_htb_opt) },
2346 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2347 const struct tc_htb_opt *htb;
2349 if (!nl_parse_nested(nl_options, tca_htb_policy,
2350 attrs, ARRAY_SIZE(tca_htb_policy))) {
2351 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2355 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2356 class->min_rate = htb->rate.rate;
2357 class->max_rate = htb->ceil.rate;
2358 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2359 class->priority = htb->prio;
2364 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2365 struct htb_class *options,
2366 struct netdev_queue_stats *stats)
2368 struct nlattr *nl_options;
2369 unsigned int handle;
2372 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2373 if (!error && queue_id) {
2374 unsigned int major = tc_get_major(handle);
2375 unsigned int minor = tc_get_minor(handle);
2376 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2377 *queue_id = minor - 1;
2382 if (!error && options) {
2383 error = htb_parse_tca_options__(nl_options, options);
2389 htb_parse_qdisc_details__(struct netdev *netdev,
2390 const struct shash *details, struct htb_class *hc)
2392 const char *max_rate_s;
2394 max_rate_s = shash_find_data(details, "max-rate");
2395 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2396 if (!hc->max_rate) {
2399 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2400 hc->max_rate = netdev_features_to_bps(current) / 8;
2402 hc->min_rate = hc->max_rate;
2408 htb_parse_class_details__(struct netdev *netdev,
2409 const struct shash *details, struct htb_class *hc)
2411 const struct htb *htb = htb_get__(netdev);
2412 const char *min_rate_s = shash_find_data(details, "min-rate");
2413 const char *max_rate_s = shash_find_data(details, "max-rate");
2414 const char *burst_s = shash_find_data(details, "burst");
2415 const char *priority_s = shash_find_data(details, "priority");
2418 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2420 /* min-rate is required. */
2423 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2424 hc->min_rate = MAX(hc->min_rate, 1500);
2425 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2428 hc->max_rate = (max_rate_s
2429 ? strtoull(max_rate_s, NULL, 10) / 8
2431 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2432 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2436 * According to hints in the documentation that I've read, it is important
2437 * that 'burst' be at least as big as the largest frame that might be
2438 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2439 * but having it a bit too small is a problem. Since netdev_get_mtu()
2440 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2441 * the MTU. We actually add 64, instead of 14, as a guard against
2442 * additional headers get tacked on somewhere that we're not aware of. */
2443 netdev_get_mtu(netdev, &mtu);
2444 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2445 hc->burst = MAX(hc->burst, mtu + 64);
2448 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2454 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2455 unsigned int parent, struct htb_class *options,
2456 struct netdev_queue_stats *stats)
2458 struct ofpbuf *reply;
2461 error = tc_query_class(netdev, handle, parent, &reply);
2463 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2464 ofpbuf_delete(reply);
2470 htb_tc_install(struct netdev *netdev, const struct shash *details)
2474 error = htb_setup_qdisc__(netdev);
2476 struct htb_class hc;
2478 htb_parse_qdisc_details__(netdev, details, &hc);
2479 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2480 tc_make_handle(1, 0), &hc);
2482 htb_install__(netdev, hc.max_rate);
2488 static struct htb_class *
2489 htb_class_cast__(const struct tc_queue *queue)
2491 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2495 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2496 const struct htb_class *hc)
2498 struct htb *htb = htb_get__(netdev);
2499 size_t hash = hash_int(queue_id, 0);
2500 struct tc_queue *queue;
2501 struct htb_class *hcp;
2503 queue = tc_find_queue__(netdev, queue_id, hash);
2505 hcp = htb_class_cast__(queue);
2507 hcp = xmalloc(sizeof *hcp);
2508 queue = &hcp->tc_queue;
2509 queue->queue_id = queue_id;
2510 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2513 hcp->min_rate = hc->min_rate;
2514 hcp->max_rate = hc->max_rate;
2515 hcp->burst = hc->burst;
2516 hcp->priority = hc->priority;
2520 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2523 struct nl_dump dump;
2524 struct htb_class hc;
2527 /* Get qdisc options. */
2529 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2530 htb = htb_install__(netdev, hc.max_rate);
2533 if (!start_queue_dump(netdev, &dump)) {
2536 while (nl_dump_next(&dump, &msg)) {
2537 unsigned int queue_id;
2539 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2540 htb_update_queue__(netdev, queue_id, &hc);
2543 nl_dump_done(&dump);
2549 htb_tc_destroy(struct tc *tc)
2551 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2552 struct htb_class *hc, *next;
2554 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2555 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2563 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2565 const struct htb *htb = htb_get__(netdev);
2566 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2571 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2573 struct htb_class hc;
2576 htb_parse_qdisc_details__(netdev, details, &hc);
2577 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2578 tc_make_handle(1, 0), &hc);
2580 htb_get__(netdev)->max_rate = hc.max_rate;
2586 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2587 const struct tc_queue *queue, struct shash *details)
2589 const struct htb_class *hc = htb_class_cast__(queue);
2591 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2592 if (hc->min_rate != hc->max_rate) {
2593 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2595 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2597 shash_add(details, "priority", xasprintf("%u", hc->priority));
2603 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2604 const struct shash *details)
2606 struct htb_class hc;
2609 error = htb_parse_class_details__(netdev, details, &hc);
2614 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2615 tc_make_handle(1, 0xfffe), &hc);
2620 htb_update_queue__(netdev, queue_id, &hc);
2625 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2627 struct htb_class *hc = htb_class_cast__(queue);
2628 struct htb *htb = htb_get__(netdev);
2631 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2633 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2640 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2641 struct netdev_queue_stats *stats)
2643 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2644 tc_make_handle(1, 0xfffe), NULL, stats);
2648 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2649 const struct ofpbuf *nlmsg,
2650 netdev_dump_queue_stats_cb *cb, void *aux)
2652 struct netdev_queue_stats stats;
2653 unsigned int handle, major, minor;
2656 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2661 major = tc_get_major(handle);
2662 minor = tc_get_minor(handle);
2663 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2664 (*cb)(minor - 1, &stats, aux);
2669 static const struct tc_ops tc_ops_htb = {
2670 "htb", /* linux_name */
2671 "linux-htb", /* ovs_name */
2672 HTB_N_QUEUES, /* n_queues */
2681 htb_class_get_stats,
2682 htb_class_dump_stats
2685 /* "linux-hfsc" traffic control class. */
2687 #define HFSC_N_QUEUES 0xf000
2695 struct tc_queue tc_queue;
2700 static struct hfsc *
2701 hfsc_get__(const struct netdev *netdev)
2703 struct netdev_dev_linux *netdev_dev;
2704 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2705 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2708 static struct hfsc_class *
2709 hfsc_class_cast__(const struct tc_queue *queue)
2711 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2714 static struct hfsc *
2715 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2717 struct netdev_dev_linux * netdev_dev;
2720 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2721 hfsc = xmalloc(sizeof *hfsc);
2722 tc_init(&hfsc->tc, &tc_ops_hfsc);
2723 hfsc->max_rate = max_rate;
2724 netdev_dev->tc = &hfsc->tc;
2730 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2731 const struct hfsc_class *hc)
2735 struct hfsc_class *hcp;
2736 struct tc_queue *queue;
2738 hfsc = hfsc_get__(netdev);
2739 hash = hash_int(queue_id, 0);
2741 queue = tc_find_queue__(netdev, queue_id, hash);
2743 hcp = hfsc_class_cast__(queue);
2745 hcp = xmalloc(sizeof *hcp);
2746 queue = &hcp->tc_queue;
2747 queue->queue_id = queue_id;
2748 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2751 hcp->min_rate = hc->min_rate;
2752 hcp->max_rate = hc->max_rate;
2756 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2758 const struct tc_service_curve *rsc, *fsc, *usc;
2759 static const struct nl_policy tca_hfsc_policy[] = {
2761 .type = NL_A_UNSPEC,
2763 .min_len = sizeof(struct tc_service_curve),
2766 .type = NL_A_UNSPEC,
2768 .min_len = sizeof(struct tc_service_curve),
2771 .type = NL_A_UNSPEC,
2773 .min_len = sizeof(struct tc_service_curve),
2776 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2778 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2779 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2780 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2784 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2785 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2786 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2788 if (rsc->m1 != 0 || rsc->d != 0 ||
2789 fsc->m1 != 0 || fsc->d != 0 ||
2790 usc->m1 != 0 || usc->d != 0) {
2791 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2792 "Non-linear service curves are not supported.");
2796 if (rsc->m2 != fsc->m2) {
2797 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2798 "Real-time service curves are not supported ");
2802 if (rsc->m2 > usc->m2) {
2803 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2804 "Min-rate service curve is greater than "
2805 "the max-rate service curve.");
2809 class->min_rate = fsc->m2;
2810 class->max_rate = usc->m2;
2815 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2816 struct hfsc_class *options,
2817 struct netdev_queue_stats *stats)
2820 unsigned int handle;
2821 struct nlattr *nl_options;
2823 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2829 unsigned int major, minor;
2831 major = tc_get_major(handle);
2832 minor = tc_get_minor(handle);
2833 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2834 *queue_id = minor - 1;
2841 error = hfsc_parse_tca_options__(nl_options, options);
2848 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2849 unsigned int parent, struct hfsc_class *options,
2850 struct netdev_queue_stats *stats)
2853 struct ofpbuf *reply;
2855 error = tc_query_class(netdev, handle, parent, &reply);
2860 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2861 ofpbuf_delete(reply);
2866 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2867 struct hfsc_class *class)
2870 const char *max_rate_s;
2872 max_rate_s = shash_find_data(details, "max-rate");
2873 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2878 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2879 max_rate = netdev_features_to_bps(current) / 8;
2882 class->min_rate = max_rate;
2883 class->max_rate = max_rate;
2887 hfsc_parse_class_details__(struct netdev *netdev,
2888 const struct shash *details,
2889 struct hfsc_class * class)
2891 const struct hfsc *hfsc;
2892 uint32_t min_rate, max_rate;
2893 const char *min_rate_s, *max_rate_s;
2895 hfsc = hfsc_get__(netdev);
2896 min_rate_s = shash_find_data(details, "min-rate");
2897 max_rate_s = shash_find_data(details, "max-rate");
2903 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2904 min_rate = MAX(min_rate, 1500);
2905 min_rate = MIN(min_rate, hfsc->max_rate);
2907 max_rate = (max_rate_s
2908 ? strtoull(max_rate_s, NULL, 10) / 8
2910 max_rate = MAX(max_rate, min_rate);
2911 max_rate = MIN(max_rate, hfsc->max_rate);
2913 class->min_rate = min_rate;
2914 class->max_rate = max_rate;
2919 /* Create an HFSC qdisc.
2921 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2923 hfsc_setup_qdisc__(struct netdev * netdev)
2925 struct tcmsg *tcmsg;
2926 struct ofpbuf request;
2927 struct tc_hfsc_qopt opt;
2929 tc_del_qdisc(netdev);
2931 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2932 NLM_F_EXCL | NLM_F_CREATE, &request);
2938 tcmsg->tcm_handle = tc_make_handle(1, 0);
2939 tcmsg->tcm_parent = TC_H_ROOT;
2941 memset(&opt, 0, sizeof opt);
2944 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2945 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2947 return tc_transact(&request, NULL);
2950 /* Create an HFSC class.
2952 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2953 * sc rate <min_rate> ul rate <max_rate>" */
2955 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2956 unsigned int parent, struct hfsc_class *class)
2960 struct tcmsg *tcmsg;
2961 struct ofpbuf request;
2962 struct tc_service_curve min, max;
2964 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2970 tcmsg->tcm_handle = handle;
2971 tcmsg->tcm_parent = parent;
2975 min.m2 = class->min_rate;
2979 max.m2 = class->max_rate;
2981 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2982 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2983 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2984 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2985 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2986 nl_msg_end_nested(&request, opt_offset);
2988 error = tc_transact(&request, NULL);
2990 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2991 "min-rate %ubps, max-rate %ubps (%s)",
2992 netdev_get_name(netdev),
2993 tc_get_major(handle), tc_get_minor(handle),
2994 tc_get_major(parent), tc_get_minor(parent),
2995 class->min_rate, class->max_rate, strerror(error));
3002 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3005 struct hfsc_class class;
3007 error = hfsc_setup_qdisc__(netdev);
3013 hfsc_parse_qdisc_details__(netdev, details, &class);
3014 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3015 tc_make_handle(1, 0), &class);
3021 hfsc_install__(netdev, class.max_rate);
3026 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3030 struct nl_dump dump;
3031 struct hfsc_class hc;
3034 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3035 hfsc = hfsc_install__(netdev, hc.max_rate);
3037 if (!start_queue_dump(netdev, &dump)) {
3041 while (nl_dump_next(&dump, &msg)) {
3042 unsigned int queue_id;
3044 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3045 hfsc_update_queue__(netdev, queue_id, &hc);
3049 nl_dump_done(&dump);
3054 hfsc_tc_destroy(struct tc *tc)
3057 struct hfsc_class *hc, *next;
3059 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3061 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3062 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3071 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3073 const struct hfsc *hfsc;
3074 hfsc = hfsc_get__(netdev);
3075 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3080 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3083 struct hfsc_class class;
3085 hfsc_parse_qdisc_details__(netdev, details, &class);
3086 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3087 tc_make_handle(1, 0), &class);
3090 hfsc_get__(netdev)->max_rate = class.max_rate;
3097 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3098 const struct tc_queue *queue, struct shash *details)
3100 const struct hfsc_class *hc;
3102 hc = hfsc_class_cast__(queue);
3103 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3104 if (hc->min_rate != hc->max_rate) {
3105 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3111 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3112 const struct shash *details)
3115 struct hfsc_class class;
3117 error = hfsc_parse_class_details__(netdev, details, &class);
3122 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3123 tc_make_handle(1, 0xfffe), &class);
3128 hfsc_update_queue__(netdev, queue_id, &class);
3133 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3137 struct hfsc_class *hc;
3139 hc = hfsc_class_cast__(queue);
3140 hfsc = hfsc_get__(netdev);
3142 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3144 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3151 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3152 struct netdev_queue_stats *stats)
3154 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3155 tc_make_handle(1, 0xfffe), NULL, stats);
3159 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3160 const struct ofpbuf *nlmsg,
3161 netdev_dump_queue_stats_cb *cb, void *aux)
3163 struct netdev_queue_stats stats;
3164 unsigned int handle, major, minor;
3167 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3172 major = tc_get_major(handle);
3173 minor = tc_get_minor(handle);
3174 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3175 (*cb)(minor - 1, &stats, aux);
3180 static const struct tc_ops tc_ops_hfsc = {
3181 "hfsc", /* linux_name */
3182 "linux-hfsc", /* ovs_name */
3183 HFSC_N_QUEUES, /* n_queues */
3184 hfsc_tc_install, /* tc_install */
3185 hfsc_tc_load, /* tc_load */
3186 hfsc_tc_destroy, /* tc_destroy */
3187 hfsc_qdisc_get, /* qdisc_get */
3188 hfsc_qdisc_set, /* qdisc_set */
3189 hfsc_class_get, /* class_get */
3190 hfsc_class_set, /* class_set */
3191 hfsc_class_delete, /* class_delete */
3192 hfsc_class_get_stats, /* class_get_stats */
3193 hfsc_class_dump_stats /* class_dump_stats */
3196 /* "linux-default" traffic control class.
3198 * This class represents the default, unnamed Linux qdisc. It corresponds to
3199 * the "" (empty string) QoS type in the OVS database. */
3202 default_install__(struct netdev *netdev)
3204 struct netdev_dev_linux *netdev_dev =
3205 netdev_dev_linux_cast(netdev_get_dev(netdev));
3206 static struct tc *tc;
3209 tc = xmalloc(sizeof *tc);
3210 tc_init(tc, &tc_ops_default);
3212 netdev_dev->tc = tc;
3216 default_tc_install(struct netdev *netdev,
3217 const struct shash *details OVS_UNUSED)
3219 default_install__(netdev);
3224 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3226 default_install__(netdev);
3230 static const struct tc_ops tc_ops_default = {
3231 NULL, /* linux_name */
3236 NULL, /* tc_destroy */
3237 NULL, /* qdisc_get */
3238 NULL, /* qdisc_set */
3239 NULL, /* class_get */
3240 NULL, /* class_set */
3241 NULL, /* class_delete */
3242 NULL, /* class_get_stats */
3243 NULL /* class_dump_stats */
3246 /* "linux-other" traffic control class.
3251 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3253 struct netdev_dev_linux *netdev_dev =
3254 netdev_dev_linux_cast(netdev_get_dev(netdev));
3255 static struct tc *tc;
3258 tc = xmalloc(sizeof *tc);
3259 tc_init(tc, &tc_ops_other);
3261 netdev_dev->tc = tc;
3265 static const struct tc_ops tc_ops_other = {
3266 NULL, /* linux_name */
3267 "linux-other", /* ovs_name */
3269 NULL, /* tc_install */
3271 NULL, /* tc_destroy */
3272 NULL, /* qdisc_get */
3273 NULL, /* qdisc_set */
3274 NULL, /* class_get */
3275 NULL, /* class_set */
3276 NULL, /* class_delete */
3277 NULL, /* class_get_stats */
3278 NULL /* class_dump_stats */
3281 /* Traffic control. */
3283 /* Number of kernel "tc" ticks per second. */
3284 static double ticks_per_s;
3286 /* Number of kernel "jiffies" per second. This is used for the purpose of
3287 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3288 * one jiffy's worth of data.
3290 * There are two possibilities here:
3292 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3293 * approximate range of 100 to 1024. That means that we really need to
3294 * make sure that the qdisc can buffer that much data.
3296 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3297 * has finely granular timers and there's no need to fudge additional room
3298 * for buffers. (There's no extra effort needed to implement that: the
3299 * large 'buffer_hz' is used as a divisor, so practically any number will
3300 * come out as 0 in the division. Small integer results in the case of
3301 * really high dividends won't have any real effect anyhow.)
3303 static unsigned int buffer_hz;
3305 /* Returns tc handle 'major':'minor'. */
3307 tc_make_handle(unsigned int major, unsigned int minor)
3309 return TC_H_MAKE(major << 16, minor);
3312 /* Returns the major number from 'handle'. */
3314 tc_get_major(unsigned int handle)
3316 return TC_H_MAJ(handle) >> 16;
3319 /* Returns the minor number from 'handle'. */
3321 tc_get_minor(unsigned int handle)
3323 return TC_H_MIN(handle);
3326 static struct tcmsg *
3327 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3328 struct ofpbuf *request)
3330 struct tcmsg *tcmsg;
3334 error = get_ifindex(netdev, &ifindex);
3339 ofpbuf_init(request, 512);
3340 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3341 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3342 tcmsg->tcm_family = AF_UNSPEC;
3343 tcmsg->tcm_ifindex = ifindex;
3344 /* Caller should fill in tcmsg->tcm_handle. */
3345 /* Caller should fill in tcmsg->tcm_parent. */
3351 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3353 int error = nl_sock_transact(rtnl_sock, request, replyp);
3354 ofpbuf_uninit(request);
3361 /* The values in psched are not individually very meaningful, but they are
3362 * important. The tables below show some values seen in the wild.
3366 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3367 * (Before that, there are hints that it was 1000000000.)
3369 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3373 * -----------------------------------
3374 * [1] 000c8000 000f4240 000f4240 00000064
3375 * [2] 000003e8 00000400 000f4240 3b9aca00
3376 * [3] 000003e8 00000400 000f4240 3b9aca00
3377 * [4] 000003e8 00000400 000f4240 00000064
3378 * [5] 000003e8 00000040 000f4240 3b9aca00
3379 * [6] 000003e8 00000040 000f4240 000000f9
3381 * a b c d ticks_per_s buffer_hz
3382 * ------- --------- ---------- ------------- ----------- -------------
3383 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3384 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3385 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3386 * [4] 1,000 1,024 1,000,000 100 976,562 100
3387 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3388 * [6] 1,000 64 1,000,000 249 15,625,000 249
3390 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3391 * [2] 2.6.26-1-686-bigmem from Debian lenny
3392 * [3] 2.6.26-2-sparc64 from Debian lenny
3393 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3394 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3395 * [6] 2.6.34 from kernel.org on KVM
3397 static const char fn[] = "/proc/net/psched";
3398 unsigned int a, b, c, d;
3404 stream = fopen(fn, "r");
3406 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3410 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3411 VLOG_WARN("%s: read failed", fn);
3415 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3419 VLOG_WARN("%s: invalid scheduler parameters", fn);
3423 ticks_per_s = (double) a * c / b;
3427 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3430 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3433 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3434 * rate of 'rate' bytes per second. */
3436 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3441 return (rate * ticks) / ticks_per_s;
3444 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3445 * rate of 'rate' bytes per second. */
3447 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3452 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3455 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3456 * a transmission rate of 'rate' bytes per second. */
3458 tc_buffer_per_jiffy(unsigned int rate)
3463 return rate / buffer_hz;
3466 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3467 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3468 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3469 * stores NULL into it if it is absent.
3471 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3474 * Returns 0 if successful, otherwise a positive errno value. */
3476 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3477 struct nlattr **options)
3479 static const struct nl_policy tca_policy[] = {
3480 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3481 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3483 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3485 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3486 tca_policy, ta, ARRAY_SIZE(ta))) {
3487 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3492 *kind = nl_attr_get_string(ta[TCA_KIND]);
3496 *options = ta[TCA_OPTIONS];
3511 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3512 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3513 * into '*options', and its queue statistics into '*stats'. Any of the output
3514 * arguments may be null.
3516 * Returns 0 if successful, otherwise a positive errno value. */
3518 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3519 struct nlattr **options, struct netdev_queue_stats *stats)
3521 static const struct nl_policy tca_policy[] = {
3522 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3523 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3525 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3527 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3528 tca_policy, ta, ARRAY_SIZE(ta))) {
3529 VLOG_WARN_RL(&rl, "failed to parse class message");
3534 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3535 *handlep = tc->tcm_handle;
3539 *options = ta[TCA_OPTIONS];
3543 const struct gnet_stats_queue *gsq;
3544 struct gnet_stats_basic gsb;
3546 static const struct nl_policy stats_policy[] = {
3547 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3548 .min_len = sizeof gsb },
3549 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3550 .min_len = sizeof *gsq },
3552 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3554 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3555 sa, ARRAY_SIZE(sa))) {
3556 VLOG_WARN_RL(&rl, "failed to parse class stats");
3560 /* Alignment issues screw up the length of struct gnet_stats_basic on
3561 * some arch/bitsize combinations. Newer versions of Linux have a
3562 * struct gnet_stats_basic_packed, but we can't depend on that. The
3563 * easiest thing to do is just to make a copy. */
3564 memset(&gsb, 0, sizeof gsb);
3565 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3566 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3567 stats->tx_bytes = gsb.bytes;
3568 stats->tx_packets = gsb.packets;
3570 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3571 stats->tx_errors = gsq->drops;
3581 memset(stats, 0, sizeof *stats);
3586 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3589 tc_query_class(const struct netdev *netdev,
3590 unsigned int handle, unsigned int parent,
3591 struct ofpbuf **replyp)
3593 struct ofpbuf request;
3594 struct tcmsg *tcmsg;
3597 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3601 tcmsg->tcm_handle = handle;
3602 tcmsg->tcm_parent = parent;
3604 error = tc_transact(&request, replyp);
3606 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3607 netdev_get_name(netdev),
3608 tc_get_major(handle), tc_get_minor(handle),
3609 tc_get_major(parent), tc_get_minor(parent),
3615 /* Equivalent to "tc class del dev <name> handle <handle>". */
3617 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3619 struct ofpbuf request;
3620 struct tcmsg *tcmsg;
3623 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3627 tcmsg->tcm_handle = handle;
3628 tcmsg->tcm_parent = 0;
3630 error = tc_transact(&request, NULL);
3632 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3633 netdev_get_name(netdev),
3634 tc_get_major(handle), tc_get_minor(handle),
3640 /* Equivalent to "tc qdisc del dev <name> root". */
3642 tc_del_qdisc(struct netdev *netdev)
3644 struct netdev_dev_linux *netdev_dev =
3645 netdev_dev_linux_cast(netdev_get_dev(netdev));
3646 struct ofpbuf request;
3647 struct tcmsg *tcmsg;
3650 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3654 tcmsg->tcm_handle = tc_make_handle(1, 0);
3655 tcmsg->tcm_parent = TC_H_ROOT;
3657 error = tc_transact(&request, NULL);
3658 if (error == EINVAL) {
3659 /* EINVAL probably means that the default qdisc was in use, in which
3660 * case we've accomplished our purpose. */
3663 if (!error && netdev_dev->tc) {
3664 if (netdev_dev->tc->ops->tc_destroy) {
3665 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3667 netdev_dev->tc = NULL;
3672 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3673 * kernel to determine what they are. Returns 0 if successful, otherwise a
3674 * positive errno value. */
3676 tc_query_qdisc(const struct netdev *netdev)
3678 struct netdev_dev_linux *netdev_dev =
3679 netdev_dev_linux_cast(netdev_get_dev(netdev));
3680 struct ofpbuf request, *qdisc;
3681 const struct tc_ops *ops;
3682 struct tcmsg *tcmsg;
3686 if (netdev_dev->tc) {
3690 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3691 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3692 * 2.6.35 without that fix backported to it.
3694 * To avoid the OOPS, we must not make a request that would attempt to dump
3695 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3696 * few others. There are a few ways that I can see to do this, but most of
3697 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3698 * technique chosen here is to assume that any non-default qdisc that we
3699 * create will have a class with handle 1:0. The built-in qdiscs only have
3700 * a class with handle 0:0.
3702 * We could check for Linux 2.6.35+ and use a more straightforward method
3704 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3708 tcmsg->tcm_handle = tc_make_handle(1, 0);
3709 tcmsg->tcm_parent = 0;
3711 /* Figure out what tc class to instantiate. */
3712 error = tc_transact(&request, &qdisc);
3716 error = tc_parse_qdisc(qdisc, &kind, NULL);
3718 ops = &tc_ops_other;
3720 ops = tc_lookup_linux_name(kind);
3722 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3723 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3725 ops = &tc_ops_other;
3728 } else if (error == ENOENT) {
3729 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3730 * other entity that doesn't have a handle 1:0. We will assume
3731 * that it's the system default qdisc. */
3732 ops = &tc_ops_default;
3735 /* Who knows? Maybe the device got deleted. */
3736 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3737 netdev_get_name(netdev), strerror(error));
3738 ops = &tc_ops_other;
3741 /* Instantiate it. */
3742 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3743 assert((load_error == 0) == (netdev_dev->tc != NULL));
3744 ofpbuf_delete(qdisc);
3746 return error ? error : load_error;
3749 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3750 approximate the time to transmit packets of various lengths. For an MTU of
3751 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3752 represents two possible packet lengths; for a MTU of 513 through 1024, four
3753 possible lengths; and so on.
3755 Returns, for the specified 'mtu', the number of bits that packet lengths
3756 need to be shifted right to fit within such a 256-entry table. */
3758 tc_calc_cell_log(unsigned int mtu)
3763 mtu = ETH_PAYLOAD_MAX;
3765 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3767 for (cell_log = 0; mtu >= 256; cell_log++) {
3774 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3777 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3779 memset(rate, 0, sizeof *rate);
3780 rate->cell_log = tc_calc_cell_log(mtu);
3781 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3782 /* rate->cell_align = 0; */ /* distro headers. */
3783 rate->mpu = ETH_TOTAL_MIN;
3787 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3788 * attribute of the specified "type".
3790 * See tc_calc_cell_log() above for a description of "rtab"s. */
3792 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3797 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3798 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3799 unsigned packet_size = (i + 1) << rate->cell_log;
3800 if (packet_size < rate->mpu) {
3801 packet_size = rate->mpu;
3803 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3807 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3808 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3809 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3812 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3814 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3815 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3819 /* Utility functions. */
3822 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3824 /* Policy for RTNLGRP_LINK messages.
3826 * There are *many* more fields in these messages, but currently we only
3827 * care about these fields. */
3828 static const struct nl_policy rtnlgrp_link_policy[] = {
3829 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3830 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3831 .min_len = sizeof(struct rtnl_link_stats) },
3834 struct ofpbuf request;
3835 struct ofpbuf *reply;
3836 struct ifinfomsg *ifi;
3837 const struct rtnl_link_stats *rtnl_stats;
3838 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3841 ofpbuf_init(&request, 0);
3842 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3843 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3844 ifi->ifi_family = PF_UNSPEC;
3845 ifi->ifi_index = ifindex;
3846 error = nl_sock_transact(rtnl_sock, &request, &reply);
3847 ofpbuf_uninit(&request);
3852 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3853 rtnlgrp_link_policy,
3854 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3855 ofpbuf_delete(reply);
3859 if (!attrs[IFLA_STATS]) {
3860 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3861 ofpbuf_delete(reply);
3865 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3866 stats->rx_packets = rtnl_stats->rx_packets;
3867 stats->tx_packets = rtnl_stats->tx_packets;
3868 stats->rx_bytes = rtnl_stats->rx_bytes;
3869 stats->tx_bytes = rtnl_stats->tx_bytes;
3870 stats->rx_errors = rtnl_stats->rx_errors;
3871 stats->tx_errors = rtnl_stats->tx_errors;
3872 stats->rx_dropped = rtnl_stats->rx_dropped;
3873 stats->tx_dropped = rtnl_stats->tx_dropped;
3874 stats->multicast = rtnl_stats->multicast;
3875 stats->collisions = rtnl_stats->collisions;
3876 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3877 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3878 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3879 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3880 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3881 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3882 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3883 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3884 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3885 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3886 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3888 ofpbuf_delete(reply);
3894 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3896 static const char fn[] = "/proc/net/dev";
3901 stream = fopen(fn, "r");
3903 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3908 while (fgets(line, sizeof line, stream)) {
3911 #define X64 "%"SCNu64
3914 X64 X64 X64 X64 X64 X64 X64 "%*u"
3915 X64 X64 X64 X64 X64 X64 X64 "%*u",
3921 &stats->rx_fifo_errors,
3922 &stats->rx_frame_errors,
3928 &stats->tx_fifo_errors,
3930 &stats->tx_carrier_errors) != 15) {
3931 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3932 } else if (!strcmp(devname, netdev_name)) {
3933 stats->rx_length_errors = UINT64_MAX;
3934 stats->rx_over_errors = UINT64_MAX;
3935 stats->rx_crc_errors = UINT64_MAX;
3936 stats->rx_missed_errors = UINT64_MAX;
3937 stats->tx_aborted_errors = UINT64_MAX;
3938 stats->tx_heartbeat_errors = UINT64_MAX;
3939 stats->tx_window_errors = UINT64_MAX;
3945 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3951 get_flags(const struct netdev *netdev, int *flags)
3956 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3958 *flags = ifr.ifr_flags;
3963 set_flags(struct netdev *netdev, int flags)
3967 ifr.ifr_flags = flags;
3968 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3973 do_get_ifindex(const char *netdev_name)
3977 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3978 COVERAGE_INC(netdev_get_ifindex);
3979 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3980 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3981 netdev_name, strerror(errno));
3984 return ifr.ifr_ifindex;
3988 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3990 struct netdev_dev_linux *netdev_dev =
3991 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3993 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3994 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3998 netdev_dev->cache_valid |= VALID_IFINDEX;
3999 netdev_dev->ifindex = ifindex;
4001 *ifindexp = netdev_dev->ifindex;
4006 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4011 memset(&ifr, 0, sizeof ifr);
4012 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4013 COVERAGE_INC(netdev_get_hwaddr);
4014 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4015 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4016 netdev_name, strerror(errno));
4019 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4020 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4021 VLOG_WARN("%s device has unknown hardware address family %d",
4022 netdev_name, hwaddr_family);
4024 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4029 set_etheraddr(const char *netdev_name, int hwaddr_family,
4030 const uint8_t mac[ETH_ADDR_LEN])
4034 memset(&ifr, 0, sizeof ifr);
4035 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4036 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4037 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4038 COVERAGE_INC(netdev_set_hwaddr);
4039 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4040 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4041 netdev_name, strerror(errno));
4048 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4049 int cmd, const char *cmd_name)
4053 memset(&ifr, 0, sizeof ifr);
4054 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4055 ifr.ifr_data = (caddr_t) ecmd;
4058 COVERAGE_INC(netdev_ethtool);
4059 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4062 if (errno != EOPNOTSUPP) {
4063 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4064 "failed: %s", cmd_name, name, strerror(errno));
4066 /* The device doesn't support this operation. That's pretty
4067 * common, so there's no point in logging anything. */
4074 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4075 const char *cmd_name)
4077 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4078 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4079 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4087 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4088 int cmd, const char *cmd_name)
4093 ifr.ifr_addr.sa_family = AF_INET;
4094 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4096 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4097 *ip = sin->sin_addr;