2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_get_vlan_vid);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
95 #define TC_RTAB_SIZE 1024
98 static struct rtnetlink_notifier netdev_linux_cache_notifier;
99 static int cache_notifier_refcount;
102 VALID_IFINDEX = 1 << 0,
103 VALID_ETHERADDR = 1 << 1,
107 VALID_CARRIER = 1 << 5,
108 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
109 VALID_POLICING = 1 << 7,
110 VALID_HAVE_VPORT_STATS = 1 << 8
118 /* Traffic control. */
120 /* An instance of a traffic control class. Always associated with a particular
123 * Each TC implementation subclasses this with whatever additional data it
126 const struct tc_ops *ops;
127 struct hmap queues; /* Contains "struct tc_queue"s.
128 * Read by generic TC layer.
129 * Written only by TC implementation. */
132 /* One traffic control queue.
134 * Each TC implementation subclasses this with whatever additional data it
137 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
138 unsigned int queue_id; /* OpenFlow queue ID. */
141 /* A particular kind of traffic control. Each implementation generally maps to
142 * one particular Linux qdisc class.
144 * The functions below return 0 if successful or a positive errno value on
145 * failure, except where otherwise noted. All of them must be provided, except
146 * where otherwise noted. */
148 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
149 * This is null for tc_ops_default and tc_ops_other, for which there are no
150 * appropriate values. */
151 const char *linux_name;
153 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
154 const char *ovs_name;
156 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
157 * queues. The queues are numbered 0 through n_queues - 1. */
158 unsigned int n_queues;
160 /* Called to install this TC class on 'netdev'. The implementation should
161 * make the Netlink calls required to set up 'netdev' with the right qdisc
162 * and configure it according to 'details'. The implementation may assume
163 * that the current qdisc is the default; that is, there is no need for it
164 * to delete the current qdisc before installing itself.
166 * The contents of 'details' should be documented as valid for 'ovs_name'
167 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
168 * (which is built as ovs-vswitchd.conf.db(8)).
170 * This function must return 0 if and only if it sets 'netdev->tc' to an
171 * initialized 'struct tc'.
173 * (This function is null for tc_ops_other, which cannot be installed. For
174 * other TC classes it should always be nonnull.) */
175 int (*tc_install)(struct netdev *netdev, const struct shash *details);
177 /* Called when the netdev code determines (through a Netlink query) that
178 * this TC class's qdisc is installed on 'netdev', but we didn't install
179 * it ourselves and so don't know any of the details.
181 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
182 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
183 * implementation should parse the other attributes of 'nlmsg' as
184 * necessary to determine its configuration. If necessary it should also
185 * use Netlink queries to determine the configuration of queues on
188 * This function must return 0 if and only if it sets 'netdev->tc' to an
189 * initialized 'struct tc'. */
190 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
192 /* Destroys the data structures allocated by the implementation as part of
193 * 'tc'. (This includes destroying 'tc->queues' by calling
196 * The implementation should not need to perform any Netlink calls. If
197 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
198 * (But it may not be desirable.)
200 * This function may be null if 'tc' is trivial. */
201 void (*tc_destroy)(struct tc *tc);
203 /* Retrieves details of 'netdev->tc' configuration into 'details'.
205 * The implementation should not need to perform any Netlink calls, because
206 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
207 * cached the configuration.
209 * The contents of 'details' should be documented as valid for 'ovs_name'
210 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
211 * (which is built as ovs-vswitchd.conf.db(8)).
213 * This function may be null if 'tc' is not configurable.
215 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
217 /* Reconfigures 'netdev->tc' according to 'details', performing any
218 * required Netlink calls to complete the reconfiguration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_set)(struct netdev *, const struct shash *details);
228 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
229 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "Queue" table in
233 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
235 * The implementation should not need to perform any Netlink calls, because
236 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
237 * cached the queue configuration.
239 * This function may be null if 'tc' does not have queues ('n_queues' is
241 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
242 struct shash *details);
244 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
245 * 'details', perfoming any required Netlink calls to complete the
246 * reconfiguration. The caller ensures that 'queue_id' is less than
249 * The contents of 'details' should be documented as valid for 'ovs_name'
250 * in the "other_config" column in the "Queue" table in
251 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
253 * This function may be null if 'tc' does not have queues or its queues are
254 * not configurable. */
255 int (*class_set)(struct netdev *, unsigned int queue_id,
256 const struct shash *details);
258 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
259 * tc_queue's within 'netdev->tc->queues'.
261 * This function may be null if 'tc' does not have queues or its queues
262 * cannot be deleted. */
263 int (*class_delete)(struct netdev *, struct tc_queue *queue);
265 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
266 * 'struct tc_queue's within 'netdev->tc->queues'.
268 * On success, initializes '*stats'.
270 * This function may be null if 'tc' does not have queues or if it cannot
271 * report queue statistics. */
272 int (*class_get_stats)(const struct netdev *netdev,
273 const struct tc_queue *queue,
274 struct netdev_queue_stats *stats);
276 /* Extracts queue stats from 'nlmsg', which is a response to a
277 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_dump_stats)(const struct netdev *netdev,
282 const struct ofpbuf *nlmsg,
283 netdev_dump_queue_stats_cb *cb, void *aux);
287 tc_init(struct tc *tc, const struct tc_ops *ops)
290 hmap_init(&tc->queues);
294 tc_destroy(struct tc *tc)
296 hmap_destroy(&tc->queues);
299 static const struct tc_ops tc_ops_htb;
300 static const struct tc_ops tc_ops_hfsc;
301 static const struct tc_ops tc_ops_default;
302 static const struct tc_ops tc_ops_other;
304 static const struct tc_ops *tcs[] = {
305 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
306 &tc_ops_hfsc, /* Hierarchical fair service curve. */
307 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
308 &tc_ops_other, /* Some other qdisc. */
312 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
313 static unsigned int tc_get_major(unsigned int handle);
314 static unsigned int tc_get_minor(unsigned int handle);
316 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
317 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
318 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
320 static struct tcmsg *tc_make_request(const struct netdev *, int type,
321 unsigned int flags, struct ofpbuf *);
322 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
324 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
325 struct nlattr **options);
326 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
327 struct nlattr **options,
328 struct netdev_queue_stats *);
329 static int tc_query_class(const struct netdev *,
330 unsigned int handle, unsigned int parent,
331 struct ofpbuf **replyp);
332 static int tc_delete_class(const struct netdev *, unsigned int handle);
334 static int tc_del_qdisc(struct netdev *netdev);
335 static int tc_query_qdisc(const struct netdev *netdev);
337 static int tc_calc_cell_log(unsigned int mtu);
338 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
339 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
340 const struct tc_ratespec *rate);
341 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
343 struct netdev_dev_linux {
344 struct netdev_dev netdev_dev;
346 struct shash_node *shash_node;
347 unsigned int cache_valid;
349 /* The following are figured out "on demand" only. They are only valid
350 * when the corresponding VALID_* bit in 'cache_valid' is set. */
352 uint8_t etheraddr[ETH_ADDR_LEN];
353 struct in_addr address, netmask;
357 bool is_internal; /* Is this an openvswitch internal device? */
358 bool is_tap; /* Is this a tuntap device? */
359 uint32_t kbits_rate; /* Policing data. */
360 uint32_t kbits_burst;
361 bool have_vport_stats;
365 struct tap_state tap;
369 struct netdev_linux {
370 struct netdev netdev;
374 /* Sockets used for ioctl operations. */
375 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
376 static int af_packet_sock = -1; /* AF_PACKET, SOCK_RAW. */
378 /* A Netlink routing socket that is not subscribed to any multicast groups. */
379 static struct nl_sock *rtnl_sock;
381 struct netdev_linux_notifier {
382 struct netdev_notifier notifier;
386 static struct shash netdev_linux_notifiers =
387 SHASH_INITIALIZER(&netdev_linux_notifiers);
388 static struct rtnetlink_notifier netdev_linux_poll_notifier;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
416 is_netdev_linux_class(const struct netdev_class *netdev_class)
418 return netdev_class->init == netdev_linux_init;
421 static struct netdev_dev_linux *
422 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
424 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
425 assert(is_netdev_linux_class(netdev_class));
427 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
430 static struct netdev_linux *
431 netdev_linux_cast(const struct netdev *netdev)
433 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
434 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
435 assert(is_netdev_linux_class(netdev_class));
437 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
441 netdev_linux_init(void)
443 static int status = -1;
445 /* Create AF_INET socket. */
446 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
447 status = af_inet_sock >= 0 ? 0 : errno;
449 VLOG_ERR("failed to create inet socket: %s", strerror(status));
451 /* Create AF_PACKET socket. */
452 af_packet_sock = socket(AF_PACKET, SOCK_RAW, 0);
453 status = af_packet_sock >= 0 ? 0 : errno;
455 set_nonblocking(af_packet_sock);
457 VLOG_ERR("failed to create packet socket: %s",
462 /* Create rtnetlink socket. */
464 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
466 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
475 netdev_linux_run(void)
477 rtnetlink_link_notifier_run();
481 netdev_linux_wait(void)
483 rtnetlink_link_notifier_wait();
487 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
488 void *aux OVS_UNUSED)
490 struct netdev_dev_linux *dev;
492 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
494 const struct netdev_class *netdev_class =
495 netdev_dev_get_class(base_dev);
497 if (is_netdev_linux_class(netdev_class)) {
498 dev = netdev_dev_linux_cast(base_dev);
499 dev->cache_valid = 0;
503 struct shash device_shash;
504 struct shash_node *node;
506 shash_init(&device_shash);
507 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
508 SHASH_FOR_EACH (node, &device_shash) {
510 dev->cache_valid = 0;
512 shash_destroy(&device_shash);
516 /* Creates system and internal devices. */
518 netdev_linux_create(const struct netdev_class *class,
519 const char *name, const struct shash *args,
520 struct netdev_dev **netdev_devp)
522 struct netdev_dev_linux *netdev_dev;
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for %s devices should be empty",
530 if (!cache_notifier_refcount) {
531 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
532 netdev_linux_cache_cb, NULL);
537 cache_notifier_refcount++;
539 netdev_dev = xzalloc(sizeof *netdev_dev);
540 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
542 *netdev_devp = &netdev_dev->netdev_dev;
546 /* For most types of netdevs we open the device for each call of
547 * netdev_open(). However, this is not the case with tap devices,
548 * since it is only possible to open the device once. In this
549 * situation we share a single file descriptor, and consequently
550 * buffers, across all readers. Therefore once data is read it will
551 * be unavailable to other reads for tap devices. */
553 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
554 const char *name, const struct shash *args,
555 struct netdev_dev **netdev_devp)
557 struct netdev_dev_linux *netdev_dev;
558 struct tap_state *state;
559 static const char tap_dev[] = "/dev/net/tun";
563 if (!shash_is_empty(args)) {
564 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
567 netdev_dev = xzalloc(sizeof *netdev_dev);
568 state = &netdev_dev->state.tap;
570 /* Open tap device. */
571 state->fd = open(tap_dev, O_RDWR);
574 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
578 /* Create tap device. */
579 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
581 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
582 VLOG_WARN("%s: creating tap device failed: %s", name,
588 /* Make non-blocking. */
589 error = set_nonblocking(state->fd);
594 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
595 *netdev_devp = &netdev_dev->netdev_dev;
604 destroy_tap(struct netdev_dev_linux *netdev_dev)
606 struct tap_state *state = &netdev_dev->state.tap;
608 if (state->fd >= 0) {
613 /* Destroys the netdev device 'netdev_dev_'. */
615 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
617 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
618 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
620 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
621 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
624 if (class == &netdev_linux_class || class == &netdev_internal_class) {
625 cache_notifier_refcount--;
627 if (!cache_notifier_refcount) {
628 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
630 } else if (class == &netdev_tap_class) {
631 destroy_tap(netdev_dev);
640 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
641 struct netdev **netdevp)
643 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
644 struct netdev_linux *netdev;
645 enum netdev_flags flags;
648 /* Allocate network device. */
649 netdev = xzalloc(sizeof *netdev);
651 netdev_init(&netdev->netdev, netdev_dev_);
653 /* Verify that the device really exists, by attempting to read its flags.
654 * (The flags might be cached, in which case this won't actually do an
657 * Don't do this for "internal" netdevs, though, because those have to be
658 * created as netdev objects before they exist in the kernel, because
659 * creating them in the kernel happens by passing a netdev object to
660 * dpif_port_add(). */
661 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
662 error = netdev_get_flags(&netdev->netdev, &flags);
663 if (error == ENODEV) {
668 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
669 !netdev_dev->state.tap.opened) {
671 /* We assume that the first user of the tap device is the primary user
672 * and give them the tap FD. Subsequent users probably just expect
673 * this to be a system device so open it normally to avoid send/receive
674 * directions appearing to be reversed. */
675 netdev->fd = netdev_dev->state.tap.fd;
676 netdev_dev->state.tap.opened = true;
677 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
678 struct sockaddr_ll sll;
682 /* Create file descriptor. */
683 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
684 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
686 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
687 if (netdev->fd < 0) {
692 /* Set non-blocking mode. */
693 error = set_nonblocking(netdev->fd);
698 /* Get ethernet device index. */
699 error = get_ifindex(&netdev->netdev, &ifindex);
704 /* Bind to specific ethernet device. */
705 memset(&sll, 0, sizeof sll);
706 sll.sll_family = AF_PACKET;
707 sll.sll_ifindex = ifindex;
709 (struct sockaddr *) &sll, sizeof sll) < 0) {
711 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
716 /* Between the socket() and bind() calls above, the socket receives all
717 * packets of the requested type on all system interfaces. We do not
718 * want to receive that data, but there is no way to avoid it. So we
719 * must now drain out the receive queue. */
720 error = drain_rcvbuf(netdev->fd);
726 *netdevp = &netdev->netdev;
730 netdev_uninit(&netdev->netdev, true);
734 /* Closes and destroys 'netdev'. */
736 netdev_linux_close(struct netdev *netdev_)
738 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
740 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
746 /* Initializes 'sset' with a list of the names of all known network devices. */
748 netdev_linux_enumerate(struct sset *sset)
750 struct if_nameindex *names;
752 names = if_nameindex();
756 for (i = 0; names[i].if_name != NULL; i++) {
757 sset_add(sset, names[i].if_name);
759 if_freenameindex(names);
762 VLOG_WARN("could not obtain list of network device names: %s",
769 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
773 if (netdev->fd < 0) {
774 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
779 ssize_t retval = read(netdev->fd, data, size);
782 } else if (errno != EINTR) {
783 if (errno != EAGAIN) {
784 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
785 strerror(errno), netdev_get_name(netdev_));
792 /* Registers with the poll loop to wake up from the next call to poll_block()
793 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
795 netdev_linux_recv_wait(struct netdev *netdev_)
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
798 if (netdev->fd >= 0) {
799 poll_fd_wait(netdev->fd, POLLIN);
803 /* Discards all packets waiting to be received from 'netdev'. */
805 netdev_linux_drain(struct netdev *netdev_)
807 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
808 if (netdev->fd < 0) {
810 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
812 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
813 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
817 drain_fd(netdev->fd, ifr.ifr_qlen);
820 return drain_rcvbuf(netdev->fd);
824 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
825 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
826 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
827 * the packet is too big or too small to transmit on the device.
829 * The caller retains ownership of 'buffer' in all cases.
831 * The kernel maintains a packet transmission queue, so the caller is not
832 * expected to do additional queuing of packets. */
834 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
840 if (netdev->fd < 0) {
841 /* Use our AF_PACKET socket to send to this device. */
842 struct sockaddr_ll sll;
848 error = get_ifindex(netdev_, &ifindex);
853 /* We don't bother setting most fields in sockaddr_ll because the
854 * kernel ignores them for SOCK_RAW. */
855 memset(&sll, 0, sizeof sll);
856 sll.sll_family = AF_PACKET;
857 sll.sll_ifindex = ifindex;
859 iov.iov_base = (void *) data;
863 msg.msg_namelen = sizeof sll;
866 msg.msg_control = NULL;
867 msg.msg_controllen = 0;
870 retval = sendmsg(af_packet_sock, &msg, 0);
872 /* Use the netdev's own fd to send to this device. This is
873 * essential for tap devices, because packets sent to a tap device
874 * with an AF_PACKET socket will loop back to be *received* again
875 * on the tap device. */
876 retval = write(netdev->fd, data, size);
880 /* The Linux AF_PACKET implementation never blocks waiting for room
881 * for packets, instead returning ENOBUFS. Translate this into
882 * EAGAIN for the caller. */
883 if (errno == ENOBUFS) {
885 } else if (errno == EINTR) {
887 } else if (errno != EAGAIN) {
888 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
889 netdev_get_name(netdev_), strerror(errno));
892 } else if (retval != size) {
893 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
894 "%zu) on %s", retval, size, netdev_get_name(netdev_));
902 /* Registers with the poll loop to wake up from the next call to poll_block()
903 * when the packet transmission queue has sufficient room to transmit a packet
904 * with netdev_send().
906 * The kernel maintains a packet transmission queue, so the client is not
907 * expected to do additional queuing of packets. Thus, this function is
908 * unlikely to ever be used. It is included for completeness. */
910 netdev_linux_send_wait(struct netdev *netdev_)
912 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
913 if (netdev->fd < 0) {
915 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
916 poll_fd_wait(netdev->fd, POLLOUT);
918 /* TAP device always accepts packets.*/
919 poll_immediate_wake();
923 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
924 * otherwise a positive errno value. */
926 netdev_linux_set_etheraddr(struct netdev *netdev_,
927 const uint8_t mac[ETH_ADDR_LEN])
929 struct netdev_dev_linux *netdev_dev =
930 netdev_dev_linux_cast(netdev_get_dev(netdev_));
933 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
934 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
935 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
937 netdev_dev->cache_valid |= VALID_ETHERADDR;
938 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
946 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
947 * free the returned buffer. */
949 netdev_linux_get_etheraddr(const struct netdev *netdev_,
950 uint8_t mac[ETH_ADDR_LEN])
952 struct netdev_dev_linux *netdev_dev =
953 netdev_dev_linux_cast(netdev_get_dev(netdev_));
954 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
955 int error = get_etheraddr(netdev_get_name(netdev_),
956 netdev_dev->etheraddr);
960 netdev_dev->cache_valid |= VALID_ETHERADDR;
962 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
966 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
967 * in bytes, not including the hardware header; thus, this is typically 1500
968 * bytes for Ethernet devices. */
970 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
972 struct netdev_dev_linux *netdev_dev =
973 netdev_dev_linux_cast(netdev_get_dev(netdev_));
974 if (!(netdev_dev->cache_valid & VALID_MTU)) {
978 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
979 SIOCGIFMTU, "SIOCGIFMTU");
983 netdev_dev->mtu = ifr.ifr_mtu;
984 netdev_dev->cache_valid |= VALID_MTU;
986 *mtup = netdev_dev->mtu;
990 /* Returns the ifindex of 'netdev', if successful, as a positive number.
991 * On failure, returns a negative errno value. */
993 netdev_linux_get_ifindex(const struct netdev *netdev)
997 error = get_ifindex(netdev, &ifindex);
998 return error ? -error : ifindex;
1002 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1004 struct netdev_dev_linux *netdev_dev =
1005 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1010 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1014 fn = xasprintf("/sys/class/net/%s/carrier",
1015 netdev_get_name(netdev_));
1016 fd = open(fn, O_RDONLY);
1019 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1023 retval = read(fd, line, sizeof line);
1026 if (error == EINVAL) {
1027 /* This is the normal return value when we try to check carrier
1028 * if the network device is not up. */
1030 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1033 } else if (retval == 0) {
1035 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1039 if (line[0] != '0' && line[0] != '1') {
1041 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1045 netdev_dev->carrier = line[0] != '0';
1046 netdev_dev->cache_valid |= VALID_CARRIER;
1048 *carrier = netdev_dev->carrier;
1060 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1061 const char *cmd_name, struct mii_ioctl_data *data)
1066 memset(&ifr, 0, sizeof ifr);
1067 memcpy(&ifr.ifr_data, data, sizeof *data);
1068 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1069 &ifr, cmd, cmd_name);
1070 memcpy(data, &ifr.ifr_data, sizeof *data);
1076 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1078 const char *name = netdev_get_name(netdev);
1079 struct mii_ioctl_data data;
1084 memset(&data, 0, sizeof data);
1085 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1087 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1088 data.reg_num = MII_BMSR;
1089 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1093 *miimon = !!(data.val_out & BMSR_LSTATUS);
1095 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1098 struct ethtool_cmd ecmd;
1100 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1103 memset(&ecmd, 0, sizeof ecmd);
1104 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1107 struct ethtool_value eval;
1109 memcpy(&eval, &ecmd, sizeof eval);
1110 *miimon = !!eval.data;
1112 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1119 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1120 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1123 check_for_working_netlink_stats(void)
1125 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1126 * preferable, so if that works, we'll use it. */
1127 int ifindex = do_get_ifindex("lo");
1129 VLOG_WARN("failed to get ifindex for lo, "
1130 "obtaining netdev stats from proc");
1133 struct netdev_stats stats;
1134 int error = get_stats_via_netlink(ifindex, &stats);
1136 VLOG_DBG("obtaining netdev stats via rtnetlink");
1139 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1140 "via proc (you are probably running a pre-2.6.19 "
1141 "kernel)", strerror(error));
1147 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1149 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1151 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1152 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1153 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1155 netdev_dev->is_tap = !strcmp(type, "tap");
1156 netdev_dev->is_internal = (!netdev_dev->is_tap
1157 && dpif_linux_is_internal_device(name));
1158 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1163 swap_uint64(uint64_t *a, uint64_t *b)
1170 /* Retrieves current device stats for 'netdev'. */
1172 netdev_linux_get_stats(const struct netdev *netdev_,
1173 struct netdev_stats *stats)
1175 struct netdev_dev_linux *netdev_dev =
1176 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1177 static int use_netlink_stats = -1;
1180 if (netdev_dev->have_vport_stats ||
1181 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1183 error = netdev_vport_get_stats(netdev_, stats);
1184 netdev_dev->have_vport_stats = !error;
1185 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1188 if (!netdev_dev->have_vport_stats) {
1189 if (use_netlink_stats < 0) {
1190 use_netlink_stats = check_for_working_netlink_stats();
1192 if (use_netlink_stats) {
1195 error = get_ifindex(netdev_, &ifindex);
1197 error = get_stats_via_netlink(ifindex, stats);
1200 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1204 /* If this port is an internal port then the transmit and receive stats
1205 * will appear to be swapped relative to the other ports since we are the
1206 * one sending the data, not a remote computer. For consistency, we swap
1207 * them back here. This does not apply if we are getting stats from the
1208 * vport layer because it always tracks stats from the perspective of the
1210 netdev_linux_update_is_pseudo(netdev_dev);
1211 if (!error && !netdev_dev->have_vport_stats &&
1212 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1213 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1214 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1215 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1216 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1217 stats->rx_length_errors = 0;
1218 stats->rx_over_errors = 0;
1219 stats->rx_crc_errors = 0;
1220 stats->rx_frame_errors = 0;
1221 stats->rx_fifo_errors = 0;
1222 stats->rx_missed_errors = 0;
1223 stats->tx_aborted_errors = 0;
1224 stats->tx_carrier_errors = 0;
1225 stats->tx_fifo_errors = 0;
1226 stats->tx_heartbeat_errors = 0;
1227 stats->tx_window_errors = 0;
1233 /* Stores the features supported by 'netdev' into each of '*current',
1234 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1235 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1236 * successful, otherwise a positive errno value. */
1238 netdev_linux_get_features(const struct netdev *netdev,
1239 uint32_t *current, uint32_t *advertised,
1240 uint32_t *supported, uint32_t *peer)
1242 struct ethtool_cmd ecmd;
1245 memset(&ecmd, 0, sizeof ecmd);
1246 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1247 ETHTOOL_GSET, "ETHTOOL_GSET");
1252 /* Supported features. */
1254 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1255 *supported |= OFPPF_10MB_HD;
1257 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1258 *supported |= OFPPF_10MB_FD;
1260 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1261 *supported |= OFPPF_100MB_HD;
1263 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1264 *supported |= OFPPF_100MB_FD;
1266 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1267 *supported |= OFPPF_1GB_HD;
1269 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1270 *supported |= OFPPF_1GB_FD;
1272 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1273 *supported |= OFPPF_10GB_FD;
1275 if (ecmd.supported & SUPPORTED_TP) {
1276 *supported |= OFPPF_COPPER;
1278 if (ecmd.supported & SUPPORTED_FIBRE) {
1279 *supported |= OFPPF_FIBER;
1281 if (ecmd.supported & SUPPORTED_Autoneg) {
1282 *supported |= OFPPF_AUTONEG;
1284 if (ecmd.supported & SUPPORTED_Pause) {
1285 *supported |= OFPPF_PAUSE;
1287 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1288 *supported |= OFPPF_PAUSE_ASYM;
1291 /* Advertised features. */
1293 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1294 *advertised |= OFPPF_10MB_HD;
1296 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1297 *advertised |= OFPPF_10MB_FD;
1299 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1300 *advertised |= OFPPF_100MB_HD;
1302 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1303 *advertised |= OFPPF_100MB_FD;
1305 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1306 *advertised |= OFPPF_1GB_HD;
1308 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1309 *advertised |= OFPPF_1GB_FD;
1311 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1312 *advertised |= OFPPF_10GB_FD;
1314 if (ecmd.advertising & ADVERTISED_TP) {
1315 *advertised |= OFPPF_COPPER;
1317 if (ecmd.advertising & ADVERTISED_FIBRE) {
1318 *advertised |= OFPPF_FIBER;
1320 if (ecmd.advertising & ADVERTISED_Autoneg) {
1321 *advertised |= OFPPF_AUTONEG;
1323 if (ecmd.advertising & ADVERTISED_Pause) {
1324 *advertised |= OFPPF_PAUSE;
1326 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1327 *advertised |= OFPPF_PAUSE_ASYM;
1330 /* Current settings. */
1331 if (ecmd.speed == SPEED_10) {
1332 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1333 } else if (ecmd.speed == SPEED_100) {
1334 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1335 } else if (ecmd.speed == SPEED_1000) {
1336 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1337 } else if (ecmd.speed == SPEED_10000) {
1338 *current = OFPPF_10GB_FD;
1343 if (ecmd.port == PORT_TP) {
1344 *current |= OFPPF_COPPER;
1345 } else if (ecmd.port == PORT_FIBRE) {
1346 *current |= OFPPF_FIBER;
1350 *current |= OFPPF_AUTONEG;
1353 /* Peer advertisements. */
1354 *peer = 0; /* XXX */
1359 /* Set the features advertised by 'netdev' to 'advertise'. */
1361 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1363 struct ethtool_cmd ecmd;
1366 memset(&ecmd, 0, sizeof ecmd);
1367 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1368 ETHTOOL_GSET, "ETHTOOL_GSET");
1373 ecmd.advertising = 0;
1374 if (advertise & OFPPF_10MB_HD) {
1375 ecmd.advertising |= ADVERTISED_10baseT_Half;
1377 if (advertise & OFPPF_10MB_FD) {
1378 ecmd.advertising |= ADVERTISED_10baseT_Full;
1380 if (advertise & OFPPF_100MB_HD) {
1381 ecmd.advertising |= ADVERTISED_100baseT_Half;
1383 if (advertise & OFPPF_100MB_FD) {
1384 ecmd.advertising |= ADVERTISED_100baseT_Full;
1386 if (advertise & OFPPF_1GB_HD) {
1387 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1389 if (advertise & OFPPF_1GB_FD) {
1390 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1392 if (advertise & OFPPF_10GB_FD) {
1393 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1395 if (advertise & OFPPF_COPPER) {
1396 ecmd.advertising |= ADVERTISED_TP;
1398 if (advertise & OFPPF_FIBER) {
1399 ecmd.advertising |= ADVERTISED_FIBRE;
1401 if (advertise & OFPPF_AUTONEG) {
1402 ecmd.advertising |= ADVERTISED_Autoneg;
1404 if (advertise & OFPPF_PAUSE) {
1405 ecmd.advertising |= ADVERTISED_Pause;
1407 if (advertise & OFPPF_PAUSE_ASYM) {
1408 ecmd.advertising |= ADVERTISED_Asym_Pause;
1410 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1411 ETHTOOL_SSET, "ETHTOOL_SSET");
1414 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1415 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1416 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1417 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1418 * sets '*vlan_vid' to -1. */
1420 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1422 const char *netdev_name = netdev_get_name(netdev);
1423 struct ds line = DS_EMPTY_INITIALIZER;
1424 FILE *stream = NULL;
1428 COVERAGE_INC(netdev_get_vlan_vid);
1429 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1430 stream = fopen(fn, "r");
1436 if (ds_get_line(&line, stream)) {
1437 if (ferror(stream)) {
1439 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1442 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1447 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1449 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1450 fn, ds_cstr(&line));
1468 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1469 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1471 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1472 * positive errno value.
1474 * This function is equivalent to running
1475 * /sbin/tc qdisc del dev %s handle ffff: ingress
1476 * but it is much, much faster.
1479 netdev_linux_remove_policing(struct netdev *netdev)
1481 struct netdev_dev_linux *netdev_dev =
1482 netdev_dev_linux_cast(netdev_get_dev(netdev));
1483 const char *netdev_name = netdev_get_name(netdev);
1485 struct ofpbuf request;
1486 struct tcmsg *tcmsg;
1489 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1493 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1494 tcmsg->tcm_parent = TC_H_INGRESS;
1495 nl_msg_put_string(&request, TCA_KIND, "ingress");
1496 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1498 error = tc_transact(&request, NULL);
1499 if (error && error != ENOENT && error != EINVAL) {
1500 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1501 netdev_name, strerror(error));
1505 netdev_dev->kbits_rate = 0;
1506 netdev_dev->kbits_burst = 0;
1507 netdev_dev->cache_valid |= VALID_POLICING;
1511 /* Attempts to set input rate limiting (policing) policy. */
1513 netdev_linux_set_policing(struct netdev *netdev,
1514 uint32_t kbits_rate, uint32_t kbits_burst)
1516 struct netdev_dev_linux *netdev_dev =
1517 netdev_dev_linux_cast(netdev_get_dev(netdev));
1518 const char *netdev_name = netdev_get_name(netdev);
1521 COVERAGE_INC(netdev_set_policing);
1523 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1524 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1525 : kbits_burst); /* Stick with user-specified value. */
1527 if (netdev_dev->cache_valid & VALID_POLICING
1528 && netdev_dev->kbits_rate == kbits_rate
1529 && netdev_dev->kbits_burst == kbits_burst) {
1530 /* Assume that settings haven't changed since we last set them. */
1534 netdev_linux_remove_policing(netdev);
1536 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1537 if (system(command) != 0) {
1538 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1542 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1543 kbits_rate, kbits_burst);
1544 if (system(command) != 0) {
1545 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1550 netdev_dev->kbits_rate = kbits_rate;
1551 netdev_dev->kbits_burst = kbits_burst;
1552 netdev_dev->cache_valid |= VALID_POLICING;
1559 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1562 const struct tc_ops **opsp;
1564 for (opsp = tcs; *opsp != NULL; opsp++) {
1565 const struct tc_ops *ops = *opsp;
1566 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1567 sset_add(types, ops->ovs_name);
1573 static const struct tc_ops *
1574 tc_lookup_ovs_name(const char *name)
1576 const struct tc_ops **opsp;
1578 for (opsp = tcs; *opsp != NULL; opsp++) {
1579 const struct tc_ops *ops = *opsp;
1580 if (!strcmp(name, ops->ovs_name)) {
1587 static const struct tc_ops *
1588 tc_lookup_linux_name(const char *name)
1590 const struct tc_ops **opsp;
1592 for (opsp = tcs; *opsp != NULL; opsp++) {
1593 const struct tc_ops *ops = *opsp;
1594 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1601 static struct tc_queue *
1602 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1605 struct netdev_dev_linux *netdev_dev =
1606 netdev_dev_linux_cast(netdev_get_dev(netdev));
1607 struct tc_queue *queue;
1609 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1610 if (queue->queue_id == queue_id) {
1617 static struct tc_queue *
1618 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1620 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1624 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1626 struct netdev_qos_capabilities *caps)
1628 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1632 caps->n_queues = ops->n_queues;
1637 netdev_linux_get_qos(const struct netdev *netdev,
1638 const char **typep, struct shash *details)
1640 struct netdev_dev_linux *netdev_dev =
1641 netdev_dev_linux_cast(netdev_get_dev(netdev));
1644 error = tc_query_qdisc(netdev);
1649 *typep = netdev_dev->tc->ops->ovs_name;
1650 return (netdev_dev->tc->ops->qdisc_get
1651 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1656 netdev_linux_set_qos(struct netdev *netdev,
1657 const char *type, const struct shash *details)
1659 struct netdev_dev_linux *netdev_dev =
1660 netdev_dev_linux_cast(netdev_get_dev(netdev));
1661 const struct tc_ops *new_ops;
1664 new_ops = tc_lookup_ovs_name(type);
1665 if (!new_ops || !new_ops->tc_install) {
1669 error = tc_query_qdisc(netdev);
1674 if (new_ops == netdev_dev->tc->ops) {
1675 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1677 /* Delete existing qdisc. */
1678 error = tc_del_qdisc(netdev);
1682 assert(netdev_dev->tc == NULL);
1684 /* Install new qdisc. */
1685 error = new_ops->tc_install(netdev, details);
1686 assert((error == 0) == (netdev_dev->tc != NULL));
1693 netdev_linux_get_queue(const struct netdev *netdev,
1694 unsigned int queue_id, struct shash *details)
1696 struct netdev_dev_linux *netdev_dev =
1697 netdev_dev_linux_cast(netdev_get_dev(netdev));
1700 error = tc_query_qdisc(netdev);
1704 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1706 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1712 netdev_linux_set_queue(struct netdev *netdev,
1713 unsigned int queue_id, const struct shash *details)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 error = tc_query_qdisc(netdev);
1722 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1723 || !netdev_dev->tc->ops->class_set) {
1727 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1731 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1733 struct netdev_dev_linux *netdev_dev =
1734 netdev_dev_linux_cast(netdev_get_dev(netdev));
1737 error = tc_query_qdisc(netdev);
1740 } else if (!netdev_dev->tc->ops->class_delete) {
1743 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1745 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1751 netdev_linux_get_queue_stats(const struct netdev *netdev,
1752 unsigned int queue_id,
1753 struct netdev_queue_stats *stats)
1755 struct netdev_dev_linux *netdev_dev =
1756 netdev_dev_linux_cast(netdev_get_dev(netdev));
1759 error = tc_query_qdisc(netdev);
1762 } else if (!netdev_dev->tc->ops->class_get_stats) {
1765 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1767 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1773 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1775 struct ofpbuf request;
1776 struct tcmsg *tcmsg;
1778 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1782 tcmsg->tcm_parent = 0;
1783 nl_dump_start(dump, rtnl_sock, &request);
1784 ofpbuf_uninit(&request);
1789 netdev_linux_dump_queues(const struct netdev *netdev,
1790 netdev_dump_queues_cb *cb, void *aux)
1792 struct netdev_dev_linux *netdev_dev =
1793 netdev_dev_linux_cast(netdev_get_dev(netdev));
1794 struct tc_queue *queue;
1795 struct shash details;
1799 error = tc_query_qdisc(netdev);
1802 } else if (!netdev_dev->tc->ops->class_get) {
1807 shash_init(&details);
1808 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1809 shash_clear(&details);
1811 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1813 (*cb)(queue->queue_id, &details, aux);
1818 shash_destroy(&details);
1824 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1825 netdev_dump_queue_stats_cb *cb, void *aux)
1827 struct netdev_dev_linux *netdev_dev =
1828 netdev_dev_linux_cast(netdev_get_dev(netdev));
1829 struct nl_dump dump;
1834 error = tc_query_qdisc(netdev);
1837 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1842 if (!start_queue_dump(netdev, &dump)) {
1845 while (nl_dump_next(&dump, &msg)) {
1846 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1852 error = nl_dump_done(&dump);
1853 return error ? error : last_error;
1857 netdev_linux_get_in4(const struct netdev *netdev_,
1858 struct in_addr *address, struct in_addr *netmask)
1860 struct netdev_dev_linux *netdev_dev =
1861 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1863 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1866 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1867 SIOCGIFADDR, "SIOCGIFADDR");
1872 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1873 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1878 netdev_dev->cache_valid |= VALID_IN4;
1880 *address = netdev_dev->address;
1881 *netmask = netdev_dev->netmask;
1882 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1886 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1887 struct in_addr netmask)
1889 struct netdev_dev_linux *netdev_dev =
1890 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1893 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1895 netdev_dev->cache_valid |= VALID_IN4;
1896 netdev_dev->address = address;
1897 netdev_dev->netmask = netmask;
1898 if (address.s_addr != INADDR_ANY) {
1899 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1900 "SIOCSIFNETMASK", netmask);
1907 parse_if_inet6_line(const char *line,
1908 struct in6_addr *in6, char ifname[16 + 1])
1910 uint8_t *s6 = in6->s6_addr;
1911 #define X8 "%2"SCNx8
1913 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1914 "%*x %*x %*x %*x %16s\n",
1915 &s6[0], &s6[1], &s6[2], &s6[3],
1916 &s6[4], &s6[5], &s6[6], &s6[7],
1917 &s6[8], &s6[9], &s6[10], &s6[11],
1918 &s6[12], &s6[13], &s6[14], &s6[15],
1922 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1923 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1925 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1927 struct netdev_dev_linux *netdev_dev =
1928 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1929 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1933 netdev_dev->in6 = in6addr_any;
1935 file = fopen("/proc/net/if_inet6", "r");
1937 const char *name = netdev_get_name(netdev_);
1938 while (fgets(line, sizeof line, file)) {
1939 struct in6_addr in6_tmp;
1940 char ifname[16 + 1];
1941 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1942 && !strcmp(name, ifname))
1944 netdev_dev->in6 = in6_tmp;
1950 netdev_dev->cache_valid |= VALID_IN6;
1952 *in6 = netdev_dev->in6;
1957 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1959 struct sockaddr_in sin;
1960 memset(&sin, 0, sizeof sin);
1961 sin.sin_family = AF_INET;
1962 sin.sin_addr = addr;
1965 memset(sa, 0, sizeof *sa);
1966 memcpy(sa, &sin, sizeof sin);
1970 do_set_addr(struct netdev *netdev,
1971 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1974 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1975 make_in4_sockaddr(&ifr.ifr_addr, addr);
1977 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1981 /* Adds 'router' as a default IP gateway. */
1983 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1985 struct in_addr any = { INADDR_ANY };
1989 memset(&rt, 0, sizeof rt);
1990 make_in4_sockaddr(&rt.rt_dst, any);
1991 make_in4_sockaddr(&rt.rt_gateway, router);
1992 make_in4_sockaddr(&rt.rt_genmask, any);
1993 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1994 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1996 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2002 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2005 static const char fn[] = "/proc/net/route";
2010 *netdev_name = NULL;
2011 stream = fopen(fn, "r");
2012 if (stream == NULL) {
2013 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2018 while (fgets(line, sizeof line, stream)) {
2021 uint32_t dest, gateway, mask;
2022 int refcnt, metric, mtu;
2023 unsigned int flags, use, window, irtt;
2026 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2028 iface, &dest, &gateway, &flags, &refcnt,
2029 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2031 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2035 if (!(flags & RTF_UP)) {
2036 /* Skip routes that aren't up. */
2040 /* The output of 'dest', 'mask', and 'gateway' were given in
2041 * network byte order, so we don't need need any endian
2042 * conversions here. */
2043 if ((dest & mask) == (host->s_addr & mask)) {
2045 /* The host is directly reachable. */
2046 next_hop->s_addr = 0;
2048 /* To reach the host, we must go through a gateway. */
2049 next_hop->s_addr = gateway;
2051 *netdev_name = xstrdup(iface);
2063 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2065 struct ethtool_drvinfo drvinfo;
2068 memset(&drvinfo, 0, sizeof drvinfo);
2069 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2070 (struct ethtool_cmd *)&drvinfo,
2072 "ETHTOOL_GDRVINFO");
2074 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2075 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2076 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2082 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2083 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2084 * returns 0. Otherwise, it returns a positive errno value; in particular,
2085 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2087 netdev_linux_arp_lookup(const struct netdev *netdev,
2088 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2091 struct sockaddr_in sin;
2094 memset(&r, 0, sizeof r);
2095 memset(&sin, 0, sizeof sin);
2096 sin.sin_family = AF_INET;
2097 sin.sin_addr.s_addr = ip;
2099 memcpy(&r.arp_pa, &sin, sizeof sin);
2100 r.arp_ha.sa_family = ARPHRD_ETHER;
2102 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2103 COVERAGE_INC(netdev_arp_lookup);
2104 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2106 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2107 } else if (retval != ENXIO) {
2108 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2109 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2115 nd_to_iff_flags(enum netdev_flags nd)
2118 if (nd & NETDEV_UP) {
2121 if (nd & NETDEV_PROMISC) {
2128 iff_to_nd_flags(int iff)
2130 enum netdev_flags nd = 0;
2134 if (iff & IFF_PROMISC) {
2135 nd |= NETDEV_PROMISC;
2141 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2142 enum netdev_flags on, enum netdev_flags *old_flagsp)
2144 int old_flags, new_flags;
2147 error = get_flags(netdev, &old_flags);
2149 *old_flagsp = iff_to_nd_flags(old_flags);
2150 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2151 if (new_flags != old_flags) {
2152 error = set_flags(netdev, new_flags);
2159 poll_notify(struct list *list)
2161 struct netdev_linux_notifier *notifier;
2162 LIST_FOR_EACH (notifier, node, list) {
2163 struct netdev_notifier *n = ¬ifier->notifier;
2169 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2170 void *aux OVS_UNUSED)
2173 struct list *list = shash_find_data(&netdev_linux_notifiers,
2179 struct shash_node *node;
2180 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2181 poll_notify(node->data);
2187 netdev_linux_poll_add(struct netdev *netdev,
2188 void (*cb)(struct netdev_notifier *), void *aux,
2189 struct netdev_notifier **notifierp)
2191 const char *netdev_name = netdev_get_name(netdev);
2192 struct netdev_linux_notifier *notifier;
2195 if (shash_is_empty(&netdev_linux_notifiers)) {
2197 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2198 netdev_linux_poll_cb, NULL);
2204 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2206 list = xmalloc(sizeof *list);
2208 shash_add(&netdev_linux_notifiers, netdev_name, list);
2211 notifier = xmalloc(sizeof *notifier);
2212 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2213 list_push_back(list, ¬ifier->node);
2214 *notifierp = ¬ifier->notifier;
2219 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2221 struct netdev_linux_notifier *notifier =
2222 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2225 /* Remove 'notifier' from its list. */
2226 list = list_remove(¬ifier->node);
2227 if (list_is_empty(list)) {
2228 /* The list is now empty. Remove it from the hash and free it. */
2229 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2230 shash_delete(&netdev_linux_notifiers,
2231 shash_find(&netdev_linux_notifiers, netdev_name));
2236 /* If that was the last notifier, unregister. */
2237 if (shash_is_empty(&netdev_linux_notifiers)) {
2238 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2242 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2246 netdev_linux_init, \
2248 netdev_linux_wait, \
2251 netdev_linux_destroy, \
2252 NULL, /* set_config */ \
2254 netdev_linux_open, \
2255 netdev_linux_close, \
2259 netdev_linux_recv, \
2260 netdev_linux_recv_wait, \
2261 netdev_linux_drain, \
2263 netdev_linux_send, \
2264 netdev_linux_send_wait, \
2266 netdev_linux_set_etheraddr, \
2267 netdev_linux_get_etheraddr, \
2268 netdev_linux_get_mtu, \
2269 netdev_linux_get_ifindex, \
2270 netdev_linux_get_carrier, \
2271 netdev_linux_get_miimon, \
2272 netdev_linux_get_stats, \
2275 netdev_linux_get_features, \
2276 netdev_linux_set_advertisements, \
2277 netdev_linux_get_vlan_vid, \
2279 netdev_linux_set_policing, \
2280 netdev_linux_get_qos_types, \
2281 netdev_linux_get_qos_capabilities, \
2282 netdev_linux_get_qos, \
2283 netdev_linux_set_qos, \
2284 netdev_linux_get_queue, \
2285 netdev_linux_set_queue, \
2286 netdev_linux_delete_queue, \
2287 netdev_linux_get_queue_stats, \
2288 netdev_linux_dump_queues, \
2289 netdev_linux_dump_queue_stats, \
2291 netdev_linux_get_in4, \
2292 netdev_linux_set_in4, \
2293 netdev_linux_get_in6, \
2294 netdev_linux_add_router, \
2295 netdev_linux_get_next_hop, \
2296 netdev_linux_get_status, \
2297 netdev_linux_arp_lookup, \
2299 netdev_linux_update_flags, \
2301 netdev_linux_poll_add, \
2302 netdev_linux_poll_remove \
2305 const struct netdev_class netdev_linux_class =
2308 netdev_linux_create,
2309 netdev_linux_enumerate,
2310 NULL); /* set_stats */
2312 const struct netdev_class netdev_tap_class =
2315 netdev_linux_create_tap,
2316 NULL, /* enumerate */
2317 NULL); /* set_stats */
2319 const struct netdev_class netdev_internal_class =
2322 netdev_linux_create,
2323 NULL, /* enumerate */
2324 netdev_vport_set_stats);
2326 /* HTB traffic control class. */
2328 #define HTB_N_QUEUES 0xf000
2332 unsigned int max_rate; /* In bytes/s. */
2336 struct tc_queue tc_queue;
2337 unsigned int min_rate; /* In bytes/s. */
2338 unsigned int max_rate; /* In bytes/s. */
2339 unsigned int burst; /* In bytes. */
2340 unsigned int priority; /* Lower values are higher priorities. */
2344 htb_get__(const struct netdev *netdev)
2346 struct netdev_dev_linux *netdev_dev =
2347 netdev_dev_linux_cast(netdev_get_dev(netdev));
2348 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2352 htb_install__(struct netdev *netdev, uint64_t max_rate)
2354 struct netdev_dev_linux *netdev_dev =
2355 netdev_dev_linux_cast(netdev_get_dev(netdev));
2358 htb = xmalloc(sizeof *htb);
2359 tc_init(&htb->tc, &tc_ops_htb);
2360 htb->max_rate = max_rate;
2362 netdev_dev->tc = &htb->tc;
2365 /* Create an HTB qdisc.
2367 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2369 htb_setup_qdisc__(struct netdev *netdev)
2372 struct tc_htb_glob opt;
2373 struct ofpbuf request;
2374 struct tcmsg *tcmsg;
2376 tc_del_qdisc(netdev);
2378 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2379 NLM_F_EXCL | NLM_F_CREATE, &request);
2383 tcmsg->tcm_handle = tc_make_handle(1, 0);
2384 tcmsg->tcm_parent = TC_H_ROOT;
2386 nl_msg_put_string(&request, TCA_KIND, "htb");
2388 memset(&opt, 0, sizeof opt);
2389 opt.rate2quantum = 10;
2393 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2394 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2395 nl_msg_end_nested(&request, opt_offset);
2397 return tc_transact(&request, NULL);
2400 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2401 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2403 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2404 unsigned int parent, struct htb_class *class)
2407 struct tc_htb_opt opt;
2408 struct ofpbuf request;
2409 struct tcmsg *tcmsg;
2413 netdev_get_mtu(netdev, &mtu);
2414 if (mtu == INT_MAX) {
2415 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2416 netdev_get_name(netdev));
2420 memset(&opt, 0, sizeof opt);
2421 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2422 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2423 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2424 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2425 opt.prio = class->priority;
2427 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2431 tcmsg->tcm_handle = handle;
2432 tcmsg->tcm_parent = parent;
2434 nl_msg_put_string(&request, TCA_KIND, "htb");
2435 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2436 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2437 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2438 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2439 nl_msg_end_nested(&request, opt_offset);
2441 error = tc_transact(&request, NULL);
2443 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2444 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2445 netdev_get_name(netdev),
2446 tc_get_major(handle), tc_get_minor(handle),
2447 tc_get_major(parent), tc_get_minor(parent),
2448 class->min_rate, class->max_rate,
2449 class->burst, class->priority, strerror(error));
2454 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2455 * description of them into 'details'. The description complies with the
2456 * specification given in the vswitch database documentation for linux-htb
2459 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2461 static const struct nl_policy tca_htb_policy[] = {
2462 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2463 .min_len = sizeof(struct tc_htb_opt) },
2466 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2467 const struct tc_htb_opt *htb;
2469 if (!nl_parse_nested(nl_options, tca_htb_policy,
2470 attrs, ARRAY_SIZE(tca_htb_policy))) {
2471 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2475 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2476 class->min_rate = htb->rate.rate;
2477 class->max_rate = htb->ceil.rate;
2478 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2479 class->priority = htb->prio;
2484 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2485 struct htb_class *options,
2486 struct netdev_queue_stats *stats)
2488 struct nlattr *nl_options;
2489 unsigned int handle;
2492 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2493 if (!error && queue_id) {
2494 unsigned int major = tc_get_major(handle);
2495 unsigned int minor = tc_get_minor(handle);
2496 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2497 *queue_id = minor - 1;
2502 if (!error && options) {
2503 error = htb_parse_tca_options__(nl_options, options);
2509 htb_parse_qdisc_details__(struct netdev *netdev,
2510 const struct shash *details, struct htb_class *hc)
2512 const char *max_rate_s;
2514 max_rate_s = shash_find_data(details, "max-rate");
2515 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2516 if (!hc->max_rate) {
2519 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2520 hc->max_rate = netdev_features_to_bps(current) / 8;
2522 hc->min_rate = hc->max_rate;
2528 htb_parse_class_details__(struct netdev *netdev,
2529 const struct shash *details, struct htb_class *hc)
2531 const struct htb *htb = htb_get__(netdev);
2532 const char *min_rate_s = shash_find_data(details, "min-rate");
2533 const char *max_rate_s = shash_find_data(details, "max-rate");
2534 const char *burst_s = shash_find_data(details, "burst");
2535 const char *priority_s = shash_find_data(details, "priority");
2538 netdev_get_mtu(netdev, &mtu);
2539 if (mtu == INT_MAX) {
2540 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2541 netdev_get_name(netdev));
2545 /* HTB requires at least an mtu sized min-rate to send any traffic even
2546 * on uncongested links. */
2547 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2548 hc->min_rate = MAX(hc->min_rate, mtu);
2549 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2552 hc->max_rate = (max_rate_s
2553 ? strtoull(max_rate_s, NULL, 10) / 8
2555 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2556 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2560 * According to hints in the documentation that I've read, it is important
2561 * that 'burst' be at least as big as the largest frame that might be
2562 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2563 * but having it a bit too small is a problem. Since netdev_get_mtu()
2564 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2565 * the MTU. We actually add 64, instead of 14, as a guard against
2566 * additional headers get tacked on somewhere that we're not aware of. */
2567 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2568 hc->burst = MAX(hc->burst, mtu + 64);
2571 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2577 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2578 unsigned int parent, struct htb_class *options,
2579 struct netdev_queue_stats *stats)
2581 struct ofpbuf *reply;
2584 error = tc_query_class(netdev, handle, parent, &reply);
2586 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2587 ofpbuf_delete(reply);
2593 htb_tc_install(struct netdev *netdev, const struct shash *details)
2597 error = htb_setup_qdisc__(netdev);
2599 struct htb_class hc;
2601 htb_parse_qdisc_details__(netdev, details, &hc);
2602 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2603 tc_make_handle(1, 0), &hc);
2605 htb_install__(netdev, hc.max_rate);
2611 static struct htb_class *
2612 htb_class_cast__(const struct tc_queue *queue)
2614 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2618 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2619 const struct htb_class *hc)
2621 struct htb *htb = htb_get__(netdev);
2622 size_t hash = hash_int(queue_id, 0);
2623 struct tc_queue *queue;
2624 struct htb_class *hcp;
2626 queue = tc_find_queue__(netdev, queue_id, hash);
2628 hcp = htb_class_cast__(queue);
2630 hcp = xmalloc(sizeof *hcp);
2631 queue = &hcp->tc_queue;
2632 queue->queue_id = queue_id;
2633 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2636 hcp->min_rate = hc->min_rate;
2637 hcp->max_rate = hc->max_rate;
2638 hcp->burst = hc->burst;
2639 hcp->priority = hc->priority;
2643 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2646 struct nl_dump dump;
2647 struct htb_class hc;
2649 /* Get qdisc options. */
2651 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2652 htb_install__(netdev, hc.max_rate);
2655 if (!start_queue_dump(netdev, &dump)) {
2658 while (nl_dump_next(&dump, &msg)) {
2659 unsigned int queue_id;
2661 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2662 htb_update_queue__(netdev, queue_id, &hc);
2665 nl_dump_done(&dump);
2671 htb_tc_destroy(struct tc *tc)
2673 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2674 struct htb_class *hc, *next;
2676 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2677 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2685 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2687 const struct htb *htb = htb_get__(netdev);
2688 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2693 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2695 struct htb_class hc;
2698 htb_parse_qdisc_details__(netdev, details, &hc);
2699 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2700 tc_make_handle(1, 0), &hc);
2702 htb_get__(netdev)->max_rate = hc.max_rate;
2708 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2709 const struct tc_queue *queue, struct shash *details)
2711 const struct htb_class *hc = htb_class_cast__(queue);
2713 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2714 if (hc->min_rate != hc->max_rate) {
2715 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2717 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2719 shash_add(details, "priority", xasprintf("%u", hc->priority));
2725 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2726 const struct shash *details)
2728 struct htb_class hc;
2731 error = htb_parse_class_details__(netdev, details, &hc);
2736 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2737 tc_make_handle(1, 0xfffe), &hc);
2742 htb_update_queue__(netdev, queue_id, &hc);
2747 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2749 struct htb_class *hc = htb_class_cast__(queue);
2750 struct htb *htb = htb_get__(netdev);
2753 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2755 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2762 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2763 struct netdev_queue_stats *stats)
2765 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2766 tc_make_handle(1, 0xfffe), NULL, stats);
2770 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2771 const struct ofpbuf *nlmsg,
2772 netdev_dump_queue_stats_cb *cb, void *aux)
2774 struct netdev_queue_stats stats;
2775 unsigned int handle, major, minor;
2778 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2783 major = tc_get_major(handle);
2784 minor = tc_get_minor(handle);
2785 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2786 (*cb)(minor - 1, &stats, aux);
2791 static const struct tc_ops tc_ops_htb = {
2792 "htb", /* linux_name */
2793 "linux-htb", /* ovs_name */
2794 HTB_N_QUEUES, /* n_queues */
2803 htb_class_get_stats,
2804 htb_class_dump_stats
2807 /* "linux-hfsc" traffic control class. */
2809 #define HFSC_N_QUEUES 0xf000
2817 struct tc_queue tc_queue;
2822 static struct hfsc *
2823 hfsc_get__(const struct netdev *netdev)
2825 struct netdev_dev_linux *netdev_dev;
2826 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2827 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2830 static struct hfsc_class *
2831 hfsc_class_cast__(const struct tc_queue *queue)
2833 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2837 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2839 struct netdev_dev_linux * netdev_dev;
2842 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2843 hfsc = xmalloc(sizeof *hfsc);
2844 tc_init(&hfsc->tc, &tc_ops_hfsc);
2845 hfsc->max_rate = max_rate;
2846 netdev_dev->tc = &hfsc->tc;
2850 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2851 const struct hfsc_class *hc)
2855 struct hfsc_class *hcp;
2856 struct tc_queue *queue;
2858 hfsc = hfsc_get__(netdev);
2859 hash = hash_int(queue_id, 0);
2861 queue = tc_find_queue__(netdev, queue_id, hash);
2863 hcp = hfsc_class_cast__(queue);
2865 hcp = xmalloc(sizeof *hcp);
2866 queue = &hcp->tc_queue;
2867 queue->queue_id = queue_id;
2868 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2871 hcp->min_rate = hc->min_rate;
2872 hcp->max_rate = hc->max_rate;
2876 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2878 const struct tc_service_curve *rsc, *fsc, *usc;
2879 static const struct nl_policy tca_hfsc_policy[] = {
2881 .type = NL_A_UNSPEC,
2883 .min_len = sizeof(struct tc_service_curve),
2886 .type = NL_A_UNSPEC,
2888 .min_len = sizeof(struct tc_service_curve),
2891 .type = NL_A_UNSPEC,
2893 .min_len = sizeof(struct tc_service_curve),
2896 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2898 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2899 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2900 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2904 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2905 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2906 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2908 if (rsc->m1 != 0 || rsc->d != 0 ||
2909 fsc->m1 != 0 || fsc->d != 0 ||
2910 usc->m1 != 0 || usc->d != 0) {
2911 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2912 "Non-linear service curves are not supported.");
2916 if (rsc->m2 != fsc->m2) {
2917 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2918 "Real-time service curves are not supported ");
2922 if (rsc->m2 > usc->m2) {
2923 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2924 "Min-rate service curve is greater than "
2925 "the max-rate service curve.");
2929 class->min_rate = fsc->m2;
2930 class->max_rate = usc->m2;
2935 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2936 struct hfsc_class *options,
2937 struct netdev_queue_stats *stats)
2940 unsigned int handle;
2941 struct nlattr *nl_options;
2943 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2949 unsigned int major, minor;
2951 major = tc_get_major(handle);
2952 minor = tc_get_minor(handle);
2953 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2954 *queue_id = minor - 1;
2961 error = hfsc_parse_tca_options__(nl_options, options);
2968 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2969 unsigned int parent, struct hfsc_class *options,
2970 struct netdev_queue_stats *stats)
2973 struct ofpbuf *reply;
2975 error = tc_query_class(netdev, handle, parent, &reply);
2980 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2981 ofpbuf_delete(reply);
2986 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2987 struct hfsc_class *class)
2990 const char *max_rate_s;
2992 max_rate_s = shash_find_data(details, "max-rate");
2993 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2998 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2999 max_rate = netdev_features_to_bps(current) / 8;
3002 class->min_rate = max_rate;
3003 class->max_rate = max_rate;
3007 hfsc_parse_class_details__(struct netdev *netdev,
3008 const struct shash *details,
3009 struct hfsc_class * class)
3011 const struct hfsc *hfsc;
3012 uint32_t min_rate, max_rate;
3013 const char *min_rate_s, *max_rate_s;
3015 hfsc = hfsc_get__(netdev);
3016 min_rate_s = shash_find_data(details, "min-rate");
3017 max_rate_s = shash_find_data(details, "max-rate");
3019 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3020 min_rate = MAX(min_rate, 1);
3021 min_rate = MIN(min_rate, hfsc->max_rate);
3023 max_rate = (max_rate_s
3024 ? strtoull(max_rate_s, NULL, 10) / 8
3026 max_rate = MAX(max_rate, min_rate);
3027 max_rate = MIN(max_rate, hfsc->max_rate);
3029 class->min_rate = min_rate;
3030 class->max_rate = max_rate;
3035 /* Create an HFSC qdisc.
3037 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3039 hfsc_setup_qdisc__(struct netdev * netdev)
3041 struct tcmsg *tcmsg;
3042 struct ofpbuf request;
3043 struct tc_hfsc_qopt opt;
3045 tc_del_qdisc(netdev);
3047 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3048 NLM_F_EXCL | NLM_F_CREATE, &request);
3054 tcmsg->tcm_handle = tc_make_handle(1, 0);
3055 tcmsg->tcm_parent = TC_H_ROOT;
3057 memset(&opt, 0, sizeof opt);
3060 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3061 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3063 return tc_transact(&request, NULL);
3066 /* Create an HFSC class.
3068 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3069 * sc rate <min_rate> ul rate <max_rate>" */
3071 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3072 unsigned int parent, struct hfsc_class *class)
3076 struct tcmsg *tcmsg;
3077 struct ofpbuf request;
3078 struct tc_service_curve min, max;
3080 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3086 tcmsg->tcm_handle = handle;
3087 tcmsg->tcm_parent = parent;
3091 min.m2 = class->min_rate;
3095 max.m2 = class->max_rate;
3097 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3098 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3099 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3100 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3101 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3102 nl_msg_end_nested(&request, opt_offset);
3104 error = tc_transact(&request, NULL);
3106 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3107 "min-rate %ubps, max-rate %ubps (%s)",
3108 netdev_get_name(netdev),
3109 tc_get_major(handle), tc_get_minor(handle),
3110 tc_get_major(parent), tc_get_minor(parent),
3111 class->min_rate, class->max_rate, strerror(error));
3118 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3121 struct hfsc_class class;
3123 error = hfsc_setup_qdisc__(netdev);
3129 hfsc_parse_qdisc_details__(netdev, details, &class);
3130 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3131 tc_make_handle(1, 0), &class);
3137 hfsc_install__(netdev, class.max_rate);
3142 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3145 struct nl_dump dump;
3146 struct hfsc_class hc;
3149 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3150 hfsc_install__(netdev, hc.max_rate);
3152 if (!start_queue_dump(netdev, &dump)) {
3156 while (nl_dump_next(&dump, &msg)) {
3157 unsigned int queue_id;
3159 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3160 hfsc_update_queue__(netdev, queue_id, &hc);
3164 nl_dump_done(&dump);
3169 hfsc_tc_destroy(struct tc *tc)
3172 struct hfsc_class *hc, *next;
3174 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3176 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3177 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3186 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3188 const struct hfsc *hfsc;
3189 hfsc = hfsc_get__(netdev);
3190 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3195 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3198 struct hfsc_class class;
3200 hfsc_parse_qdisc_details__(netdev, details, &class);
3201 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3202 tc_make_handle(1, 0), &class);
3205 hfsc_get__(netdev)->max_rate = class.max_rate;
3212 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3213 const struct tc_queue *queue, struct shash *details)
3215 const struct hfsc_class *hc;
3217 hc = hfsc_class_cast__(queue);
3218 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3219 if (hc->min_rate != hc->max_rate) {
3220 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3226 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3227 const struct shash *details)
3230 struct hfsc_class class;
3232 error = hfsc_parse_class_details__(netdev, details, &class);
3237 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3238 tc_make_handle(1, 0xfffe), &class);
3243 hfsc_update_queue__(netdev, queue_id, &class);
3248 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3252 struct hfsc_class *hc;
3254 hc = hfsc_class_cast__(queue);
3255 hfsc = hfsc_get__(netdev);
3257 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3259 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3266 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3267 struct netdev_queue_stats *stats)
3269 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3270 tc_make_handle(1, 0xfffe), NULL, stats);
3274 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3275 const struct ofpbuf *nlmsg,
3276 netdev_dump_queue_stats_cb *cb, void *aux)
3278 struct netdev_queue_stats stats;
3279 unsigned int handle, major, minor;
3282 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3287 major = tc_get_major(handle);
3288 minor = tc_get_minor(handle);
3289 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3290 (*cb)(minor - 1, &stats, aux);
3295 static const struct tc_ops tc_ops_hfsc = {
3296 "hfsc", /* linux_name */
3297 "linux-hfsc", /* ovs_name */
3298 HFSC_N_QUEUES, /* n_queues */
3299 hfsc_tc_install, /* tc_install */
3300 hfsc_tc_load, /* tc_load */
3301 hfsc_tc_destroy, /* tc_destroy */
3302 hfsc_qdisc_get, /* qdisc_get */
3303 hfsc_qdisc_set, /* qdisc_set */
3304 hfsc_class_get, /* class_get */
3305 hfsc_class_set, /* class_set */
3306 hfsc_class_delete, /* class_delete */
3307 hfsc_class_get_stats, /* class_get_stats */
3308 hfsc_class_dump_stats /* class_dump_stats */
3311 /* "linux-default" traffic control class.
3313 * This class represents the default, unnamed Linux qdisc. It corresponds to
3314 * the "" (empty string) QoS type in the OVS database. */
3317 default_install__(struct netdev *netdev)
3319 struct netdev_dev_linux *netdev_dev =
3320 netdev_dev_linux_cast(netdev_get_dev(netdev));
3321 static struct tc *tc;
3324 tc = xmalloc(sizeof *tc);
3325 tc_init(tc, &tc_ops_default);
3327 netdev_dev->tc = tc;
3331 default_tc_install(struct netdev *netdev,
3332 const struct shash *details OVS_UNUSED)
3334 default_install__(netdev);
3339 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3341 default_install__(netdev);
3345 static const struct tc_ops tc_ops_default = {
3346 NULL, /* linux_name */
3351 NULL, /* tc_destroy */
3352 NULL, /* qdisc_get */
3353 NULL, /* qdisc_set */
3354 NULL, /* class_get */
3355 NULL, /* class_set */
3356 NULL, /* class_delete */
3357 NULL, /* class_get_stats */
3358 NULL /* class_dump_stats */
3361 /* "linux-other" traffic control class.
3366 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3368 struct netdev_dev_linux *netdev_dev =
3369 netdev_dev_linux_cast(netdev_get_dev(netdev));
3370 static struct tc *tc;
3373 tc = xmalloc(sizeof *tc);
3374 tc_init(tc, &tc_ops_other);
3376 netdev_dev->tc = tc;
3380 static const struct tc_ops tc_ops_other = {
3381 NULL, /* linux_name */
3382 "linux-other", /* ovs_name */
3384 NULL, /* tc_install */
3386 NULL, /* tc_destroy */
3387 NULL, /* qdisc_get */
3388 NULL, /* qdisc_set */
3389 NULL, /* class_get */
3390 NULL, /* class_set */
3391 NULL, /* class_delete */
3392 NULL, /* class_get_stats */
3393 NULL /* class_dump_stats */
3396 /* Traffic control. */
3398 /* Number of kernel "tc" ticks per second. */
3399 static double ticks_per_s;
3401 /* Number of kernel "jiffies" per second. This is used for the purpose of
3402 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3403 * one jiffy's worth of data.
3405 * There are two possibilities here:
3407 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3408 * approximate range of 100 to 1024. That means that we really need to
3409 * make sure that the qdisc can buffer that much data.
3411 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3412 * has finely granular timers and there's no need to fudge additional room
3413 * for buffers. (There's no extra effort needed to implement that: the
3414 * large 'buffer_hz' is used as a divisor, so practically any number will
3415 * come out as 0 in the division. Small integer results in the case of
3416 * really high dividends won't have any real effect anyhow.)
3418 static unsigned int buffer_hz;
3420 /* Returns tc handle 'major':'minor'. */
3422 tc_make_handle(unsigned int major, unsigned int minor)
3424 return TC_H_MAKE(major << 16, minor);
3427 /* Returns the major number from 'handle'. */
3429 tc_get_major(unsigned int handle)
3431 return TC_H_MAJ(handle) >> 16;
3434 /* Returns the minor number from 'handle'. */
3436 tc_get_minor(unsigned int handle)
3438 return TC_H_MIN(handle);
3441 static struct tcmsg *
3442 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3443 struct ofpbuf *request)
3445 struct tcmsg *tcmsg;
3449 error = get_ifindex(netdev, &ifindex);
3454 ofpbuf_init(request, 512);
3455 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3456 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3457 tcmsg->tcm_family = AF_UNSPEC;
3458 tcmsg->tcm_ifindex = ifindex;
3459 /* Caller should fill in tcmsg->tcm_handle. */
3460 /* Caller should fill in tcmsg->tcm_parent. */
3466 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3468 int error = nl_sock_transact(rtnl_sock, request, replyp);
3469 ofpbuf_uninit(request);
3476 /* The values in psched are not individually very meaningful, but they are
3477 * important. The tables below show some values seen in the wild.
3481 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3482 * (Before that, there are hints that it was 1000000000.)
3484 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3488 * -----------------------------------
3489 * [1] 000c8000 000f4240 000f4240 00000064
3490 * [2] 000003e8 00000400 000f4240 3b9aca00
3491 * [3] 000003e8 00000400 000f4240 3b9aca00
3492 * [4] 000003e8 00000400 000f4240 00000064
3493 * [5] 000003e8 00000040 000f4240 3b9aca00
3494 * [6] 000003e8 00000040 000f4240 000000f9
3496 * a b c d ticks_per_s buffer_hz
3497 * ------- --------- ---------- ------------- ----------- -------------
3498 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3499 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3500 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3501 * [4] 1,000 1,024 1,000,000 100 976,562 100
3502 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3503 * [6] 1,000 64 1,000,000 249 15,625,000 249
3505 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3506 * [2] 2.6.26-1-686-bigmem from Debian lenny
3507 * [3] 2.6.26-2-sparc64 from Debian lenny
3508 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3509 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3510 * [6] 2.6.34 from kernel.org on KVM
3512 static const char fn[] = "/proc/net/psched";
3513 unsigned int a, b, c, d;
3519 stream = fopen(fn, "r");
3521 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3525 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3526 VLOG_WARN("%s: read failed", fn);
3530 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3534 VLOG_WARN("%s: invalid scheduler parameters", fn);
3538 ticks_per_s = (double) a * c / b;
3542 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3545 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3548 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3549 * rate of 'rate' bytes per second. */
3551 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3556 return (rate * ticks) / ticks_per_s;
3559 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3560 * rate of 'rate' bytes per second. */
3562 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3567 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3570 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3571 * a transmission rate of 'rate' bytes per second. */
3573 tc_buffer_per_jiffy(unsigned int rate)
3578 return rate / buffer_hz;
3581 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3582 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3583 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3584 * stores NULL into it if it is absent.
3586 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3589 * Returns 0 if successful, otherwise a positive errno value. */
3591 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3592 struct nlattr **options)
3594 static const struct nl_policy tca_policy[] = {
3595 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3596 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3598 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3600 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3601 tca_policy, ta, ARRAY_SIZE(ta))) {
3602 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3607 *kind = nl_attr_get_string(ta[TCA_KIND]);
3611 *options = ta[TCA_OPTIONS];
3626 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3627 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3628 * into '*options', and its queue statistics into '*stats'. Any of the output
3629 * arguments may be null.
3631 * Returns 0 if successful, otherwise a positive errno value. */
3633 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3634 struct nlattr **options, struct netdev_queue_stats *stats)
3636 static const struct nl_policy tca_policy[] = {
3637 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3638 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3640 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3642 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3643 tca_policy, ta, ARRAY_SIZE(ta))) {
3644 VLOG_WARN_RL(&rl, "failed to parse class message");
3649 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3650 *handlep = tc->tcm_handle;
3654 *options = ta[TCA_OPTIONS];
3658 const struct gnet_stats_queue *gsq;
3659 struct gnet_stats_basic gsb;
3661 static const struct nl_policy stats_policy[] = {
3662 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3663 .min_len = sizeof gsb },
3664 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3665 .min_len = sizeof *gsq },
3667 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3669 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3670 sa, ARRAY_SIZE(sa))) {
3671 VLOG_WARN_RL(&rl, "failed to parse class stats");
3675 /* Alignment issues screw up the length of struct gnet_stats_basic on
3676 * some arch/bitsize combinations. Newer versions of Linux have a
3677 * struct gnet_stats_basic_packed, but we can't depend on that. The
3678 * easiest thing to do is just to make a copy. */
3679 memset(&gsb, 0, sizeof gsb);
3680 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3681 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3682 stats->tx_bytes = gsb.bytes;
3683 stats->tx_packets = gsb.packets;
3685 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3686 stats->tx_errors = gsq->drops;
3696 memset(stats, 0, sizeof *stats);
3701 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3704 tc_query_class(const struct netdev *netdev,
3705 unsigned int handle, unsigned int parent,
3706 struct ofpbuf **replyp)
3708 struct ofpbuf request;
3709 struct tcmsg *tcmsg;
3712 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3716 tcmsg->tcm_handle = handle;
3717 tcmsg->tcm_parent = parent;
3719 error = tc_transact(&request, replyp);
3721 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3722 netdev_get_name(netdev),
3723 tc_get_major(handle), tc_get_minor(handle),
3724 tc_get_major(parent), tc_get_minor(parent),
3730 /* Equivalent to "tc class del dev <name> handle <handle>". */
3732 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3734 struct ofpbuf request;
3735 struct tcmsg *tcmsg;
3738 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3742 tcmsg->tcm_handle = handle;
3743 tcmsg->tcm_parent = 0;
3745 error = tc_transact(&request, NULL);
3747 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3748 netdev_get_name(netdev),
3749 tc_get_major(handle), tc_get_minor(handle),
3755 /* Equivalent to "tc qdisc del dev <name> root". */
3757 tc_del_qdisc(struct netdev *netdev)
3759 struct netdev_dev_linux *netdev_dev =
3760 netdev_dev_linux_cast(netdev_get_dev(netdev));
3761 struct ofpbuf request;
3762 struct tcmsg *tcmsg;
3765 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3769 tcmsg->tcm_handle = tc_make_handle(1, 0);
3770 tcmsg->tcm_parent = TC_H_ROOT;
3772 error = tc_transact(&request, NULL);
3773 if (error == EINVAL) {
3774 /* EINVAL probably means that the default qdisc was in use, in which
3775 * case we've accomplished our purpose. */
3778 if (!error && netdev_dev->tc) {
3779 if (netdev_dev->tc->ops->tc_destroy) {
3780 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3782 netdev_dev->tc = NULL;
3787 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3788 * kernel to determine what they are. Returns 0 if successful, otherwise a
3789 * positive errno value. */
3791 tc_query_qdisc(const struct netdev *netdev)
3793 struct netdev_dev_linux *netdev_dev =
3794 netdev_dev_linux_cast(netdev_get_dev(netdev));
3795 struct ofpbuf request, *qdisc;
3796 const struct tc_ops *ops;
3797 struct tcmsg *tcmsg;
3801 if (netdev_dev->tc) {
3805 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3806 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3807 * 2.6.35 without that fix backported to it.
3809 * To avoid the OOPS, we must not make a request that would attempt to dump
3810 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3811 * few others. There are a few ways that I can see to do this, but most of
3812 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3813 * technique chosen here is to assume that any non-default qdisc that we
3814 * create will have a class with handle 1:0. The built-in qdiscs only have
3815 * a class with handle 0:0.
3817 * We could check for Linux 2.6.35+ and use a more straightforward method
3819 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3823 tcmsg->tcm_handle = tc_make_handle(1, 0);
3824 tcmsg->tcm_parent = 0;
3826 /* Figure out what tc class to instantiate. */
3827 error = tc_transact(&request, &qdisc);
3831 error = tc_parse_qdisc(qdisc, &kind, NULL);
3833 ops = &tc_ops_other;
3835 ops = tc_lookup_linux_name(kind);
3837 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3838 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3840 ops = &tc_ops_other;
3843 } else if (error == ENOENT) {
3844 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3845 * other entity that doesn't have a handle 1:0. We will assume
3846 * that it's the system default qdisc. */
3847 ops = &tc_ops_default;
3850 /* Who knows? Maybe the device got deleted. */
3851 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3852 netdev_get_name(netdev), strerror(error));
3853 ops = &tc_ops_other;
3856 /* Instantiate it. */
3857 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3858 assert((load_error == 0) == (netdev_dev->tc != NULL));
3859 ofpbuf_delete(qdisc);
3861 return error ? error : load_error;
3864 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3865 approximate the time to transmit packets of various lengths. For an MTU of
3866 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3867 represents two possible packet lengths; for a MTU of 513 through 1024, four
3868 possible lengths; and so on.
3870 Returns, for the specified 'mtu', the number of bits that packet lengths
3871 need to be shifted right to fit within such a 256-entry table. */
3873 tc_calc_cell_log(unsigned int mtu)
3878 mtu = ETH_PAYLOAD_MAX;
3880 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3882 for (cell_log = 0; mtu >= 256; cell_log++) {
3889 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3892 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3894 memset(rate, 0, sizeof *rate);
3895 rate->cell_log = tc_calc_cell_log(mtu);
3896 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3897 /* rate->cell_align = 0; */ /* distro headers. */
3898 rate->mpu = ETH_TOTAL_MIN;
3902 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3903 * attribute of the specified "type".
3905 * See tc_calc_cell_log() above for a description of "rtab"s. */
3907 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3912 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3913 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3914 unsigned packet_size = (i + 1) << rate->cell_log;
3915 if (packet_size < rate->mpu) {
3916 packet_size = rate->mpu;
3918 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3922 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3923 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3924 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3927 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3929 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3930 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3933 /* Public utility functions. */
3935 #define COPY_NETDEV_STATS \
3936 dst->rx_packets = src->rx_packets; \
3937 dst->tx_packets = src->tx_packets; \
3938 dst->rx_bytes = src->rx_bytes; \
3939 dst->tx_bytes = src->tx_bytes; \
3940 dst->rx_errors = src->rx_errors; \
3941 dst->tx_errors = src->tx_errors; \
3942 dst->rx_dropped = src->rx_dropped; \
3943 dst->tx_dropped = src->tx_dropped; \
3944 dst->multicast = src->multicast; \
3945 dst->collisions = src->collisions; \
3946 dst->rx_length_errors = src->rx_length_errors; \
3947 dst->rx_over_errors = src->rx_over_errors; \
3948 dst->rx_crc_errors = src->rx_crc_errors; \
3949 dst->rx_frame_errors = src->rx_frame_errors; \
3950 dst->rx_fifo_errors = src->rx_fifo_errors; \
3951 dst->rx_missed_errors = src->rx_missed_errors; \
3952 dst->tx_aborted_errors = src->tx_aborted_errors; \
3953 dst->tx_carrier_errors = src->tx_carrier_errors; \
3954 dst->tx_fifo_errors = src->tx_fifo_errors; \
3955 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3956 dst->tx_window_errors = src->tx_window_errors
3958 /* Copies 'src' into 'dst', performing format conversion in the process. */
3960 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3961 const struct rtnl_link_stats *src)
3966 /* Copies 'src' into 'dst', performing format conversion in the process. */
3968 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3969 const struct rtnl_link_stats64 *src)
3974 /* Copies 'src' into 'dst', performing format conversion in the process. */
3976 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3977 const struct netdev_stats *src)
3982 /* Utility functions. */
3985 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3987 /* Policy for RTNLGRP_LINK messages.
3989 * There are *many* more fields in these messages, but currently we only
3990 * care about these fields. */
3991 static const struct nl_policy rtnlgrp_link_policy[] = {
3992 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3993 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3994 .min_len = sizeof(struct rtnl_link_stats) },
3997 struct ofpbuf request;
3998 struct ofpbuf *reply;
3999 struct ifinfomsg *ifi;
4000 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4003 ofpbuf_init(&request, 0);
4004 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4005 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4006 ifi->ifi_family = PF_UNSPEC;
4007 ifi->ifi_index = ifindex;
4008 error = nl_sock_transact(rtnl_sock, &request, &reply);
4009 ofpbuf_uninit(&request);
4014 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4015 rtnlgrp_link_policy,
4016 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4017 ofpbuf_delete(reply);
4021 if (!attrs[IFLA_STATS]) {
4022 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4023 ofpbuf_delete(reply);
4027 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4029 ofpbuf_delete(reply);
4035 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4037 static const char fn[] = "/proc/net/dev";
4042 stream = fopen(fn, "r");
4044 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4049 while (fgets(line, sizeof line, stream)) {
4052 #define X64 "%"SCNu64
4055 X64 X64 X64 X64 X64 X64 X64 "%*u"
4056 X64 X64 X64 X64 X64 X64 X64 "%*u",
4062 &stats->rx_fifo_errors,
4063 &stats->rx_frame_errors,
4069 &stats->tx_fifo_errors,
4071 &stats->tx_carrier_errors) != 15) {
4072 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4073 } else if (!strcmp(devname, netdev_name)) {
4074 stats->rx_length_errors = UINT64_MAX;
4075 stats->rx_over_errors = UINT64_MAX;
4076 stats->rx_crc_errors = UINT64_MAX;
4077 stats->rx_missed_errors = UINT64_MAX;
4078 stats->tx_aborted_errors = UINT64_MAX;
4079 stats->tx_heartbeat_errors = UINT64_MAX;
4080 stats->tx_window_errors = UINT64_MAX;
4086 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4092 get_flags(const struct netdev *netdev, int *flags)
4097 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4099 *flags = ifr.ifr_flags;
4104 set_flags(struct netdev *netdev, int flags)
4108 ifr.ifr_flags = flags;
4109 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4114 do_get_ifindex(const char *netdev_name)
4118 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4119 COVERAGE_INC(netdev_get_ifindex);
4120 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4121 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4122 netdev_name, strerror(errno));
4125 return ifr.ifr_ifindex;
4129 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4131 struct netdev_dev_linux *netdev_dev =
4132 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4134 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4135 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4139 netdev_dev->cache_valid |= VALID_IFINDEX;
4140 netdev_dev->ifindex = ifindex;
4142 *ifindexp = netdev_dev->ifindex;
4147 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4152 memset(&ifr, 0, sizeof ifr);
4153 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4154 COVERAGE_INC(netdev_get_hwaddr);
4155 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4156 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4157 netdev_name, strerror(errno));
4160 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4161 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4162 VLOG_WARN("%s device has unknown hardware address family %d",
4163 netdev_name, hwaddr_family);
4165 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4170 set_etheraddr(const char *netdev_name, int hwaddr_family,
4171 const uint8_t mac[ETH_ADDR_LEN])
4175 memset(&ifr, 0, sizeof ifr);
4176 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4177 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4178 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4179 COVERAGE_INC(netdev_set_hwaddr);
4180 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4181 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4182 netdev_name, strerror(errno));
4189 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4190 int cmd, const char *cmd_name)
4194 memset(&ifr, 0, sizeof ifr);
4195 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4196 ifr.ifr_data = (caddr_t) ecmd;
4199 COVERAGE_INC(netdev_ethtool);
4200 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4203 if (errno != EOPNOTSUPP) {
4204 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4205 "failed: %s", cmd_name, name, strerror(errno));
4207 /* The device doesn't support this operation. That's pretty
4208 * common, so there's no point in logging anything. */
4215 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4216 const char *cmd_name)
4218 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4219 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4220 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4228 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4229 int cmd, const char *cmd_name)
4234 ifr.ifr_addr.sa_family = AF_INET;
4235 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4237 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4238 *ip = sin->sin_addr;