2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int do_get_ifindex(const char *netdev_name);
414 static int get_ifindex(const struct netdev *, int *ifindexp);
415 static int do_set_addr(struct netdev *netdev,
416 int ioctl_nr, const char *ioctl_name,
417 struct in_addr addr);
418 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
419 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
420 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
421 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
422 static int af_packet_sock(void);
423 static void netdev_linux_miimon_run(void);
424 static void netdev_linux_miimon_wait(void);
427 is_netdev_linux_class(const struct netdev_class *netdev_class)
429 return netdev_class->run == netdev_linux_run;
433 is_tap_netdev(const struct netdev *netdev)
435 return netdev_get_class(netdev) == &netdev_tap_class;
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
443 return CONTAINER_OF(netdev, struct netdev_linux, up);
446 static struct netdev_rx_linux *
447 netdev_rx_linux_cast(const struct netdev_rx *rx)
449 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
450 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
453 static void netdev_linux_update(struct netdev_linux *netdev,
454 const struct rtnetlink_link_change *)
455 OVS_REQUIRES(netdev->mutex);
456 static void netdev_linux_changed(struct netdev_linux *netdev,
457 unsigned int ifi_flags, unsigned int mask)
458 OVS_REQUIRES(netdev->mutex);
460 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
461 * if no such socket could be created. */
462 static struct nl_sock *
463 netdev_linux_notify_sock(void)
465 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
466 static struct nl_sock *sock;
468 if (ovsthread_once_start(&once)) {
471 error = nl_sock_create(NETLINK_ROUTE, &sock);
473 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
475 nl_sock_destroy(sock);
479 ovsthread_once_done(&once);
486 netdev_linux_run(void)
488 struct nl_sock *sock;
491 netdev_linux_miimon_run();
493 sock = netdev_linux_notify_sock();
499 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
500 uint64_t buf_stub[4096 / 8];
503 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
504 error = nl_sock_recv(sock, &buf, false);
506 struct rtnetlink_link_change change;
508 if (rtnetlink_link_parse(&buf, &change)) {
509 struct netdev *netdev_ = netdev_from_name(change.ifname);
510 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
511 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
513 ovs_mutex_lock(&netdev->mutex);
514 netdev_linux_update(netdev, &change);
515 ovs_mutex_unlock(&netdev->mutex);
517 netdev_close(netdev_);
519 } else if (error == ENOBUFS) {
520 struct shash device_shash;
521 struct shash_node *node;
525 shash_init(&device_shash);
526 netdev_get_devices(&netdev_linux_class, &device_shash);
527 SHASH_FOR_EACH (node, &device_shash) {
528 struct netdev *netdev_ = node->data;
529 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
532 ovs_mutex_lock(&netdev->mutex);
533 get_flags(netdev_, &flags);
534 netdev_linux_changed(netdev, flags, 0);
535 ovs_mutex_unlock(&netdev->mutex);
537 netdev_close(netdev_);
539 shash_destroy(&device_shash);
540 } else if (error != EAGAIN) {
541 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
542 ovs_strerror(error));
549 netdev_linux_wait(void)
551 struct nl_sock *sock;
553 netdev_linux_miimon_wait();
554 sock = netdev_linux_notify_sock();
556 nl_sock_wait(sock, POLLIN);
561 netdev_linux_changed(struct netdev_linux *dev,
562 unsigned int ifi_flags, unsigned int mask)
563 OVS_REQUIRES(dev->mutex)
566 if (!dev->change_seq) {
570 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
571 dev->carrier_resets++;
573 dev->ifi_flags = ifi_flags;
575 dev->cache_valid &= mask;
579 netdev_linux_update(struct netdev_linux *dev,
580 const struct rtnetlink_link_change *change)
581 OVS_REQUIRES(dev->mutex)
583 if (change->nlmsg_type == RTM_NEWLINK) {
585 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
587 /* Update netdev from rtnl-change msg. */
589 dev->mtu = change->mtu;
590 dev->cache_valid |= VALID_MTU;
591 dev->netdev_mtu_error = 0;
594 if (!eth_addr_is_zero(change->addr)) {
595 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
596 dev->cache_valid |= VALID_ETHERADDR;
597 dev->ether_addr_error = 0;
600 dev->ifindex = change->ifi_index;
601 dev->cache_valid |= VALID_IFINDEX;
602 dev->get_ifindex_error = 0;
605 netdev_linux_changed(dev, change->ifi_flags, 0);
609 static struct netdev *
610 netdev_linux_alloc(void)
612 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
617 netdev_linux_common_construct(struct netdev_linux *netdev)
619 ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
620 netdev->change_seq = 1;
623 /* Creates system and internal devices. */
625 netdev_linux_construct(struct netdev *netdev_)
627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
630 netdev_linux_common_construct(netdev);
632 error = get_flags(&netdev->up, &netdev->ifi_flags);
633 if (error == ENODEV) {
634 if (netdev->up.netdev_class != &netdev_internal_class) {
635 /* The device does not exist, so don't allow it to be opened. */
638 /* "Internal" netdevs have to be created as netdev objects before
639 * they exist in the kernel, because creating them in the kernel
640 * happens by passing a netdev object to dpif_port_add().
641 * Therefore, ignore the error. */
648 /* For most types of netdevs we open the device for each call of
649 * netdev_open(). However, this is not the case with tap devices,
650 * since it is only possible to open the device once. In this
651 * situation we share a single file descriptor, and consequently
652 * buffers, across all readers. Therefore once data is read it will
653 * be unavailable to other reads for tap devices. */
655 netdev_linux_construct_tap(struct netdev *netdev_)
657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
658 static const char tap_dev[] = "/dev/net/tun";
659 const char *name = netdev_->name;
663 netdev_linux_common_construct(netdev);
665 /* Open tap device. */
666 netdev->tap_fd = open(tap_dev, O_RDWR);
667 if (netdev->tap_fd < 0) {
669 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
673 /* Create tap device. */
674 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
675 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
676 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
677 VLOG_WARN("%s: creating tap device failed: %s", name,
678 ovs_strerror(errno));
683 /* Make non-blocking. */
684 error = set_nonblocking(netdev->tap_fd);
692 close(netdev->tap_fd);
697 netdev_linux_destruct(struct netdev *netdev_)
699 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
701 if (netdev->tc && netdev->tc->ops->tc_destroy) {
702 netdev->tc->ops->tc_destroy(netdev->tc);
705 if (netdev_get_class(netdev_) == &netdev_tap_class
706 && netdev->tap_fd >= 0)
708 close(netdev->tap_fd);
711 ovs_mutex_destroy(&netdev->mutex);
715 netdev_linux_dealloc(struct netdev *netdev_)
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 static struct netdev_rx *
722 netdev_linux_rx_alloc(void)
724 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
729 netdev_linux_rx_construct(struct netdev_rx *rx_)
731 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
732 struct netdev *netdev_ = rx->up.netdev;
733 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
736 ovs_mutex_lock(&netdev->mutex);
737 rx->is_tap = is_tap_netdev(netdev_);
739 rx->fd = netdev->tap_fd;
741 struct sockaddr_ll sll;
743 /* Result of tcpdump -dd inbound */
744 static const struct sock_filter filt[] = {
745 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
746 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
747 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
748 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
750 static const struct sock_fprog fprog = {
751 ARRAY_SIZE(filt), (struct sock_filter *) filt
754 /* Create file descriptor. */
755 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
758 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
762 /* Set non-blocking mode. */
763 error = set_nonblocking(rx->fd);
768 /* Get ethernet device index. */
769 error = get_ifindex(&netdev->up, &ifindex);
774 /* Bind to specific ethernet device. */
775 memset(&sll, 0, sizeof sll);
776 sll.sll_family = AF_PACKET;
777 sll.sll_ifindex = ifindex;
778 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
779 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
781 VLOG_ERR("%s: failed to bind raw socket (%s)",
782 netdev_get_name(netdev_), ovs_strerror(error));
786 /* Filter for only inbound packets. */
787 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
791 VLOG_ERR("%s: failed to attach filter (%s)",
792 netdev_get_name(netdev_), ovs_strerror(error));
796 ovs_mutex_unlock(&netdev->mutex);
804 ovs_mutex_unlock(&netdev->mutex);
809 netdev_linux_rx_destruct(struct netdev_rx *rx_)
811 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
819 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
821 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
827 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
829 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
834 ? read(rx->fd, data, size)
835 : recv(rx->fd, data, size, MSG_TRUNC));
836 } while (retval < 0 && errno == EINTR);
839 return retval > size ? -EMSGSIZE : retval;
841 if (errno != EAGAIN) {
842 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
843 ovs_strerror(errno), netdev_rx_get_name(rx_));
850 netdev_linux_rx_wait(struct netdev_rx *rx_)
852 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
853 poll_fd_wait(rx->fd, POLLIN);
857 netdev_linux_rx_drain(struct netdev_rx *rx_)
859 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
862 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
863 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
867 drain_fd(rx->fd, ifr.ifr_qlen);
870 return drain_rcvbuf(rx->fd);
874 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
875 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
876 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
877 * the packet is too big or too small to transmit on the device.
879 * The caller retains ownership of 'buffer' in all cases.
881 * The kernel maintains a packet transmission queue, so the caller is not
882 * expected to do additional queuing of packets. */
884 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
889 if (!is_tap_netdev(netdev_)) {
890 /* Use our AF_PACKET socket to send to this device. */
891 struct sockaddr_ll sll;
897 sock = af_packet_sock();
902 ifindex = netdev_get_ifindex(netdev_);
907 /* We don't bother setting most fields in sockaddr_ll because the
908 * kernel ignores them for SOCK_RAW. */
909 memset(&sll, 0, sizeof sll);
910 sll.sll_family = AF_PACKET;
911 sll.sll_ifindex = ifindex;
913 iov.iov_base = CONST_CAST(void *, data);
917 msg.msg_namelen = sizeof sll;
920 msg.msg_control = NULL;
921 msg.msg_controllen = 0;
924 retval = sendmsg(sock, &msg, 0);
926 /* Use the tap fd to send to this device. This is essential for
927 * tap devices, because packets sent to a tap device with an
928 * AF_PACKET socket will loop back to be *received* again on the
929 * tap device. This doesn't occur on other interface types
930 * because we attach a socket filter to the rx socket. */
931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
933 retval = write(netdev->tap_fd, data, size);
937 /* The Linux AF_PACKET implementation never blocks waiting for room
938 * for packets, instead returning ENOBUFS. Translate this into
939 * EAGAIN for the caller. */
940 if (errno == ENOBUFS) {
942 } else if (errno == EINTR) {
944 } else if (errno != EAGAIN) {
945 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
946 netdev_get_name(netdev_), ovs_strerror(errno));
949 } else if (retval != size) {
950 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
951 "%zu) on %s", retval, size, netdev_get_name(netdev_));
959 /* Registers with the poll loop to wake up from the next call to poll_block()
960 * when the packet transmission queue has sufficient room to transmit a packet
961 * with netdev_send().
963 * The kernel maintains a packet transmission queue, so the client is not
964 * expected to do additional queuing of packets. Thus, this function is
965 * unlikely to ever be used. It is included for completeness. */
967 netdev_linux_send_wait(struct netdev *netdev)
969 if (is_tap_netdev(netdev)) {
970 /* TAP device always accepts packets.*/
971 poll_immediate_wake();
975 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
976 * otherwise a positive errno value. */
978 netdev_linux_set_etheraddr(struct netdev *netdev_,
979 const uint8_t mac[ETH_ADDR_LEN])
981 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
982 struct netdev_saved_flags *sf = NULL;
985 ovs_mutex_lock(&netdev->mutex);
987 if (netdev->cache_valid & VALID_ETHERADDR) {
988 error = netdev->ether_addr_error;
989 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
992 netdev->cache_valid &= ~VALID_ETHERADDR;
995 /* Tap devices must be brought down before setting the address. */
996 if (is_tap_netdev(netdev_)) {
997 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
999 error = set_etheraddr(netdev_get_name(netdev_), mac);
1000 if (!error || error == ENODEV) {
1001 netdev->ether_addr_error = error;
1002 netdev->cache_valid |= VALID_ETHERADDR;
1004 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1008 netdev_restore_flags(sf);
1011 ovs_mutex_unlock(&netdev->mutex);
1015 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1017 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1018 uint8_t mac[ETH_ADDR_LEN])
1020 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1023 ovs_mutex_lock(&netdev->mutex);
1024 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1025 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1027 netdev->cache_valid |= VALID_ETHERADDR;
1030 error = netdev->ether_addr_error;
1032 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1034 ovs_mutex_unlock(&netdev->mutex);
1039 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1040 * in bytes, not including the hardware header; thus, this is typically 1500
1041 * bytes for Ethernet devices. */
1043 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1045 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1048 ovs_mutex_lock(&netdev->mutex);
1049 if (!(netdev->cache_valid & VALID_MTU)) {
1052 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1053 netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1054 netdev->mtu = ifr.ifr_mtu;
1055 netdev->cache_valid |= VALID_MTU;
1058 error = netdev->netdev_mtu_error;
1060 *mtup = netdev->mtu;
1062 ovs_mutex_unlock(&netdev->mutex);
1067 /* Sets the maximum size of transmitted (MTU) for given device using linux
1068 * networking ioctl interface.
1071 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1073 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1077 ovs_mutex_lock(&netdev->mutex);
1078 if (netdev->cache_valid & VALID_MTU) {
1079 error = netdev->netdev_mtu_error;
1080 if (error || netdev->mtu == mtu) {
1083 netdev->cache_valid &= ~VALID_MTU;
1086 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1087 SIOCSIFMTU, "SIOCSIFMTU");
1088 if (!error || error == ENODEV) {
1089 netdev->netdev_mtu_error = error;
1090 netdev->mtu = ifr.ifr_mtu;
1091 netdev->cache_valid |= VALID_MTU;
1094 ovs_mutex_unlock(&netdev->mutex);
1098 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1099 * On failure, returns a negative errno value. */
1101 netdev_linux_get_ifindex(const struct netdev *netdev_)
1103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1106 ovs_mutex_lock(&netdev->mutex);
1107 error = get_ifindex(netdev_, &ifindex);
1108 ovs_mutex_unlock(&netdev->mutex);
1110 return error ? -error : ifindex;
1114 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1116 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1118 ovs_mutex_lock(&netdev->mutex);
1119 if (netdev->miimon_interval > 0) {
1120 *carrier = netdev->miimon;
1122 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1124 ovs_mutex_unlock(&netdev->mutex);
1129 static long long int
1130 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1132 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1133 long long int carrier_resets;
1135 ovs_mutex_lock(&netdev->mutex);
1136 carrier_resets = netdev->carrier_resets;
1137 ovs_mutex_unlock(&netdev->mutex);
1139 return carrier_resets;
1143 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1144 struct mii_ioctl_data *data)
1149 memset(&ifr, 0, sizeof ifr);
1150 memcpy(&ifr.ifr_data, data, sizeof *data);
1151 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1152 memcpy(data, &ifr.ifr_data, sizeof *data);
1158 netdev_linux_get_miimon(const char *name, bool *miimon)
1160 struct mii_ioctl_data data;
1165 memset(&data, 0, sizeof data);
1166 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1168 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1169 data.reg_num = MII_BMSR;
1170 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1174 *miimon = !!(data.val_out & BMSR_LSTATUS);
1176 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1179 struct ethtool_cmd ecmd;
1181 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1184 COVERAGE_INC(netdev_get_ethtool);
1185 memset(&ecmd, 0, sizeof ecmd);
1186 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1189 struct ethtool_value eval;
1191 memcpy(&eval, &ecmd, sizeof eval);
1192 *miimon = !!eval.data;
1194 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1202 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1203 long long int interval)
1205 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1207 ovs_mutex_lock(&netdev->mutex);
1208 interval = interval > 0 ? MAX(interval, 100) : 0;
1209 if (netdev->miimon_interval != interval) {
1210 netdev->miimon_interval = interval;
1211 timer_set_expired(&netdev->miimon_timer);
1213 ovs_mutex_unlock(&netdev->mutex);
1219 netdev_linux_miimon_run(void)
1221 struct shash device_shash;
1222 struct shash_node *node;
1224 shash_init(&device_shash);
1225 netdev_get_devices(&netdev_linux_class, &device_shash);
1226 SHASH_FOR_EACH (node, &device_shash) {
1227 struct netdev *netdev = node->data;
1228 struct netdev_linux *dev = netdev_linux_cast(netdev);
1231 ovs_mutex_lock(&dev->mutex);
1232 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1233 netdev_linux_get_miimon(dev->up.name, &miimon);
1234 if (miimon != dev->miimon) {
1235 dev->miimon = miimon;
1236 netdev_linux_changed(dev, dev->ifi_flags, 0);
1239 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1241 ovs_mutex_unlock(&dev->mutex);
1242 netdev_close(netdev);
1245 shash_destroy(&device_shash);
1249 netdev_linux_miimon_wait(void)
1251 struct shash device_shash;
1252 struct shash_node *node;
1254 shash_init(&device_shash);
1255 netdev_get_devices(&netdev_linux_class, &device_shash);
1256 SHASH_FOR_EACH (node, &device_shash) {
1257 struct netdev *netdev = node->data;
1258 struct netdev_linux *dev = netdev_linux_cast(netdev);
1260 ovs_mutex_lock(&dev->mutex);
1261 if (dev->miimon_interval > 0) {
1262 timer_wait(&dev->miimon_timer);
1264 ovs_mutex_unlock(&dev->mutex);
1265 netdev_close(netdev);
1267 shash_destroy(&device_shash);
1270 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1271 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1274 check_for_working_netlink_stats(void)
1276 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1277 * preferable, so if that works, we'll use it. */
1278 int ifindex = do_get_ifindex("lo");
1280 VLOG_WARN("failed to get ifindex for lo, "
1281 "obtaining netdev stats from proc");
1284 struct netdev_stats stats;
1285 int error = get_stats_via_netlink(ifindex, &stats);
1287 VLOG_DBG("obtaining netdev stats via rtnetlink");
1290 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1291 "via proc (you are probably running a pre-2.6.19 "
1292 "kernel)", ovs_strerror(error));
1299 swap_uint64(uint64_t *a, uint64_t *b)
1306 /* Copies 'src' into 'dst', performing format conversion in the process.
1308 * 'src' is allowed to be misaligned. */
1310 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1311 const struct ovs_vport_stats *src)
1313 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1314 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1315 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1316 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1317 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1318 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1319 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1320 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1322 dst->collisions = 0;
1323 dst->rx_length_errors = 0;
1324 dst->rx_over_errors = 0;
1325 dst->rx_crc_errors = 0;
1326 dst->rx_frame_errors = 0;
1327 dst->rx_fifo_errors = 0;
1328 dst->rx_missed_errors = 0;
1329 dst->tx_aborted_errors = 0;
1330 dst->tx_carrier_errors = 0;
1331 dst->tx_fifo_errors = 0;
1332 dst->tx_heartbeat_errors = 0;
1333 dst->tx_window_errors = 0;
1337 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1339 struct dpif_linux_vport reply;
1343 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1346 } else if (!reply.stats) {
1351 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1359 get_stats_via_vport(const struct netdev *netdev_,
1360 struct netdev_stats *stats)
1362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1364 if (!netdev->vport_stats_error ||
1365 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1368 error = get_stats_via_vport__(netdev_, stats);
1369 if (error && error != ENOENT) {
1370 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1372 netdev_get_name(netdev_), ovs_strerror(error));
1374 netdev->vport_stats_error = error;
1375 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1380 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1381 struct netdev_stats *stats)
1383 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1384 static int use_netlink_stats;
1387 if (ovsthread_once_start(&once)) {
1388 use_netlink_stats = check_for_working_netlink_stats();
1389 ovsthread_once_done(&once);
1392 if (use_netlink_stats) {
1395 error = get_ifindex(netdev_, &ifindex);
1397 error = get_stats_via_netlink(ifindex, stats);
1400 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1404 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1405 netdev_get_name(netdev_), error);
1411 /* Retrieves current device stats for 'netdev-linux'. */
1413 netdev_linux_get_stats(const struct netdev *netdev_,
1414 struct netdev_stats *stats)
1416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1417 struct netdev_stats dev_stats;
1420 ovs_mutex_lock(&netdev->mutex);
1421 get_stats_via_vport(netdev_, stats);
1422 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1424 if (!netdev->vport_stats_error) {
1427 } else if (netdev->vport_stats_error) {
1428 /* stats not available from OVS then use ioctl stats. */
1431 stats->rx_errors += dev_stats.rx_errors;
1432 stats->tx_errors += dev_stats.tx_errors;
1433 stats->rx_dropped += dev_stats.rx_dropped;
1434 stats->tx_dropped += dev_stats.tx_dropped;
1435 stats->multicast += dev_stats.multicast;
1436 stats->collisions += dev_stats.collisions;
1437 stats->rx_length_errors += dev_stats.rx_length_errors;
1438 stats->rx_over_errors += dev_stats.rx_over_errors;
1439 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1440 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1441 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1442 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1443 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1444 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1445 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1446 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1447 stats->tx_window_errors += dev_stats.tx_window_errors;
1449 ovs_mutex_unlock(&netdev->mutex);
1454 /* Retrieves current device stats for 'netdev-tap' netdev or
1455 * netdev-internal. */
1457 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1459 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1460 struct netdev_stats dev_stats;
1463 ovs_mutex_lock(&netdev->mutex);
1464 get_stats_via_vport(netdev_, stats);
1465 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1467 if (!netdev->vport_stats_error) {
1470 } else if (netdev->vport_stats_error) {
1471 /* Transmit and receive stats will appear to be swapped relative to the
1472 * other ports since we are the one sending the data, not a remote
1473 * computer. For consistency, we swap them back here. This does not
1474 * apply if we are getting stats from the vport layer because it always
1475 * tracks stats from the perspective of the switch. */
1478 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1479 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1480 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1481 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1482 stats->rx_length_errors = 0;
1483 stats->rx_over_errors = 0;
1484 stats->rx_crc_errors = 0;
1485 stats->rx_frame_errors = 0;
1486 stats->rx_fifo_errors = 0;
1487 stats->rx_missed_errors = 0;
1488 stats->tx_aborted_errors = 0;
1489 stats->tx_carrier_errors = 0;
1490 stats->tx_fifo_errors = 0;
1491 stats->tx_heartbeat_errors = 0;
1492 stats->tx_window_errors = 0;
1494 stats->rx_dropped += dev_stats.tx_dropped;
1495 stats->tx_dropped += dev_stats.rx_dropped;
1497 stats->rx_errors += dev_stats.tx_errors;
1498 stats->tx_errors += dev_stats.rx_errors;
1500 stats->multicast += dev_stats.multicast;
1501 stats->collisions += dev_stats.collisions;
1503 ovs_mutex_unlock(&netdev->mutex);
1509 netdev_internal_get_stats(const struct netdev *netdev_,
1510 struct netdev_stats *stats)
1512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1515 ovs_mutex_lock(&netdev->mutex);
1516 get_stats_via_vport(netdev_, stats);
1517 error = netdev->vport_stats_error;
1518 ovs_mutex_unlock(&netdev->mutex);
1524 netdev_internal_set_stats(struct netdev *netdev,
1525 const struct netdev_stats *stats)
1527 struct ovs_vport_stats vport_stats;
1528 struct dpif_linux_vport vport;
1531 vport_stats.rx_packets = stats->rx_packets;
1532 vport_stats.tx_packets = stats->tx_packets;
1533 vport_stats.rx_bytes = stats->rx_bytes;
1534 vport_stats.tx_bytes = stats->tx_bytes;
1535 vport_stats.rx_errors = stats->rx_errors;
1536 vport_stats.tx_errors = stats->tx_errors;
1537 vport_stats.rx_dropped = stats->rx_dropped;
1538 vport_stats.tx_dropped = stats->tx_dropped;
1540 dpif_linux_vport_init(&vport);
1541 vport.cmd = OVS_VPORT_CMD_SET;
1542 vport.name = netdev_get_name(netdev);
1543 vport.stats = &vport_stats;
1545 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1547 /* If the vport layer doesn't know about the device, that doesn't mean it
1548 * doesn't exist (after all were able to open it when netdev_open() was
1549 * called), it just means that it isn't attached and we'll be getting
1550 * stats a different way. */
1551 if (err == ENODEV) {
1559 netdev_linux_read_features(struct netdev_linux *netdev)
1560 OVS_REQUIRES(netdev->mutex)
1562 struct ethtool_cmd ecmd;
1566 if (netdev->cache_valid & VALID_FEATURES) {
1570 COVERAGE_INC(netdev_get_ethtool);
1571 memset(&ecmd, 0, sizeof ecmd);
1572 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1573 ETHTOOL_GSET, "ETHTOOL_GSET");
1578 /* Supported features. */
1579 netdev->supported = 0;
1580 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1581 netdev->supported |= NETDEV_F_10MB_HD;
1583 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1584 netdev->supported |= NETDEV_F_10MB_FD;
1586 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1587 netdev->supported |= NETDEV_F_100MB_HD;
1589 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1590 netdev->supported |= NETDEV_F_100MB_FD;
1592 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1593 netdev->supported |= NETDEV_F_1GB_HD;
1595 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1596 netdev->supported |= NETDEV_F_1GB_FD;
1598 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1599 netdev->supported |= NETDEV_F_10GB_FD;
1601 if (ecmd.supported & SUPPORTED_TP) {
1602 netdev->supported |= NETDEV_F_COPPER;
1604 if (ecmd.supported & SUPPORTED_FIBRE) {
1605 netdev->supported |= NETDEV_F_FIBER;
1607 if (ecmd.supported & SUPPORTED_Autoneg) {
1608 netdev->supported |= NETDEV_F_AUTONEG;
1610 if (ecmd.supported & SUPPORTED_Pause) {
1611 netdev->supported |= NETDEV_F_PAUSE;
1613 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1614 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1617 /* Advertised features. */
1618 netdev->advertised = 0;
1619 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1620 netdev->advertised |= NETDEV_F_10MB_HD;
1622 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1623 netdev->advertised |= NETDEV_F_10MB_FD;
1625 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1626 netdev->advertised |= NETDEV_F_100MB_HD;
1628 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1629 netdev->advertised |= NETDEV_F_100MB_FD;
1631 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1632 netdev->advertised |= NETDEV_F_1GB_HD;
1634 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1635 netdev->advertised |= NETDEV_F_1GB_FD;
1637 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1638 netdev->advertised |= NETDEV_F_10GB_FD;
1640 if (ecmd.advertising & ADVERTISED_TP) {
1641 netdev->advertised |= NETDEV_F_COPPER;
1643 if (ecmd.advertising & ADVERTISED_FIBRE) {
1644 netdev->advertised |= NETDEV_F_FIBER;
1646 if (ecmd.advertising & ADVERTISED_Autoneg) {
1647 netdev->advertised |= NETDEV_F_AUTONEG;
1649 if (ecmd.advertising & ADVERTISED_Pause) {
1650 netdev->advertised |= NETDEV_F_PAUSE;
1652 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1653 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1656 /* Current settings. */
1658 if (speed == SPEED_10) {
1659 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1660 } else if (speed == SPEED_100) {
1661 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1662 } else if (speed == SPEED_1000) {
1663 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1664 } else if (speed == SPEED_10000) {
1665 netdev->current = NETDEV_F_10GB_FD;
1666 } else if (speed == 40000) {
1667 netdev->current = NETDEV_F_40GB_FD;
1668 } else if (speed == 100000) {
1669 netdev->current = NETDEV_F_100GB_FD;
1670 } else if (speed == 1000000) {
1671 netdev->current = NETDEV_F_1TB_FD;
1673 netdev->current = 0;
1676 if (ecmd.port == PORT_TP) {
1677 netdev->current |= NETDEV_F_COPPER;
1678 } else if (ecmd.port == PORT_FIBRE) {
1679 netdev->current |= NETDEV_F_FIBER;
1683 netdev->current |= NETDEV_F_AUTONEG;
1687 netdev->cache_valid |= VALID_FEATURES;
1688 netdev->get_features_error = error;
1691 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1692 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1693 * Returns 0 if successful, otherwise a positive errno value. */
1695 netdev_linux_get_features(const struct netdev *netdev_,
1696 enum netdev_features *current,
1697 enum netdev_features *advertised,
1698 enum netdev_features *supported,
1699 enum netdev_features *peer)
1701 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1704 ovs_mutex_lock(&netdev->mutex);
1705 netdev_linux_read_features(netdev);
1706 if (!netdev->get_features_error) {
1707 *current = netdev->current;
1708 *advertised = netdev->advertised;
1709 *supported = netdev->supported;
1710 *peer = 0; /* XXX */
1712 error = netdev->get_features_error;
1713 ovs_mutex_unlock(&netdev->mutex);
1718 /* Set the features advertised by 'netdev' to 'advertise'. */
1720 netdev_linux_set_advertisements(struct netdev *netdev_,
1721 enum netdev_features advertise)
1723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1724 struct ethtool_cmd ecmd;
1727 ovs_mutex_lock(&netdev->mutex);
1729 COVERAGE_INC(netdev_get_ethtool);
1730 memset(&ecmd, 0, sizeof ecmd);
1731 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1732 ETHTOOL_GSET, "ETHTOOL_GSET");
1737 ecmd.advertising = 0;
1738 if (advertise & NETDEV_F_10MB_HD) {
1739 ecmd.advertising |= ADVERTISED_10baseT_Half;
1741 if (advertise & NETDEV_F_10MB_FD) {
1742 ecmd.advertising |= ADVERTISED_10baseT_Full;
1744 if (advertise & NETDEV_F_100MB_HD) {
1745 ecmd.advertising |= ADVERTISED_100baseT_Half;
1747 if (advertise & NETDEV_F_100MB_FD) {
1748 ecmd.advertising |= ADVERTISED_100baseT_Full;
1750 if (advertise & NETDEV_F_1GB_HD) {
1751 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1753 if (advertise & NETDEV_F_1GB_FD) {
1754 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1756 if (advertise & NETDEV_F_10GB_FD) {
1757 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1759 if (advertise & NETDEV_F_COPPER) {
1760 ecmd.advertising |= ADVERTISED_TP;
1762 if (advertise & NETDEV_F_FIBER) {
1763 ecmd.advertising |= ADVERTISED_FIBRE;
1765 if (advertise & NETDEV_F_AUTONEG) {
1766 ecmd.advertising |= ADVERTISED_Autoneg;
1768 if (advertise & NETDEV_F_PAUSE) {
1769 ecmd.advertising |= ADVERTISED_Pause;
1771 if (advertise & NETDEV_F_PAUSE_ASYM) {
1772 ecmd.advertising |= ADVERTISED_Asym_Pause;
1774 COVERAGE_INC(netdev_set_ethtool);
1775 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1776 ETHTOOL_SSET, "ETHTOOL_SSET");
1779 ovs_mutex_unlock(&netdev->mutex);
1783 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1784 * successful, otherwise a positive errno value. */
1786 netdev_linux_set_policing(struct netdev *netdev_,
1787 uint32_t kbits_rate, uint32_t kbits_burst)
1789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1790 const char *netdev_name = netdev_get_name(netdev_);
1793 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1794 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1795 : kbits_burst); /* Stick with user-specified value. */
1797 ovs_mutex_lock(&netdev->mutex);
1798 if (netdev->cache_valid & VALID_POLICING) {
1799 error = netdev->netdev_policing_error;
1800 if (error || (netdev->kbits_rate == kbits_rate &&
1801 netdev->kbits_burst == kbits_burst)) {
1802 /* Assume that settings haven't changed since we last set them. */
1805 netdev->cache_valid &= ~VALID_POLICING;
1808 COVERAGE_INC(netdev_set_policing);
1809 /* Remove any existing ingress qdisc. */
1810 error = tc_add_del_ingress_qdisc(netdev_, false);
1812 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1813 netdev_name, ovs_strerror(error));
1818 error = tc_add_del_ingress_qdisc(netdev_, true);
1820 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1821 netdev_name, ovs_strerror(error));
1825 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1827 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1828 netdev_name, ovs_strerror(error));
1833 netdev->kbits_rate = kbits_rate;
1834 netdev->kbits_burst = kbits_burst;
1837 if (!error || error == ENODEV) {
1838 netdev->netdev_policing_error = error;
1839 netdev->cache_valid |= VALID_POLICING;
1841 ovs_mutex_unlock(&netdev->mutex);
1846 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1849 const struct tc_ops *const *opsp;
1851 for (opsp = tcs; *opsp != NULL; opsp++) {
1852 const struct tc_ops *ops = *opsp;
1853 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1854 sset_add(types, ops->ovs_name);
1860 static const struct tc_ops *
1861 tc_lookup_ovs_name(const char *name)
1863 const struct tc_ops *const *opsp;
1865 for (opsp = tcs; *opsp != NULL; opsp++) {
1866 const struct tc_ops *ops = *opsp;
1867 if (!strcmp(name, ops->ovs_name)) {
1874 static const struct tc_ops *
1875 tc_lookup_linux_name(const char *name)
1877 const struct tc_ops *const *opsp;
1879 for (opsp = tcs; *opsp != NULL; opsp++) {
1880 const struct tc_ops *ops = *opsp;
1881 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1888 static struct tc_queue *
1889 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1892 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1893 struct tc_queue *queue;
1895 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1896 if (queue->queue_id == queue_id) {
1903 static struct tc_queue *
1904 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1906 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1910 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1912 struct netdev_qos_capabilities *caps)
1914 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1918 caps->n_queues = ops->n_queues;
1923 netdev_linux_get_qos(const struct netdev *netdev_,
1924 const char **typep, struct smap *details)
1926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1929 ovs_mutex_lock(&netdev->mutex);
1930 error = tc_query_qdisc(netdev_);
1932 *typep = netdev->tc->ops->ovs_name;
1933 error = (netdev->tc->ops->qdisc_get
1934 ? netdev->tc->ops->qdisc_get(netdev_, details)
1937 ovs_mutex_unlock(&netdev->mutex);
1943 netdev_linux_set_qos(struct netdev *netdev_,
1944 const char *type, const struct smap *details)
1946 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1947 const struct tc_ops *new_ops;
1950 new_ops = tc_lookup_ovs_name(type);
1951 if (!new_ops || !new_ops->tc_install) {
1955 ovs_mutex_lock(&netdev->mutex);
1956 error = tc_query_qdisc(netdev_);
1961 if (new_ops == netdev->tc->ops) {
1962 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1964 /* Delete existing qdisc. */
1965 error = tc_del_qdisc(netdev_);
1969 ovs_assert(netdev->tc == NULL);
1971 /* Install new qdisc. */
1972 error = new_ops->tc_install(netdev_, details);
1973 ovs_assert((error == 0) == (netdev->tc != NULL));
1977 ovs_mutex_unlock(&netdev->mutex);
1982 netdev_linux_get_queue(const struct netdev *netdev_,
1983 unsigned int queue_id, struct smap *details)
1985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1988 ovs_mutex_lock(&netdev->mutex);
1989 error = tc_query_qdisc(netdev_);
1991 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1993 ? netdev->tc->ops->class_get(netdev_, queue, details)
1996 ovs_mutex_unlock(&netdev->mutex);
2002 netdev_linux_set_queue(struct netdev *netdev_,
2003 unsigned int queue_id, const struct smap *details)
2005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2008 ovs_mutex_lock(&netdev->mutex);
2009 error = tc_query_qdisc(netdev_);
2011 error = (queue_id < netdev->tc->ops->n_queues
2012 && netdev->tc->ops->class_set
2013 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2016 ovs_mutex_unlock(&netdev->mutex);
2022 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2027 ovs_mutex_lock(&netdev->mutex);
2028 error = tc_query_qdisc(netdev_);
2030 if (netdev->tc->ops->class_delete) {
2031 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2033 ? netdev->tc->ops->class_delete(netdev_, queue)
2039 ovs_mutex_unlock(&netdev->mutex);
2045 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2046 unsigned int queue_id,
2047 struct netdev_queue_stats *stats)
2049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052 ovs_mutex_lock(&netdev->mutex);
2053 error = tc_query_qdisc(netdev_);
2055 if (netdev->tc->ops->class_get_stats) {
2056 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2058 stats->created = queue->created;
2059 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2068 ovs_mutex_unlock(&netdev->mutex);
2074 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2076 struct ofpbuf request;
2077 struct tcmsg *tcmsg;
2079 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2083 tcmsg->tcm_parent = 0;
2084 nl_dump_start(dump, NETLINK_ROUTE, &request);
2085 ofpbuf_uninit(&request);
2090 netdev_linux_dump_queues(const struct netdev *netdev_,
2091 netdev_dump_queues_cb *cb, void *aux)
2093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2096 ovs_mutex_lock(&netdev->mutex);
2097 error = tc_query_qdisc(netdev_);
2099 if (netdev->tc->ops->class_get) {
2100 struct tc_queue *queue, *next_queue;
2101 struct smap details;
2103 smap_init(&details);
2104 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2105 &netdev->tc->queues) {
2108 smap_clear(&details);
2110 retval = netdev->tc->ops->class_get(netdev_, queue, &details);
2112 (*cb)(queue->queue_id, &details, aux);
2117 smap_destroy(&details);
2122 ovs_mutex_unlock(&netdev->mutex);
2128 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2129 netdev_dump_queue_stats_cb *cb, void *aux)
2131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2134 ovs_mutex_lock(&netdev->mutex);
2135 error = tc_query_qdisc(netdev_);
2137 struct nl_dump dump;
2139 if (!netdev->tc->ops->class_dump_stats) {
2141 } else if (!start_queue_dump(netdev_, &dump)) {
2147 while (nl_dump_next(&dump, &msg)) {
2148 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2155 retval = nl_dump_done(&dump);
2161 ovs_mutex_unlock(&netdev->mutex);
2167 netdev_linux_get_in4(const struct netdev *netdev_,
2168 struct in_addr *address, struct in_addr *netmask)
2170 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2173 ovs_mutex_lock(&netdev->mutex);
2174 if (!(netdev->cache_valid & VALID_IN4)) {
2175 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2176 SIOCGIFADDR, "SIOCGIFADDR");
2178 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2179 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2181 netdev->cache_valid |= VALID_IN4;
2189 if (netdev->address.s_addr != INADDR_ANY) {
2190 *address = netdev->address;
2191 *netmask = netdev->netmask;
2193 error = EADDRNOTAVAIL;
2196 ovs_mutex_unlock(&netdev->mutex);
2202 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2203 struct in_addr netmask)
2205 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2208 ovs_mutex_lock(&netdev->mutex);
2209 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2211 netdev->cache_valid |= VALID_IN4;
2212 netdev->address = address;
2213 netdev->netmask = netmask;
2214 if (address.s_addr != INADDR_ANY) {
2215 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2216 "SIOCSIFNETMASK", netmask);
2219 ovs_mutex_unlock(&netdev->mutex);
2225 parse_if_inet6_line(const char *line,
2226 struct in6_addr *in6, char ifname[16 + 1])
2228 uint8_t *s6 = in6->s6_addr;
2229 #define X8 "%2"SCNx8
2231 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2232 "%*x %*x %*x %*x %16s\n",
2233 &s6[0], &s6[1], &s6[2], &s6[3],
2234 &s6[4], &s6[5], &s6[6], &s6[7],
2235 &s6[8], &s6[9], &s6[10], &s6[11],
2236 &s6[12], &s6[13], &s6[14], &s6[15],
2240 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2241 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2243 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2247 ovs_mutex_lock(&netdev->mutex);
2248 if (!(netdev->cache_valid & VALID_IN6)) {
2252 netdev->in6 = in6addr_any;
2254 file = fopen("/proc/net/if_inet6", "r");
2256 const char *name = netdev_get_name(netdev_);
2257 while (fgets(line, sizeof line, file)) {
2258 struct in6_addr in6_tmp;
2259 char ifname[16 + 1];
2260 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2261 && !strcmp(name, ifname))
2263 netdev->in6 = in6_tmp;
2269 netdev->cache_valid |= VALID_IN6;
2272 ovs_mutex_unlock(&netdev->mutex);
2278 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2280 struct sockaddr_in sin;
2281 memset(&sin, 0, sizeof sin);
2282 sin.sin_family = AF_INET;
2283 sin.sin_addr = addr;
2286 memset(sa, 0, sizeof *sa);
2287 memcpy(sa, &sin, sizeof sin);
2291 do_set_addr(struct netdev *netdev,
2292 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2296 make_in4_sockaddr(&ifr.ifr_addr, addr);
2297 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2301 /* Adds 'router' as a default IP gateway. */
2303 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2305 struct in_addr any = { INADDR_ANY };
2309 memset(&rt, 0, sizeof rt);
2310 make_in4_sockaddr(&rt.rt_dst, any);
2311 make_in4_sockaddr(&rt.rt_gateway, router);
2312 make_in4_sockaddr(&rt.rt_genmask, any);
2313 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2314 error = af_inet_ioctl(SIOCADDRT, &rt);
2316 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2322 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2325 static const char fn[] = "/proc/net/route";
2330 *netdev_name = NULL;
2331 stream = fopen(fn, "r");
2332 if (stream == NULL) {
2333 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2338 while (fgets(line, sizeof line, stream)) {
2341 ovs_be32 dest, gateway, mask;
2342 int refcnt, metric, mtu;
2343 unsigned int flags, use, window, irtt;
2346 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2348 iface, &dest, &gateway, &flags, &refcnt,
2349 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2351 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2355 if (!(flags & RTF_UP)) {
2356 /* Skip routes that aren't up. */
2360 /* The output of 'dest', 'mask', and 'gateway' were given in
2361 * network byte order, so we don't need need any endian
2362 * conversions here. */
2363 if ((dest & mask) == (host->s_addr & mask)) {
2365 /* The host is directly reachable. */
2366 next_hop->s_addr = 0;
2368 /* To reach the host, we must go through a gateway. */
2369 next_hop->s_addr = gateway;
2371 *netdev_name = xstrdup(iface);
2383 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2388 ovs_mutex_lock(&netdev->mutex);
2389 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2390 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2392 COVERAGE_INC(netdev_get_ethtool);
2393 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2394 error = netdev_linux_do_ethtool(netdev->up.name,
2397 "ETHTOOL_GDRVINFO");
2399 netdev->cache_valid |= VALID_DRVINFO;
2404 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2405 smap_add(smap, "driver_version", netdev->drvinfo.version);
2406 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2408 ovs_mutex_unlock(&netdev->mutex);
2414 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2417 smap_add(smap, "driver_name", "openvswitch");
2421 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2422 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2423 * returns 0. Otherwise, it returns a positive errno value; in particular,
2424 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2426 netdev_linux_arp_lookup(const struct netdev *netdev,
2427 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2430 struct sockaddr_in sin;
2433 memset(&r, 0, sizeof r);
2434 memset(&sin, 0, sizeof sin);
2435 sin.sin_family = AF_INET;
2436 sin.sin_addr.s_addr = ip;
2438 memcpy(&r.arp_pa, &sin, sizeof sin);
2439 r.arp_ha.sa_family = ARPHRD_ETHER;
2441 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2442 COVERAGE_INC(netdev_arp_lookup);
2443 retval = af_inet_ioctl(SIOCGARP, &r);
2445 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2446 } else if (retval != ENXIO) {
2447 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2448 netdev_get_name(netdev), IP_ARGS(ip),
2449 ovs_strerror(retval));
2455 nd_to_iff_flags(enum netdev_flags nd)
2458 if (nd & NETDEV_UP) {
2461 if (nd & NETDEV_PROMISC) {
2468 iff_to_nd_flags(int iff)
2470 enum netdev_flags nd = 0;
2474 if (iff & IFF_PROMISC) {
2475 nd |= NETDEV_PROMISC;
2481 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2482 enum netdev_flags on, enum netdev_flags *old_flagsp)
2484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2485 int old_flags, new_flags;
2488 ovs_mutex_lock(&netdev->mutex);
2489 old_flags = netdev->ifi_flags;
2490 *old_flagsp = iff_to_nd_flags(old_flags);
2491 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2492 if (new_flags != old_flags) {
2493 error = set_flags(netdev_get_name(netdev_), new_flags);
2494 get_flags(netdev_, &netdev->ifi_flags);
2496 ovs_mutex_unlock(&netdev->mutex);
2502 netdev_linux_change_seq(const struct netdev *netdev_)
2504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2505 unsigned int change_seq;
2507 ovs_mutex_lock(&netdev->mutex);
2508 change_seq = netdev->change_seq;
2509 ovs_mutex_unlock(&netdev->mutex);
2514 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2515 GET_FEATURES, GET_STATUS) \
2521 netdev_linux_wait, \
2523 netdev_linux_alloc, \
2525 netdev_linux_destruct, \
2526 netdev_linux_dealloc, \
2527 NULL, /* get_config */ \
2528 NULL, /* set_config */ \
2529 NULL, /* get_tunnel_config */ \
2531 netdev_linux_send, \
2532 netdev_linux_send_wait, \
2534 netdev_linux_set_etheraddr, \
2535 netdev_linux_get_etheraddr, \
2536 netdev_linux_get_mtu, \
2537 netdev_linux_set_mtu, \
2538 netdev_linux_get_ifindex, \
2539 netdev_linux_get_carrier, \
2540 netdev_linux_get_carrier_resets, \
2541 netdev_linux_set_miimon_interval, \
2546 netdev_linux_set_advertisements, \
2548 netdev_linux_set_policing, \
2549 netdev_linux_get_qos_types, \
2550 netdev_linux_get_qos_capabilities, \
2551 netdev_linux_get_qos, \
2552 netdev_linux_set_qos, \
2553 netdev_linux_get_queue, \
2554 netdev_linux_set_queue, \
2555 netdev_linux_delete_queue, \
2556 netdev_linux_get_queue_stats, \
2557 netdev_linux_dump_queues, \
2558 netdev_linux_dump_queue_stats, \
2560 netdev_linux_get_in4, \
2561 netdev_linux_set_in4, \
2562 netdev_linux_get_in6, \
2563 netdev_linux_add_router, \
2564 netdev_linux_get_next_hop, \
2566 netdev_linux_arp_lookup, \
2568 netdev_linux_update_flags, \
2570 netdev_linux_change_seq, \
2572 netdev_linux_rx_alloc, \
2573 netdev_linux_rx_construct, \
2574 netdev_linux_rx_destruct, \
2575 netdev_linux_rx_dealloc, \
2576 netdev_linux_rx_recv, \
2577 netdev_linux_rx_wait, \
2578 netdev_linux_rx_drain, \
2581 const struct netdev_class netdev_linux_class =
2584 netdev_linux_construct,
2585 netdev_linux_get_stats,
2586 NULL, /* set_stats */
2587 netdev_linux_get_features,
2588 netdev_linux_get_status);
2590 const struct netdev_class netdev_tap_class =
2593 netdev_linux_construct_tap,
2594 netdev_tap_get_stats,
2595 NULL, /* set_stats */
2596 netdev_linux_get_features,
2597 netdev_linux_get_status);
2599 const struct netdev_class netdev_internal_class =
2602 netdev_linux_construct,
2603 netdev_internal_get_stats,
2604 netdev_internal_set_stats,
2605 NULL, /* get_features */
2606 netdev_internal_get_status);
2608 /* HTB traffic control class. */
2610 #define HTB_N_QUEUES 0xf000
2614 unsigned int max_rate; /* In bytes/s. */
2618 struct tc_queue tc_queue;
2619 unsigned int min_rate; /* In bytes/s. */
2620 unsigned int max_rate; /* In bytes/s. */
2621 unsigned int burst; /* In bytes. */
2622 unsigned int priority; /* Lower values are higher priorities. */
2626 htb_get__(const struct netdev *netdev_)
2628 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2629 return CONTAINER_OF(netdev->tc, struct htb, tc);
2633 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2635 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2638 htb = xmalloc(sizeof *htb);
2639 tc_init(&htb->tc, &tc_ops_htb);
2640 htb->max_rate = max_rate;
2642 netdev->tc = &htb->tc;
2645 /* Create an HTB qdisc.
2647 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2649 htb_setup_qdisc__(struct netdev *netdev)
2652 struct tc_htb_glob opt;
2653 struct ofpbuf request;
2654 struct tcmsg *tcmsg;
2656 tc_del_qdisc(netdev);
2658 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2659 NLM_F_EXCL | NLM_F_CREATE, &request);
2663 tcmsg->tcm_handle = tc_make_handle(1, 0);
2664 tcmsg->tcm_parent = TC_H_ROOT;
2666 nl_msg_put_string(&request, TCA_KIND, "htb");
2668 memset(&opt, 0, sizeof opt);
2669 opt.rate2quantum = 10;
2673 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2674 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2675 nl_msg_end_nested(&request, opt_offset);
2677 return tc_transact(&request, NULL);
2680 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2681 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2683 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2684 unsigned int parent, struct htb_class *class)
2687 struct tc_htb_opt opt;
2688 struct ofpbuf request;
2689 struct tcmsg *tcmsg;
2693 error = netdev_get_mtu(netdev, &mtu);
2695 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2696 netdev_get_name(netdev));
2700 memset(&opt, 0, sizeof opt);
2701 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2702 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2703 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2704 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2705 opt.prio = class->priority;
2707 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2711 tcmsg->tcm_handle = handle;
2712 tcmsg->tcm_parent = parent;
2714 nl_msg_put_string(&request, TCA_KIND, "htb");
2715 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2716 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2717 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2718 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2719 nl_msg_end_nested(&request, opt_offset);
2721 error = tc_transact(&request, NULL);
2723 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2724 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2725 netdev_get_name(netdev),
2726 tc_get_major(handle), tc_get_minor(handle),
2727 tc_get_major(parent), tc_get_minor(parent),
2728 class->min_rate, class->max_rate,
2729 class->burst, class->priority, ovs_strerror(error));
2734 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2735 * description of them into 'details'. The description complies with the
2736 * specification given in the vswitch database documentation for linux-htb
2739 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2741 static const struct nl_policy tca_htb_policy[] = {
2742 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2743 .min_len = sizeof(struct tc_htb_opt) },
2746 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2747 const struct tc_htb_opt *htb;
2749 if (!nl_parse_nested(nl_options, tca_htb_policy,
2750 attrs, ARRAY_SIZE(tca_htb_policy))) {
2751 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2755 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2756 class->min_rate = htb->rate.rate;
2757 class->max_rate = htb->ceil.rate;
2758 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2759 class->priority = htb->prio;
2764 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2765 struct htb_class *options,
2766 struct netdev_queue_stats *stats)
2768 struct nlattr *nl_options;
2769 unsigned int handle;
2772 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2773 if (!error && queue_id) {
2774 unsigned int major = tc_get_major(handle);
2775 unsigned int minor = tc_get_minor(handle);
2776 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2777 *queue_id = minor - 1;
2782 if (!error && options) {
2783 error = htb_parse_tca_options__(nl_options, options);
2789 htb_parse_qdisc_details__(struct netdev *netdev,
2790 const struct smap *details, struct htb_class *hc)
2792 const char *max_rate_s;
2794 max_rate_s = smap_get(details, "max-rate");
2795 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2796 if (!hc->max_rate) {
2797 enum netdev_features current;
2799 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2800 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2802 hc->min_rate = hc->max_rate;
2808 htb_parse_class_details__(struct netdev *netdev,
2809 const struct smap *details, struct htb_class *hc)
2811 const struct htb *htb = htb_get__(netdev);
2812 const char *min_rate_s = smap_get(details, "min-rate");
2813 const char *max_rate_s = smap_get(details, "max-rate");
2814 const char *burst_s = smap_get(details, "burst");
2815 const char *priority_s = smap_get(details, "priority");
2818 error = netdev_get_mtu(netdev, &mtu);
2820 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2821 netdev_get_name(netdev));
2825 /* HTB requires at least an mtu sized min-rate to send any traffic even
2826 * on uncongested links. */
2827 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2828 hc->min_rate = MAX(hc->min_rate, mtu);
2829 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2832 hc->max_rate = (max_rate_s
2833 ? strtoull(max_rate_s, NULL, 10) / 8
2835 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2836 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2840 * According to hints in the documentation that I've read, it is important
2841 * that 'burst' be at least as big as the largest frame that might be
2842 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2843 * but having it a bit too small is a problem. Since netdev_get_mtu()
2844 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2845 * the MTU. We actually add 64, instead of 14, as a guard against
2846 * additional headers get tacked on somewhere that we're not aware of. */
2847 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2848 hc->burst = MAX(hc->burst, mtu + 64);
2851 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2857 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2858 unsigned int parent, struct htb_class *options,
2859 struct netdev_queue_stats *stats)
2861 struct ofpbuf *reply;
2864 error = tc_query_class(netdev, handle, parent, &reply);
2866 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2867 ofpbuf_delete(reply);
2873 htb_tc_install(struct netdev *netdev, const struct smap *details)
2877 error = htb_setup_qdisc__(netdev);
2879 struct htb_class hc;
2881 htb_parse_qdisc_details__(netdev, details, &hc);
2882 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2883 tc_make_handle(1, 0), &hc);
2885 htb_install__(netdev, hc.max_rate);
2891 static struct htb_class *
2892 htb_class_cast__(const struct tc_queue *queue)
2894 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2898 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2899 const struct htb_class *hc)
2901 struct htb *htb = htb_get__(netdev);
2902 size_t hash = hash_int(queue_id, 0);
2903 struct tc_queue *queue;
2904 struct htb_class *hcp;
2906 queue = tc_find_queue__(netdev, queue_id, hash);
2908 hcp = htb_class_cast__(queue);
2910 hcp = xmalloc(sizeof *hcp);
2911 queue = &hcp->tc_queue;
2912 queue->queue_id = queue_id;
2913 queue->created = time_msec();
2914 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2917 hcp->min_rate = hc->min_rate;
2918 hcp->max_rate = hc->max_rate;
2919 hcp->burst = hc->burst;
2920 hcp->priority = hc->priority;
2924 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2927 struct nl_dump dump;
2928 struct htb_class hc;
2930 /* Get qdisc options. */
2932 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2933 htb_install__(netdev, hc.max_rate);
2936 if (!start_queue_dump(netdev, &dump)) {
2939 while (nl_dump_next(&dump, &msg)) {
2940 unsigned int queue_id;
2942 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2943 htb_update_queue__(netdev, queue_id, &hc);
2946 nl_dump_done(&dump);
2952 htb_tc_destroy(struct tc *tc)
2954 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2955 struct htb_class *hc, *next;
2957 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2958 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2966 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2968 const struct htb *htb = htb_get__(netdev);
2969 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2974 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2976 struct htb_class hc;
2979 htb_parse_qdisc_details__(netdev, details, &hc);
2980 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2981 tc_make_handle(1, 0), &hc);
2983 htb_get__(netdev)->max_rate = hc.max_rate;
2989 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2990 const struct tc_queue *queue, struct smap *details)
2992 const struct htb_class *hc = htb_class_cast__(queue);
2994 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2995 if (hc->min_rate != hc->max_rate) {
2996 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2998 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3000 smap_add_format(details, "priority", "%u", hc->priority);
3006 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3007 const struct smap *details)
3009 struct htb_class hc;
3012 error = htb_parse_class_details__(netdev, details, &hc);
3017 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3018 tc_make_handle(1, 0xfffe), &hc);
3023 htb_update_queue__(netdev, queue_id, &hc);
3028 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3030 struct htb_class *hc = htb_class_cast__(queue);
3031 struct htb *htb = htb_get__(netdev);
3034 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3036 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3043 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3044 struct netdev_queue_stats *stats)
3046 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3047 tc_make_handle(1, 0xfffe), NULL, stats);
3051 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3052 const struct ofpbuf *nlmsg,
3053 netdev_dump_queue_stats_cb *cb, void *aux)
3055 struct netdev_queue_stats stats;
3056 unsigned int handle, major, minor;
3059 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3064 major = tc_get_major(handle);
3065 minor = tc_get_minor(handle);
3066 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3067 (*cb)(minor - 1, &stats, aux);
3072 static const struct tc_ops tc_ops_htb = {
3073 "htb", /* linux_name */
3074 "linux-htb", /* ovs_name */
3075 HTB_N_QUEUES, /* n_queues */
3084 htb_class_get_stats,
3085 htb_class_dump_stats
3088 /* "linux-hfsc" traffic control class. */
3090 #define HFSC_N_QUEUES 0xf000
3098 struct tc_queue tc_queue;
3103 static struct hfsc *
3104 hfsc_get__(const struct netdev *netdev_)
3106 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3107 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3110 static struct hfsc_class *
3111 hfsc_class_cast__(const struct tc_queue *queue)
3113 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3117 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3119 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3122 hfsc = xmalloc(sizeof *hfsc);
3123 tc_init(&hfsc->tc, &tc_ops_hfsc);
3124 hfsc->max_rate = max_rate;
3125 netdev->tc = &hfsc->tc;
3129 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3130 const struct hfsc_class *hc)
3134 struct hfsc_class *hcp;
3135 struct tc_queue *queue;
3137 hfsc = hfsc_get__(netdev);
3138 hash = hash_int(queue_id, 0);
3140 queue = tc_find_queue__(netdev, queue_id, hash);
3142 hcp = hfsc_class_cast__(queue);
3144 hcp = xmalloc(sizeof *hcp);
3145 queue = &hcp->tc_queue;
3146 queue->queue_id = queue_id;
3147 queue->created = time_msec();
3148 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3151 hcp->min_rate = hc->min_rate;
3152 hcp->max_rate = hc->max_rate;
3156 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3158 const struct tc_service_curve *rsc, *fsc, *usc;
3159 static const struct nl_policy tca_hfsc_policy[] = {
3161 .type = NL_A_UNSPEC,
3163 .min_len = sizeof(struct tc_service_curve),
3166 .type = NL_A_UNSPEC,
3168 .min_len = sizeof(struct tc_service_curve),
3171 .type = NL_A_UNSPEC,
3173 .min_len = sizeof(struct tc_service_curve),
3176 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3178 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3179 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3180 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3184 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3185 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3186 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3188 if (rsc->m1 != 0 || rsc->d != 0 ||
3189 fsc->m1 != 0 || fsc->d != 0 ||
3190 usc->m1 != 0 || usc->d != 0) {
3191 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3192 "Non-linear service curves are not supported.");
3196 if (rsc->m2 != fsc->m2) {
3197 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3198 "Real-time service curves are not supported ");
3202 if (rsc->m2 > usc->m2) {
3203 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3204 "Min-rate service curve is greater than "
3205 "the max-rate service curve.");
3209 class->min_rate = fsc->m2;
3210 class->max_rate = usc->m2;
3215 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3216 struct hfsc_class *options,
3217 struct netdev_queue_stats *stats)
3220 unsigned int handle;
3221 struct nlattr *nl_options;
3223 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3229 unsigned int major, minor;
3231 major = tc_get_major(handle);
3232 minor = tc_get_minor(handle);
3233 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3234 *queue_id = minor - 1;
3241 error = hfsc_parse_tca_options__(nl_options, options);
3248 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3249 unsigned int parent, struct hfsc_class *options,
3250 struct netdev_queue_stats *stats)
3253 struct ofpbuf *reply;
3255 error = tc_query_class(netdev, handle, parent, &reply);
3260 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3261 ofpbuf_delete(reply);
3266 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3267 struct hfsc_class *class)
3270 const char *max_rate_s;
3272 max_rate_s = smap_get(details, "max-rate");
3273 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3276 enum netdev_features current;
3278 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3279 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3282 class->min_rate = max_rate;
3283 class->max_rate = max_rate;
3287 hfsc_parse_class_details__(struct netdev *netdev,
3288 const struct smap *details,
3289 struct hfsc_class * class)
3291 const struct hfsc *hfsc;
3292 uint32_t min_rate, max_rate;
3293 const char *min_rate_s, *max_rate_s;
3295 hfsc = hfsc_get__(netdev);
3296 min_rate_s = smap_get(details, "min-rate");
3297 max_rate_s = smap_get(details, "max-rate");
3299 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3300 min_rate = MAX(min_rate, 1);
3301 min_rate = MIN(min_rate, hfsc->max_rate);
3303 max_rate = (max_rate_s
3304 ? strtoull(max_rate_s, NULL, 10) / 8
3306 max_rate = MAX(max_rate, min_rate);
3307 max_rate = MIN(max_rate, hfsc->max_rate);
3309 class->min_rate = min_rate;
3310 class->max_rate = max_rate;
3315 /* Create an HFSC qdisc.
3317 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3319 hfsc_setup_qdisc__(struct netdev * netdev)
3321 struct tcmsg *tcmsg;
3322 struct ofpbuf request;
3323 struct tc_hfsc_qopt opt;
3325 tc_del_qdisc(netdev);
3327 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3328 NLM_F_EXCL | NLM_F_CREATE, &request);
3334 tcmsg->tcm_handle = tc_make_handle(1, 0);
3335 tcmsg->tcm_parent = TC_H_ROOT;
3337 memset(&opt, 0, sizeof opt);
3340 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3341 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3343 return tc_transact(&request, NULL);
3346 /* Create an HFSC class.
3348 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3349 * sc rate <min_rate> ul rate <max_rate>" */
3351 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3352 unsigned int parent, struct hfsc_class *class)
3356 struct tcmsg *tcmsg;
3357 struct ofpbuf request;
3358 struct tc_service_curve min, max;
3360 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3366 tcmsg->tcm_handle = handle;
3367 tcmsg->tcm_parent = parent;
3371 min.m2 = class->min_rate;
3375 max.m2 = class->max_rate;
3377 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3378 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3379 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3380 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3381 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3382 nl_msg_end_nested(&request, opt_offset);
3384 error = tc_transact(&request, NULL);
3386 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3387 "min-rate %ubps, max-rate %ubps (%s)",
3388 netdev_get_name(netdev),
3389 tc_get_major(handle), tc_get_minor(handle),
3390 tc_get_major(parent), tc_get_minor(parent),
3391 class->min_rate, class->max_rate, ovs_strerror(error));
3398 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3401 struct hfsc_class class;
3403 error = hfsc_setup_qdisc__(netdev);
3409 hfsc_parse_qdisc_details__(netdev, details, &class);
3410 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3411 tc_make_handle(1, 0), &class);
3417 hfsc_install__(netdev, class.max_rate);
3422 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3425 struct nl_dump dump;
3426 struct hfsc_class hc;
3429 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3430 hfsc_install__(netdev, hc.max_rate);
3432 if (!start_queue_dump(netdev, &dump)) {
3436 while (nl_dump_next(&dump, &msg)) {
3437 unsigned int queue_id;
3439 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3440 hfsc_update_queue__(netdev, queue_id, &hc);
3444 nl_dump_done(&dump);
3449 hfsc_tc_destroy(struct tc *tc)
3452 struct hfsc_class *hc, *next;
3454 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3456 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3457 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3466 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3468 const struct hfsc *hfsc;
3469 hfsc = hfsc_get__(netdev);
3470 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3475 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3478 struct hfsc_class class;
3480 hfsc_parse_qdisc_details__(netdev, details, &class);
3481 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3482 tc_make_handle(1, 0), &class);
3485 hfsc_get__(netdev)->max_rate = class.max_rate;
3492 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3493 const struct tc_queue *queue, struct smap *details)
3495 const struct hfsc_class *hc;
3497 hc = hfsc_class_cast__(queue);
3498 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3499 if (hc->min_rate != hc->max_rate) {
3500 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3506 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3507 const struct smap *details)
3510 struct hfsc_class class;
3512 error = hfsc_parse_class_details__(netdev, details, &class);
3517 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3518 tc_make_handle(1, 0xfffe), &class);
3523 hfsc_update_queue__(netdev, queue_id, &class);
3528 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3532 struct hfsc_class *hc;
3534 hc = hfsc_class_cast__(queue);
3535 hfsc = hfsc_get__(netdev);
3537 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3539 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3546 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3547 struct netdev_queue_stats *stats)
3549 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3550 tc_make_handle(1, 0xfffe), NULL, stats);
3554 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3555 const struct ofpbuf *nlmsg,
3556 netdev_dump_queue_stats_cb *cb, void *aux)
3558 struct netdev_queue_stats stats;
3559 unsigned int handle, major, minor;
3562 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3567 major = tc_get_major(handle);
3568 minor = tc_get_minor(handle);
3569 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3570 (*cb)(minor - 1, &stats, aux);
3575 static const struct tc_ops tc_ops_hfsc = {
3576 "hfsc", /* linux_name */
3577 "linux-hfsc", /* ovs_name */
3578 HFSC_N_QUEUES, /* n_queues */
3579 hfsc_tc_install, /* tc_install */
3580 hfsc_tc_load, /* tc_load */
3581 hfsc_tc_destroy, /* tc_destroy */
3582 hfsc_qdisc_get, /* qdisc_get */
3583 hfsc_qdisc_set, /* qdisc_set */
3584 hfsc_class_get, /* class_get */
3585 hfsc_class_set, /* class_set */
3586 hfsc_class_delete, /* class_delete */
3587 hfsc_class_get_stats, /* class_get_stats */
3588 hfsc_class_dump_stats /* class_dump_stats */
3591 /* "linux-default" traffic control class.
3593 * This class represents the default, unnamed Linux qdisc. It corresponds to
3594 * the "" (empty string) QoS type in the OVS database. */
3597 default_install__(struct netdev *netdev_)
3599 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3600 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3602 /* Nothing but a tc class implementation is allowed to write to a tc. This
3603 * class never does that, so we can legitimately use a const tc object. */
3604 netdev->tc = CONST_CAST(struct tc *, &tc);
3608 default_tc_install(struct netdev *netdev,
3609 const struct smap *details OVS_UNUSED)
3611 default_install__(netdev);
3616 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3618 default_install__(netdev);
3622 static const struct tc_ops tc_ops_default = {
3623 NULL, /* linux_name */
3628 NULL, /* tc_destroy */
3629 NULL, /* qdisc_get */
3630 NULL, /* qdisc_set */
3631 NULL, /* class_get */
3632 NULL, /* class_set */
3633 NULL, /* class_delete */
3634 NULL, /* class_get_stats */
3635 NULL /* class_dump_stats */
3638 /* "linux-other" traffic control class.
3643 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3646 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3648 /* Nothing but a tc class implementation is allowed to write to a tc. This
3649 * class never does that, so we can legitimately use a const tc object. */
3650 netdev->tc = CONST_CAST(struct tc *, &tc);
3654 static const struct tc_ops tc_ops_other = {
3655 NULL, /* linux_name */
3656 "linux-other", /* ovs_name */
3658 NULL, /* tc_install */
3660 NULL, /* tc_destroy */
3661 NULL, /* qdisc_get */
3662 NULL, /* qdisc_set */
3663 NULL, /* class_get */
3664 NULL, /* class_set */
3665 NULL, /* class_delete */
3666 NULL, /* class_get_stats */
3667 NULL /* class_dump_stats */
3670 /* Traffic control. */
3672 /* Number of kernel "tc" ticks per second. */
3673 static double ticks_per_s;
3675 /* Number of kernel "jiffies" per second. This is used for the purpose of
3676 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3677 * one jiffy's worth of data.
3679 * There are two possibilities here:
3681 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3682 * approximate range of 100 to 1024. That means that we really need to
3683 * make sure that the qdisc can buffer that much data.
3685 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3686 * has finely granular timers and there's no need to fudge additional room
3687 * for buffers. (There's no extra effort needed to implement that: the
3688 * large 'buffer_hz' is used as a divisor, so practically any number will
3689 * come out as 0 in the division. Small integer results in the case of
3690 * really high dividends won't have any real effect anyhow.)
3692 static unsigned int buffer_hz;
3694 /* Returns tc handle 'major':'minor'. */
3696 tc_make_handle(unsigned int major, unsigned int minor)
3698 return TC_H_MAKE(major << 16, minor);
3701 /* Returns the major number from 'handle'. */
3703 tc_get_major(unsigned int handle)
3705 return TC_H_MAJ(handle) >> 16;
3708 /* Returns the minor number from 'handle'. */
3710 tc_get_minor(unsigned int handle)
3712 return TC_H_MIN(handle);
3715 static struct tcmsg *
3716 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3717 struct ofpbuf *request)
3719 struct tcmsg *tcmsg;
3723 error = get_ifindex(netdev, &ifindex);
3728 ofpbuf_init(request, 512);
3729 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3730 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3731 tcmsg->tcm_family = AF_UNSPEC;
3732 tcmsg->tcm_ifindex = ifindex;
3733 /* Caller should fill in tcmsg->tcm_handle. */
3734 /* Caller should fill in tcmsg->tcm_parent. */
3740 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3742 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3743 ofpbuf_uninit(request);
3747 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3748 * policing configuration.
3750 * This function is equivalent to running the following when 'add' is true:
3751 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3753 * This function is equivalent to running the following when 'add' is false:
3754 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3756 * The configuration and stats may be seen with the following command:
3757 * /sbin/tc -s qdisc show dev <devname>
3759 * Returns 0 if successful, otherwise a positive errno value.
3762 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3764 struct ofpbuf request;
3765 struct tcmsg *tcmsg;
3767 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3768 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3770 tcmsg = tc_make_request(netdev, type, flags, &request);
3774 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3775 tcmsg->tcm_parent = TC_H_INGRESS;
3776 nl_msg_put_string(&request, TCA_KIND, "ingress");
3777 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3779 error = tc_transact(&request, NULL);
3781 /* If we're deleting the qdisc, don't worry about some of the
3782 * error conditions. */
3783 if (!add && (error == ENOENT || error == EINVAL)) {
3792 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3795 * This function is equivalent to running:
3796 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3797 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3800 * The configuration and stats may be seen with the following command:
3801 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3803 * Returns 0 if successful, otherwise a positive errno value.
3806 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3808 struct tc_police tc_police;
3809 struct ofpbuf request;
3810 struct tcmsg *tcmsg;
3811 size_t basic_offset;
3812 size_t police_offset;
3816 memset(&tc_police, 0, sizeof tc_police);
3817 tc_police.action = TC_POLICE_SHOT;
3818 tc_police.mtu = mtu;
3819 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3820 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3821 kbits_burst * 1024);
3823 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3824 NLM_F_EXCL | NLM_F_CREATE, &request);
3828 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3829 tcmsg->tcm_info = tc_make_handle(49,
3830 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3832 nl_msg_put_string(&request, TCA_KIND, "basic");
3833 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3834 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3835 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3836 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3837 nl_msg_end_nested(&request, police_offset);
3838 nl_msg_end_nested(&request, basic_offset);
3840 error = tc_transact(&request, NULL);
3851 /* The values in psched are not individually very meaningful, but they are
3852 * important. The tables below show some values seen in the wild.
3856 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3857 * (Before that, there are hints that it was 1000000000.)
3859 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3863 * -----------------------------------
3864 * [1] 000c8000 000f4240 000f4240 00000064
3865 * [2] 000003e8 00000400 000f4240 3b9aca00
3866 * [3] 000003e8 00000400 000f4240 3b9aca00
3867 * [4] 000003e8 00000400 000f4240 00000064
3868 * [5] 000003e8 00000040 000f4240 3b9aca00
3869 * [6] 000003e8 00000040 000f4240 000000f9
3871 * a b c d ticks_per_s buffer_hz
3872 * ------- --------- ---------- ------------- ----------- -------------
3873 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3874 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3875 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3876 * [4] 1,000 1,024 1,000,000 100 976,562 100
3877 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3878 * [6] 1,000 64 1,000,000 249 15,625,000 249
3880 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3881 * [2] 2.6.26-1-686-bigmem from Debian lenny
3882 * [3] 2.6.26-2-sparc64 from Debian lenny
3883 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3884 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3885 * [6] 2.6.34 from kernel.org on KVM
3887 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3888 static const char fn[] = "/proc/net/psched";
3889 unsigned int a, b, c, d;
3892 if (!ovsthread_once_start(&once)) {
3899 stream = fopen(fn, "r");
3901 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3905 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3906 VLOG_WARN("%s: read failed", fn);
3910 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3914 VLOG_WARN("%s: invalid scheduler parameters", fn);
3918 ticks_per_s = (double) a * c / b;
3922 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3925 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3928 ovsthread_once_done(&once);
3931 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3932 * rate of 'rate' bytes per second. */
3934 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3937 return (rate * ticks) / ticks_per_s;
3940 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3941 * rate of 'rate' bytes per second. */
3943 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3946 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3949 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3950 * a transmission rate of 'rate' bytes per second. */
3952 tc_buffer_per_jiffy(unsigned int rate)
3955 return rate / buffer_hz;
3958 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3959 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3960 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3961 * stores NULL into it if it is absent.
3963 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3966 * Returns 0 if successful, otherwise a positive errno value. */
3968 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3969 struct nlattr **options)
3971 static const struct nl_policy tca_policy[] = {
3972 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3973 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3975 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3977 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3978 tca_policy, ta, ARRAY_SIZE(ta))) {
3979 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3984 *kind = nl_attr_get_string(ta[TCA_KIND]);
3988 *options = ta[TCA_OPTIONS];
4003 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4004 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4005 * into '*options', and its queue statistics into '*stats'. Any of the output
4006 * arguments may be null.
4008 * Returns 0 if successful, otherwise a positive errno value. */
4010 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4011 struct nlattr **options, struct netdev_queue_stats *stats)
4013 static const struct nl_policy tca_policy[] = {
4014 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4015 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4017 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4019 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4020 tca_policy, ta, ARRAY_SIZE(ta))) {
4021 VLOG_WARN_RL(&rl, "failed to parse class message");
4026 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4027 *handlep = tc->tcm_handle;
4031 *options = ta[TCA_OPTIONS];
4035 const struct gnet_stats_queue *gsq;
4036 struct gnet_stats_basic gsb;
4038 static const struct nl_policy stats_policy[] = {
4039 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4040 .min_len = sizeof gsb },
4041 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4042 .min_len = sizeof *gsq },
4044 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4046 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4047 sa, ARRAY_SIZE(sa))) {
4048 VLOG_WARN_RL(&rl, "failed to parse class stats");
4052 /* Alignment issues screw up the length of struct gnet_stats_basic on
4053 * some arch/bitsize combinations. Newer versions of Linux have a
4054 * struct gnet_stats_basic_packed, but we can't depend on that. The
4055 * easiest thing to do is just to make a copy. */
4056 memset(&gsb, 0, sizeof gsb);
4057 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4058 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4059 stats->tx_bytes = gsb.bytes;
4060 stats->tx_packets = gsb.packets;
4062 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4063 stats->tx_errors = gsq->drops;
4073 memset(stats, 0, sizeof *stats);
4078 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4081 tc_query_class(const struct netdev *netdev,
4082 unsigned int handle, unsigned int parent,
4083 struct ofpbuf **replyp)
4085 struct ofpbuf request;
4086 struct tcmsg *tcmsg;
4089 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4093 tcmsg->tcm_handle = handle;
4094 tcmsg->tcm_parent = parent;
4096 error = tc_transact(&request, replyp);
4098 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4099 netdev_get_name(netdev),
4100 tc_get_major(handle), tc_get_minor(handle),
4101 tc_get_major(parent), tc_get_minor(parent),
4102 ovs_strerror(error));
4107 /* Equivalent to "tc class del dev <name> handle <handle>". */
4109 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4111 struct ofpbuf request;
4112 struct tcmsg *tcmsg;
4115 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4119 tcmsg->tcm_handle = handle;
4120 tcmsg->tcm_parent = 0;
4122 error = tc_transact(&request, NULL);
4124 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4125 netdev_get_name(netdev),
4126 tc_get_major(handle), tc_get_minor(handle),
4127 ovs_strerror(error));
4132 /* Equivalent to "tc qdisc del dev <name> root". */
4134 tc_del_qdisc(struct netdev *netdev_)
4136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4137 struct ofpbuf request;
4138 struct tcmsg *tcmsg;
4141 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4145 tcmsg->tcm_handle = tc_make_handle(1, 0);
4146 tcmsg->tcm_parent = TC_H_ROOT;
4148 error = tc_transact(&request, NULL);
4149 if (error == EINVAL) {
4150 /* EINVAL probably means that the default qdisc was in use, in which
4151 * case we've accomplished our purpose. */
4154 if (!error && netdev->tc) {
4155 if (netdev->tc->ops->tc_destroy) {
4156 netdev->tc->ops->tc_destroy(netdev->tc);
4163 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4164 * kernel to determine what they are. Returns 0 if successful, otherwise a
4165 * positive errno value. */
4167 tc_query_qdisc(const struct netdev *netdev_)
4169 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4170 struct ofpbuf request, *qdisc;
4171 const struct tc_ops *ops;
4172 struct tcmsg *tcmsg;
4180 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4181 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4182 * 2.6.35 without that fix backported to it.
4184 * To avoid the OOPS, we must not make a request that would attempt to dump
4185 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4186 * few others. There are a few ways that I can see to do this, but most of
4187 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4188 * technique chosen here is to assume that any non-default qdisc that we
4189 * create will have a class with handle 1:0. The built-in qdiscs only have
4190 * a class with handle 0:0.
4192 * We could check for Linux 2.6.35+ and use a more straightforward method
4194 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4198 tcmsg->tcm_handle = tc_make_handle(1, 0);
4199 tcmsg->tcm_parent = 0;
4201 /* Figure out what tc class to instantiate. */
4202 error = tc_transact(&request, &qdisc);
4206 error = tc_parse_qdisc(qdisc, &kind, NULL);
4208 ops = &tc_ops_other;
4210 ops = tc_lookup_linux_name(kind);
4212 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4213 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4215 ops = &tc_ops_other;
4218 } else if (error == ENOENT) {
4219 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4220 * other entity that doesn't have a handle 1:0. We will assume
4221 * that it's the system default qdisc. */
4222 ops = &tc_ops_default;
4225 /* Who knows? Maybe the device got deleted. */
4226 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4227 netdev_get_name(netdev_), ovs_strerror(error));
4228 ops = &tc_ops_other;
4231 /* Instantiate it. */
4232 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4233 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4234 ofpbuf_delete(qdisc);
4236 return error ? error : load_error;
4239 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4240 approximate the time to transmit packets of various lengths. For an MTU of
4241 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4242 represents two possible packet lengths; for a MTU of 513 through 1024, four
4243 possible lengths; and so on.
4245 Returns, for the specified 'mtu', the number of bits that packet lengths
4246 need to be shifted right to fit within such a 256-entry table. */
4248 tc_calc_cell_log(unsigned int mtu)
4253 mtu = ETH_PAYLOAD_MAX;
4255 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4257 for (cell_log = 0; mtu >= 256; cell_log++) {
4264 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4267 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4269 memset(rate, 0, sizeof *rate);
4270 rate->cell_log = tc_calc_cell_log(mtu);
4271 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4272 /* rate->cell_align = 0; */ /* distro headers. */
4273 rate->mpu = ETH_TOTAL_MIN;
4277 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4278 * attribute of the specified "type".
4280 * See tc_calc_cell_log() above for a description of "rtab"s. */
4282 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4287 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4288 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4289 unsigned packet_size = (i + 1) << rate->cell_log;
4290 if (packet_size < rate->mpu) {
4291 packet_size = rate->mpu;
4293 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4297 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4298 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4299 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4302 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4304 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4305 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4308 /* Linux-only functions declared in netdev-linux.h */
4310 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4311 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4313 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4314 const char *flag_name, bool enable)
4316 const char *netdev_name = netdev_get_name(netdev);
4317 struct ethtool_value evalue;
4321 COVERAGE_INC(netdev_get_ethtool);
4322 memset(&evalue, 0, sizeof evalue);
4323 error = netdev_linux_do_ethtool(netdev_name,
4324 (struct ethtool_cmd *)&evalue,
4325 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4330 COVERAGE_INC(netdev_set_ethtool);
4331 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4332 error = netdev_linux_do_ethtool(netdev_name,
4333 (struct ethtool_cmd *)&evalue,
4334 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4339 COVERAGE_INC(netdev_get_ethtool);
4340 memset(&evalue, 0, sizeof evalue);
4341 error = netdev_linux_do_ethtool(netdev_name,
4342 (struct ethtool_cmd *)&evalue,
4343 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4348 if (new_flags != evalue.data) {
4349 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4350 "device %s failed", enable ? "enable" : "disable",
4351 flag_name, netdev_name);
4358 /* Utility functions. */
4360 /* Copies 'src' into 'dst', performing format conversion in the process. */
4362 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4363 const struct rtnl_link_stats *src)
4365 dst->rx_packets = src->rx_packets;
4366 dst->tx_packets = src->tx_packets;
4367 dst->rx_bytes = src->rx_bytes;
4368 dst->tx_bytes = src->tx_bytes;
4369 dst->rx_errors = src->rx_errors;
4370 dst->tx_errors = src->tx_errors;
4371 dst->rx_dropped = src->rx_dropped;
4372 dst->tx_dropped = src->tx_dropped;
4373 dst->multicast = src->multicast;
4374 dst->collisions = src->collisions;
4375 dst->rx_length_errors = src->rx_length_errors;
4376 dst->rx_over_errors = src->rx_over_errors;
4377 dst->rx_crc_errors = src->rx_crc_errors;
4378 dst->rx_frame_errors = src->rx_frame_errors;
4379 dst->rx_fifo_errors = src->rx_fifo_errors;
4380 dst->rx_missed_errors = src->rx_missed_errors;
4381 dst->tx_aborted_errors = src->tx_aborted_errors;
4382 dst->tx_carrier_errors = src->tx_carrier_errors;
4383 dst->tx_fifo_errors = src->tx_fifo_errors;
4384 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4385 dst->tx_window_errors = src->tx_window_errors;
4389 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4391 /* Policy for RTNLGRP_LINK messages.
4393 * There are *many* more fields in these messages, but currently we only
4394 * care about these fields. */
4395 static const struct nl_policy rtnlgrp_link_policy[] = {
4396 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4397 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4398 .min_len = sizeof(struct rtnl_link_stats) },
4401 struct ofpbuf request;
4402 struct ofpbuf *reply;
4403 struct ifinfomsg *ifi;
4404 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4407 ofpbuf_init(&request, 0);
4408 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4409 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4410 ifi->ifi_family = PF_UNSPEC;
4411 ifi->ifi_index = ifindex;
4412 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4413 ofpbuf_uninit(&request);
4418 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4419 rtnlgrp_link_policy,
4420 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4421 ofpbuf_delete(reply);
4425 if (!attrs[IFLA_STATS]) {
4426 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4427 ofpbuf_delete(reply);
4431 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4433 ofpbuf_delete(reply);
4439 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4441 static const char fn[] = "/proc/net/dev";
4446 stream = fopen(fn, "r");
4448 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4453 while (fgets(line, sizeof line, stream)) {
4456 #define X64 "%"SCNu64
4459 X64 X64 X64 X64 X64 X64 X64 "%*u"
4460 X64 X64 X64 X64 X64 X64 X64 "%*u",
4466 &stats->rx_fifo_errors,
4467 &stats->rx_frame_errors,
4473 &stats->tx_fifo_errors,
4475 &stats->tx_carrier_errors) != 15) {
4476 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4477 } else if (!strcmp(devname, netdev_name)) {
4478 stats->rx_length_errors = UINT64_MAX;
4479 stats->rx_over_errors = UINT64_MAX;
4480 stats->rx_crc_errors = UINT64_MAX;
4481 stats->rx_missed_errors = UINT64_MAX;
4482 stats->tx_aborted_errors = UINT64_MAX;
4483 stats->tx_heartbeat_errors = UINT64_MAX;
4484 stats->tx_window_errors = UINT64_MAX;
4490 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4496 get_flags(const struct netdev *dev, unsigned int *flags)
4502 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4504 *flags = ifr.ifr_flags;
4510 set_flags(const char *name, unsigned int flags)
4514 ifr.ifr_flags = flags;
4515 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4519 do_get_ifindex(const char *netdev_name)
4524 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4525 COVERAGE_INC(netdev_get_ifindex);
4527 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4529 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4530 netdev_name, ovs_strerror(error));
4533 return ifr.ifr_ifindex;
4537 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4539 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4541 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4542 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4545 netdev->get_ifindex_error = -ifindex;
4546 netdev->ifindex = 0;
4548 netdev->get_ifindex_error = 0;
4549 netdev->ifindex = ifindex;
4551 netdev->cache_valid |= VALID_IFINDEX;
4554 *ifindexp = netdev->ifindex;
4555 return netdev->get_ifindex_error;
4559 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4565 memset(&ifr, 0, sizeof ifr);
4566 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4567 COVERAGE_INC(netdev_get_hwaddr);
4568 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4570 /* ENODEV probably means that a vif disappeared asynchronously and
4571 * hasn't been removed from the database yet, so reduce the log level
4572 * to INFO for that case. */
4573 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4574 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4575 netdev_name, ovs_strerror(error));
4578 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4579 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4580 VLOG_WARN("%s device has unknown hardware address family %d",
4581 netdev_name, hwaddr_family);
4583 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4588 set_etheraddr(const char *netdev_name,
4589 const uint8_t mac[ETH_ADDR_LEN])
4594 memset(&ifr, 0, sizeof ifr);
4595 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4596 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4597 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4598 COVERAGE_INC(netdev_set_hwaddr);
4599 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4601 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4602 netdev_name, ovs_strerror(error));
4608 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4609 int cmd, const char *cmd_name)
4614 memset(&ifr, 0, sizeof ifr);
4615 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4616 ifr.ifr_data = (caddr_t) ecmd;
4619 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4621 if (error != EOPNOTSUPP) {
4622 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4623 "failed: %s", cmd_name, name, ovs_strerror(error));
4625 /* The device doesn't support this operation. That's pretty
4626 * common, so there's no point in logging anything. */
4633 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4634 int cmd, const char *cmd_name)
4639 ifr.ifr_addr.sa_family = AF_INET;
4640 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4642 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4644 *ip = sin->sin_addr;
4649 /* Returns an AF_PACKET raw socket or a negative errno value. */
4651 af_packet_sock(void)
4653 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4656 if (ovsthread_once_start(&once)) {
4657 sock = socket(AF_PACKET, SOCK_RAW, 0);
4659 int error = set_nonblocking(sock);
4666 VLOG_ERR("failed to create packet socket: %s",
4667 ovs_strerror(errno));
4669 ovsthread_once_done(&once);