2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
156 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
157 * 2.6.32-431.29.2.el6.x86_64 (see report at
158 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
159 * if_link.h is not self-contained on those kernels. It is easiest to
160 * unconditionally define a replacement. */
162 #define IFLA_STATS64 23
164 #define rtnl_link_stats64 rpl_rtnl_link_stats64
165 struct rtnl_link_stats64 {
177 uint64_t rx_length_errors;
178 uint64_t rx_over_errors;
179 uint64_t rx_crc_errors;
180 uint64_t rx_frame_errors;
181 uint64_t rx_fifo_errors;
182 uint64_t rx_missed_errors;
184 uint64_t tx_aborted_errors;
185 uint64_t tx_carrier_errors;
186 uint64_t tx_fifo_errors;
187 uint64_t tx_heartbeat_errors;
188 uint64_t tx_window_errors;
190 uint64_t rx_compressed;
191 uint64_t tx_compressed;
195 VALID_IFINDEX = 1 << 0,
196 VALID_ETHERADDR = 1 << 1,
200 VALID_POLICING = 1 << 5,
201 VALID_VPORT_STAT_ERROR = 1 << 6,
202 VALID_DRVINFO = 1 << 7,
203 VALID_FEATURES = 1 << 8,
206 /* Traffic control. */
208 /* An instance of a traffic control class. Always associated with a particular
211 * Each TC implementation subclasses this with whatever additional data it
214 const struct tc_ops *ops;
215 struct hmap queues; /* Contains "struct tc_queue"s.
216 * Read by generic TC layer.
217 * Written only by TC implementation. */
220 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
222 /* One traffic control queue.
224 * Each TC implementation subclasses this with whatever additional data it
227 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
228 unsigned int queue_id; /* OpenFlow queue ID. */
229 long long int created; /* Time queue was created, in msecs. */
232 /* A particular kind of traffic control. Each implementation generally maps to
233 * one particular Linux qdisc class.
235 * The functions below return 0 if successful or a positive errno value on
236 * failure, except where otherwise noted. All of them must be provided, except
237 * where otherwise noted. */
239 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
240 * This is null for tc_ops_default and tc_ops_other, for which there are no
241 * appropriate values. */
242 const char *linux_name;
244 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
245 const char *ovs_name;
247 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
248 * queues. The queues are numbered 0 through n_queues - 1. */
249 unsigned int n_queues;
251 /* Called to install this TC class on 'netdev'. The implementation should
252 * make the Netlink calls required to set up 'netdev' with the right qdisc
253 * and configure it according to 'details'. The implementation may assume
254 * that the current qdisc is the default; that is, there is no need for it
255 * to delete the current qdisc before installing itself.
257 * The contents of 'details' should be documented as valid for 'ovs_name'
258 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
259 * (which is built as ovs-vswitchd.conf.db(8)).
261 * This function must return 0 if and only if it sets 'netdev->tc' to an
262 * initialized 'struct tc'.
264 * (This function is null for tc_ops_other, which cannot be installed. For
265 * other TC classes it should always be nonnull.) */
266 int (*tc_install)(struct netdev *netdev, const struct smap *details);
268 /* Called when the netdev code determines (through a Netlink query) that
269 * this TC class's qdisc is installed on 'netdev', but we didn't install
270 * it ourselves and so don't know any of the details.
272 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
273 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
274 * implementation should parse the other attributes of 'nlmsg' as
275 * necessary to determine its configuration. If necessary it should also
276 * use Netlink queries to determine the configuration of queues on
279 * This function must return 0 if and only if it sets 'netdev->tc' to an
280 * initialized 'struct tc'. */
281 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
283 /* Destroys the data structures allocated by the implementation as part of
284 * 'tc'. (This includes destroying 'tc->queues' by calling
287 * The implementation should not need to perform any Netlink calls. If
288 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
289 * (But it may not be desirable.)
291 * This function may be null if 'tc' is trivial. */
292 void (*tc_destroy)(struct tc *tc);
294 /* Retrieves details of 'netdev->tc' configuration into 'details'.
296 * The implementation should not need to perform any Netlink calls, because
297 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
298 * cached the configuration.
300 * The contents of 'details' should be documented as valid for 'ovs_name'
301 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
302 * (which is built as ovs-vswitchd.conf.db(8)).
304 * This function may be null if 'tc' is not configurable.
306 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
308 /* Reconfigures 'netdev->tc' according to 'details', performing any
309 * required Netlink calls to complete the reconfiguration.
311 * The contents of 'details' should be documented as valid for 'ovs_name'
312 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
313 * (which is built as ovs-vswitchd.conf.db(8)).
315 * This function may be null if 'tc' is not configurable.
317 int (*qdisc_set)(struct netdev *, const struct smap *details);
319 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
320 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
322 * The contents of 'details' should be documented as valid for 'ovs_name'
323 * in the "other_config" column in the "Queue" table in
324 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the queue configuration.
330 * This function may be null if 'tc' does not have queues ('n_queues' is
332 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
333 struct smap *details);
335 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
336 * 'details', perfoming any required Netlink calls to complete the
337 * reconfiguration. The caller ensures that 'queue_id' is less than
340 * The contents of 'details' should be documented as valid for 'ovs_name'
341 * in the "other_config" column in the "Queue" table in
342 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
344 * This function may be null if 'tc' does not have queues or its queues are
345 * not configurable. */
346 int (*class_set)(struct netdev *, unsigned int queue_id,
347 const struct smap *details);
349 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
350 * tc_queue's within 'netdev->tc->queues'.
352 * This function may be null if 'tc' does not have queues or its queues
353 * cannot be deleted. */
354 int (*class_delete)(struct netdev *, struct tc_queue *queue);
356 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
357 * 'struct tc_queue's within 'netdev->tc->queues'.
359 * On success, initializes '*stats'.
361 * This function may be null if 'tc' does not have queues or if it cannot
362 * report queue statistics. */
363 int (*class_get_stats)(const struct netdev *netdev,
364 const struct tc_queue *queue,
365 struct netdev_queue_stats *stats);
367 /* Extracts queue stats from 'nlmsg', which is a response to a
368 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
370 * This function may be null if 'tc' does not have queues or if it cannot
371 * report queue statistics. */
372 int (*class_dump_stats)(const struct netdev *netdev,
373 const struct ofpbuf *nlmsg,
374 netdev_dump_queue_stats_cb *cb, void *aux);
378 tc_init(struct tc *tc, const struct tc_ops *ops)
381 hmap_init(&tc->queues);
385 tc_destroy(struct tc *tc)
387 hmap_destroy(&tc->queues);
390 static const struct tc_ops tc_ops_htb;
391 static const struct tc_ops tc_ops_hfsc;
392 static const struct tc_ops tc_ops_codel;
393 static const struct tc_ops tc_ops_fqcodel;
394 static const struct tc_ops tc_ops_sfq;
395 static const struct tc_ops tc_ops_default;
396 static const struct tc_ops tc_ops_other;
398 static const struct tc_ops *const tcs[] = {
399 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
400 &tc_ops_hfsc, /* Hierarchical fair service curve. */
401 &tc_ops_codel, /* Controlled delay */
402 &tc_ops_fqcodel, /* Fair queue controlled delay */
403 &tc_ops_sfq, /* Stochastic fair queueing */
404 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
405 &tc_ops_other, /* Some other qdisc. */
409 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
410 static unsigned int tc_get_major(unsigned int handle);
411 static unsigned int tc_get_minor(unsigned int handle);
413 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
414 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
415 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
417 static struct tcmsg *tc_make_request(const struct netdev *, int type,
418 unsigned int flags, struct ofpbuf *);
419 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
420 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
421 static int tc_add_policer(struct netdev *,
422 uint32_t kbits_rate, uint32_t kbits_burst);
424 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
425 struct nlattr **options);
426 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
427 struct nlattr **options,
428 struct netdev_queue_stats *);
429 static int tc_query_class(const struct netdev *,
430 unsigned int handle, unsigned int parent,
431 struct ofpbuf **replyp);
432 static int tc_delete_class(const struct netdev *, unsigned int handle);
434 static int tc_del_qdisc(struct netdev *netdev);
435 static int tc_query_qdisc(const struct netdev *netdev);
437 static int tc_calc_cell_log(unsigned int mtu);
438 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
439 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
440 const struct tc_ratespec *rate);
441 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
443 struct netdev_linux {
446 /* Protects all members below. */
447 struct ovs_mutex mutex;
449 unsigned int cache_valid;
451 bool miimon; /* Link status of last poll. */
452 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
453 struct timer miimon_timer;
455 /* The following are figured out "on demand" only. They are only valid
456 * when the corresponding VALID_* bit in 'cache_valid' is set. */
458 struct eth_addr etheraddr;
459 struct in_addr address, netmask;
462 unsigned int ifi_flags;
463 long long int carrier_resets;
464 uint32_t kbits_rate; /* Policing data. */
465 uint32_t kbits_burst;
466 int vport_stats_error; /* Cached error code from vport_get_stats().
467 0 or an errno value. */
468 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
469 int ether_addr_error; /* Cached error code from set/get etheraddr. */
470 int netdev_policing_error; /* Cached error code from set policing. */
471 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
472 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
473 int in4_error; /* Cached error code from reading in4 addr. */
474 int in6_error; /* Cached error code from reading in6 addr. */
476 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
477 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
478 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
480 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
483 /* For devices of class netdev_tap_class only. */
487 struct netdev_rxq_linux {
488 struct netdev_rxq up;
493 /* This is set pretty low because we probably won't learn anything from the
494 * additional log messages. */
495 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
497 /* Polling miimon status for all ports causes performance degradation when
498 * handling a large number of ports. If there are no devices using miimon, then
499 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
501 * Readers do not depend on this variable synchronizing with the related
502 * changes in the device miimon status, so we can use atomic_count. */
503 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
505 static void netdev_linux_run(void);
507 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
508 int cmd, const char *cmd_name);
509 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
510 int cmd, const char *cmd_name);
511 static int get_flags(const struct netdev *, unsigned int *flags);
512 static int set_flags(const char *, unsigned int flags);
513 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
514 enum netdev_flags on, enum netdev_flags *old_flagsp)
515 OVS_REQUIRES(netdev->mutex);
516 static int do_get_ifindex(const char *netdev_name);
517 static int get_ifindex(const struct netdev *, int *ifindexp);
518 static int do_set_addr(struct netdev *netdev,
519 int ioctl_nr, const char *ioctl_name,
520 struct in_addr addr);
521 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
522 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
523 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
524 static int af_packet_sock(void);
525 static bool netdev_linux_miimon_enabled(void);
526 static void netdev_linux_miimon_run(void);
527 static void netdev_linux_miimon_wait(void);
528 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
531 is_netdev_linux_class(const struct netdev_class *netdev_class)
533 return netdev_class->run == netdev_linux_run;
537 is_tap_netdev(const struct netdev *netdev)
539 return netdev_get_class(netdev) == &netdev_tap_class;
542 static struct netdev_linux *
543 netdev_linux_cast(const struct netdev *netdev)
545 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
547 return CONTAINER_OF(netdev, struct netdev_linux, up);
550 static struct netdev_rxq_linux *
551 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
553 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
554 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
557 static void netdev_linux_update(struct netdev_linux *netdev,
558 const struct rtnetlink_change *)
559 OVS_REQUIRES(netdev->mutex);
560 static void netdev_linux_changed(struct netdev_linux *netdev,
561 unsigned int ifi_flags, unsigned int mask)
562 OVS_REQUIRES(netdev->mutex);
564 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
565 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
566 * if no such socket could be created. */
567 static struct nl_sock *
568 netdev_linux_notify_sock(void)
570 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
571 static struct nl_sock *sock;
572 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
573 RTNLGRP_IPV6_IFADDR};
575 if (ovsthread_once_start(&once)) {
578 error = nl_sock_create(NETLINK_ROUTE, &sock);
582 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
583 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
585 nl_sock_destroy(sock);
591 ovsthread_once_done(&once);
598 netdev_linux_miimon_enabled(void)
600 return atomic_count_get(&miimon_cnt) > 0;
604 netdev_linux_run(void)
606 struct nl_sock *sock;
609 if (netdev_linux_miimon_enabled()) {
610 netdev_linux_miimon_run();
613 sock = netdev_linux_notify_sock();
619 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
620 uint64_t buf_stub[4096 / 8];
623 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
624 error = nl_sock_recv(sock, &buf, false);
626 struct rtnetlink_change change;
628 if (rtnetlink_parse(&buf, &change)) {
629 struct netdev *netdev_ = netdev_from_name(change.ifname);
630 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
633 ovs_mutex_lock(&netdev->mutex);
634 netdev_linux_update(netdev, &change);
635 ovs_mutex_unlock(&netdev->mutex);
637 netdev_close(netdev_);
639 } else if (error == ENOBUFS) {
640 struct shash device_shash;
641 struct shash_node *node;
645 shash_init(&device_shash);
646 netdev_get_devices(&netdev_linux_class, &device_shash);
647 SHASH_FOR_EACH (node, &device_shash) {
648 struct netdev *netdev_ = node->data;
649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
652 ovs_mutex_lock(&netdev->mutex);
653 get_flags(netdev_, &flags);
654 netdev_linux_changed(netdev, flags, 0);
655 ovs_mutex_unlock(&netdev->mutex);
657 netdev_close(netdev_);
659 shash_destroy(&device_shash);
660 } else if (error != EAGAIN) {
661 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
662 ovs_strerror(error));
669 netdev_linux_wait(void)
671 struct nl_sock *sock;
673 if (netdev_linux_miimon_enabled()) {
674 netdev_linux_miimon_wait();
676 sock = netdev_linux_notify_sock();
678 nl_sock_wait(sock, POLLIN);
683 netdev_linux_changed(struct netdev_linux *dev,
684 unsigned int ifi_flags, unsigned int mask)
685 OVS_REQUIRES(dev->mutex)
687 netdev_change_seq_changed(&dev->up);
689 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
690 dev->carrier_resets++;
692 dev->ifi_flags = ifi_flags;
694 dev->cache_valid &= mask;
698 netdev_linux_update(struct netdev_linux *dev,
699 const struct rtnetlink_change *change)
700 OVS_REQUIRES(dev->mutex)
702 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
703 if (change->nlmsg_type == RTM_NEWLINK) {
704 /* Keep drv-info, in4, in6. */
705 netdev_linux_changed(dev, change->ifi_flags,
706 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
708 /* Update netdev from rtnl-change msg. */
710 dev->mtu = change->mtu;
711 dev->cache_valid |= VALID_MTU;
712 dev->netdev_mtu_error = 0;
715 if (!eth_addr_is_zero(change->mac)) {
716 dev->etheraddr = change->mac;
717 dev->cache_valid |= VALID_ETHERADDR;
718 dev->ether_addr_error = 0;
721 dev->ifindex = change->if_index;
722 dev->cache_valid |= VALID_IFINDEX;
723 dev->get_ifindex_error = 0;
725 netdev_linux_changed(dev, change->ifi_flags, 0);
727 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
728 /* Invalidates in4, in6. */
729 netdev_linux_changed(dev, dev->ifi_flags,
730 ~(VALID_IN4 | VALID_IN6));
736 static struct netdev *
737 netdev_linux_alloc(void)
739 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
744 netdev_linux_common_construct(struct netdev_linux *netdev)
746 ovs_mutex_init(&netdev->mutex);
749 /* Creates system and internal devices. */
751 netdev_linux_construct(struct netdev *netdev_)
753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756 netdev_linux_common_construct(netdev);
758 error = get_flags(&netdev->up, &netdev->ifi_flags);
759 if (error == ENODEV) {
760 if (netdev->up.netdev_class != &netdev_internal_class) {
761 /* The device does not exist, so don't allow it to be opened. */
764 /* "Internal" netdevs have to be created as netdev objects before
765 * they exist in the kernel, because creating them in the kernel
766 * happens by passing a netdev object to dpif_port_add().
767 * Therefore, ignore the error. */
774 /* For most types of netdevs we open the device for each call of
775 * netdev_open(). However, this is not the case with tap devices,
776 * since it is only possible to open the device once. In this
777 * situation we share a single file descriptor, and consequently
778 * buffers, across all readers. Therefore once data is read it will
779 * be unavailable to other reads for tap devices. */
781 netdev_linux_construct_tap(struct netdev *netdev_)
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
784 static const char tap_dev[] = "/dev/net/tun";
785 const char *name = netdev_->name;
789 netdev_linux_common_construct(netdev);
791 /* Open tap device. */
792 netdev->tap_fd = open(tap_dev, O_RDWR);
793 if (netdev->tap_fd < 0) {
795 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
799 /* Create tap device. */
800 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
801 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
802 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
803 VLOG_WARN("%s: creating tap device failed: %s", name,
804 ovs_strerror(errno));
809 /* Make non-blocking. */
810 error = set_nonblocking(netdev->tap_fd);
818 close(netdev->tap_fd);
823 netdev_linux_destruct(struct netdev *netdev_)
825 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
827 if (netdev->tc && netdev->tc->ops->tc_destroy) {
828 netdev->tc->ops->tc_destroy(netdev->tc);
831 if (netdev_get_class(netdev_) == &netdev_tap_class
832 && netdev->tap_fd >= 0)
834 close(netdev->tap_fd);
837 if (netdev->miimon_interval > 0) {
838 atomic_count_dec(&miimon_cnt);
841 ovs_mutex_destroy(&netdev->mutex);
845 netdev_linux_dealloc(struct netdev *netdev_)
847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
851 static struct netdev_rxq *
852 netdev_linux_rxq_alloc(void)
854 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
859 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
861 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
862 struct netdev *netdev_ = rx->up.netdev;
863 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 ovs_mutex_lock(&netdev->mutex);
867 rx->is_tap = is_tap_netdev(netdev_);
869 rx->fd = netdev->tap_fd;
871 struct sockaddr_ll sll;
873 /* Result of tcpdump -dd inbound */
874 static const struct sock_filter filt[] = {
875 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
876 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
877 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
878 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
880 static const struct sock_fprog fprog = {
881 ARRAY_SIZE(filt), (struct sock_filter *) filt
884 /* Create file descriptor. */
885 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
888 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
893 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
895 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
896 netdev_get_name(netdev_), ovs_strerror(error));
900 /* Set non-blocking mode. */
901 error = set_nonblocking(rx->fd);
906 /* Get ethernet device index. */
907 error = get_ifindex(&netdev->up, &ifindex);
912 /* Bind to specific ethernet device. */
913 memset(&sll, 0, sizeof sll);
914 sll.sll_family = AF_PACKET;
915 sll.sll_ifindex = ifindex;
916 sll.sll_protocol = htons(ETH_P_ALL);
917 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
919 VLOG_ERR("%s: failed to bind raw socket (%s)",
920 netdev_get_name(netdev_), ovs_strerror(error));
924 /* Filter for only inbound packets. */
925 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
929 VLOG_ERR("%s: failed to attach filter (%s)",
930 netdev_get_name(netdev_), ovs_strerror(error));
934 ovs_mutex_unlock(&netdev->mutex);
942 ovs_mutex_unlock(&netdev->mutex);
947 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
949 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
957 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
959 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
965 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
967 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
968 return htons(aux->tp_vlan_tpid);
970 return htons(ETH_TYPE_VLAN);
975 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
977 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
981 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
986 struct cmsghdr *cmsg;
989 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
993 /* Reserve headroom for a single VLAN tag */
994 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
995 size = dp_packet_tailroom(buffer);
997 iov.iov_base = dp_packet_data(buffer);
999 msgh.msg_name = NULL;
1000 msgh.msg_namelen = 0;
1001 msgh.msg_iov = &iov;
1002 msgh.msg_iovlen = 1;
1003 msgh.msg_control = &cmsg_buffer;
1004 msgh.msg_controllen = sizeof cmsg_buffer;
1008 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1009 } while (retval < 0 && errno == EINTR);
1013 } else if (retval > size) {
1017 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1019 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1020 const struct tpacket_auxdata *aux;
1022 if (cmsg->cmsg_level != SOL_PACKET
1023 || cmsg->cmsg_type != PACKET_AUXDATA
1024 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1028 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1029 if (auxdata_has_vlan_tci(aux)) {
1030 if (retval < ETH_HEADER_LEN) {
1034 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1035 htons(aux->tp_vlan_tci));
1044 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1047 size_t size = dp_packet_tailroom(buffer);
1050 retval = read(fd, dp_packet_data(buffer), size);
1051 } while (retval < 0 && errno == EINTR);
1057 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1062 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1065 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1066 struct netdev *netdev = rx->up.netdev;
1067 struct dp_packet *buffer;
1071 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1072 mtu = ETH_PAYLOAD_MAX;
1075 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1076 DP_NETDEV_HEADROOM);
1077 retval = (rx->is_tap
1078 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1079 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1082 if (retval != EAGAIN && retval != EMSGSIZE) {
1083 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1084 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1086 dp_packet_delete(buffer);
1088 dp_packet_pad(buffer);
1089 dp_packet_rss_invalidate(buffer);
1090 packets[0] = buffer;
1098 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1100 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1101 poll_fd_wait(rx->fd, POLLIN);
1105 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1107 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1110 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1111 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1115 drain_fd(rx->fd, ifr.ifr_qlen);
1118 return drain_rcvbuf(rx->fd);
1122 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1123 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1124 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1125 * the packet is too big or too small to transmit on the device.
1127 * The caller retains ownership of 'buffer' in all cases.
1129 * The kernel maintains a packet transmission queue, so the caller is not
1130 * expected to do additional queuing of packets. */
1132 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1133 struct dp_packet **pkts, int cnt, bool may_steal)
1138 /* 'i' is incremented only if there's no error */
1139 for (i = 0; i < cnt;) {
1140 const void *data = dp_packet_data(pkts[i]);
1141 size_t size = dp_packet_size(pkts[i]);
1144 if (!is_tap_netdev(netdev_)) {
1145 /* Use our AF_PACKET socket to send to this device. */
1146 struct sockaddr_ll sll;
1152 sock = af_packet_sock();
1157 ifindex = netdev_get_ifindex(netdev_);
1162 /* We don't bother setting most fields in sockaddr_ll because the
1163 * kernel ignores them for SOCK_RAW. */
1164 memset(&sll, 0, sizeof sll);
1165 sll.sll_family = AF_PACKET;
1166 sll.sll_ifindex = ifindex;
1168 iov.iov_base = CONST_CAST(void *, data);
1171 msg.msg_name = &sll;
1172 msg.msg_namelen = sizeof sll;
1175 msg.msg_control = NULL;
1176 msg.msg_controllen = 0;
1179 retval = sendmsg(sock, &msg, 0);
1181 /* Use the tap fd to send to this device. This is essential for
1182 * tap devices, because packets sent to a tap device with an
1183 * AF_PACKET socket will loop back to be *received* again on the
1184 * tap device. This doesn't occur on other interface types
1185 * because we attach a socket filter to the rx socket. */
1186 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1188 retval = write(netdev->tap_fd, data, size);
1192 /* The Linux AF_PACKET implementation never blocks waiting for room
1193 * for packets, instead returning ENOBUFS. Translate this into
1194 * EAGAIN for the caller. */
1195 error = errno == ENOBUFS ? EAGAIN : errno;
1196 if (error == EINTR) {
1197 /* continue without incrementing 'i', i.e. retry this packet */
1201 } else if (retval != size) {
1202 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1203 " of %"PRIuSIZE") on %s", retval, size,
1204 netdev_get_name(netdev_));
1209 /* Process the next packet in the batch */
1214 for (i = 0; i < cnt; i++) {
1215 dp_packet_delete(pkts[i]);
1219 if (error && error != EAGAIN) {
1220 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1221 netdev_get_name(netdev_), ovs_strerror(error));
1228 /* Registers with the poll loop to wake up from the next call to poll_block()
1229 * when the packet transmission queue has sufficient room to transmit a packet
1230 * with netdev_send().
1232 * The kernel maintains a packet transmission queue, so the client is not
1233 * expected to do additional queuing of packets. Thus, this function is
1234 * unlikely to ever be used. It is included for completeness. */
1236 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1238 if (is_tap_netdev(netdev)) {
1239 /* TAP device always accepts packets.*/
1240 poll_immediate_wake();
1244 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1245 * otherwise a positive errno value. */
1247 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1249 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1250 enum netdev_flags old_flags = 0;
1253 ovs_mutex_lock(&netdev->mutex);
1255 if (netdev->cache_valid & VALID_ETHERADDR) {
1256 error = netdev->ether_addr_error;
1257 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1260 netdev->cache_valid &= ~VALID_ETHERADDR;
1263 /* Tap devices must be brought down before setting the address. */
1264 if (is_tap_netdev(netdev_)) {
1265 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1267 error = set_etheraddr(netdev_get_name(netdev_), mac);
1268 if (!error || error == ENODEV) {
1269 netdev->ether_addr_error = error;
1270 netdev->cache_valid |= VALID_ETHERADDR;
1272 netdev->etheraddr = mac;
1276 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1277 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1281 ovs_mutex_unlock(&netdev->mutex);
1285 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1287 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1289 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1292 ovs_mutex_lock(&netdev->mutex);
1293 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1294 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1295 &netdev->etheraddr);
1296 netdev->cache_valid |= VALID_ETHERADDR;
1299 error = netdev->ether_addr_error;
1301 *mac = netdev->etheraddr;
1303 ovs_mutex_unlock(&netdev->mutex);
1309 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1313 if (!(netdev->cache_valid & VALID_MTU)) {
1316 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1317 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1318 netdev->mtu = ifr.ifr_mtu;
1319 netdev->cache_valid |= VALID_MTU;
1322 error = netdev->netdev_mtu_error;
1324 *mtup = netdev->mtu;
1330 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1331 * in bytes, not including the hardware header; thus, this is typically 1500
1332 * bytes for Ethernet devices. */
1334 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1336 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1339 ovs_mutex_lock(&netdev->mutex);
1340 error = netdev_linux_get_mtu__(netdev, mtup);
1341 ovs_mutex_unlock(&netdev->mutex);
1346 /* Sets the maximum size of transmitted (MTU) for given device using linux
1347 * networking ioctl interface.
1350 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1356 ovs_mutex_lock(&netdev->mutex);
1357 if (netdev->cache_valid & VALID_MTU) {
1358 error = netdev->netdev_mtu_error;
1359 if (error || netdev->mtu == mtu) {
1362 netdev->cache_valid &= ~VALID_MTU;
1365 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1366 SIOCSIFMTU, "SIOCSIFMTU");
1367 if (!error || error == ENODEV) {
1368 netdev->netdev_mtu_error = error;
1369 netdev->mtu = ifr.ifr_mtu;
1370 netdev->cache_valid |= VALID_MTU;
1373 ovs_mutex_unlock(&netdev->mutex);
1377 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1378 * On failure, returns a negative errno value. */
1380 netdev_linux_get_ifindex(const struct netdev *netdev_)
1382 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1385 ovs_mutex_lock(&netdev->mutex);
1386 error = get_ifindex(netdev_, &ifindex);
1387 ovs_mutex_unlock(&netdev->mutex);
1389 return error ? -error : ifindex;
1393 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1397 ovs_mutex_lock(&netdev->mutex);
1398 if (netdev->miimon_interval > 0) {
1399 *carrier = netdev->miimon;
1401 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1403 ovs_mutex_unlock(&netdev->mutex);
1408 static long long int
1409 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1411 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1412 long long int carrier_resets;
1414 ovs_mutex_lock(&netdev->mutex);
1415 carrier_resets = netdev->carrier_resets;
1416 ovs_mutex_unlock(&netdev->mutex);
1418 return carrier_resets;
1422 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1423 struct mii_ioctl_data *data)
1428 memset(&ifr, 0, sizeof ifr);
1429 memcpy(&ifr.ifr_data, data, sizeof *data);
1430 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1431 memcpy(data, &ifr.ifr_data, sizeof *data);
1437 netdev_linux_get_miimon(const char *name, bool *miimon)
1439 struct mii_ioctl_data data;
1444 memset(&data, 0, sizeof data);
1445 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1447 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1448 data.reg_num = MII_BMSR;
1449 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1453 *miimon = !!(data.val_out & BMSR_LSTATUS);
1455 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1458 struct ethtool_cmd ecmd;
1460 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1463 COVERAGE_INC(netdev_get_ethtool);
1464 memset(&ecmd, 0, sizeof ecmd);
1465 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1468 struct ethtool_value eval;
1470 memcpy(&eval, &ecmd, sizeof eval);
1471 *miimon = !!eval.data;
1473 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1481 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1482 long long int interval)
1484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1486 ovs_mutex_lock(&netdev->mutex);
1487 interval = interval > 0 ? MAX(interval, 100) : 0;
1488 if (netdev->miimon_interval != interval) {
1489 if (interval && !netdev->miimon_interval) {
1490 atomic_count_inc(&miimon_cnt);
1491 } else if (!interval && netdev->miimon_interval) {
1492 atomic_count_dec(&miimon_cnt);
1495 netdev->miimon_interval = interval;
1496 timer_set_expired(&netdev->miimon_timer);
1498 ovs_mutex_unlock(&netdev->mutex);
1504 netdev_linux_miimon_run(void)
1506 struct shash device_shash;
1507 struct shash_node *node;
1509 shash_init(&device_shash);
1510 netdev_get_devices(&netdev_linux_class, &device_shash);
1511 SHASH_FOR_EACH (node, &device_shash) {
1512 struct netdev *netdev = node->data;
1513 struct netdev_linux *dev = netdev_linux_cast(netdev);
1516 ovs_mutex_lock(&dev->mutex);
1517 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1518 netdev_linux_get_miimon(dev->up.name, &miimon);
1519 if (miimon != dev->miimon) {
1520 dev->miimon = miimon;
1521 netdev_linux_changed(dev, dev->ifi_flags, 0);
1524 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1526 ovs_mutex_unlock(&dev->mutex);
1527 netdev_close(netdev);
1530 shash_destroy(&device_shash);
1534 netdev_linux_miimon_wait(void)
1536 struct shash device_shash;
1537 struct shash_node *node;
1539 shash_init(&device_shash);
1540 netdev_get_devices(&netdev_linux_class, &device_shash);
1541 SHASH_FOR_EACH (node, &device_shash) {
1542 struct netdev *netdev = node->data;
1543 struct netdev_linux *dev = netdev_linux_cast(netdev);
1545 ovs_mutex_lock(&dev->mutex);
1546 if (dev->miimon_interval > 0) {
1547 timer_wait(&dev->miimon_timer);
1549 ovs_mutex_unlock(&dev->mutex);
1550 netdev_close(netdev);
1552 shash_destroy(&device_shash);
1556 swap_uint64(uint64_t *a, uint64_t *b)
1563 /* Copies 'src' into 'dst', performing format conversion in the process.
1565 * 'src' is allowed to be misaligned. */
1567 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1568 const struct ovs_vport_stats *src)
1570 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1571 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1572 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1573 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1574 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1575 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1576 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1577 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1579 dst->collisions = 0;
1580 dst->rx_length_errors = 0;
1581 dst->rx_over_errors = 0;
1582 dst->rx_crc_errors = 0;
1583 dst->rx_frame_errors = 0;
1584 dst->rx_fifo_errors = 0;
1585 dst->rx_missed_errors = 0;
1586 dst->tx_aborted_errors = 0;
1587 dst->tx_carrier_errors = 0;
1588 dst->tx_fifo_errors = 0;
1589 dst->tx_heartbeat_errors = 0;
1590 dst->tx_window_errors = 0;
1594 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1596 struct dpif_netlink_vport reply;
1600 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1603 } else if (!reply.stats) {
1608 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1616 get_stats_via_vport(const struct netdev *netdev_,
1617 struct netdev_stats *stats)
1619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1621 if (!netdev->vport_stats_error ||
1622 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1625 error = get_stats_via_vport__(netdev_, stats);
1626 if (error && error != ENOENT && error != ENODEV) {
1627 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1629 netdev_get_name(netdev_), ovs_strerror(error));
1631 netdev->vport_stats_error = error;
1632 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1636 /* Retrieves current device stats for 'netdev-linux'. */
1638 netdev_linux_get_stats(const struct netdev *netdev_,
1639 struct netdev_stats *stats)
1641 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1642 struct netdev_stats dev_stats;
1645 ovs_mutex_lock(&netdev->mutex);
1646 get_stats_via_vport(netdev_, stats);
1647 error = get_stats_via_netlink(netdev_, &dev_stats);
1649 if (!netdev->vport_stats_error) {
1652 } else if (netdev->vport_stats_error) {
1653 /* stats not available from OVS then use netdev stats. */
1656 /* Use kernel netdev's packet and byte counts since vport's counters
1657 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1659 stats->rx_packets = dev_stats.rx_packets;
1660 stats->rx_bytes = dev_stats.rx_bytes;
1661 stats->tx_packets = dev_stats.tx_packets;
1662 stats->tx_bytes = dev_stats.tx_bytes;
1664 stats->rx_errors += dev_stats.rx_errors;
1665 stats->tx_errors += dev_stats.tx_errors;
1666 stats->rx_dropped += dev_stats.rx_dropped;
1667 stats->tx_dropped += dev_stats.tx_dropped;
1668 stats->multicast += dev_stats.multicast;
1669 stats->collisions += dev_stats.collisions;
1670 stats->rx_length_errors += dev_stats.rx_length_errors;
1671 stats->rx_over_errors += dev_stats.rx_over_errors;
1672 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1673 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1674 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1675 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1676 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1677 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1678 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1679 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1680 stats->tx_window_errors += dev_stats.tx_window_errors;
1682 ovs_mutex_unlock(&netdev->mutex);
1687 /* Retrieves current device stats for 'netdev-tap' netdev or
1688 * netdev-internal. */
1690 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1693 struct netdev_stats dev_stats;
1696 ovs_mutex_lock(&netdev->mutex);
1697 get_stats_via_vport(netdev_, stats);
1698 error = get_stats_via_netlink(netdev_, &dev_stats);
1700 if (!netdev->vport_stats_error) {
1703 } else if (netdev->vport_stats_error) {
1704 /* Transmit and receive stats will appear to be swapped relative to the
1705 * other ports since we are the one sending the data, not a remote
1706 * computer. For consistency, we swap them back here. This does not
1707 * apply if we are getting stats from the vport layer because it always
1708 * tracks stats from the perspective of the switch. */
1711 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1712 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1713 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1714 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1715 stats->rx_length_errors = 0;
1716 stats->rx_over_errors = 0;
1717 stats->rx_crc_errors = 0;
1718 stats->rx_frame_errors = 0;
1719 stats->rx_fifo_errors = 0;
1720 stats->rx_missed_errors = 0;
1721 stats->tx_aborted_errors = 0;
1722 stats->tx_carrier_errors = 0;
1723 stats->tx_fifo_errors = 0;
1724 stats->tx_heartbeat_errors = 0;
1725 stats->tx_window_errors = 0;
1727 /* Use kernel netdev's packet and byte counts since vport counters
1728 * do not reflect packet counts on the wire when GSO, TSO or GRO
1730 stats->rx_packets = dev_stats.tx_packets;
1731 stats->rx_bytes = dev_stats.tx_bytes;
1732 stats->tx_packets = dev_stats.rx_packets;
1733 stats->tx_bytes = dev_stats.rx_bytes;
1735 stats->rx_dropped += dev_stats.tx_dropped;
1736 stats->tx_dropped += dev_stats.rx_dropped;
1738 stats->rx_errors += dev_stats.tx_errors;
1739 stats->tx_errors += dev_stats.rx_errors;
1741 stats->multicast += dev_stats.multicast;
1742 stats->collisions += dev_stats.collisions;
1744 ovs_mutex_unlock(&netdev->mutex);
1750 netdev_internal_get_stats(const struct netdev *netdev_,
1751 struct netdev_stats *stats)
1753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1756 ovs_mutex_lock(&netdev->mutex);
1757 get_stats_via_vport(netdev_, stats);
1758 error = netdev->vport_stats_error;
1759 ovs_mutex_unlock(&netdev->mutex);
1765 netdev_linux_read_features(struct netdev_linux *netdev)
1767 struct ethtool_cmd ecmd;
1771 if (netdev->cache_valid & VALID_FEATURES) {
1775 COVERAGE_INC(netdev_get_ethtool);
1776 memset(&ecmd, 0, sizeof ecmd);
1777 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1778 ETHTOOL_GSET, "ETHTOOL_GSET");
1783 /* Supported features. */
1784 netdev->supported = 0;
1785 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1786 netdev->supported |= NETDEV_F_10MB_HD;
1788 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1789 netdev->supported |= NETDEV_F_10MB_FD;
1791 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1792 netdev->supported |= NETDEV_F_100MB_HD;
1794 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1795 netdev->supported |= NETDEV_F_100MB_FD;
1797 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1798 netdev->supported |= NETDEV_F_1GB_HD;
1800 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1801 netdev->supported |= NETDEV_F_1GB_FD;
1803 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1804 netdev->supported |= NETDEV_F_10GB_FD;
1806 if (ecmd.supported & SUPPORTED_TP) {
1807 netdev->supported |= NETDEV_F_COPPER;
1809 if (ecmd.supported & SUPPORTED_FIBRE) {
1810 netdev->supported |= NETDEV_F_FIBER;
1812 if (ecmd.supported & SUPPORTED_Autoneg) {
1813 netdev->supported |= NETDEV_F_AUTONEG;
1815 if (ecmd.supported & SUPPORTED_Pause) {
1816 netdev->supported |= NETDEV_F_PAUSE;
1818 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1819 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1822 /* Advertised features. */
1823 netdev->advertised = 0;
1824 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1825 netdev->advertised |= NETDEV_F_10MB_HD;
1827 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1828 netdev->advertised |= NETDEV_F_10MB_FD;
1830 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1831 netdev->advertised |= NETDEV_F_100MB_HD;
1833 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1834 netdev->advertised |= NETDEV_F_100MB_FD;
1836 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1837 netdev->advertised |= NETDEV_F_1GB_HD;
1839 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1840 netdev->advertised |= NETDEV_F_1GB_FD;
1842 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1843 netdev->advertised |= NETDEV_F_10GB_FD;
1845 if (ecmd.advertising & ADVERTISED_TP) {
1846 netdev->advertised |= NETDEV_F_COPPER;
1848 if (ecmd.advertising & ADVERTISED_FIBRE) {
1849 netdev->advertised |= NETDEV_F_FIBER;
1851 if (ecmd.advertising & ADVERTISED_Autoneg) {
1852 netdev->advertised |= NETDEV_F_AUTONEG;
1854 if (ecmd.advertising & ADVERTISED_Pause) {
1855 netdev->advertised |= NETDEV_F_PAUSE;
1857 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1858 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1861 /* Current settings. */
1862 speed = ethtool_cmd_speed(&ecmd);
1863 if (speed == SPEED_10) {
1864 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1865 } else if (speed == SPEED_100) {
1866 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1867 } else if (speed == SPEED_1000) {
1868 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1869 } else if (speed == SPEED_10000) {
1870 netdev->current = NETDEV_F_10GB_FD;
1871 } else if (speed == 40000) {
1872 netdev->current = NETDEV_F_40GB_FD;
1873 } else if (speed == 100000) {
1874 netdev->current = NETDEV_F_100GB_FD;
1875 } else if (speed == 1000000) {
1876 netdev->current = NETDEV_F_1TB_FD;
1878 netdev->current = 0;
1881 if (ecmd.port == PORT_TP) {
1882 netdev->current |= NETDEV_F_COPPER;
1883 } else if (ecmd.port == PORT_FIBRE) {
1884 netdev->current |= NETDEV_F_FIBER;
1888 netdev->current |= NETDEV_F_AUTONEG;
1892 netdev->cache_valid |= VALID_FEATURES;
1893 netdev->get_features_error = error;
1896 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1897 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1898 * Returns 0 if successful, otherwise a positive errno value. */
1900 netdev_linux_get_features(const struct netdev *netdev_,
1901 enum netdev_features *current,
1902 enum netdev_features *advertised,
1903 enum netdev_features *supported,
1904 enum netdev_features *peer)
1906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1909 ovs_mutex_lock(&netdev->mutex);
1910 netdev_linux_read_features(netdev);
1911 if (!netdev->get_features_error) {
1912 *current = netdev->current;
1913 *advertised = netdev->advertised;
1914 *supported = netdev->supported;
1915 *peer = 0; /* XXX */
1917 error = netdev->get_features_error;
1918 ovs_mutex_unlock(&netdev->mutex);
1923 /* Set the features advertised by 'netdev' to 'advertise'. */
1925 netdev_linux_set_advertisements(struct netdev *netdev_,
1926 enum netdev_features advertise)
1928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1929 struct ethtool_cmd ecmd;
1932 ovs_mutex_lock(&netdev->mutex);
1934 COVERAGE_INC(netdev_get_ethtool);
1935 memset(&ecmd, 0, sizeof ecmd);
1936 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1937 ETHTOOL_GSET, "ETHTOOL_GSET");
1942 ecmd.advertising = 0;
1943 if (advertise & NETDEV_F_10MB_HD) {
1944 ecmd.advertising |= ADVERTISED_10baseT_Half;
1946 if (advertise & NETDEV_F_10MB_FD) {
1947 ecmd.advertising |= ADVERTISED_10baseT_Full;
1949 if (advertise & NETDEV_F_100MB_HD) {
1950 ecmd.advertising |= ADVERTISED_100baseT_Half;
1952 if (advertise & NETDEV_F_100MB_FD) {
1953 ecmd.advertising |= ADVERTISED_100baseT_Full;
1955 if (advertise & NETDEV_F_1GB_HD) {
1956 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1958 if (advertise & NETDEV_F_1GB_FD) {
1959 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1961 if (advertise & NETDEV_F_10GB_FD) {
1962 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1964 if (advertise & NETDEV_F_COPPER) {
1965 ecmd.advertising |= ADVERTISED_TP;
1967 if (advertise & NETDEV_F_FIBER) {
1968 ecmd.advertising |= ADVERTISED_FIBRE;
1970 if (advertise & NETDEV_F_AUTONEG) {
1971 ecmd.advertising |= ADVERTISED_Autoneg;
1973 if (advertise & NETDEV_F_PAUSE) {
1974 ecmd.advertising |= ADVERTISED_Pause;
1976 if (advertise & NETDEV_F_PAUSE_ASYM) {
1977 ecmd.advertising |= ADVERTISED_Asym_Pause;
1979 COVERAGE_INC(netdev_set_ethtool);
1980 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1981 ETHTOOL_SSET, "ETHTOOL_SSET");
1984 ovs_mutex_unlock(&netdev->mutex);
1988 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1989 * successful, otherwise a positive errno value. */
1991 netdev_linux_set_policing(struct netdev *netdev_,
1992 uint32_t kbits_rate, uint32_t kbits_burst)
1994 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1995 const char *netdev_name = netdev_get_name(netdev_);
1998 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1999 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2000 : kbits_burst); /* Stick with user-specified value. */
2002 ovs_mutex_lock(&netdev->mutex);
2003 if (netdev->cache_valid & VALID_POLICING) {
2004 error = netdev->netdev_policing_error;
2005 if (error || (netdev->kbits_rate == kbits_rate &&
2006 netdev->kbits_burst == kbits_burst)) {
2007 /* Assume that settings haven't changed since we last set them. */
2010 netdev->cache_valid &= ~VALID_POLICING;
2013 COVERAGE_INC(netdev_set_policing);
2014 /* Remove any existing ingress qdisc. */
2015 error = tc_add_del_ingress_qdisc(netdev_, false);
2017 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2018 netdev_name, ovs_strerror(error));
2023 error = tc_add_del_ingress_qdisc(netdev_, true);
2025 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2026 netdev_name, ovs_strerror(error));
2030 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2032 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2033 netdev_name, ovs_strerror(error));
2038 netdev->kbits_rate = kbits_rate;
2039 netdev->kbits_burst = kbits_burst;
2042 if (!error || error == ENODEV) {
2043 netdev->netdev_policing_error = error;
2044 netdev->cache_valid |= VALID_POLICING;
2046 ovs_mutex_unlock(&netdev->mutex);
2051 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2054 const struct tc_ops *const *opsp;
2056 for (opsp = tcs; *opsp != NULL; opsp++) {
2057 const struct tc_ops *ops = *opsp;
2058 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2059 sset_add(types, ops->ovs_name);
2065 static const struct tc_ops *
2066 tc_lookup_ovs_name(const char *name)
2068 const struct tc_ops *const *opsp;
2070 for (opsp = tcs; *opsp != NULL; opsp++) {
2071 const struct tc_ops *ops = *opsp;
2072 if (!strcmp(name, ops->ovs_name)) {
2079 static const struct tc_ops *
2080 tc_lookup_linux_name(const char *name)
2082 const struct tc_ops *const *opsp;
2084 for (opsp = tcs; *opsp != NULL; opsp++) {
2085 const struct tc_ops *ops = *opsp;
2086 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2093 static struct tc_queue *
2094 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2098 struct tc_queue *queue;
2100 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2101 if (queue->queue_id == queue_id) {
2108 static struct tc_queue *
2109 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2111 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2115 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2117 struct netdev_qos_capabilities *caps)
2119 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2123 caps->n_queues = ops->n_queues;
2128 netdev_linux_get_qos(const struct netdev *netdev_,
2129 const char **typep, struct smap *details)
2131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2134 ovs_mutex_lock(&netdev->mutex);
2135 error = tc_query_qdisc(netdev_);
2137 *typep = netdev->tc->ops->ovs_name;
2138 error = (netdev->tc->ops->qdisc_get
2139 ? netdev->tc->ops->qdisc_get(netdev_, details)
2142 ovs_mutex_unlock(&netdev->mutex);
2148 netdev_linux_set_qos(struct netdev *netdev_,
2149 const char *type, const struct smap *details)
2151 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2152 const struct tc_ops *new_ops;
2155 new_ops = tc_lookup_ovs_name(type);
2156 if (!new_ops || !new_ops->tc_install) {
2160 ovs_mutex_lock(&netdev->mutex);
2161 error = tc_query_qdisc(netdev_);
2166 if (new_ops == netdev->tc->ops) {
2167 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2169 /* Delete existing qdisc. */
2170 error = tc_del_qdisc(netdev_);
2174 ovs_assert(netdev->tc == NULL);
2176 /* Install new qdisc. */
2177 error = new_ops->tc_install(netdev_, details);
2178 ovs_assert((error == 0) == (netdev->tc != NULL));
2182 ovs_mutex_unlock(&netdev->mutex);
2187 netdev_linux_get_queue(const struct netdev *netdev_,
2188 unsigned int queue_id, struct smap *details)
2190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2193 ovs_mutex_lock(&netdev->mutex);
2194 error = tc_query_qdisc(netdev_);
2196 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2198 ? netdev->tc->ops->class_get(netdev_, queue, details)
2201 ovs_mutex_unlock(&netdev->mutex);
2207 netdev_linux_set_queue(struct netdev *netdev_,
2208 unsigned int queue_id, const struct smap *details)
2210 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2213 ovs_mutex_lock(&netdev->mutex);
2214 error = tc_query_qdisc(netdev_);
2216 error = (queue_id < netdev->tc->ops->n_queues
2217 && netdev->tc->ops->class_set
2218 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2221 ovs_mutex_unlock(&netdev->mutex);
2227 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2229 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2232 ovs_mutex_lock(&netdev->mutex);
2233 error = tc_query_qdisc(netdev_);
2235 if (netdev->tc->ops->class_delete) {
2236 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2238 ? netdev->tc->ops->class_delete(netdev_, queue)
2244 ovs_mutex_unlock(&netdev->mutex);
2250 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2251 unsigned int queue_id,
2252 struct netdev_queue_stats *stats)
2254 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2257 ovs_mutex_lock(&netdev->mutex);
2258 error = tc_query_qdisc(netdev_);
2260 if (netdev->tc->ops->class_get_stats) {
2261 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2263 stats->created = queue->created;
2264 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2273 ovs_mutex_unlock(&netdev->mutex);
2278 struct queue_dump_state {
2279 struct nl_dump dump;
2284 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2286 struct ofpbuf request;
2287 struct tcmsg *tcmsg;
2289 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2293 tcmsg->tcm_parent = 0;
2294 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2295 ofpbuf_uninit(&request);
2297 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2302 finish_queue_dump(struct queue_dump_state *state)
2304 ofpbuf_uninit(&state->buf);
2305 return nl_dump_done(&state->dump);
2308 struct netdev_linux_queue_state {
2309 unsigned int *queues;
2315 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2317 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2320 ovs_mutex_lock(&netdev->mutex);
2321 error = tc_query_qdisc(netdev_);
2323 if (netdev->tc->ops->class_get) {
2324 struct netdev_linux_queue_state *state;
2325 struct tc_queue *queue;
2328 *statep = state = xmalloc(sizeof *state);
2329 state->n_queues = hmap_count(&netdev->tc->queues);
2330 state->cur_queue = 0;
2331 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2334 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2335 state->queues[i++] = queue->queue_id;
2341 ovs_mutex_unlock(&netdev->mutex);
2347 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2348 unsigned int *queue_idp, struct smap *details)
2350 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2351 struct netdev_linux_queue_state *state = state_;
2354 ovs_mutex_lock(&netdev->mutex);
2355 while (state->cur_queue < state->n_queues) {
2356 unsigned int queue_id = state->queues[state->cur_queue++];
2357 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2360 *queue_idp = queue_id;
2361 error = netdev->tc->ops->class_get(netdev_, queue, details);
2365 ovs_mutex_unlock(&netdev->mutex);
2371 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2374 struct netdev_linux_queue_state *state = state_;
2376 free(state->queues);
2382 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2383 netdev_dump_queue_stats_cb *cb, void *aux)
2385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2388 ovs_mutex_lock(&netdev->mutex);
2389 error = tc_query_qdisc(netdev_);
2391 struct queue_dump_state state;
2393 if (!netdev->tc->ops->class_dump_stats) {
2395 } else if (!start_queue_dump(netdev_, &state)) {
2401 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2402 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2409 retval = finish_queue_dump(&state);
2415 ovs_mutex_unlock(&netdev->mutex);
2421 netdev_linux_get_in4(const struct netdev *netdev_,
2422 struct in_addr *address, struct in_addr *netmask)
2424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2427 ovs_mutex_lock(&netdev->mutex);
2428 if (!(netdev->cache_valid & VALID_IN4)) {
2429 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2430 SIOCGIFADDR, "SIOCGIFADDR");
2432 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2433 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2435 netdev->in4_error = error;
2436 netdev->cache_valid |= VALID_IN4;
2438 error = netdev->in4_error;
2442 if (netdev->address.s_addr != INADDR_ANY) {
2443 *address = netdev->address;
2444 *netmask = netdev->netmask;
2446 error = EADDRNOTAVAIL;
2449 ovs_mutex_unlock(&netdev->mutex);
2455 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2456 struct in_addr netmask)
2458 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2461 ovs_mutex_lock(&netdev->mutex);
2462 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2464 netdev->address = address;
2465 netdev->netmask = netmask;
2466 if (address.s_addr != INADDR_ANY) {
2467 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2468 "SIOCSIFNETMASK", netmask);
2473 netdev->cache_valid |= VALID_IN4;
2474 netdev->in4_error = 0;
2476 netdev->cache_valid &= ~VALID_IN4;
2478 ovs_mutex_unlock(&netdev->mutex);
2484 parse_if_inet6_line(const char *line,
2485 struct in6_addr *in6, char ifname[16 + 1])
2487 uint8_t *s6 = in6->s6_addr;
2488 #define X8 "%2"SCNx8
2489 return ovs_scan(line,
2490 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2491 "%*x %*x %*x %*x %16s\n",
2492 &s6[0], &s6[1], &s6[2], &s6[3],
2493 &s6[4], &s6[5], &s6[6], &s6[7],
2494 &s6[8], &s6[9], &s6[10], &s6[11],
2495 &s6[12], &s6[13], &s6[14], &s6[15],
2499 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2500 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2503 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2508 ovs_mutex_lock(&netdev->mutex);
2509 if (!(netdev->cache_valid & VALID_IN6)) {
2513 netdev->in6 = in6addr_any;
2514 netdev->in6_error = EADDRNOTAVAIL;
2516 file = fopen("/proc/net/if_inet6", "r");
2518 const char *name = netdev_get_name(netdev_);
2519 while (fgets(line, sizeof line, file)) {
2520 struct in6_addr in6_tmp;
2521 char ifname[16 + 1];
2522 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2523 && !strcmp(name, ifname))
2525 netdev->in6 = in6_tmp;
2526 netdev->in6_error = 0;
2532 netdev->in6_error = EOPNOTSUPP;
2534 netdev->cache_valid |= VALID_IN6;
2537 error = netdev->in6_error;
2538 ovs_mutex_unlock(&netdev->mutex);
2544 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2546 struct sockaddr_in sin;
2547 memset(&sin, 0, sizeof sin);
2548 sin.sin_family = AF_INET;
2549 sin.sin_addr = addr;
2552 memset(sa, 0, sizeof *sa);
2553 memcpy(sa, &sin, sizeof sin);
2557 do_set_addr(struct netdev *netdev,
2558 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2562 make_in4_sockaddr(&ifr.ifr_addr, addr);
2563 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2567 /* Adds 'router' as a default IP gateway. */
2569 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2571 struct in_addr any = { INADDR_ANY };
2575 memset(&rt, 0, sizeof rt);
2576 make_in4_sockaddr(&rt.rt_dst, any);
2577 make_in4_sockaddr(&rt.rt_gateway, router);
2578 make_in4_sockaddr(&rt.rt_genmask, any);
2579 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2580 error = af_inet_ioctl(SIOCADDRT, &rt);
2582 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2588 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2591 static const char fn[] = "/proc/net/route";
2596 *netdev_name = NULL;
2597 stream = fopen(fn, "r");
2598 if (stream == NULL) {
2599 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2604 while (fgets(line, sizeof line, stream)) {
2607 ovs_be32 dest, gateway, mask;
2608 int refcnt, metric, mtu;
2609 unsigned int flags, use, window, irtt;
2612 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2614 iface, &dest, &gateway, &flags, &refcnt,
2615 &use, &metric, &mask, &mtu, &window, &irtt)) {
2616 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2620 if (!(flags & RTF_UP)) {
2621 /* Skip routes that aren't up. */
2625 /* The output of 'dest', 'mask', and 'gateway' were given in
2626 * network byte order, so we don't need need any endian
2627 * conversions here. */
2628 if ((dest & mask) == (host->s_addr & mask)) {
2630 /* The host is directly reachable. */
2631 next_hop->s_addr = 0;
2633 /* To reach the host, we must go through a gateway. */
2634 next_hop->s_addr = gateway;
2636 *netdev_name = xstrdup(iface);
2648 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2653 ovs_mutex_lock(&netdev->mutex);
2654 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2655 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2657 COVERAGE_INC(netdev_get_ethtool);
2658 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2659 error = netdev_linux_do_ethtool(netdev->up.name,
2662 "ETHTOOL_GDRVINFO");
2664 netdev->cache_valid |= VALID_DRVINFO;
2669 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2670 smap_add(smap, "driver_version", netdev->drvinfo.version);
2671 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2673 ovs_mutex_unlock(&netdev->mutex);
2679 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2682 smap_add(smap, "driver_name", "openvswitch");
2686 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2687 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2688 * returns 0. Otherwise, it returns a positive errno value; in particular,
2689 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2691 netdev_linux_arp_lookup(const struct netdev *netdev,
2692 ovs_be32 ip, struct eth_addr *mac)
2695 struct sockaddr_in sin;
2698 memset(&r, 0, sizeof r);
2699 memset(&sin, 0, sizeof sin);
2700 sin.sin_family = AF_INET;
2701 sin.sin_addr.s_addr = ip;
2703 memcpy(&r.arp_pa, &sin, sizeof sin);
2704 r.arp_ha.sa_family = ARPHRD_ETHER;
2706 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2707 COVERAGE_INC(netdev_arp_lookup);
2708 retval = af_inet_ioctl(SIOCGARP, &r);
2710 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2711 } else if (retval != ENXIO) {
2712 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2713 netdev_get_name(netdev), IP_ARGS(ip),
2714 ovs_strerror(retval));
2720 nd_to_iff_flags(enum netdev_flags nd)
2723 if (nd & NETDEV_UP) {
2726 if (nd & NETDEV_PROMISC) {
2729 if (nd & NETDEV_LOOPBACK) {
2730 iff |= IFF_LOOPBACK;
2736 iff_to_nd_flags(int iff)
2738 enum netdev_flags nd = 0;
2742 if (iff & IFF_PROMISC) {
2743 nd |= NETDEV_PROMISC;
2745 if (iff & IFF_LOOPBACK) {
2746 nd |= NETDEV_LOOPBACK;
2752 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2753 enum netdev_flags on, enum netdev_flags *old_flagsp)
2754 OVS_REQUIRES(netdev->mutex)
2756 int old_flags, new_flags;
2759 old_flags = netdev->ifi_flags;
2760 *old_flagsp = iff_to_nd_flags(old_flags);
2761 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2762 if (new_flags != old_flags) {
2763 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2764 get_flags(&netdev->up, &netdev->ifi_flags);
2771 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2772 enum netdev_flags on, enum netdev_flags *old_flagsp)
2774 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2777 ovs_mutex_lock(&netdev->mutex);
2778 error = update_flags(netdev, off, on, old_flagsp);
2779 ovs_mutex_unlock(&netdev->mutex);
2784 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2785 GET_FEATURES, GET_STATUS) \
2791 netdev_linux_wait, \
2793 netdev_linux_alloc, \
2795 netdev_linux_destruct, \
2796 netdev_linux_dealloc, \
2797 NULL, /* get_config */ \
2798 NULL, /* set_config */ \
2799 NULL, /* get_tunnel_config */ \
2800 NULL, /* build header */ \
2801 NULL, /* push header */ \
2802 NULL, /* pop header */ \
2803 NULL, /* get_numa_id */ \
2804 NULL, /* set_multiq */ \
2806 netdev_linux_send, \
2807 netdev_linux_send_wait, \
2809 netdev_linux_set_etheraddr, \
2810 netdev_linux_get_etheraddr, \
2811 netdev_linux_get_mtu, \
2812 netdev_linux_set_mtu, \
2813 netdev_linux_get_ifindex, \
2814 netdev_linux_get_carrier, \
2815 netdev_linux_get_carrier_resets, \
2816 netdev_linux_set_miimon_interval, \
2820 netdev_linux_set_advertisements, \
2822 netdev_linux_set_policing, \
2823 netdev_linux_get_qos_types, \
2824 netdev_linux_get_qos_capabilities, \
2825 netdev_linux_get_qos, \
2826 netdev_linux_set_qos, \
2827 netdev_linux_get_queue, \
2828 netdev_linux_set_queue, \
2829 netdev_linux_delete_queue, \
2830 netdev_linux_get_queue_stats, \
2831 netdev_linux_queue_dump_start, \
2832 netdev_linux_queue_dump_next, \
2833 netdev_linux_queue_dump_done, \
2834 netdev_linux_dump_queue_stats, \
2836 netdev_linux_get_in4, \
2837 netdev_linux_set_in4, \
2838 netdev_linux_get_in6, \
2839 netdev_linux_add_router, \
2840 netdev_linux_get_next_hop, \
2842 netdev_linux_arp_lookup, \
2844 netdev_linux_update_flags, \
2846 netdev_linux_rxq_alloc, \
2847 netdev_linux_rxq_construct, \
2848 netdev_linux_rxq_destruct, \
2849 netdev_linux_rxq_dealloc, \
2850 netdev_linux_rxq_recv, \
2851 netdev_linux_rxq_wait, \
2852 netdev_linux_rxq_drain, \
2855 const struct netdev_class netdev_linux_class =
2858 netdev_linux_construct,
2859 netdev_linux_get_stats,
2860 netdev_linux_get_features,
2861 netdev_linux_get_status);
2863 const struct netdev_class netdev_tap_class =
2866 netdev_linux_construct_tap,
2867 netdev_tap_get_stats,
2868 netdev_linux_get_features,
2869 netdev_linux_get_status);
2871 const struct netdev_class netdev_internal_class =
2874 netdev_linux_construct,
2875 netdev_internal_get_stats,
2876 NULL, /* get_features */
2877 netdev_internal_get_status);
2880 #define CODEL_N_QUEUES 0x0000
2882 /* In sufficiently new kernel headers these are defined as enums in
2883 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2884 * kernels. (This overrides any enum definition in the header file but that's
2886 #define TCA_CODEL_TARGET 1
2887 #define TCA_CODEL_LIMIT 2
2888 #define TCA_CODEL_INTERVAL 3
2897 static struct codel *
2898 codel_get__(const struct netdev *netdev_)
2900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2901 return CONTAINER_OF(netdev->tc, struct codel, tc);
2905 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2908 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2909 struct codel *codel;
2911 codel = xmalloc(sizeof *codel);
2912 tc_init(&codel->tc, &tc_ops_codel);
2913 codel->target = target;
2914 codel->limit = limit;
2915 codel->interval = interval;
2917 netdev->tc = &codel->tc;
2921 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2925 struct ofpbuf request;
2926 struct tcmsg *tcmsg;
2927 uint32_t otarget, olimit, ointerval;
2930 tc_del_qdisc(netdev);
2932 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2933 NLM_F_EXCL | NLM_F_CREATE, &request);
2937 tcmsg->tcm_handle = tc_make_handle(1, 0);
2938 tcmsg->tcm_parent = TC_H_ROOT;
2940 otarget = target ? target : 5000;
2941 olimit = limit ? limit : 10240;
2942 ointerval = interval ? interval : 100000;
2944 nl_msg_put_string(&request, TCA_KIND, "codel");
2945 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2946 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2947 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2948 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2949 nl_msg_end_nested(&request, opt_offset);
2951 error = tc_transact(&request, NULL);
2953 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2954 "target %u, limit %u, interval %u error %d(%s)",
2955 netdev_get_name(netdev),
2956 otarget, olimit, ointerval,
2957 error, ovs_strerror(error));
2963 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2964 const struct smap *details, struct codel *codel)
2966 const char *target_s;
2967 const char *limit_s;
2968 const char *interval_s;
2970 target_s = smap_get(details, "target");
2971 limit_s = smap_get(details, "limit");
2972 interval_s = smap_get(details, "interval");
2974 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2975 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2976 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2978 if (!codel->target) {
2979 codel->target = 5000;
2981 if (!codel->limit) {
2982 codel->limit = 10240;
2984 if (!codel->interval) {
2985 codel->interval = 100000;
2990 codel_tc_install(struct netdev *netdev, const struct smap *details)
2995 codel_parse_qdisc_details__(netdev, details, &codel);
2996 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2999 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3005 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3007 static const struct nl_policy tca_codel_policy[] = {
3008 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3009 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3010 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3013 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3015 if (!nl_parse_nested(nl_options, tca_codel_policy,
3016 attrs, ARRAY_SIZE(tca_codel_policy))) {
3017 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3021 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3022 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3023 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3028 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3030 struct nlattr *nlattr;
3035 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3040 error = codel_parse_tca_options__(nlattr, &codel);
3045 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3051 codel_tc_destroy(struct tc *tc)
3053 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3059 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3061 const struct codel *codel = codel_get__(netdev);
3062 smap_add_format(details, "target", "%u", codel->target);
3063 smap_add_format(details, "limit", "%u", codel->limit);
3064 smap_add_format(details, "interval", "%u", codel->interval);
3069 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3073 codel_parse_qdisc_details__(netdev, details, &codel);
3074 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3075 codel_get__(netdev)->target = codel.target;
3076 codel_get__(netdev)->limit = codel.limit;
3077 codel_get__(netdev)->interval = codel.interval;
3081 static const struct tc_ops tc_ops_codel = {
3082 "codel", /* linux_name */
3083 "linux-codel", /* ovs_name */
3084 CODEL_N_QUEUES, /* n_queues */
3097 /* FQ-CoDel traffic control class. */
3099 #define FQCODEL_N_QUEUES 0x0000
3101 /* In sufficiently new kernel headers these are defined as enums in
3102 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3103 * kernels. (This overrides any enum definition in the header file but that's
3105 #define TCA_FQ_CODEL_TARGET 1
3106 #define TCA_FQ_CODEL_LIMIT 2
3107 #define TCA_FQ_CODEL_INTERVAL 3
3108 #define TCA_FQ_CODEL_ECN 4
3109 #define TCA_FQ_CODEL_FLOWS 5
3110 #define TCA_FQ_CODEL_QUANTUM 6
3121 static struct fqcodel *
3122 fqcodel_get__(const struct netdev *netdev_)
3124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3125 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3129 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3130 uint32_t interval, uint32_t flows, uint32_t quantum)
3132 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3133 struct fqcodel *fqcodel;
3135 fqcodel = xmalloc(sizeof *fqcodel);
3136 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3137 fqcodel->target = target;
3138 fqcodel->limit = limit;
3139 fqcodel->interval = interval;
3140 fqcodel->flows = flows;
3141 fqcodel->quantum = quantum;
3143 netdev->tc = &fqcodel->tc;
3147 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3148 uint32_t interval, uint32_t flows, uint32_t quantum)
3151 struct ofpbuf request;
3152 struct tcmsg *tcmsg;
3153 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3156 tc_del_qdisc(netdev);
3158 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3159 NLM_F_EXCL | NLM_F_CREATE, &request);
3163 tcmsg->tcm_handle = tc_make_handle(1, 0);
3164 tcmsg->tcm_parent = TC_H_ROOT;
3166 otarget = target ? target : 5000;
3167 olimit = limit ? limit : 10240;
3168 ointerval = interval ? interval : 100000;
3169 oflows = flows ? flows : 1024;
3170 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3173 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3174 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3175 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3176 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3177 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3178 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3179 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3180 nl_msg_end_nested(&request, opt_offset);
3182 error = tc_transact(&request, NULL);
3184 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3185 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3186 netdev_get_name(netdev),
3187 otarget, olimit, ointerval, oflows, oquantum,
3188 error, ovs_strerror(error));
3194 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3195 const struct smap *details, struct fqcodel *fqcodel)
3197 const char *target_s;
3198 const char *limit_s;
3199 const char *interval_s;
3200 const char *flows_s;
3201 const char *quantum_s;
3203 target_s = smap_get(details, "target");
3204 limit_s = smap_get(details, "limit");
3205 interval_s = smap_get(details, "interval");
3206 flows_s = smap_get(details, "flows");
3207 quantum_s = smap_get(details, "quantum");
3208 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3209 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3210 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3211 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3212 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3213 if (!fqcodel->target) {
3214 fqcodel->target = 5000;
3216 if (!fqcodel->limit) {
3217 fqcodel->limit = 10240;
3219 if (!fqcodel->interval) {
3220 fqcodel->interval = 1000000;
3222 if (!fqcodel->flows) {
3223 fqcodel->flows = 1024;
3225 if (!fqcodel->quantum) {
3226 fqcodel->quantum = 1514;
3231 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3234 struct fqcodel fqcodel;
3236 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3237 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3238 fqcodel.interval, fqcodel.flows,
3241 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3242 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3248 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3250 static const struct nl_policy tca_fqcodel_policy[] = {
3251 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3252 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3253 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3254 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3255 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3258 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3260 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3261 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3262 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3266 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3267 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3268 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3269 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3270 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3275 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3277 struct nlattr *nlattr;
3280 struct fqcodel fqcodel;
3282 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3287 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3292 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3293 fqcodel.flows, fqcodel.quantum);
3298 fqcodel_tc_destroy(struct tc *tc)
3300 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3306 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3308 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3309 smap_add_format(details, "target", "%u", fqcodel->target);
3310 smap_add_format(details, "limit", "%u", fqcodel->limit);
3311 smap_add_format(details, "interval", "%u", fqcodel->interval);
3312 smap_add_format(details, "flows", "%u", fqcodel->flows);
3313 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3318 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3320 struct fqcodel fqcodel;
3322 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3323 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3324 fqcodel.flows, fqcodel.quantum);
3325 fqcodel_get__(netdev)->target = fqcodel.target;
3326 fqcodel_get__(netdev)->limit = fqcodel.limit;
3327 fqcodel_get__(netdev)->interval = fqcodel.interval;
3328 fqcodel_get__(netdev)->flows = fqcodel.flows;
3329 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3333 static const struct tc_ops tc_ops_fqcodel = {
3334 "fq_codel", /* linux_name */
3335 "linux-fq_codel", /* ovs_name */
3336 FQCODEL_N_QUEUES, /* n_queues */
3349 /* SFQ traffic control class. */
3351 #define SFQ_N_QUEUES 0x0000
3360 sfq_get__(const struct netdev *netdev_)
3362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3363 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3367 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3372 sfq = xmalloc(sizeof *sfq);
3373 tc_init(&sfq->tc, &tc_ops_sfq);
3374 sfq->perturb = perturb;
3375 sfq->quantum = quantum;
3377 netdev->tc = &sfq->tc;
3381 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3383 struct tc_sfq_qopt opt;
3384 struct ofpbuf request;
3385 struct tcmsg *tcmsg;
3387 int mtu_error, error;
3388 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3390 tc_del_qdisc(netdev);
3392 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3393 NLM_F_EXCL | NLM_F_CREATE, &request);
3397 tcmsg->tcm_handle = tc_make_handle(1, 0);
3398 tcmsg->tcm_parent = TC_H_ROOT;
3400 memset(&opt, 0, sizeof opt);
3403 opt.quantum = mtu; /* if we cannot find mtu, use default */
3406 opt.quantum = quantum;
3410 opt.perturb_period = 10;
3412 opt.perturb_period = perturb;
3415 nl_msg_put_string(&request, TCA_KIND, "sfq");
3416 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3418 error = tc_transact(&request, NULL);
3420 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3421 "quantum %u, perturb %u error %d(%s)",
3422 netdev_get_name(netdev),
3423 opt.quantum, opt.perturb_period,
3424 error, ovs_strerror(error));
3430 sfq_parse_qdisc_details__(struct netdev *netdev,
3431 const struct smap *details, struct sfq *sfq)
3433 const char *perturb_s;
3434 const char *quantum_s;
3438 perturb_s = smap_get(details, "perturb");
3439 quantum_s = smap_get(details, "quantum");
3440 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3441 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3442 if (!sfq->perturb) {
3446 if (!sfq->quantum) {
3447 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3451 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3452 "device without mtu");
3459 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3464 sfq_parse_qdisc_details__(netdev, details, &sfq);
3465 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3467 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3473 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3475 const struct tc_sfq_qopt *sfq;
3476 struct nlattr *nlattr;
3480 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3482 sfq = nl_attr_get(nlattr);
3483 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3491 sfq_tc_destroy(struct tc *tc)
3493 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3499 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3501 const struct sfq *sfq = sfq_get__(netdev);
3502 smap_add_format(details, "quantum", "%u", sfq->quantum);
3503 smap_add_format(details, "perturb", "%u", sfq->perturb);
3508 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3512 sfq_parse_qdisc_details__(netdev, details, &sfq);
3513 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3514 sfq_get__(netdev)->quantum = sfq.quantum;
3515 sfq_get__(netdev)->perturb = sfq.perturb;
3519 static const struct tc_ops tc_ops_sfq = {
3520 "sfq", /* linux_name */
3521 "linux-sfq", /* ovs_name */
3522 SFQ_N_QUEUES, /* n_queues */
3535 /* HTB traffic control class. */
3537 #define HTB_N_QUEUES 0xf000
3538 #define HTB_RATE2QUANTUM 10
3542 unsigned int max_rate; /* In bytes/s. */
3546 struct tc_queue tc_queue;
3547 unsigned int min_rate; /* In bytes/s. */
3548 unsigned int max_rate; /* In bytes/s. */
3549 unsigned int burst; /* In bytes. */
3550 unsigned int priority; /* Lower values are higher priorities. */
3554 htb_get__(const struct netdev *netdev_)
3556 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3557 return CONTAINER_OF(netdev->tc, struct htb, tc);
3561 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3563 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3566 htb = xmalloc(sizeof *htb);
3567 tc_init(&htb->tc, &tc_ops_htb);
3568 htb->max_rate = max_rate;
3570 netdev->tc = &htb->tc;
3573 /* Create an HTB qdisc.
3575 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3577 htb_setup_qdisc__(struct netdev *netdev)
3580 struct tc_htb_glob opt;
3581 struct ofpbuf request;
3582 struct tcmsg *tcmsg;
3584 tc_del_qdisc(netdev);
3586 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3587 NLM_F_EXCL | NLM_F_CREATE, &request);
3591 tcmsg->tcm_handle = tc_make_handle(1, 0);
3592 tcmsg->tcm_parent = TC_H_ROOT;
3594 nl_msg_put_string(&request, TCA_KIND, "htb");
3596 memset(&opt, 0, sizeof opt);
3597 opt.rate2quantum = HTB_RATE2QUANTUM;
3601 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3602 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3603 nl_msg_end_nested(&request, opt_offset);
3605 return tc_transact(&request, NULL);
3608 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3609 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3611 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3612 unsigned int parent, struct htb_class *class)
3615 struct tc_htb_opt opt;
3616 struct ofpbuf request;
3617 struct tcmsg *tcmsg;
3621 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3623 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3624 netdev_get_name(netdev));
3628 memset(&opt, 0, sizeof opt);
3629 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3630 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3631 /* Makes sure the quantum is at least MTU. Setting quantum will
3632 * make htb ignore the r2q for this class. */
3633 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3636 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3637 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3638 opt.prio = class->priority;
3640 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3644 tcmsg->tcm_handle = handle;
3645 tcmsg->tcm_parent = parent;
3647 nl_msg_put_string(&request, TCA_KIND, "htb");
3648 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3649 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3650 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3651 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3652 nl_msg_end_nested(&request, opt_offset);
3654 error = tc_transact(&request, NULL);
3656 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3657 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3658 netdev_get_name(netdev),
3659 tc_get_major(handle), tc_get_minor(handle),
3660 tc_get_major(parent), tc_get_minor(parent),
3661 class->min_rate, class->max_rate,
3662 class->burst, class->priority, ovs_strerror(error));
3667 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3668 * description of them into 'details'. The description complies with the
3669 * specification given in the vswitch database documentation for linux-htb
3672 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3674 static const struct nl_policy tca_htb_policy[] = {
3675 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3676 .min_len = sizeof(struct tc_htb_opt) },
3679 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3680 const struct tc_htb_opt *htb;
3682 if (!nl_parse_nested(nl_options, tca_htb_policy,
3683 attrs, ARRAY_SIZE(tca_htb_policy))) {
3684 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3688 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3689 class->min_rate = htb->rate.rate;
3690 class->max_rate = htb->ceil.rate;
3691 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3692 class->priority = htb->prio;
3697 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3698 struct htb_class *options,
3699 struct netdev_queue_stats *stats)
3701 struct nlattr *nl_options;
3702 unsigned int handle;
3705 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3706 if (!error && queue_id) {
3707 unsigned int major = tc_get_major(handle);
3708 unsigned int minor = tc_get_minor(handle);
3709 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3710 *queue_id = minor - 1;
3715 if (!error && options) {
3716 error = htb_parse_tca_options__(nl_options, options);
3722 htb_parse_qdisc_details__(struct netdev *netdev_,
3723 const struct smap *details, struct htb_class *hc)
3725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3726 const char *max_rate_s;
3728 max_rate_s = smap_get(details, "max-rate");
3729 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3730 if (!hc->max_rate) {
3731 enum netdev_features current;
3733 netdev_linux_read_features(netdev);
3734 current = !netdev->get_features_error ? netdev->current : 0;
3735 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3737 hc->min_rate = hc->max_rate;
3743 htb_parse_class_details__(struct netdev *netdev,
3744 const struct smap *details, struct htb_class *hc)
3746 const struct htb *htb = htb_get__(netdev);
3747 const char *min_rate_s = smap_get(details, "min-rate");
3748 const char *max_rate_s = smap_get(details, "max-rate");
3749 const char *burst_s = smap_get(details, "burst");
3750 const char *priority_s = smap_get(details, "priority");
3753 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3755 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3756 netdev_get_name(netdev));
3760 /* HTB requires at least an mtu sized min-rate to send any traffic even
3761 * on uncongested links. */
3762 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3763 hc->min_rate = MAX(hc->min_rate, mtu);
3764 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3767 hc->max_rate = (max_rate_s
3768 ? strtoull(max_rate_s, NULL, 10) / 8
3770 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3771 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3775 * According to hints in the documentation that I've read, it is important
3776 * that 'burst' be at least as big as the largest frame that might be
3777 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3778 * but having it a bit too small is a problem. Since netdev_get_mtu()
3779 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3780 * the MTU. We actually add 64, instead of 14, as a guard against
3781 * additional headers get tacked on somewhere that we're not aware of. */
3782 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3783 hc->burst = MAX(hc->burst, mtu + 64);
3786 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3792 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3793 unsigned int parent, struct htb_class *options,
3794 struct netdev_queue_stats *stats)
3796 struct ofpbuf *reply;
3799 error = tc_query_class(netdev, handle, parent, &reply);
3801 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3802 ofpbuf_delete(reply);
3808 htb_tc_install(struct netdev *netdev, const struct smap *details)
3812 error = htb_setup_qdisc__(netdev);
3814 struct htb_class hc;
3816 htb_parse_qdisc_details__(netdev, details, &hc);
3817 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3818 tc_make_handle(1, 0), &hc);
3820 htb_install__(netdev, hc.max_rate);
3826 static struct htb_class *
3827 htb_class_cast__(const struct tc_queue *queue)
3829 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3833 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3834 const struct htb_class *hc)
3836 struct htb *htb = htb_get__(netdev);
3837 size_t hash = hash_int(queue_id, 0);
3838 struct tc_queue *queue;
3839 struct htb_class *hcp;
3841 queue = tc_find_queue__(netdev, queue_id, hash);
3843 hcp = htb_class_cast__(queue);
3845 hcp = xmalloc(sizeof *hcp);
3846 queue = &hcp->tc_queue;
3847 queue->queue_id = queue_id;
3848 queue->created = time_msec();
3849 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3852 hcp->min_rate = hc->min_rate;
3853 hcp->max_rate = hc->max_rate;
3854 hcp->burst = hc->burst;
3855 hcp->priority = hc->priority;
3859 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3862 struct queue_dump_state state;
3863 struct htb_class hc;
3865 /* Get qdisc options. */
3867 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3868 htb_install__(netdev, hc.max_rate);
3871 if (!start_queue_dump(netdev, &state)) {
3874 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3875 unsigned int queue_id;
3877 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3878 htb_update_queue__(netdev, queue_id, &hc);
3881 finish_queue_dump(&state);
3887 htb_tc_destroy(struct tc *tc)
3889 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3890 struct htb_class *hc, *next;
3892 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3893 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3901 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3903 const struct htb *htb = htb_get__(netdev);
3904 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3909 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3911 struct htb_class hc;
3914 htb_parse_qdisc_details__(netdev, details, &hc);
3915 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3916 tc_make_handle(1, 0), &hc);
3918 htb_get__(netdev)->max_rate = hc.max_rate;
3924 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3925 const struct tc_queue *queue, struct smap *details)
3927 const struct htb_class *hc = htb_class_cast__(queue);
3929 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3930 if (hc->min_rate != hc->max_rate) {
3931 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3933 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3935 smap_add_format(details, "priority", "%u", hc->priority);
3941 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3942 const struct smap *details)
3944 struct htb_class hc;
3947 error = htb_parse_class_details__(netdev, details, &hc);
3952 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3953 tc_make_handle(1, 0xfffe), &hc);
3958 htb_update_queue__(netdev, queue_id, &hc);
3963 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3965 struct htb_class *hc = htb_class_cast__(queue);
3966 struct htb *htb = htb_get__(netdev);
3969 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3971 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3978 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3979 struct netdev_queue_stats *stats)
3981 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3982 tc_make_handle(1, 0xfffe), NULL, stats);
3986 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3987 const struct ofpbuf *nlmsg,
3988 netdev_dump_queue_stats_cb *cb, void *aux)
3990 struct netdev_queue_stats stats;
3991 unsigned int handle, major, minor;
3994 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3999 major = tc_get_major(handle);
4000 minor = tc_get_minor(handle);
4001 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4002 (*cb)(minor - 1, &stats, aux);
4007 static const struct tc_ops tc_ops_htb = {
4008 "htb", /* linux_name */
4009 "linux-htb", /* ovs_name */
4010 HTB_N_QUEUES, /* n_queues */
4019 htb_class_get_stats,
4020 htb_class_dump_stats
4023 /* "linux-hfsc" traffic control class. */
4025 #define HFSC_N_QUEUES 0xf000
4033 struct tc_queue tc_queue;
4038 static struct hfsc *
4039 hfsc_get__(const struct netdev *netdev_)
4041 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4042 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4045 static struct hfsc_class *
4046 hfsc_class_cast__(const struct tc_queue *queue)
4048 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4052 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4057 hfsc = xmalloc(sizeof *hfsc);
4058 tc_init(&hfsc->tc, &tc_ops_hfsc);
4059 hfsc->max_rate = max_rate;
4060 netdev->tc = &hfsc->tc;
4064 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4065 const struct hfsc_class *hc)
4069 struct hfsc_class *hcp;
4070 struct tc_queue *queue;
4072 hfsc = hfsc_get__(netdev);
4073 hash = hash_int(queue_id, 0);
4075 queue = tc_find_queue__(netdev, queue_id, hash);
4077 hcp = hfsc_class_cast__(queue);
4079 hcp = xmalloc(sizeof *hcp);
4080 queue = &hcp->tc_queue;
4081 queue->queue_id = queue_id;
4082 queue->created = time_msec();
4083 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4086 hcp->min_rate = hc->min_rate;
4087 hcp->max_rate = hc->max_rate;
4091 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4093 const struct tc_service_curve *rsc, *fsc, *usc;
4094 static const struct nl_policy tca_hfsc_policy[] = {
4096 .type = NL_A_UNSPEC,
4098 .min_len = sizeof(struct tc_service_curve),
4101 .type = NL_A_UNSPEC,
4103 .min_len = sizeof(struct tc_service_curve),
4106 .type = NL_A_UNSPEC,
4108 .min_len = sizeof(struct tc_service_curve),
4111 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4113 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4114 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4115 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4119 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4120 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4121 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4123 if (rsc->m1 != 0 || rsc->d != 0 ||
4124 fsc->m1 != 0 || fsc->d != 0 ||
4125 usc->m1 != 0 || usc->d != 0) {
4126 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4127 "Non-linear service curves are not supported.");
4131 if (rsc->m2 != fsc->m2) {
4132 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4133 "Real-time service curves are not supported ");
4137 if (rsc->m2 > usc->m2) {
4138 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4139 "Min-rate service curve is greater than "
4140 "the max-rate service curve.");
4144 class->min_rate = fsc->m2;
4145 class->max_rate = usc->m2;
4150 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4151 struct hfsc_class *options,
4152 struct netdev_queue_stats *stats)
4155 unsigned int handle;
4156 struct nlattr *nl_options;
4158 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4164 unsigned int major, minor;
4166 major = tc_get_major(handle);
4167 minor = tc_get_minor(handle);
4168 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4169 *queue_id = minor - 1;
4176 error = hfsc_parse_tca_options__(nl_options, options);
4183 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4184 unsigned int parent, struct hfsc_class *options,
4185 struct netdev_queue_stats *stats)
4188 struct ofpbuf *reply;
4190 error = tc_query_class(netdev, handle, parent, &reply);
4195 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4196 ofpbuf_delete(reply);
4201 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4202 struct hfsc_class *class)
4204 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4206 const char *max_rate_s;
4208 max_rate_s = smap_get(details, "max-rate");
4209 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4212 enum netdev_features current;
4214 netdev_linux_read_features(netdev);
4215 current = !netdev->get_features_error ? netdev->current : 0;
4216 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4219 class->min_rate = max_rate;
4220 class->max_rate = max_rate;
4224 hfsc_parse_class_details__(struct netdev *netdev,
4225 const struct smap *details,
4226 struct hfsc_class * class)
4228 const struct hfsc *hfsc;
4229 uint32_t min_rate, max_rate;
4230 const char *min_rate_s, *max_rate_s;
4232 hfsc = hfsc_get__(netdev);
4233 min_rate_s = smap_get(details, "min-rate");
4234 max_rate_s = smap_get(details, "max-rate");
4236 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4237 min_rate = MAX(min_rate, 1);
4238 min_rate = MIN(min_rate, hfsc->max_rate);
4240 max_rate = (max_rate_s
4241 ? strtoull(max_rate_s, NULL, 10) / 8
4243 max_rate = MAX(max_rate, min_rate);
4244 max_rate = MIN(max_rate, hfsc->max_rate);
4246 class->min_rate = min_rate;
4247 class->max_rate = max_rate;
4252 /* Create an HFSC qdisc.
4254 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4256 hfsc_setup_qdisc__(struct netdev * netdev)
4258 struct tcmsg *tcmsg;
4259 struct ofpbuf request;
4260 struct tc_hfsc_qopt opt;
4262 tc_del_qdisc(netdev);
4264 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4265 NLM_F_EXCL | NLM_F_CREATE, &request);
4271 tcmsg->tcm_handle = tc_make_handle(1, 0);
4272 tcmsg->tcm_parent = TC_H_ROOT;
4274 memset(&opt, 0, sizeof opt);
4277 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4278 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4280 return tc_transact(&request, NULL);
4283 /* Create an HFSC class.
4285 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4286 * sc rate <min_rate> ul rate <max_rate>" */
4288 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4289 unsigned int parent, struct hfsc_class *class)
4293 struct tcmsg *tcmsg;
4294 struct ofpbuf request;
4295 struct tc_service_curve min, max;
4297 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4303 tcmsg->tcm_handle = handle;
4304 tcmsg->tcm_parent = parent;
4308 min.m2 = class->min_rate;
4312 max.m2 = class->max_rate;
4314 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4315 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4316 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4317 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4318 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4319 nl_msg_end_nested(&request, opt_offset);
4321 error = tc_transact(&request, NULL);
4323 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4324 "min-rate %ubps, max-rate %ubps (%s)",
4325 netdev_get_name(netdev),
4326 tc_get_major(handle), tc_get_minor(handle),
4327 tc_get_major(parent), tc_get_minor(parent),
4328 class->min_rate, class->max_rate, ovs_strerror(error));
4335 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4338 struct hfsc_class class;
4340 error = hfsc_setup_qdisc__(netdev);
4346 hfsc_parse_qdisc_details__(netdev, details, &class);
4347 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4348 tc_make_handle(1, 0), &class);
4354 hfsc_install__(netdev, class.max_rate);
4359 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4362 struct queue_dump_state state;
4363 struct hfsc_class hc;
4366 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4367 hfsc_install__(netdev, hc.max_rate);
4369 if (!start_queue_dump(netdev, &state)) {
4373 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4374 unsigned int queue_id;
4376 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4377 hfsc_update_queue__(netdev, queue_id, &hc);
4381 finish_queue_dump(&state);
4386 hfsc_tc_destroy(struct tc *tc)
4389 struct hfsc_class *hc, *next;
4391 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4393 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4394 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4403 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4405 const struct hfsc *hfsc;
4406 hfsc = hfsc_get__(netdev);
4407 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4412 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4415 struct hfsc_class class;
4417 hfsc_parse_qdisc_details__(netdev, details, &class);
4418 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4419 tc_make_handle(1, 0), &class);
4422 hfsc_get__(netdev)->max_rate = class.max_rate;
4429 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4430 const struct tc_queue *queue, struct smap *details)
4432 const struct hfsc_class *hc;
4434 hc = hfsc_class_cast__(queue);
4435 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4436 if (hc->min_rate != hc->max_rate) {
4437 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4443 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4444 const struct smap *details)
4447 struct hfsc_class class;
4449 error = hfsc_parse_class_details__(netdev, details, &class);
4454 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4455 tc_make_handle(1, 0xfffe), &class);
4460 hfsc_update_queue__(netdev, queue_id, &class);
4465 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4469 struct hfsc_class *hc;
4471 hc = hfsc_class_cast__(queue);
4472 hfsc = hfsc_get__(netdev);
4474 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4476 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4483 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4484 struct netdev_queue_stats *stats)
4486 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4487 tc_make_handle(1, 0xfffe), NULL, stats);
4491 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4492 const struct ofpbuf *nlmsg,
4493 netdev_dump_queue_stats_cb *cb, void *aux)
4495 struct netdev_queue_stats stats;
4496 unsigned int handle, major, minor;
4499 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4504 major = tc_get_major(handle);
4505 minor = tc_get_minor(handle);
4506 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4507 (*cb)(minor - 1, &stats, aux);
4512 static const struct tc_ops tc_ops_hfsc = {
4513 "hfsc", /* linux_name */
4514 "linux-hfsc", /* ovs_name */
4515 HFSC_N_QUEUES, /* n_queues */
4516 hfsc_tc_install, /* tc_install */
4517 hfsc_tc_load, /* tc_load */
4518 hfsc_tc_destroy, /* tc_destroy */
4519 hfsc_qdisc_get, /* qdisc_get */
4520 hfsc_qdisc_set, /* qdisc_set */
4521 hfsc_class_get, /* class_get */
4522 hfsc_class_set, /* class_set */
4523 hfsc_class_delete, /* class_delete */
4524 hfsc_class_get_stats, /* class_get_stats */
4525 hfsc_class_dump_stats /* class_dump_stats */
4528 /* "linux-default" traffic control class.
4530 * This class represents the default, unnamed Linux qdisc. It corresponds to
4531 * the "" (empty string) QoS type in the OVS database. */
4534 default_install__(struct netdev *netdev_)
4536 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4537 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4539 /* Nothing but a tc class implementation is allowed to write to a tc. This
4540 * class never does that, so we can legitimately use a const tc object. */
4541 netdev->tc = CONST_CAST(struct tc *, &tc);
4545 default_tc_install(struct netdev *netdev,
4546 const struct smap *details OVS_UNUSED)
4548 default_install__(netdev);
4553 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4555 default_install__(netdev);
4559 static const struct tc_ops tc_ops_default = {
4560 NULL, /* linux_name */
4565 NULL, /* tc_destroy */
4566 NULL, /* qdisc_get */
4567 NULL, /* qdisc_set */
4568 NULL, /* class_get */
4569 NULL, /* class_set */
4570 NULL, /* class_delete */
4571 NULL, /* class_get_stats */
4572 NULL /* class_dump_stats */
4575 /* "linux-other" traffic control class.
4580 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4582 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4583 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4585 /* Nothing but a tc class implementation is allowed to write to a tc. This
4586 * class never does that, so we can legitimately use a const tc object. */
4587 netdev->tc = CONST_CAST(struct tc *, &tc);
4591 static const struct tc_ops tc_ops_other = {
4592 NULL, /* linux_name */
4593 "linux-other", /* ovs_name */
4595 NULL, /* tc_install */
4597 NULL, /* tc_destroy */
4598 NULL, /* qdisc_get */
4599 NULL, /* qdisc_set */
4600 NULL, /* class_get */
4601 NULL, /* class_set */
4602 NULL, /* class_delete */
4603 NULL, /* class_get_stats */
4604 NULL /* class_dump_stats */
4607 /* Traffic control. */
4609 /* Number of kernel "tc" ticks per second. */
4610 static double ticks_per_s;
4612 /* Number of kernel "jiffies" per second. This is used for the purpose of
4613 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4614 * one jiffy's worth of data.
4616 * There are two possibilities here:
4618 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4619 * approximate range of 100 to 1024. That means that we really need to
4620 * make sure that the qdisc can buffer that much data.
4622 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4623 * has finely granular timers and there's no need to fudge additional room
4624 * for buffers. (There's no extra effort needed to implement that: the
4625 * large 'buffer_hz' is used as a divisor, so practically any number will
4626 * come out as 0 in the division. Small integer results in the case of
4627 * really high dividends won't have any real effect anyhow.)
4629 static unsigned int buffer_hz;
4631 /* Returns tc handle 'major':'minor'. */
4633 tc_make_handle(unsigned int major, unsigned int minor)
4635 return TC_H_MAKE(major << 16, minor);
4638 /* Returns the major number from 'handle'. */
4640 tc_get_major(unsigned int handle)
4642 return TC_H_MAJ(handle) >> 16;
4645 /* Returns the minor number from 'handle'. */
4647 tc_get_minor(unsigned int handle)
4649 return TC_H_MIN(handle);
4652 static struct tcmsg *
4653 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4654 struct ofpbuf *request)
4656 struct tcmsg *tcmsg;
4660 error = get_ifindex(netdev, &ifindex);
4665 ofpbuf_init(request, 512);
4666 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4667 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4668 tcmsg->tcm_family = AF_UNSPEC;
4669 tcmsg->tcm_ifindex = ifindex;
4670 /* Caller should fill in tcmsg->tcm_handle. */
4671 /* Caller should fill in tcmsg->tcm_parent. */
4677 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4679 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4680 ofpbuf_uninit(request);
4684 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4685 * policing configuration.
4687 * This function is equivalent to running the following when 'add' is true:
4688 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4690 * This function is equivalent to running the following when 'add' is false:
4691 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4693 * The configuration and stats may be seen with the following command:
4694 * /sbin/tc -s qdisc show dev <devname>
4696 * Returns 0 if successful, otherwise a positive errno value.
4699 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4701 struct ofpbuf request;
4702 struct tcmsg *tcmsg;
4704 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4705 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4707 tcmsg = tc_make_request(netdev, type, flags, &request);
4711 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4712 tcmsg->tcm_parent = TC_H_INGRESS;
4713 nl_msg_put_string(&request, TCA_KIND, "ingress");
4714 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4716 error = tc_transact(&request, NULL);
4718 /* If we're deleting the qdisc, don't worry about some of the
4719 * error conditions. */
4720 if (!add && (error == ENOENT || error == EINVAL)) {
4729 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4732 * This function is equivalent to running:
4733 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4734 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4737 * The configuration and stats may be seen with the following command:
4738 * /sbin/tc -s filter show dev <devname> parent ffff:
4740 * Returns 0 if successful, otherwise a positive errno value.
4743 tc_add_policer(struct netdev *netdev,
4744 uint32_t kbits_rate, uint32_t kbits_burst)
4746 struct tc_police tc_police;
4747 struct ofpbuf request;
4748 struct tcmsg *tcmsg;
4749 size_t basic_offset;
4750 size_t police_offset;
4754 memset(&tc_police, 0, sizeof tc_police);
4755 tc_police.action = TC_POLICE_SHOT;
4756 tc_police.mtu = mtu;
4757 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4759 /* The following appears wrong in two ways:
4761 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4762 * arguments (or at least consistently "bytes" as both or "bits" as
4763 * both), but this supplies bytes for the first argument and bits for the
4766 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4768 * However if you "fix" those problems then "tc filter show ..." shows
4769 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4770 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4771 * tc's point of view. Whatever. */
4772 tc_police.burst = tc_bytes_to_ticks(
4773 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4775 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4776 NLM_F_EXCL | NLM_F_CREATE, &request);
4780 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4781 tcmsg->tcm_info = tc_make_handle(49,
4782 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4784 nl_msg_put_string(&request, TCA_KIND, "basic");
4785 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4786 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4787 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4788 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4789 nl_msg_end_nested(&request, police_offset);
4790 nl_msg_end_nested(&request, basic_offset);
4792 error = tc_transact(&request, NULL);
4803 /* The values in psched are not individually very meaningful, but they are
4804 * important. The tables below show some values seen in the wild.
4808 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4809 * (Before that, there are hints that it was 1000000000.)
4811 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4815 * -----------------------------------
4816 * [1] 000c8000 000f4240 000f4240 00000064
4817 * [2] 000003e8 00000400 000f4240 3b9aca00
4818 * [3] 000003e8 00000400 000f4240 3b9aca00
4819 * [4] 000003e8 00000400 000f4240 00000064
4820 * [5] 000003e8 00000040 000f4240 3b9aca00
4821 * [6] 000003e8 00000040 000f4240 000000f9
4823 * a b c d ticks_per_s buffer_hz
4824 * ------- --------- ---------- ------------- ----------- -------------
4825 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4826 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4827 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4828 * [4] 1,000 1,024 1,000,000 100 976,562 100
4829 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4830 * [6] 1,000 64 1,000,000 249 15,625,000 249
4832 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4833 * [2] 2.6.26-1-686-bigmem from Debian lenny
4834 * [3] 2.6.26-2-sparc64 from Debian lenny
4835 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4836 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4837 * [6] 2.6.34 from kernel.org on KVM
4839 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4840 static const char fn[] = "/proc/net/psched";
4841 unsigned int a, b, c, d;
4844 if (!ovsthread_once_start(&once)) {
4851 stream = fopen(fn, "r");
4853 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4857 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4858 VLOG_WARN("%s: read failed", fn);
4862 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4866 VLOG_WARN("%s: invalid scheduler parameters", fn);
4870 ticks_per_s = (double) a * c / b;
4874 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4877 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4880 ovsthread_once_done(&once);
4883 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4884 * rate of 'rate' bytes per second. */
4886 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4889 return (rate * ticks) / ticks_per_s;
4892 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4893 * rate of 'rate' bytes per second. */
4895 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4898 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4901 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4902 * a transmission rate of 'rate' bytes per second. */
4904 tc_buffer_per_jiffy(unsigned int rate)
4907 return rate / buffer_hz;
4910 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4911 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4912 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4913 * stores NULL into it if it is absent.
4915 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4918 * Returns 0 if successful, otherwise a positive errno value. */
4920 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4921 struct nlattr **options)
4923 static const struct nl_policy tca_policy[] = {
4924 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4925 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4927 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4929 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4930 tca_policy, ta, ARRAY_SIZE(ta))) {
4931 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4936 *kind = nl_attr_get_string(ta[TCA_KIND]);
4940 *options = ta[TCA_OPTIONS];
4955 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4956 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4957 * into '*options', and its queue statistics into '*stats'. Any of the output
4958 * arguments may be null.
4960 * Returns 0 if successful, otherwise a positive errno value. */
4962 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4963 struct nlattr **options, struct netdev_queue_stats *stats)
4965 static const struct nl_policy tca_policy[] = {
4966 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4967 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4969 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4971 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4972 tca_policy, ta, ARRAY_SIZE(ta))) {
4973 VLOG_WARN_RL(&rl, "failed to parse class message");
4978 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4979 *handlep = tc->tcm_handle;
4983 *options = ta[TCA_OPTIONS];
4987 const struct gnet_stats_queue *gsq;
4988 struct gnet_stats_basic gsb;
4990 static const struct nl_policy stats_policy[] = {
4991 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4992 .min_len = sizeof gsb },
4993 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4994 .min_len = sizeof *gsq },
4996 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4998 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4999 sa, ARRAY_SIZE(sa))) {
5000 VLOG_WARN_RL(&rl, "failed to parse class stats");
5004 /* Alignment issues screw up the length of struct gnet_stats_basic on
5005 * some arch/bitsize combinations. Newer versions of Linux have a
5006 * struct gnet_stats_basic_packed, but we can't depend on that. The
5007 * easiest thing to do is just to make a copy. */
5008 memset(&gsb, 0, sizeof gsb);
5009 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5010 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5011 stats->tx_bytes = gsb.bytes;
5012 stats->tx_packets = gsb.packets;
5014 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5015 stats->tx_errors = gsq->drops;
5025 memset(stats, 0, sizeof *stats);
5030 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5033 tc_query_class(const struct netdev *netdev,
5034 unsigned int handle, unsigned int parent,
5035 struct ofpbuf **replyp)
5037 struct ofpbuf request;
5038 struct tcmsg *tcmsg;
5041 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5045 tcmsg->tcm_handle = handle;
5046 tcmsg->tcm_parent = parent;
5048 error = tc_transact(&request, replyp);
5050 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5051 netdev_get_name(netdev),
5052 tc_get_major(handle), tc_get_minor(handle),
5053 tc_get_major(parent), tc_get_minor(parent),
5054 ovs_strerror(error));
5059 /* Equivalent to "tc class del dev <name> handle <handle>". */
5061 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5063 struct ofpbuf request;
5064 struct tcmsg *tcmsg;
5067 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5071 tcmsg->tcm_handle = handle;
5072 tcmsg->tcm_parent = 0;
5074 error = tc_transact(&request, NULL);
5076 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5077 netdev_get_name(netdev),
5078 tc_get_major(handle), tc_get_minor(handle),
5079 ovs_strerror(error));
5084 /* Equivalent to "tc qdisc del dev <name> root". */
5086 tc_del_qdisc(struct netdev *netdev_)
5088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5089 struct ofpbuf request;
5090 struct tcmsg *tcmsg;
5093 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5097 tcmsg->tcm_handle = tc_make_handle(1, 0);
5098 tcmsg->tcm_parent = TC_H_ROOT;
5100 error = tc_transact(&request, NULL);
5101 if (error == EINVAL) {
5102 /* EINVAL probably means that the default qdisc was in use, in which
5103 * case we've accomplished our purpose. */
5106 if (!error && netdev->tc) {
5107 if (netdev->tc->ops->tc_destroy) {
5108 netdev->tc->ops->tc_destroy(netdev->tc);
5116 getqdisc_is_safe(void)
5118 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5119 static bool safe = false;
5121 if (ovsthread_once_start(&once)) {
5122 struct utsname utsname;
5125 if (uname(&utsname) == -1) {
5126 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5127 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5128 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5129 } else if (major < 2 || (major == 2 && minor < 35)) {
5130 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5135 ovsthread_once_done(&once);
5140 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5141 * kernel to determine what they are. Returns 0 if successful, otherwise a
5142 * positive errno value. */
5144 tc_query_qdisc(const struct netdev *netdev_)
5146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5147 struct ofpbuf request, *qdisc;
5148 const struct tc_ops *ops;
5149 struct tcmsg *tcmsg;
5157 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5158 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5159 * 2.6.35 without that fix backported to it.
5161 * To avoid the OOPS, we must not make a request that would attempt to dump
5162 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5163 * few others. There are a few ways that I can see to do this, but most of
5164 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5165 * technique chosen here is to assume that any non-default qdisc that we
5166 * create will have a class with handle 1:0. The built-in qdiscs only have
5167 * a class with handle 0:0.
5169 * On Linux 2.6.35+ we use the straightforward method because it allows us
5170 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5171 * in such a case we get no response at all from the kernel (!) if a
5172 * builtin qdisc is in use (which is later caught by "!error &&
5173 * !qdisc->size"). */
5174 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5178 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5179 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5181 /* Figure out what tc class to instantiate. */
5182 error = tc_transact(&request, &qdisc);
5183 if (!error && qdisc->size) {
5186 error = tc_parse_qdisc(qdisc, &kind, NULL);
5188 ops = &tc_ops_other;
5190 ops = tc_lookup_linux_name(kind);
5192 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5193 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5195 ops = &tc_ops_other;
5198 } else if ((!error && !qdisc->size) || error == ENOENT) {
5199 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5200 * set up by some other entity that doesn't have a handle 1:0. We will
5201 * assume that it's the system default qdisc. */
5202 ops = &tc_ops_default;
5205 /* Who knows? Maybe the device got deleted. */
5206 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5207 netdev_get_name(netdev_), ovs_strerror(error));
5208 ops = &tc_ops_other;
5211 /* Instantiate it. */
5212 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5213 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5214 ofpbuf_delete(qdisc);
5216 return error ? error : load_error;
5219 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5220 approximate the time to transmit packets of various lengths. For an MTU of
5221 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5222 represents two possible packet lengths; for a MTU of 513 through 1024, four
5223 possible lengths; and so on.
5225 Returns, for the specified 'mtu', the number of bits that packet lengths
5226 need to be shifted right to fit within such a 256-entry table. */
5228 tc_calc_cell_log(unsigned int mtu)
5233 mtu = ETH_PAYLOAD_MAX;
5235 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5237 for (cell_log = 0; mtu >= 256; cell_log++) {
5244 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5247 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5249 memset(rate, 0, sizeof *rate);
5250 rate->cell_log = tc_calc_cell_log(mtu);
5251 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5252 /* rate->cell_align = 0; */ /* distro headers. */
5253 rate->mpu = ETH_TOTAL_MIN;
5257 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5258 * attribute of the specified "type".
5260 * See tc_calc_cell_log() above for a description of "rtab"s. */
5262 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5267 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5268 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5269 unsigned packet_size = (i + 1) << rate->cell_log;
5270 if (packet_size < rate->mpu) {
5271 packet_size = rate->mpu;
5273 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5277 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5278 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5279 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5282 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5284 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5285 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5288 /* Linux-only functions declared in netdev-linux.h */
5290 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5291 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5293 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5294 const char *flag_name, bool enable)
5296 const char *netdev_name = netdev_get_name(netdev);
5297 struct ethtool_value evalue;
5301 COVERAGE_INC(netdev_get_ethtool);
5302 memset(&evalue, 0, sizeof evalue);
5303 error = netdev_linux_do_ethtool(netdev_name,
5304 (struct ethtool_cmd *)&evalue,
5305 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5310 COVERAGE_INC(netdev_set_ethtool);
5311 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5312 if (new_flags == evalue.data) {
5315 evalue.data = new_flags;
5316 error = netdev_linux_do_ethtool(netdev_name,
5317 (struct ethtool_cmd *)&evalue,
5318 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5323 COVERAGE_INC(netdev_get_ethtool);
5324 memset(&evalue, 0, sizeof evalue);
5325 error = netdev_linux_do_ethtool(netdev_name,
5326 (struct ethtool_cmd *)&evalue,
5327 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5332 if (new_flags != evalue.data) {
5333 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5334 "device %s failed", enable ? "enable" : "disable",
5335 flag_name, netdev_name);
5342 /* Utility functions. */
5344 /* Copies 'src' into 'dst', performing format conversion in the process. */
5346 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5347 const struct rtnl_link_stats *src)
5349 dst->rx_packets = src->rx_packets;
5350 dst->tx_packets = src->tx_packets;
5351 dst->rx_bytes = src->rx_bytes;
5352 dst->tx_bytes = src->tx_bytes;
5353 dst->rx_errors = src->rx_errors;
5354 dst->tx_errors = src->tx_errors;
5355 dst->rx_dropped = src->rx_dropped;
5356 dst->tx_dropped = src->tx_dropped;
5357 dst->multicast = src->multicast;
5358 dst->collisions = src->collisions;
5359 dst->rx_length_errors = src->rx_length_errors;
5360 dst->rx_over_errors = src->rx_over_errors;
5361 dst->rx_crc_errors = src->rx_crc_errors;
5362 dst->rx_frame_errors = src->rx_frame_errors;
5363 dst->rx_fifo_errors = src->rx_fifo_errors;
5364 dst->rx_missed_errors = src->rx_missed_errors;
5365 dst->tx_aborted_errors = src->tx_aborted_errors;
5366 dst->tx_carrier_errors = src->tx_carrier_errors;
5367 dst->tx_fifo_errors = src->tx_fifo_errors;
5368 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5369 dst->tx_window_errors = src->tx_window_errors;
5372 /* Copies 'src' into 'dst', performing format conversion in the process. */
5374 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5375 const struct rtnl_link_stats64 *src)
5377 dst->rx_packets = src->rx_packets;
5378 dst->tx_packets = src->tx_packets;
5379 dst->rx_bytes = src->rx_bytes;
5380 dst->tx_bytes = src->tx_bytes;
5381 dst->rx_errors = src->rx_errors;
5382 dst->tx_errors = src->tx_errors;
5383 dst->rx_dropped = src->rx_dropped;
5384 dst->tx_dropped = src->tx_dropped;
5385 dst->multicast = src->multicast;
5386 dst->collisions = src->collisions;
5387 dst->rx_length_errors = src->rx_length_errors;
5388 dst->rx_over_errors = src->rx_over_errors;
5389 dst->rx_crc_errors = src->rx_crc_errors;
5390 dst->rx_frame_errors = src->rx_frame_errors;
5391 dst->rx_fifo_errors = src->rx_fifo_errors;
5392 dst->rx_missed_errors = src->rx_missed_errors;
5393 dst->tx_aborted_errors = src->tx_aborted_errors;
5394 dst->tx_carrier_errors = src->tx_carrier_errors;
5395 dst->tx_fifo_errors = src->tx_fifo_errors;
5396 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5397 dst->tx_window_errors = src->tx_window_errors;
5401 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5403 struct ofpbuf request;
5404 struct ofpbuf *reply;
5407 ofpbuf_init(&request, 0);
5408 nl_msg_put_nlmsghdr(&request,
5409 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5410 RTM_GETLINK, NLM_F_REQUEST);
5411 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5412 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5413 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5414 ofpbuf_uninit(&request);
5419 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5420 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5421 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5422 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5425 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5426 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5427 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5430 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5435 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5440 ofpbuf_delete(reply);
5445 get_flags(const struct netdev *dev, unsigned int *flags)
5451 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5453 *flags = ifr.ifr_flags;
5459 set_flags(const char *name, unsigned int flags)
5463 ifr.ifr_flags = flags;
5464 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5468 do_get_ifindex(const char *netdev_name)
5473 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5474 COVERAGE_INC(netdev_get_ifindex);
5476 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5478 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5479 netdev_name, ovs_strerror(error));
5482 return ifr.ifr_ifindex;
5486 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5490 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5491 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5494 netdev->get_ifindex_error = -ifindex;
5495 netdev->ifindex = 0;
5497 netdev->get_ifindex_error = 0;
5498 netdev->ifindex = ifindex;
5500 netdev->cache_valid |= VALID_IFINDEX;
5503 *ifindexp = netdev->ifindex;
5504 return netdev->get_ifindex_error;
5508 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5514 memset(&ifr, 0, sizeof ifr);
5515 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5516 COVERAGE_INC(netdev_get_hwaddr);
5517 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5519 /* ENODEV probably means that a vif disappeared asynchronously and
5520 * hasn't been removed from the database yet, so reduce the log level
5521 * to INFO for that case. */
5522 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5523 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5524 netdev_name, ovs_strerror(error));
5527 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5528 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5529 VLOG_INFO("%s device has unknown hardware address family %d",
5530 netdev_name, hwaddr_family);
5533 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5538 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5543 memset(&ifr, 0, sizeof ifr);
5544 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5545 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5546 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5547 COVERAGE_INC(netdev_set_hwaddr);
5548 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5550 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5551 netdev_name, ovs_strerror(error));
5557 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5558 int cmd, const char *cmd_name)
5563 memset(&ifr, 0, sizeof ifr);
5564 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5565 ifr.ifr_data = (caddr_t) ecmd;
5568 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5570 if (error != EOPNOTSUPP) {
5571 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5572 "failed: %s", cmd_name, name, ovs_strerror(error));
5574 /* The device doesn't support this operation. That's pretty
5575 * common, so there's no point in logging anything. */
5582 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5583 int cmd, const char *cmd_name)
5588 ifr.ifr_addr.sa_family = AF_INET;
5589 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5591 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5593 *ip = sin->sin_addr;
5598 /* Returns an AF_PACKET raw socket or a negative errno value. */
5600 af_packet_sock(void)
5602 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5605 if (ovsthread_once_start(&once)) {
5606 sock = socket(AF_PACKET, SOCK_RAW, 0);
5608 int error = set_nonblocking(sock);
5615 VLOG_ERR("failed to create packet socket: %s",
5616 ovs_strerror(errno));
5618 ovsthread_once_done(&once);