2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
403 tc_init(struct tc *tc, const struct tc_ops *ops)
406 hmap_init(&tc->queues);
410 tc_destroy(struct tc *tc)
412 hmap_destroy(&tc->queues);
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_other;
423 static const struct tc_ops *const tcs[] = {
424 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
425 &tc_ops_hfsc, /* Hierarchical fair service curve. */
426 &tc_ops_codel, /* Controlled delay */
427 &tc_ops_fqcodel, /* Fair queue controlled delay */
428 &tc_ops_sfq, /* Stochastic fair queueing */
429 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
430 &tc_ops_other, /* Some other qdisc. */
434 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
435 static unsigned int tc_get_major(unsigned int handle);
436 static unsigned int tc_get_minor(unsigned int handle);
438 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
439 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
440 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
442 static struct tcmsg *tc_make_request(const struct netdev *, int type,
443 unsigned int flags, struct ofpbuf *);
444 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
445 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
446 static int tc_add_policer(struct netdev *,
447 uint32_t kbits_rate, uint32_t kbits_burst);
449 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
450 struct nlattr **options);
451 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
452 struct nlattr **options,
453 struct netdev_queue_stats *);
454 static int tc_query_class(const struct netdev *,
455 unsigned int handle, unsigned int parent,
456 struct ofpbuf **replyp);
457 static int tc_delete_class(const struct netdev *, unsigned int handle);
459 static int tc_del_qdisc(struct netdev *netdev);
460 static int tc_query_qdisc(const struct netdev *netdev);
462 static int tc_calc_cell_log(unsigned int mtu);
463 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
464 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
465 const struct tc_ratespec *rate);
466 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
468 struct netdev_linux {
471 /* Protects all members below. */
472 struct ovs_mutex mutex;
474 unsigned int cache_valid;
476 bool miimon; /* Link status of last poll. */
477 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
478 struct timer miimon_timer;
480 /* The following are figured out "on demand" only. They are only valid
481 * when the corresponding VALID_* bit in 'cache_valid' is set. */
483 struct eth_addr etheraddr;
485 unsigned int ifi_flags;
486 long long int carrier_resets;
487 uint32_t kbits_rate; /* Policing data. */
488 uint32_t kbits_burst;
489 int vport_stats_error; /* Cached error code from vport_get_stats().
490 0 or an errno value. */
491 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
492 int ether_addr_error; /* Cached error code from set/get etheraddr. */
493 int netdev_policing_error; /* Cached error code from set policing. */
494 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
495 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
497 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
499 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
501 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
504 /* For devices of class netdev_tap_class only. */
508 struct netdev_rxq_linux {
509 struct netdev_rxq up;
514 /* This is set pretty low because we probably won't learn anything from the
515 * additional log messages. */
516 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
518 /* Polling miimon status for all ports causes performance degradation when
519 * handling a large number of ports. If there are no devices using miimon, then
520 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
522 * Readers do not depend on this variable synchronizing with the related
523 * changes in the device miimon status, so we can use atomic_count. */
524 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
526 static void netdev_linux_run(void);
528 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
529 int cmd, const char *cmd_name);
530 static int get_flags(const struct netdev *, unsigned int *flags);
531 static int set_flags(const char *, unsigned int flags);
532 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
533 enum netdev_flags on, enum netdev_flags *old_flagsp)
534 OVS_REQUIRES(netdev->mutex);
535 static int do_get_ifindex(const char *netdev_name);
536 static int get_ifindex(const struct netdev *, int *ifindexp);
537 static int do_set_addr(struct netdev *netdev,
538 int ioctl_nr, const char *ioctl_name,
539 struct in_addr addr);
540 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
541 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
542 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
543 static int af_packet_sock(void);
544 static bool netdev_linux_miimon_enabled(void);
545 static void netdev_linux_miimon_run(void);
546 static void netdev_linux_miimon_wait(void);
547 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
550 is_netdev_linux_class(const struct netdev_class *netdev_class)
552 return netdev_class->run == netdev_linux_run;
556 is_tap_netdev(const struct netdev *netdev)
558 return netdev_get_class(netdev) == &netdev_tap_class;
561 static struct netdev_linux *
562 netdev_linux_cast(const struct netdev *netdev)
564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
566 return CONTAINER_OF(netdev, struct netdev_linux, up);
569 static struct netdev_rxq_linux *
570 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
573 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
576 static void netdev_linux_update(struct netdev_linux *netdev,
577 const struct rtnetlink_change *)
578 OVS_REQUIRES(netdev->mutex);
579 static void netdev_linux_changed(struct netdev_linux *netdev,
580 unsigned int ifi_flags, unsigned int mask)
581 OVS_REQUIRES(netdev->mutex);
583 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
585 * if no such socket could be created. */
586 static struct nl_sock *
587 netdev_linux_notify_sock(void)
589 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
590 static struct nl_sock *sock;
591 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
592 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
594 if (ovsthread_once_start(&once)) {
597 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
602 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
604 nl_sock_destroy(sock);
610 ovsthread_once_done(&once);
617 netdev_linux_miimon_enabled(void)
619 return atomic_count_get(&miimon_cnt) > 0;
623 netdev_linux_run(void)
625 struct nl_sock *sock;
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
632 sock = netdev_linux_notify_sock();
638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
639 uint64_t buf_stub[4096 / 8];
642 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
643 error = nl_sock_recv(sock, &buf, false);
645 struct rtnetlink_change change;
647 if (rtnetlink_parse(&buf, &change)) {
648 struct netdev *netdev_ = NULL;
649 char dev_name[IFNAMSIZ];
651 if (!change.ifname) {
652 change.ifname = if_indextoname(change.if_index, dev_name);
656 netdev_ = netdev_from_name(change.ifname);
658 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 ovs_mutex_lock(&netdev->mutex);
662 netdev_linux_update(netdev, &change);
663 ovs_mutex_unlock(&netdev->mutex);
665 netdev_close(netdev_);
667 } else if (error == ENOBUFS) {
668 struct shash device_shash;
669 struct shash_node *node;
673 shash_init(&device_shash);
674 netdev_get_devices(&netdev_linux_class, &device_shash);
675 SHASH_FOR_EACH (node, &device_shash) {
676 struct netdev *netdev_ = node->data;
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
680 ovs_mutex_lock(&netdev->mutex);
681 get_flags(netdev_, &flags);
682 netdev_linux_changed(netdev, flags, 0);
683 ovs_mutex_unlock(&netdev->mutex);
685 netdev_close(netdev_);
687 shash_destroy(&device_shash);
688 } else if (error != EAGAIN) {
689 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
690 ovs_strerror(error));
697 netdev_linux_wait(void)
699 struct nl_sock *sock;
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
704 sock = netdev_linux_notify_sock();
706 nl_sock_wait(sock, POLLIN);
711 netdev_linux_changed(struct netdev_linux *dev,
712 unsigned int ifi_flags, unsigned int mask)
713 OVS_REQUIRES(dev->mutex)
715 netdev_change_seq_changed(&dev->up);
717 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
718 dev->carrier_resets++;
720 dev->ifi_flags = ifi_flags;
722 dev->cache_valid &= mask;
723 if (!(mask & VALID_IN)) {
724 netdev_get_addrs_list_flush();
729 netdev_linux_update(struct netdev_linux *dev,
730 const struct rtnetlink_change *change)
731 OVS_REQUIRES(dev->mutex)
733 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
734 if (change->nlmsg_type == RTM_NEWLINK) {
735 /* Keep drv-info, and ip addresses. */
736 netdev_linux_changed(dev, change->ifi_flags,
737 VALID_DRVINFO | VALID_IN);
739 /* Update netdev from rtnl-change msg. */
741 dev->mtu = change->mtu;
742 dev->cache_valid |= VALID_MTU;
743 dev->netdev_mtu_error = 0;
746 if (!eth_addr_is_zero(change->mac)) {
747 dev->etheraddr = change->mac;
748 dev->cache_valid |= VALID_ETHERADDR;
749 dev->ether_addr_error = 0;
752 dev->ifindex = change->if_index;
753 dev->cache_valid |= VALID_IFINDEX;
754 dev->get_ifindex_error = 0;
756 netdev_linux_changed(dev, change->ifi_flags, 0);
758 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
759 /* Invalidates in4, in6. */
760 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
766 static struct netdev *
767 netdev_linux_alloc(void)
769 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
774 netdev_linux_common_construct(struct netdev_linux *netdev)
776 ovs_mutex_init(&netdev->mutex);
779 /* Creates system and internal devices. */
781 netdev_linux_construct(struct netdev *netdev_)
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
786 netdev_linux_common_construct(netdev);
788 error = get_flags(&netdev->up, &netdev->ifi_flags);
789 if (error == ENODEV) {
790 if (netdev->up.netdev_class != &netdev_internal_class) {
791 /* The device does not exist, so don't allow it to be opened. */
794 /* "Internal" netdevs have to be created as netdev objects before
795 * they exist in the kernel, because creating them in the kernel
796 * happens by passing a netdev object to dpif_port_add().
797 * Therefore, ignore the error. */
804 /* For most types of netdevs we open the device for each call of
805 * netdev_open(). However, this is not the case with tap devices,
806 * since it is only possible to open the device once. In this
807 * situation we share a single file descriptor, and consequently
808 * buffers, across all readers. Therefore once data is read it will
809 * be unavailable to other reads for tap devices. */
811 netdev_linux_construct_tap(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 static const char tap_dev[] = "/dev/net/tun";
815 const char *name = netdev_->name;
819 netdev_linux_common_construct(netdev);
821 /* Open tap device. */
822 netdev->tap_fd = open(tap_dev, O_RDWR);
823 if (netdev->tap_fd < 0) {
825 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
829 /* Create tap device. */
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
831 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
832 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
833 VLOG_WARN("%s: creating tap device failed: %s", name,
834 ovs_strerror(errno));
839 /* Make non-blocking. */
840 error = set_nonblocking(netdev->tap_fd);
848 close(netdev->tap_fd);
853 netdev_linux_destruct(struct netdev *netdev_)
855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
857 if (netdev->tc && netdev->tc->ops->tc_destroy) {
858 netdev->tc->ops->tc_destroy(netdev->tc);
861 if (netdev_get_class(netdev_) == &netdev_tap_class
862 && netdev->tap_fd >= 0)
864 close(netdev->tap_fd);
867 if (netdev->miimon_interval > 0) {
868 atomic_count_dec(&miimon_cnt);
871 ovs_mutex_destroy(&netdev->mutex);
875 netdev_linux_dealloc(struct netdev *netdev_)
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
881 static struct netdev_rxq *
882 netdev_linux_rxq_alloc(void)
884 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
889 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
891 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
892 struct netdev *netdev_ = rx->up.netdev;
893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
896 ovs_mutex_lock(&netdev->mutex);
897 rx->is_tap = is_tap_netdev(netdev_);
899 rx->fd = netdev->tap_fd;
901 struct sockaddr_ll sll;
903 /* Result of tcpdump -dd inbound */
904 static const struct sock_filter filt[] = {
905 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
906 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
907 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
908 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
910 static const struct sock_fprog fprog = {
911 ARRAY_SIZE(filt), (struct sock_filter *) filt
914 /* Create file descriptor. */
915 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
918 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
923 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
925 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
926 netdev_get_name(netdev_), ovs_strerror(error));
930 /* Set non-blocking mode. */
931 error = set_nonblocking(rx->fd);
936 /* Get ethernet device index. */
937 error = get_ifindex(&netdev->up, &ifindex);
942 /* Bind to specific ethernet device. */
943 memset(&sll, 0, sizeof sll);
944 sll.sll_family = AF_PACKET;
945 sll.sll_ifindex = ifindex;
946 sll.sll_protocol = htons(ETH_P_ALL);
947 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
949 VLOG_ERR("%s: failed to bind raw socket (%s)",
950 netdev_get_name(netdev_), ovs_strerror(error));
954 /* Filter for only inbound packets. */
955 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
959 VLOG_ERR("%s: failed to attach filter (%s)",
960 netdev_get_name(netdev_), ovs_strerror(error));
964 ovs_mutex_unlock(&netdev->mutex);
972 ovs_mutex_unlock(&netdev->mutex);
977 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
979 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
987 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
995 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
997 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
998 return htons(aux->tp_vlan_tpid);
1000 return htons(ETH_TYPE_VLAN);
1005 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1007 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1011 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1016 struct cmsghdr *cmsg;
1018 struct cmsghdr cmsg;
1019 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1023 /* Reserve headroom for a single VLAN tag */
1024 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1025 size = dp_packet_tailroom(buffer);
1027 iov.iov_base = dp_packet_data(buffer);
1029 msgh.msg_name = NULL;
1030 msgh.msg_namelen = 0;
1031 msgh.msg_iov = &iov;
1032 msgh.msg_iovlen = 1;
1033 msgh.msg_control = &cmsg_buffer;
1034 msgh.msg_controllen = sizeof cmsg_buffer;
1038 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1039 } while (retval < 0 && errno == EINTR);
1043 } else if (retval > size) {
1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1049 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1050 const struct tpacket_auxdata *aux;
1052 if (cmsg->cmsg_level != SOL_PACKET
1053 || cmsg->cmsg_type != PACKET_AUXDATA
1054 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1058 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1059 if (auxdata_has_vlan_tci(aux)) {
1060 if (retval < ETH_HEADER_LEN) {
1064 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1065 htons(aux->tp_vlan_tci));
1074 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1077 size_t size = dp_packet_tailroom(buffer);
1080 retval = read(fd, dp_packet_data(buffer), size);
1081 } while (retval < 0 && errno == EINTR);
1087 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1092 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1096 struct netdev *netdev = rx->up.netdev;
1097 struct dp_packet *buffer;
1101 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1102 mtu = ETH_PAYLOAD_MAX;
1105 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1106 DP_NETDEV_HEADROOM);
1107 retval = (rx->is_tap
1108 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1109 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1112 if (retval != EAGAIN && retval != EMSGSIZE) {
1113 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1114 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1116 dp_packet_delete(buffer);
1118 dp_packet_pad(buffer);
1119 dp_packet_rss_invalidate(buffer);
1120 packets[0] = buffer;
1128 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1130 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1131 poll_fd_wait(rx->fd, POLLIN);
1135 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1137 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1140 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1141 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1145 drain_fd(rx->fd, ifr.ifr_qlen);
1148 return drain_rcvbuf(rx->fd);
1152 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1153 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1154 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1155 * the packet is too big or too small to transmit on the device.
1157 * The caller retains ownership of 'buffer' in all cases.
1159 * The kernel maintains a packet transmission queue, so the caller is not
1160 * expected to do additional queuing of packets. */
1162 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1163 struct dp_packet **pkts, int cnt, bool may_steal)
1168 /* 'i' is incremented only if there's no error */
1169 for (i = 0; i < cnt;) {
1170 const void *data = dp_packet_data(pkts[i]);
1171 size_t size = dp_packet_size(pkts[i]);
1174 if (!is_tap_netdev(netdev_)) {
1175 /* Use our AF_PACKET socket to send to this device. */
1176 struct sockaddr_ll sll;
1182 sock = af_packet_sock();
1187 ifindex = netdev_get_ifindex(netdev_);
1192 /* We don't bother setting most fields in sockaddr_ll because the
1193 * kernel ignores them for SOCK_RAW. */
1194 memset(&sll, 0, sizeof sll);
1195 sll.sll_family = AF_PACKET;
1196 sll.sll_ifindex = ifindex;
1198 iov.iov_base = CONST_CAST(void *, data);
1201 msg.msg_name = &sll;
1202 msg.msg_namelen = sizeof sll;
1205 msg.msg_control = NULL;
1206 msg.msg_controllen = 0;
1209 retval = sendmsg(sock, &msg, 0);
1211 /* Use the tap fd to send to this device. This is essential for
1212 * tap devices, because packets sent to a tap device with an
1213 * AF_PACKET socket will loop back to be *received* again on the
1214 * tap device. This doesn't occur on other interface types
1215 * because we attach a socket filter to the rx socket. */
1216 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1218 retval = write(netdev->tap_fd, data, size);
1222 /* The Linux AF_PACKET implementation never blocks waiting for room
1223 * for packets, instead returning ENOBUFS. Translate this into
1224 * EAGAIN for the caller. */
1225 error = errno == ENOBUFS ? EAGAIN : errno;
1226 if (error == EINTR) {
1227 /* continue without incrementing 'i', i.e. retry this packet */
1231 } else if (retval != size) {
1232 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1233 " of %"PRIuSIZE") on %s", retval, size,
1234 netdev_get_name(netdev_));
1239 /* Process the next packet in the batch */
1244 for (i = 0; i < cnt; i++) {
1245 dp_packet_delete(pkts[i]);
1249 if (error && error != EAGAIN) {
1250 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1251 netdev_get_name(netdev_), ovs_strerror(error));
1258 /* Registers with the poll loop to wake up from the next call to poll_block()
1259 * when the packet transmission queue has sufficient room to transmit a packet
1260 * with netdev_send().
1262 * The kernel maintains a packet transmission queue, so the client is not
1263 * expected to do additional queuing of packets. Thus, this function is
1264 * unlikely to ever be used. It is included for completeness. */
1266 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1268 if (is_tap_netdev(netdev)) {
1269 /* TAP device always accepts packets.*/
1270 poll_immediate_wake();
1274 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1275 * otherwise a positive errno value. */
1277 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1279 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1280 enum netdev_flags old_flags = 0;
1283 ovs_mutex_lock(&netdev->mutex);
1285 if (netdev->cache_valid & VALID_ETHERADDR) {
1286 error = netdev->ether_addr_error;
1287 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1290 netdev->cache_valid &= ~VALID_ETHERADDR;
1293 /* Tap devices must be brought down before setting the address. */
1294 if (is_tap_netdev(netdev_)) {
1295 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1297 error = set_etheraddr(netdev_get_name(netdev_), mac);
1298 if (!error || error == ENODEV) {
1299 netdev->ether_addr_error = error;
1300 netdev->cache_valid |= VALID_ETHERADDR;
1302 netdev->etheraddr = mac;
1306 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1307 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1311 ovs_mutex_unlock(&netdev->mutex);
1315 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1317 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1322 ovs_mutex_lock(&netdev->mutex);
1323 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1324 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1325 &netdev->etheraddr);
1326 netdev->cache_valid |= VALID_ETHERADDR;
1329 error = netdev->ether_addr_error;
1331 *mac = netdev->etheraddr;
1333 ovs_mutex_unlock(&netdev->mutex);
1339 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1343 if (!(netdev->cache_valid & VALID_MTU)) {
1346 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1347 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1348 netdev->mtu = ifr.ifr_mtu;
1349 netdev->cache_valid |= VALID_MTU;
1352 error = netdev->netdev_mtu_error;
1354 *mtup = netdev->mtu;
1360 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1361 * in bytes, not including the hardware header; thus, this is typically 1500
1362 * bytes for Ethernet devices. */
1364 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1369 ovs_mutex_lock(&netdev->mutex);
1370 error = netdev_linux_get_mtu__(netdev, mtup);
1371 ovs_mutex_unlock(&netdev->mutex);
1376 /* Sets the maximum size of transmitted (MTU) for given device using linux
1377 * networking ioctl interface.
1380 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1382 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1386 ovs_mutex_lock(&netdev->mutex);
1387 if (netdev->cache_valid & VALID_MTU) {
1388 error = netdev->netdev_mtu_error;
1389 if (error || netdev->mtu == mtu) {
1392 netdev->cache_valid &= ~VALID_MTU;
1395 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1396 SIOCSIFMTU, "SIOCSIFMTU");
1397 if (!error || error == ENODEV) {
1398 netdev->netdev_mtu_error = error;
1399 netdev->mtu = ifr.ifr_mtu;
1400 netdev->cache_valid |= VALID_MTU;
1403 ovs_mutex_unlock(&netdev->mutex);
1407 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1408 * On failure, returns a negative errno value. */
1410 netdev_linux_get_ifindex(const struct netdev *netdev_)
1412 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1415 ovs_mutex_lock(&netdev->mutex);
1416 error = get_ifindex(netdev_, &ifindex);
1417 ovs_mutex_unlock(&netdev->mutex);
1419 return error ? -error : ifindex;
1423 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1425 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1427 ovs_mutex_lock(&netdev->mutex);
1428 if (netdev->miimon_interval > 0) {
1429 *carrier = netdev->miimon;
1431 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1433 ovs_mutex_unlock(&netdev->mutex);
1438 static long long int
1439 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1441 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1442 long long int carrier_resets;
1444 ovs_mutex_lock(&netdev->mutex);
1445 carrier_resets = netdev->carrier_resets;
1446 ovs_mutex_unlock(&netdev->mutex);
1448 return carrier_resets;
1452 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1453 struct mii_ioctl_data *data)
1458 memset(&ifr, 0, sizeof ifr);
1459 memcpy(&ifr.ifr_data, data, sizeof *data);
1460 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1461 memcpy(data, &ifr.ifr_data, sizeof *data);
1467 netdev_linux_get_miimon(const char *name, bool *miimon)
1469 struct mii_ioctl_data data;
1474 memset(&data, 0, sizeof data);
1475 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1477 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1478 data.reg_num = MII_BMSR;
1479 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1483 *miimon = !!(data.val_out & BMSR_LSTATUS);
1485 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1488 struct ethtool_cmd ecmd;
1490 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1493 COVERAGE_INC(netdev_get_ethtool);
1494 memset(&ecmd, 0, sizeof ecmd);
1495 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1498 struct ethtool_value eval;
1500 memcpy(&eval, &ecmd, sizeof eval);
1501 *miimon = !!eval.data;
1503 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1511 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1512 long long int interval)
1514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1516 ovs_mutex_lock(&netdev->mutex);
1517 interval = interval > 0 ? MAX(interval, 100) : 0;
1518 if (netdev->miimon_interval != interval) {
1519 if (interval && !netdev->miimon_interval) {
1520 atomic_count_inc(&miimon_cnt);
1521 } else if (!interval && netdev->miimon_interval) {
1522 atomic_count_dec(&miimon_cnt);
1525 netdev->miimon_interval = interval;
1526 timer_set_expired(&netdev->miimon_timer);
1528 ovs_mutex_unlock(&netdev->mutex);
1534 netdev_linux_miimon_run(void)
1536 struct shash device_shash;
1537 struct shash_node *node;
1539 shash_init(&device_shash);
1540 netdev_get_devices(&netdev_linux_class, &device_shash);
1541 SHASH_FOR_EACH (node, &device_shash) {
1542 struct netdev *netdev = node->data;
1543 struct netdev_linux *dev = netdev_linux_cast(netdev);
1546 ovs_mutex_lock(&dev->mutex);
1547 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1548 netdev_linux_get_miimon(dev->up.name, &miimon);
1549 if (miimon != dev->miimon) {
1550 dev->miimon = miimon;
1551 netdev_linux_changed(dev, dev->ifi_flags, 0);
1554 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1556 ovs_mutex_unlock(&dev->mutex);
1557 netdev_close(netdev);
1560 shash_destroy(&device_shash);
1564 netdev_linux_miimon_wait(void)
1566 struct shash device_shash;
1567 struct shash_node *node;
1569 shash_init(&device_shash);
1570 netdev_get_devices(&netdev_linux_class, &device_shash);
1571 SHASH_FOR_EACH (node, &device_shash) {
1572 struct netdev *netdev = node->data;
1573 struct netdev_linux *dev = netdev_linux_cast(netdev);
1575 ovs_mutex_lock(&dev->mutex);
1576 if (dev->miimon_interval > 0) {
1577 timer_wait(&dev->miimon_timer);
1579 ovs_mutex_unlock(&dev->mutex);
1580 netdev_close(netdev);
1582 shash_destroy(&device_shash);
1586 swap_uint64(uint64_t *a, uint64_t *b)
1593 /* Copies 'src' into 'dst', performing format conversion in the process.
1595 * 'src' is allowed to be misaligned. */
1597 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1598 const struct ovs_vport_stats *src)
1600 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1601 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1602 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1603 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1604 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1605 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1606 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1607 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1609 dst->collisions = 0;
1610 dst->rx_length_errors = 0;
1611 dst->rx_over_errors = 0;
1612 dst->rx_crc_errors = 0;
1613 dst->rx_frame_errors = 0;
1614 dst->rx_fifo_errors = 0;
1615 dst->rx_missed_errors = 0;
1616 dst->tx_aborted_errors = 0;
1617 dst->tx_carrier_errors = 0;
1618 dst->tx_fifo_errors = 0;
1619 dst->tx_heartbeat_errors = 0;
1620 dst->tx_window_errors = 0;
1624 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1626 struct dpif_netlink_vport reply;
1630 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1633 } else if (!reply.stats) {
1638 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1646 get_stats_via_vport(const struct netdev *netdev_,
1647 struct netdev_stats *stats)
1649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1651 if (!netdev->vport_stats_error ||
1652 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1655 error = get_stats_via_vport__(netdev_, stats);
1656 if (error && error != ENOENT && error != ENODEV) {
1657 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1659 netdev_get_name(netdev_), ovs_strerror(error));
1661 netdev->vport_stats_error = error;
1662 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1666 /* Retrieves current device stats for 'netdev-linux'. */
1668 netdev_linux_get_stats(const struct netdev *netdev_,
1669 struct netdev_stats *stats)
1671 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1672 struct netdev_stats dev_stats;
1675 ovs_mutex_lock(&netdev->mutex);
1676 get_stats_via_vport(netdev_, stats);
1677 error = get_stats_via_netlink(netdev_, &dev_stats);
1679 if (!netdev->vport_stats_error) {
1682 } else if (netdev->vport_stats_error) {
1683 /* stats not available from OVS then use netdev stats. */
1686 /* Use kernel netdev's packet and byte counts since vport's counters
1687 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1689 stats->rx_packets = dev_stats.rx_packets;
1690 stats->rx_bytes = dev_stats.rx_bytes;
1691 stats->tx_packets = dev_stats.tx_packets;
1692 stats->tx_bytes = dev_stats.tx_bytes;
1694 stats->rx_errors += dev_stats.rx_errors;
1695 stats->tx_errors += dev_stats.tx_errors;
1696 stats->rx_dropped += dev_stats.rx_dropped;
1697 stats->tx_dropped += dev_stats.tx_dropped;
1698 stats->multicast += dev_stats.multicast;
1699 stats->collisions += dev_stats.collisions;
1700 stats->rx_length_errors += dev_stats.rx_length_errors;
1701 stats->rx_over_errors += dev_stats.rx_over_errors;
1702 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1703 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1704 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1705 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1706 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1707 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1708 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1709 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1710 stats->tx_window_errors += dev_stats.tx_window_errors;
1712 ovs_mutex_unlock(&netdev->mutex);
1717 /* Retrieves current device stats for 'netdev-tap' netdev or
1718 * netdev-internal. */
1720 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1723 struct netdev_stats dev_stats;
1726 ovs_mutex_lock(&netdev->mutex);
1727 get_stats_via_vport(netdev_, stats);
1728 error = get_stats_via_netlink(netdev_, &dev_stats);
1730 if (!netdev->vport_stats_error) {
1733 } else if (netdev->vport_stats_error) {
1734 /* Transmit and receive stats will appear to be swapped relative to the
1735 * other ports since we are the one sending the data, not a remote
1736 * computer. For consistency, we swap them back here. This does not
1737 * apply if we are getting stats from the vport layer because it always
1738 * tracks stats from the perspective of the switch. */
1741 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1742 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1743 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1744 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1745 stats->rx_length_errors = 0;
1746 stats->rx_over_errors = 0;
1747 stats->rx_crc_errors = 0;
1748 stats->rx_frame_errors = 0;
1749 stats->rx_fifo_errors = 0;
1750 stats->rx_missed_errors = 0;
1751 stats->tx_aborted_errors = 0;
1752 stats->tx_carrier_errors = 0;
1753 stats->tx_fifo_errors = 0;
1754 stats->tx_heartbeat_errors = 0;
1755 stats->tx_window_errors = 0;
1757 /* Use kernel netdev's packet and byte counts since vport counters
1758 * do not reflect packet counts on the wire when GSO, TSO or GRO
1760 stats->rx_packets = dev_stats.tx_packets;
1761 stats->rx_bytes = dev_stats.tx_bytes;
1762 stats->tx_packets = dev_stats.rx_packets;
1763 stats->tx_bytes = dev_stats.rx_bytes;
1765 stats->rx_dropped += dev_stats.tx_dropped;
1766 stats->tx_dropped += dev_stats.rx_dropped;
1768 stats->rx_errors += dev_stats.tx_errors;
1769 stats->tx_errors += dev_stats.rx_errors;
1771 stats->multicast += dev_stats.multicast;
1772 stats->collisions += dev_stats.collisions;
1774 ovs_mutex_unlock(&netdev->mutex);
1780 netdev_internal_get_stats(const struct netdev *netdev_,
1781 struct netdev_stats *stats)
1783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1786 ovs_mutex_lock(&netdev->mutex);
1787 get_stats_via_vport(netdev_, stats);
1788 error = netdev->vport_stats_error;
1789 ovs_mutex_unlock(&netdev->mutex);
1795 netdev_linux_read_features(struct netdev_linux *netdev)
1797 struct ethtool_cmd ecmd;
1801 if (netdev->cache_valid & VALID_FEATURES) {
1805 COVERAGE_INC(netdev_get_ethtool);
1806 memset(&ecmd, 0, sizeof ecmd);
1807 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1808 ETHTOOL_GSET, "ETHTOOL_GSET");
1813 /* Supported features. */
1814 netdev->supported = 0;
1815 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1816 netdev->supported |= NETDEV_F_10MB_HD;
1818 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1819 netdev->supported |= NETDEV_F_10MB_FD;
1821 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1822 netdev->supported |= NETDEV_F_100MB_HD;
1824 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1825 netdev->supported |= NETDEV_F_100MB_FD;
1827 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1828 netdev->supported |= NETDEV_F_1GB_HD;
1830 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1831 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1832 netdev->supported |= NETDEV_F_1GB_FD;
1834 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1835 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1836 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1837 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1838 netdev->supported |= NETDEV_F_10GB_FD;
1840 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1841 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1842 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1843 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1844 netdev->supported |= NETDEV_F_40GB_FD;
1846 if (ecmd.supported & SUPPORTED_TP) {
1847 netdev->supported |= NETDEV_F_COPPER;
1849 if (ecmd.supported & SUPPORTED_FIBRE) {
1850 netdev->supported |= NETDEV_F_FIBER;
1852 if (ecmd.supported & SUPPORTED_Autoneg) {
1853 netdev->supported |= NETDEV_F_AUTONEG;
1855 if (ecmd.supported & SUPPORTED_Pause) {
1856 netdev->supported |= NETDEV_F_PAUSE;
1858 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1859 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1862 /* Advertised features. */
1863 netdev->advertised = 0;
1864 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1865 netdev->advertised |= NETDEV_F_10MB_HD;
1867 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1868 netdev->advertised |= NETDEV_F_10MB_FD;
1870 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1871 netdev->advertised |= NETDEV_F_100MB_HD;
1873 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1874 netdev->advertised |= NETDEV_F_100MB_FD;
1876 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1877 netdev->advertised |= NETDEV_F_1GB_HD;
1879 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1880 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1881 netdev->advertised |= NETDEV_F_1GB_FD;
1883 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1884 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1885 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1886 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1887 netdev->advertised |= NETDEV_F_10GB_FD;
1889 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1890 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1891 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1892 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1893 netdev->advertised |= NETDEV_F_40GB_FD;
1895 if (ecmd.advertising & ADVERTISED_TP) {
1896 netdev->advertised |= NETDEV_F_COPPER;
1898 if (ecmd.advertising & ADVERTISED_FIBRE) {
1899 netdev->advertised |= NETDEV_F_FIBER;
1901 if (ecmd.advertising & ADVERTISED_Autoneg) {
1902 netdev->advertised |= NETDEV_F_AUTONEG;
1904 if (ecmd.advertising & ADVERTISED_Pause) {
1905 netdev->advertised |= NETDEV_F_PAUSE;
1907 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1908 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1911 /* Current settings. */
1912 speed = ethtool_cmd_speed(&ecmd);
1913 if (speed == SPEED_10) {
1914 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1915 } else if (speed == SPEED_100) {
1916 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1917 } else if (speed == SPEED_1000) {
1918 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1919 } else if (speed == SPEED_10000) {
1920 netdev->current = NETDEV_F_10GB_FD;
1921 } else if (speed == 40000) {
1922 netdev->current = NETDEV_F_40GB_FD;
1923 } else if (speed == 100000) {
1924 netdev->current = NETDEV_F_100GB_FD;
1925 } else if (speed == 1000000) {
1926 netdev->current = NETDEV_F_1TB_FD;
1928 netdev->current = 0;
1931 if (ecmd.port == PORT_TP) {
1932 netdev->current |= NETDEV_F_COPPER;
1933 } else if (ecmd.port == PORT_FIBRE) {
1934 netdev->current |= NETDEV_F_FIBER;
1938 netdev->current |= NETDEV_F_AUTONEG;
1942 netdev->cache_valid |= VALID_FEATURES;
1943 netdev->get_features_error = error;
1946 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1947 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1948 * Returns 0 if successful, otherwise a positive errno value. */
1950 netdev_linux_get_features(const struct netdev *netdev_,
1951 enum netdev_features *current,
1952 enum netdev_features *advertised,
1953 enum netdev_features *supported,
1954 enum netdev_features *peer)
1956 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1959 ovs_mutex_lock(&netdev->mutex);
1960 netdev_linux_read_features(netdev);
1961 if (!netdev->get_features_error) {
1962 *current = netdev->current;
1963 *advertised = netdev->advertised;
1964 *supported = netdev->supported;
1965 *peer = 0; /* XXX */
1967 error = netdev->get_features_error;
1968 ovs_mutex_unlock(&netdev->mutex);
1973 /* Set the features advertised by 'netdev' to 'advertise'. */
1975 netdev_linux_set_advertisements(struct netdev *netdev_,
1976 enum netdev_features advertise)
1978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1979 struct ethtool_cmd ecmd;
1982 ovs_mutex_lock(&netdev->mutex);
1984 COVERAGE_INC(netdev_get_ethtool);
1985 memset(&ecmd, 0, sizeof ecmd);
1986 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1987 ETHTOOL_GSET, "ETHTOOL_GSET");
1992 ecmd.advertising = 0;
1993 if (advertise & NETDEV_F_10MB_HD) {
1994 ecmd.advertising |= ADVERTISED_10baseT_Half;
1996 if (advertise & NETDEV_F_10MB_FD) {
1997 ecmd.advertising |= ADVERTISED_10baseT_Full;
1999 if (advertise & NETDEV_F_100MB_HD) {
2000 ecmd.advertising |= ADVERTISED_100baseT_Half;
2002 if (advertise & NETDEV_F_100MB_FD) {
2003 ecmd.advertising |= ADVERTISED_100baseT_Full;
2005 if (advertise & NETDEV_F_1GB_HD) {
2006 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2008 if (advertise & NETDEV_F_1GB_FD) {
2009 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2011 if (advertise & NETDEV_F_10GB_FD) {
2012 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2014 if (advertise & NETDEV_F_COPPER) {
2015 ecmd.advertising |= ADVERTISED_TP;
2017 if (advertise & NETDEV_F_FIBER) {
2018 ecmd.advertising |= ADVERTISED_FIBRE;
2020 if (advertise & NETDEV_F_AUTONEG) {
2021 ecmd.advertising |= ADVERTISED_Autoneg;
2023 if (advertise & NETDEV_F_PAUSE) {
2024 ecmd.advertising |= ADVERTISED_Pause;
2026 if (advertise & NETDEV_F_PAUSE_ASYM) {
2027 ecmd.advertising |= ADVERTISED_Asym_Pause;
2029 COVERAGE_INC(netdev_set_ethtool);
2030 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2031 ETHTOOL_SSET, "ETHTOOL_SSET");
2034 ovs_mutex_unlock(&netdev->mutex);
2038 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2039 * successful, otherwise a positive errno value. */
2041 netdev_linux_set_policing(struct netdev *netdev_,
2042 uint32_t kbits_rate, uint32_t kbits_burst)
2044 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2045 const char *netdev_name = netdev_get_name(netdev_);
2048 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2049 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2050 : kbits_burst); /* Stick with user-specified value. */
2052 ovs_mutex_lock(&netdev->mutex);
2053 if (netdev->cache_valid & VALID_POLICING) {
2054 error = netdev->netdev_policing_error;
2055 if (error || (netdev->kbits_rate == kbits_rate &&
2056 netdev->kbits_burst == kbits_burst)) {
2057 /* Assume that settings haven't changed since we last set them. */
2060 netdev->cache_valid &= ~VALID_POLICING;
2063 COVERAGE_INC(netdev_set_policing);
2064 /* Remove any existing ingress qdisc. */
2065 error = tc_add_del_ingress_qdisc(netdev_, false);
2067 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2068 netdev_name, ovs_strerror(error));
2073 error = tc_add_del_ingress_qdisc(netdev_, true);
2075 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2076 netdev_name, ovs_strerror(error));
2080 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2082 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2083 netdev_name, ovs_strerror(error));
2088 netdev->kbits_rate = kbits_rate;
2089 netdev->kbits_burst = kbits_burst;
2092 if (!error || error == ENODEV) {
2093 netdev->netdev_policing_error = error;
2094 netdev->cache_valid |= VALID_POLICING;
2096 ovs_mutex_unlock(&netdev->mutex);
2101 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2104 const struct tc_ops *const *opsp;
2106 for (opsp = tcs; *opsp != NULL; opsp++) {
2107 const struct tc_ops *ops = *opsp;
2108 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2109 sset_add(types, ops->ovs_name);
2115 static const struct tc_ops *
2116 tc_lookup_ovs_name(const char *name)
2118 const struct tc_ops *const *opsp;
2120 for (opsp = tcs; *opsp != NULL; opsp++) {
2121 const struct tc_ops *ops = *opsp;
2122 if (!strcmp(name, ops->ovs_name)) {
2129 static const struct tc_ops *
2130 tc_lookup_linux_name(const char *name)
2132 const struct tc_ops *const *opsp;
2134 for (opsp = tcs; *opsp != NULL; opsp++) {
2135 const struct tc_ops *ops = *opsp;
2136 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2143 static struct tc_queue *
2144 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2148 struct tc_queue *queue;
2150 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2151 if (queue->queue_id == queue_id) {
2158 static struct tc_queue *
2159 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2161 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2165 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2167 struct netdev_qos_capabilities *caps)
2169 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2173 caps->n_queues = ops->n_queues;
2178 netdev_linux_get_qos(const struct netdev *netdev_,
2179 const char **typep, struct smap *details)
2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2184 ovs_mutex_lock(&netdev->mutex);
2185 error = tc_query_qdisc(netdev_);
2187 *typep = netdev->tc->ops->ovs_name;
2188 error = (netdev->tc->ops->qdisc_get
2189 ? netdev->tc->ops->qdisc_get(netdev_, details)
2192 ovs_mutex_unlock(&netdev->mutex);
2198 netdev_linux_set_qos(struct netdev *netdev_,
2199 const char *type, const struct smap *details)
2201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2202 const struct tc_ops *new_ops;
2205 new_ops = tc_lookup_ovs_name(type);
2206 if (!new_ops || !new_ops->tc_install) {
2210 ovs_mutex_lock(&netdev->mutex);
2211 error = tc_query_qdisc(netdev_);
2216 if (new_ops == netdev->tc->ops) {
2217 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2219 /* Delete existing qdisc. */
2220 error = tc_del_qdisc(netdev_);
2224 ovs_assert(netdev->tc == NULL);
2226 /* Install new qdisc. */
2227 error = new_ops->tc_install(netdev_, details);
2228 ovs_assert((error == 0) == (netdev->tc != NULL));
2232 ovs_mutex_unlock(&netdev->mutex);
2237 netdev_linux_get_queue(const struct netdev *netdev_,
2238 unsigned int queue_id, struct smap *details)
2240 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2243 ovs_mutex_lock(&netdev->mutex);
2244 error = tc_query_qdisc(netdev_);
2246 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2248 ? netdev->tc->ops->class_get(netdev_, queue, details)
2251 ovs_mutex_unlock(&netdev->mutex);
2257 netdev_linux_set_queue(struct netdev *netdev_,
2258 unsigned int queue_id, const struct smap *details)
2260 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2263 ovs_mutex_lock(&netdev->mutex);
2264 error = tc_query_qdisc(netdev_);
2266 error = (queue_id < netdev->tc->ops->n_queues
2267 && netdev->tc->ops->class_set
2268 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2271 ovs_mutex_unlock(&netdev->mutex);
2277 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2279 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2282 ovs_mutex_lock(&netdev->mutex);
2283 error = tc_query_qdisc(netdev_);
2285 if (netdev->tc->ops->class_delete) {
2286 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2288 ? netdev->tc->ops->class_delete(netdev_, queue)
2294 ovs_mutex_unlock(&netdev->mutex);
2300 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2301 unsigned int queue_id,
2302 struct netdev_queue_stats *stats)
2304 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2307 ovs_mutex_lock(&netdev->mutex);
2308 error = tc_query_qdisc(netdev_);
2310 if (netdev->tc->ops->class_get_stats) {
2311 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2313 stats->created = queue->created;
2314 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2323 ovs_mutex_unlock(&netdev->mutex);
2328 struct queue_dump_state {
2329 struct nl_dump dump;
2334 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2336 struct ofpbuf request;
2337 struct tcmsg *tcmsg;
2339 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2343 tcmsg->tcm_parent = 0;
2344 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2345 ofpbuf_uninit(&request);
2347 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2352 finish_queue_dump(struct queue_dump_state *state)
2354 ofpbuf_uninit(&state->buf);
2355 return nl_dump_done(&state->dump);
2358 struct netdev_linux_queue_state {
2359 unsigned int *queues;
2365 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2367 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2370 ovs_mutex_lock(&netdev->mutex);
2371 error = tc_query_qdisc(netdev_);
2373 if (netdev->tc->ops->class_get) {
2374 struct netdev_linux_queue_state *state;
2375 struct tc_queue *queue;
2378 *statep = state = xmalloc(sizeof *state);
2379 state->n_queues = hmap_count(&netdev->tc->queues);
2380 state->cur_queue = 0;
2381 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2384 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2385 state->queues[i++] = queue->queue_id;
2391 ovs_mutex_unlock(&netdev->mutex);
2397 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2398 unsigned int *queue_idp, struct smap *details)
2400 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2401 struct netdev_linux_queue_state *state = state_;
2404 ovs_mutex_lock(&netdev->mutex);
2405 while (state->cur_queue < state->n_queues) {
2406 unsigned int queue_id = state->queues[state->cur_queue++];
2407 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2410 *queue_idp = queue_id;
2411 error = netdev->tc->ops->class_get(netdev_, queue, details);
2415 ovs_mutex_unlock(&netdev->mutex);
2421 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2424 struct netdev_linux_queue_state *state = state_;
2426 free(state->queues);
2432 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2433 netdev_dump_queue_stats_cb *cb, void *aux)
2435 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2438 ovs_mutex_lock(&netdev->mutex);
2439 error = tc_query_qdisc(netdev_);
2441 struct queue_dump_state state;
2443 if (!netdev->tc->ops->class_dump_stats) {
2445 } else if (!start_queue_dump(netdev_, &state)) {
2451 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2452 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2459 retval = finish_queue_dump(&state);
2465 ovs_mutex_unlock(&netdev->mutex);
2471 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2472 struct in_addr netmask)
2474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2477 ovs_mutex_lock(&netdev->mutex);
2478 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2480 if (address.s_addr != INADDR_ANY) {
2481 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2482 "SIOCSIFNETMASK", netmask);
2486 ovs_mutex_unlock(&netdev->mutex);
2491 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2492 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2495 netdev_linux_get_addr_list(const struct netdev *netdev_,
2496 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2501 ovs_mutex_lock(&netdev->mutex);
2502 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2503 ovs_mutex_unlock(&netdev->mutex);
2509 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2511 struct sockaddr_in sin;
2512 memset(&sin, 0, sizeof sin);
2513 sin.sin_family = AF_INET;
2514 sin.sin_addr = addr;
2517 memset(sa, 0, sizeof *sa);
2518 memcpy(sa, &sin, sizeof sin);
2522 do_set_addr(struct netdev *netdev,
2523 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2527 make_in4_sockaddr(&ifr.ifr_addr, addr);
2528 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2532 /* Adds 'router' as a default IP gateway. */
2534 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2536 struct in_addr any = { INADDR_ANY };
2540 memset(&rt, 0, sizeof rt);
2541 make_in4_sockaddr(&rt.rt_dst, any);
2542 make_in4_sockaddr(&rt.rt_gateway, router);
2543 make_in4_sockaddr(&rt.rt_genmask, any);
2544 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2545 error = af_inet_ioctl(SIOCADDRT, &rt);
2547 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2553 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2556 static const char fn[] = "/proc/net/route";
2561 *netdev_name = NULL;
2562 stream = fopen(fn, "r");
2563 if (stream == NULL) {
2564 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2569 while (fgets(line, sizeof line, stream)) {
2572 ovs_be32 dest, gateway, mask;
2573 int refcnt, metric, mtu;
2574 unsigned int flags, use, window, irtt;
2577 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2579 iface, &dest, &gateway, &flags, &refcnt,
2580 &use, &metric, &mask, &mtu, &window, &irtt)) {
2581 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2585 if (!(flags & RTF_UP)) {
2586 /* Skip routes that aren't up. */
2590 /* The output of 'dest', 'mask', and 'gateway' were given in
2591 * network byte order, so we don't need need any endian
2592 * conversions here. */
2593 if ((dest & mask) == (host->s_addr & mask)) {
2595 /* The host is directly reachable. */
2596 next_hop->s_addr = 0;
2598 /* To reach the host, we must go through a gateway. */
2599 next_hop->s_addr = gateway;
2601 *netdev_name = xstrdup(iface);
2613 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2615 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2618 ovs_mutex_lock(&netdev->mutex);
2619 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2620 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2622 COVERAGE_INC(netdev_get_ethtool);
2623 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2624 error = netdev_linux_do_ethtool(netdev->up.name,
2627 "ETHTOOL_GDRVINFO");
2629 netdev->cache_valid |= VALID_DRVINFO;
2634 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2635 smap_add(smap, "driver_version", netdev->drvinfo.version);
2636 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2638 ovs_mutex_unlock(&netdev->mutex);
2644 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2647 smap_add(smap, "driver_name", "openvswitch");
2651 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2652 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2653 * returns 0. Otherwise, it returns a positive errno value; in particular,
2654 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2656 netdev_linux_arp_lookup(const struct netdev *netdev,
2657 ovs_be32 ip, struct eth_addr *mac)
2660 struct sockaddr_in sin;
2663 memset(&r, 0, sizeof r);
2664 memset(&sin, 0, sizeof sin);
2665 sin.sin_family = AF_INET;
2666 sin.sin_addr.s_addr = ip;
2668 memcpy(&r.arp_pa, &sin, sizeof sin);
2669 r.arp_ha.sa_family = ARPHRD_ETHER;
2671 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2672 COVERAGE_INC(netdev_arp_lookup);
2673 retval = af_inet_ioctl(SIOCGARP, &r);
2675 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2676 } else if (retval != ENXIO) {
2677 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2678 netdev_get_name(netdev), IP_ARGS(ip),
2679 ovs_strerror(retval));
2685 nd_to_iff_flags(enum netdev_flags nd)
2688 if (nd & NETDEV_UP) {
2691 if (nd & NETDEV_PROMISC) {
2694 if (nd & NETDEV_LOOPBACK) {
2695 iff |= IFF_LOOPBACK;
2701 iff_to_nd_flags(int iff)
2703 enum netdev_flags nd = 0;
2707 if (iff & IFF_PROMISC) {
2708 nd |= NETDEV_PROMISC;
2710 if (iff & IFF_LOOPBACK) {
2711 nd |= NETDEV_LOOPBACK;
2717 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2718 enum netdev_flags on, enum netdev_flags *old_flagsp)
2719 OVS_REQUIRES(netdev->mutex)
2721 int old_flags, new_flags;
2724 old_flags = netdev->ifi_flags;
2725 *old_flagsp = iff_to_nd_flags(old_flags);
2726 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2727 if (new_flags != old_flags) {
2728 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2729 get_flags(&netdev->up, &netdev->ifi_flags);
2736 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2737 enum netdev_flags on, enum netdev_flags *old_flagsp)
2739 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2742 ovs_mutex_lock(&netdev->mutex);
2743 error = update_flags(netdev, off, on, old_flagsp);
2744 ovs_mutex_unlock(&netdev->mutex);
2749 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2750 GET_FEATURES, GET_STATUS) \
2753 false, /* is_pmd */ \
2757 netdev_linux_wait, \
2759 netdev_linux_alloc, \
2761 netdev_linux_destruct, \
2762 netdev_linux_dealloc, \
2763 NULL, /* get_config */ \
2764 NULL, /* set_config */ \
2765 NULL, /* get_tunnel_config */ \
2766 NULL, /* build header */ \
2767 NULL, /* push header */ \
2768 NULL, /* pop header */ \
2769 NULL, /* get_numa_id */ \
2770 NULL, /* set_multiq */ \
2772 netdev_linux_send, \
2773 netdev_linux_send_wait, \
2775 netdev_linux_set_etheraddr, \
2776 netdev_linux_get_etheraddr, \
2777 netdev_linux_get_mtu, \
2778 netdev_linux_set_mtu, \
2779 netdev_linux_get_ifindex, \
2780 netdev_linux_get_carrier, \
2781 netdev_linux_get_carrier_resets, \
2782 netdev_linux_set_miimon_interval, \
2786 netdev_linux_set_advertisements, \
2788 netdev_linux_set_policing, \
2789 netdev_linux_get_qos_types, \
2790 netdev_linux_get_qos_capabilities, \
2791 netdev_linux_get_qos, \
2792 netdev_linux_set_qos, \
2793 netdev_linux_get_queue, \
2794 netdev_linux_set_queue, \
2795 netdev_linux_delete_queue, \
2796 netdev_linux_get_queue_stats, \
2797 netdev_linux_queue_dump_start, \
2798 netdev_linux_queue_dump_next, \
2799 netdev_linux_queue_dump_done, \
2800 netdev_linux_dump_queue_stats, \
2802 netdev_linux_set_in4, \
2803 netdev_linux_get_addr_list, \
2804 netdev_linux_add_router, \
2805 netdev_linux_get_next_hop, \
2807 netdev_linux_arp_lookup, \
2809 netdev_linux_update_flags, \
2811 netdev_linux_rxq_alloc, \
2812 netdev_linux_rxq_construct, \
2813 netdev_linux_rxq_destruct, \
2814 netdev_linux_rxq_dealloc, \
2815 netdev_linux_rxq_recv, \
2816 netdev_linux_rxq_wait, \
2817 netdev_linux_rxq_drain, \
2820 const struct netdev_class netdev_linux_class =
2823 netdev_linux_construct,
2824 netdev_linux_get_stats,
2825 netdev_linux_get_features,
2826 netdev_linux_get_status);
2828 const struct netdev_class netdev_tap_class =
2831 netdev_linux_construct_tap,
2832 netdev_tap_get_stats,
2833 netdev_linux_get_features,
2834 netdev_linux_get_status);
2836 const struct netdev_class netdev_internal_class =
2839 netdev_linux_construct,
2840 netdev_internal_get_stats,
2841 NULL, /* get_features */
2842 netdev_internal_get_status);
2845 #define CODEL_N_QUEUES 0x0000
2847 /* In sufficiently new kernel headers these are defined as enums in
2848 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2849 * kernels. (This overrides any enum definition in the header file but that's
2851 #define TCA_CODEL_TARGET 1
2852 #define TCA_CODEL_LIMIT 2
2853 #define TCA_CODEL_INTERVAL 3
2862 static struct codel *
2863 codel_get__(const struct netdev *netdev_)
2865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2866 return CONTAINER_OF(netdev->tc, struct codel, tc);
2870 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2874 struct codel *codel;
2876 codel = xmalloc(sizeof *codel);
2877 tc_init(&codel->tc, &tc_ops_codel);
2878 codel->target = target;
2879 codel->limit = limit;
2880 codel->interval = interval;
2882 netdev->tc = &codel->tc;
2886 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2890 struct ofpbuf request;
2891 struct tcmsg *tcmsg;
2892 uint32_t otarget, olimit, ointerval;
2895 tc_del_qdisc(netdev);
2897 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2898 NLM_F_EXCL | NLM_F_CREATE, &request);
2902 tcmsg->tcm_handle = tc_make_handle(1, 0);
2903 tcmsg->tcm_parent = TC_H_ROOT;
2905 otarget = target ? target : 5000;
2906 olimit = limit ? limit : 10240;
2907 ointerval = interval ? interval : 100000;
2909 nl_msg_put_string(&request, TCA_KIND, "codel");
2910 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2911 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2912 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2913 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2914 nl_msg_end_nested(&request, opt_offset);
2916 error = tc_transact(&request, NULL);
2918 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2919 "target %u, limit %u, interval %u error %d(%s)",
2920 netdev_get_name(netdev),
2921 otarget, olimit, ointerval,
2922 error, ovs_strerror(error));
2928 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2929 const struct smap *details, struct codel *codel)
2931 const char *target_s;
2932 const char *limit_s;
2933 const char *interval_s;
2935 target_s = smap_get(details, "target");
2936 limit_s = smap_get(details, "limit");
2937 interval_s = smap_get(details, "interval");
2939 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2940 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2941 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2943 if (!codel->target) {
2944 codel->target = 5000;
2946 if (!codel->limit) {
2947 codel->limit = 10240;
2949 if (!codel->interval) {
2950 codel->interval = 100000;
2955 codel_tc_install(struct netdev *netdev, const struct smap *details)
2960 codel_parse_qdisc_details__(netdev, details, &codel);
2961 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2964 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2970 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2972 static const struct nl_policy tca_codel_policy[] = {
2973 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2974 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2975 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2978 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2980 if (!nl_parse_nested(nl_options, tca_codel_policy,
2981 attrs, ARRAY_SIZE(tca_codel_policy))) {
2982 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2986 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2987 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2988 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2993 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2995 struct nlattr *nlattr;
3000 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3005 error = codel_parse_tca_options__(nlattr, &codel);
3010 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3016 codel_tc_destroy(struct tc *tc)
3018 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3024 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3026 const struct codel *codel = codel_get__(netdev);
3027 smap_add_format(details, "target", "%u", codel->target);
3028 smap_add_format(details, "limit", "%u", codel->limit);
3029 smap_add_format(details, "interval", "%u", codel->interval);
3034 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3038 codel_parse_qdisc_details__(netdev, details, &codel);
3039 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3040 codel_get__(netdev)->target = codel.target;
3041 codel_get__(netdev)->limit = codel.limit;
3042 codel_get__(netdev)->interval = codel.interval;
3046 static const struct tc_ops tc_ops_codel = {
3047 "codel", /* linux_name */
3048 "linux-codel", /* ovs_name */
3049 CODEL_N_QUEUES, /* n_queues */
3062 /* FQ-CoDel traffic control class. */
3064 #define FQCODEL_N_QUEUES 0x0000
3066 /* In sufficiently new kernel headers these are defined as enums in
3067 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3068 * kernels. (This overrides any enum definition in the header file but that's
3070 #define TCA_FQ_CODEL_TARGET 1
3071 #define TCA_FQ_CODEL_LIMIT 2
3072 #define TCA_FQ_CODEL_INTERVAL 3
3073 #define TCA_FQ_CODEL_ECN 4
3074 #define TCA_FQ_CODEL_FLOWS 5
3075 #define TCA_FQ_CODEL_QUANTUM 6
3086 static struct fqcodel *
3087 fqcodel_get__(const struct netdev *netdev_)
3089 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3090 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3094 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3095 uint32_t interval, uint32_t flows, uint32_t quantum)
3097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3098 struct fqcodel *fqcodel;
3100 fqcodel = xmalloc(sizeof *fqcodel);
3101 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3102 fqcodel->target = target;
3103 fqcodel->limit = limit;
3104 fqcodel->interval = interval;
3105 fqcodel->flows = flows;
3106 fqcodel->quantum = quantum;
3108 netdev->tc = &fqcodel->tc;
3112 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3113 uint32_t interval, uint32_t flows, uint32_t quantum)
3116 struct ofpbuf request;
3117 struct tcmsg *tcmsg;
3118 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3121 tc_del_qdisc(netdev);
3123 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3124 NLM_F_EXCL | NLM_F_CREATE, &request);
3128 tcmsg->tcm_handle = tc_make_handle(1, 0);
3129 tcmsg->tcm_parent = TC_H_ROOT;
3131 otarget = target ? target : 5000;
3132 olimit = limit ? limit : 10240;
3133 ointerval = interval ? interval : 100000;
3134 oflows = flows ? flows : 1024;
3135 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3138 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3139 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3141 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3142 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3143 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3144 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3145 nl_msg_end_nested(&request, opt_offset);
3147 error = tc_transact(&request, NULL);
3149 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3150 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3151 netdev_get_name(netdev),
3152 otarget, olimit, ointerval, oflows, oquantum,
3153 error, ovs_strerror(error));
3159 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3160 const struct smap *details, struct fqcodel *fqcodel)
3162 const char *target_s;
3163 const char *limit_s;
3164 const char *interval_s;
3165 const char *flows_s;
3166 const char *quantum_s;
3168 target_s = smap_get(details, "target");
3169 limit_s = smap_get(details, "limit");
3170 interval_s = smap_get(details, "interval");
3171 flows_s = smap_get(details, "flows");
3172 quantum_s = smap_get(details, "quantum");
3173 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3174 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3175 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3176 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3177 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3178 if (!fqcodel->target) {
3179 fqcodel->target = 5000;
3181 if (!fqcodel->limit) {
3182 fqcodel->limit = 10240;
3184 if (!fqcodel->interval) {
3185 fqcodel->interval = 1000000;
3187 if (!fqcodel->flows) {
3188 fqcodel->flows = 1024;
3190 if (!fqcodel->quantum) {
3191 fqcodel->quantum = 1514;
3196 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3199 struct fqcodel fqcodel;
3201 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3202 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3203 fqcodel.interval, fqcodel.flows,
3206 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3207 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3213 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3215 static const struct nl_policy tca_fqcodel_policy[] = {
3216 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3218 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3219 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3220 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3223 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3225 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3226 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3227 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3231 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3232 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3233 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3234 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3235 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3240 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3242 struct nlattr *nlattr;
3245 struct fqcodel fqcodel;
3247 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3252 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3257 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3258 fqcodel.flows, fqcodel.quantum);
3263 fqcodel_tc_destroy(struct tc *tc)
3265 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3271 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3273 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3274 smap_add_format(details, "target", "%u", fqcodel->target);
3275 smap_add_format(details, "limit", "%u", fqcodel->limit);
3276 smap_add_format(details, "interval", "%u", fqcodel->interval);
3277 smap_add_format(details, "flows", "%u", fqcodel->flows);
3278 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3283 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3285 struct fqcodel fqcodel;
3287 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3288 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3289 fqcodel.flows, fqcodel.quantum);
3290 fqcodel_get__(netdev)->target = fqcodel.target;
3291 fqcodel_get__(netdev)->limit = fqcodel.limit;
3292 fqcodel_get__(netdev)->interval = fqcodel.interval;
3293 fqcodel_get__(netdev)->flows = fqcodel.flows;
3294 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3298 static const struct tc_ops tc_ops_fqcodel = {
3299 "fq_codel", /* linux_name */
3300 "linux-fq_codel", /* ovs_name */
3301 FQCODEL_N_QUEUES, /* n_queues */
3314 /* SFQ traffic control class. */
3316 #define SFQ_N_QUEUES 0x0000
3325 sfq_get__(const struct netdev *netdev_)
3327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3328 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3332 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3337 sfq = xmalloc(sizeof *sfq);
3338 tc_init(&sfq->tc, &tc_ops_sfq);
3339 sfq->perturb = perturb;
3340 sfq->quantum = quantum;
3342 netdev->tc = &sfq->tc;
3346 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3348 struct tc_sfq_qopt opt;
3349 struct ofpbuf request;
3350 struct tcmsg *tcmsg;
3352 int mtu_error, error;
3353 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3355 tc_del_qdisc(netdev);
3357 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3358 NLM_F_EXCL | NLM_F_CREATE, &request);
3362 tcmsg->tcm_handle = tc_make_handle(1, 0);
3363 tcmsg->tcm_parent = TC_H_ROOT;
3365 memset(&opt, 0, sizeof opt);
3368 opt.quantum = mtu; /* if we cannot find mtu, use default */
3371 opt.quantum = quantum;
3375 opt.perturb_period = 10;
3377 opt.perturb_period = perturb;
3380 nl_msg_put_string(&request, TCA_KIND, "sfq");
3381 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3383 error = tc_transact(&request, NULL);
3385 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3386 "quantum %u, perturb %u error %d(%s)",
3387 netdev_get_name(netdev),
3388 opt.quantum, opt.perturb_period,
3389 error, ovs_strerror(error));
3395 sfq_parse_qdisc_details__(struct netdev *netdev,
3396 const struct smap *details, struct sfq *sfq)
3398 const char *perturb_s;
3399 const char *quantum_s;
3403 perturb_s = smap_get(details, "perturb");
3404 quantum_s = smap_get(details, "quantum");
3405 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3406 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3407 if (!sfq->perturb) {
3411 if (!sfq->quantum) {
3412 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3416 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3417 "device without mtu");
3424 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3429 sfq_parse_qdisc_details__(netdev, details, &sfq);
3430 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3432 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3438 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3440 const struct tc_sfq_qopt *sfq;
3441 struct nlattr *nlattr;
3445 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3447 sfq = nl_attr_get(nlattr);
3448 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3456 sfq_tc_destroy(struct tc *tc)
3458 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3464 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3466 const struct sfq *sfq = sfq_get__(netdev);
3467 smap_add_format(details, "quantum", "%u", sfq->quantum);
3468 smap_add_format(details, "perturb", "%u", sfq->perturb);
3473 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3477 sfq_parse_qdisc_details__(netdev, details, &sfq);
3478 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3479 sfq_get__(netdev)->quantum = sfq.quantum;
3480 sfq_get__(netdev)->perturb = sfq.perturb;
3484 static const struct tc_ops tc_ops_sfq = {
3485 "sfq", /* linux_name */
3486 "linux-sfq", /* ovs_name */
3487 SFQ_N_QUEUES, /* n_queues */
3500 /* HTB traffic control class. */
3502 #define HTB_N_QUEUES 0xf000
3503 #define HTB_RATE2QUANTUM 10
3507 unsigned int max_rate; /* In bytes/s. */
3511 struct tc_queue tc_queue;
3512 unsigned int min_rate; /* In bytes/s. */
3513 unsigned int max_rate; /* In bytes/s. */
3514 unsigned int burst; /* In bytes. */
3515 unsigned int priority; /* Lower values are higher priorities. */
3519 htb_get__(const struct netdev *netdev_)
3521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3522 return CONTAINER_OF(netdev->tc, struct htb, tc);
3526 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3531 htb = xmalloc(sizeof *htb);
3532 tc_init(&htb->tc, &tc_ops_htb);
3533 htb->max_rate = max_rate;
3535 netdev->tc = &htb->tc;
3538 /* Create an HTB qdisc.
3540 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3542 htb_setup_qdisc__(struct netdev *netdev)
3545 struct tc_htb_glob opt;
3546 struct ofpbuf request;
3547 struct tcmsg *tcmsg;
3549 tc_del_qdisc(netdev);
3551 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3552 NLM_F_EXCL | NLM_F_CREATE, &request);
3556 tcmsg->tcm_handle = tc_make_handle(1, 0);
3557 tcmsg->tcm_parent = TC_H_ROOT;
3559 nl_msg_put_string(&request, TCA_KIND, "htb");
3561 memset(&opt, 0, sizeof opt);
3562 opt.rate2quantum = HTB_RATE2QUANTUM;
3566 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3567 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3568 nl_msg_end_nested(&request, opt_offset);
3570 return tc_transact(&request, NULL);
3573 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3574 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3576 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3577 unsigned int parent, struct htb_class *class)
3580 struct tc_htb_opt opt;
3581 struct ofpbuf request;
3582 struct tcmsg *tcmsg;
3586 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3588 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3589 netdev_get_name(netdev));
3593 memset(&opt, 0, sizeof opt);
3594 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3595 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3596 /* Makes sure the quantum is at least MTU. Setting quantum will
3597 * make htb ignore the r2q for this class. */
3598 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3601 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3602 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3603 opt.prio = class->priority;
3605 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3609 tcmsg->tcm_handle = handle;
3610 tcmsg->tcm_parent = parent;
3612 nl_msg_put_string(&request, TCA_KIND, "htb");
3613 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3614 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3615 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3616 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3617 nl_msg_end_nested(&request, opt_offset);
3619 error = tc_transact(&request, NULL);
3621 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3622 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3623 netdev_get_name(netdev),
3624 tc_get_major(handle), tc_get_minor(handle),
3625 tc_get_major(parent), tc_get_minor(parent),
3626 class->min_rate, class->max_rate,
3627 class->burst, class->priority, ovs_strerror(error));
3632 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3633 * description of them into 'details'. The description complies with the
3634 * specification given in the vswitch database documentation for linux-htb
3637 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3639 static const struct nl_policy tca_htb_policy[] = {
3640 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3641 .min_len = sizeof(struct tc_htb_opt) },
3644 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3645 const struct tc_htb_opt *htb;
3647 if (!nl_parse_nested(nl_options, tca_htb_policy,
3648 attrs, ARRAY_SIZE(tca_htb_policy))) {
3649 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3653 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3654 class->min_rate = htb->rate.rate;
3655 class->max_rate = htb->ceil.rate;
3656 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3657 class->priority = htb->prio;
3662 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3663 struct htb_class *options,
3664 struct netdev_queue_stats *stats)
3666 struct nlattr *nl_options;
3667 unsigned int handle;
3670 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3671 if (!error && queue_id) {
3672 unsigned int major = tc_get_major(handle);
3673 unsigned int minor = tc_get_minor(handle);
3674 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3675 *queue_id = minor - 1;
3680 if (!error && options) {
3681 error = htb_parse_tca_options__(nl_options, options);
3687 htb_parse_qdisc_details__(struct netdev *netdev_,
3688 const struct smap *details, struct htb_class *hc)
3690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3691 const char *max_rate_s;
3693 max_rate_s = smap_get(details, "max-rate");
3694 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3695 if (!hc->max_rate) {
3696 enum netdev_features current;
3698 netdev_linux_read_features(netdev);
3699 current = !netdev->get_features_error ? netdev->current : 0;
3700 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3702 hc->min_rate = hc->max_rate;
3708 htb_parse_class_details__(struct netdev *netdev,
3709 const struct smap *details, struct htb_class *hc)
3711 const struct htb *htb = htb_get__(netdev);
3712 const char *min_rate_s = smap_get(details, "min-rate");
3713 const char *max_rate_s = smap_get(details, "max-rate");
3714 const char *burst_s = smap_get(details, "burst");
3715 const char *priority_s = smap_get(details, "priority");
3718 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3720 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3721 netdev_get_name(netdev));
3725 /* HTB requires at least an mtu sized min-rate to send any traffic even
3726 * on uncongested links. */
3727 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3728 hc->min_rate = MAX(hc->min_rate, mtu);
3729 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3732 hc->max_rate = (max_rate_s
3733 ? strtoull(max_rate_s, NULL, 10) / 8
3735 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3736 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3740 * According to hints in the documentation that I've read, it is important
3741 * that 'burst' be at least as big as the largest frame that might be
3742 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3743 * but having it a bit too small is a problem. Since netdev_get_mtu()
3744 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3745 * the MTU. We actually add 64, instead of 14, as a guard against
3746 * additional headers get tacked on somewhere that we're not aware of. */
3747 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3748 hc->burst = MAX(hc->burst, mtu + 64);
3751 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3757 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3758 unsigned int parent, struct htb_class *options,
3759 struct netdev_queue_stats *stats)
3761 struct ofpbuf *reply;
3764 error = tc_query_class(netdev, handle, parent, &reply);
3766 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3767 ofpbuf_delete(reply);
3773 htb_tc_install(struct netdev *netdev, const struct smap *details)
3777 error = htb_setup_qdisc__(netdev);
3779 struct htb_class hc;
3781 htb_parse_qdisc_details__(netdev, details, &hc);
3782 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3783 tc_make_handle(1, 0), &hc);
3785 htb_install__(netdev, hc.max_rate);
3791 static struct htb_class *
3792 htb_class_cast__(const struct tc_queue *queue)
3794 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3798 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3799 const struct htb_class *hc)
3801 struct htb *htb = htb_get__(netdev);
3802 size_t hash = hash_int(queue_id, 0);
3803 struct tc_queue *queue;
3804 struct htb_class *hcp;
3806 queue = tc_find_queue__(netdev, queue_id, hash);
3808 hcp = htb_class_cast__(queue);
3810 hcp = xmalloc(sizeof *hcp);
3811 queue = &hcp->tc_queue;
3812 queue->queue_id = queue_id;
3813 queue->created = time_msec();
3814 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3817 hcp->min_rate = hc->min_rate;
3818 hcp->max_rate = hc->max_rate;
3819 hcp->burst = hc->burst;
3820 hcp->priority = hc->priority;
3824 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3827 struct queue_dump_state state;
3828 struct htb_class hc;
3830 /* Get qdisc options. */
3832 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3833 htb_install__(netdev, hc.max_rate);
3836 if (!start_queue_dump(netdev, &state)) {
3839 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3840 unsigned int queue_id;
3842 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3843 htb_update_queue__(netdev, queue_id, &hc);
3846 finish_queue_dump(&state);
3852 htb_tc_destroy(struct tc *tc)
3854 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3855 struct htb_class *hc, *next;
3857 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3858 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3866 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3868 const struct htb *htb = htb_get__(netdev);
3869 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3874 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3876 struct htb_class hc;
3879 htb_parse_qdisc_details__(netdev, details, &hc);
3880 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3881 tc_make_handle(1, 0), &hc);
3883 htb_get__(netdev)->max_rate = hc.max_rate;
3889 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3890 const struct tc_queue *queue, struct smap *details)
3892 const struct htb_class *hc = htb_class_cast__(queue);
3894 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3895 if (hc->min_rate != hc->max_rate) {
3896 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3898 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3900 smap_add_format(details, "priority", "%u", hc->priority);
3906 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3907 const struct smap *details)
3909 struct htb_class hc;
3912 error = htb_parse_class_details__(netdev, details, &hc);
3917 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3918 tc_make_handle(1, 0xfffe), &hc);
3923 htb_update_queue__(netdev, queue_id, &hc);
3928 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3930 struct htb_class *hc = htb_class_cast__(queue);
3931 struct htb *htb = htb_get__(netdev);
3934 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3936 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3943 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3944 struct netdev_queue_stats *stats)
3946 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3947 tc_make_handle(1, 0xfffe), NULL, stats);
3951 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3952 const struct ofpbuf *nlmsg,
3953 netdev_dump_queue_stats_cb *cb, void *aux)
3955 struct netdev_queue_stats stats;
3956 unsigned int handle, major, minor;
3959 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3964 major = tc_get_major(handle);
3965 minor = tc_get_minor(handle);
3966 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3967 (*cb)(minor - 1, &stats, aux);
3972 static const struct tc_ops tc_ops_htb = {
3973 "htb", /* linux_name */
3974 "linux-htb", /* ovs_name */
3975 HTB_N_QUEUES, /* n_queues */
3984 htb_class_get_stats,
3985 htb_class_dump_stats
3988 /* "linux-hfsc" traffic control class. */
3990 #define HFSC_N_QUEUES 0xf000
3998 struct tc_queue tc_queue;
4003 static struct hfsc *
4004 hfsc_get__(const struct netdev *netdev_)
4006 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4007 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4010 static struct hfsc_class *
4011 hfsc_class_cast__(const struct tc_queue *queue)
4013 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4017 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4022 hfsc = xmalloc(sizeof *hfsc);
4023 tc_init(&hfsc->tc, &tc_ops_hfsc);
4024 hfsc->max_rate = max_rate;
4025 netdev->tc = &hfsc->tc;
4029 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4030 const struct hfsc_class *hc)
4034 struct hfsc_class *hcp;
4035 struct tc_queue *queue;
4037 hfsc = hfsc_get__(netdev);
4038 hash = hash_int(queue_id, 0);
4040 queue = tc_find_queue__(netdev, queue_id, hash);
4042 hcp = hfsc_class_cast__(queue);
4044 hcp = xmalloc(sizeof *hcp);
4045 queue = &hcp->tc_queue;
4046 queue->queue_id = queue_id;
4047 queue->created = time_msec();
4048 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4051 hcp->min_rate = hc->min_rate;
4052 hcp->max_rate = hc->max_rate;
4056 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4058 const struct tc_service_curve *rsc, *fsc, *usc;
4059 static const struct nl_policy tca_hfsc_policy[] = {
4061 .type = NL_A_UNSPEC,
4063 .min_len = sizeof(struct tc_service_curve),
4066 .type = NL_A_UNSPEC,
4068 .min_len = sizeof(struct tc_service_curve),
4071 .type = NL_A_UNSPEC,
4073 .min_len = sizeof(struct tc_service_curve),
4076 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4078 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4079 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4080 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4084 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4085 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4086 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4088 if (rsc->m1 != 0 || rsc->d != 0 ||
4089 fsc->m1 != 0 || fsc->d != 0 ||
4090 usc->m1 != 0 || usc->d != 0) {
4091 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4092 "Non-linear service curves are not supported.");
4096 if (rsc->m2 != fsc->m2) {
4097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4098 "Real-time service curves are not supported ");
4102 if (rsc->m2 > usc->m2) {
4103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4104 "Min-rate service curve is greater than "
4105 "the max-rate service curve.");
4109 class->min_rate = fsc->m2;
4110 class->max_rate = usc->m2;
4115 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4116 struct hfsc_class *options,
4117 struct netdev_queue_stats *stats)
4120 unsigned int handle;
4121 struct nlattr *nl_options;
4123 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4129 unsigned int major, minor;
4131 major = tc_get_major(handle);
4132 minor = tc_get_minor(handle);
4133 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4134 *queue_id = minor - 1;
4141 error = hfsc_parse_tca_options__(nl_options, options);
4148 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4149 unsigned int parent, struct hfsc_class *options,
4150 struct netdev_queue_stats *stats)
4153 struct ofpbuf *reply;
4155 error = tc_query_class(netdev, handle, parent, &reply);
4160 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4161 ofpbuf_delete(reply);
4166 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4167 struct hfsc_class *class)
4169 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4171 const char *max_rate_s;
4173 max_rate_s = smap_get(details, "max-rate");
4174 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4177 enum netdev_features current;
4179 netdev_linux_read_features(netdev);
4180 current = !netdev->get_features_error ? netdev->current : 0;
4181 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4184 class->min_rate = max_rate;
4185 class->max_rate = max_rate;
4189 hfsc_parse_class_details__(struct netdev *netdev,
4190 const struct smap *details,
4191 struct hfsc_class * class)
4193 const struct hfsc *hfsc;
4194 uint32_t min_rate, max_rate;
4195 const char *min_rate_s, *max_rate_s;
4197 hfsc = hfsc_get__(netdev);
4198 min_rate_s = smap_get(details, "min-rate");
4199 max_rate_s = smap_get(details, "max-rate");
4201 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4202 min_rate = MAX(min_rate, 1);
4203 min_rate = MIN(min_rate, hfsc->max_rate);
4205 max_rate = (max_rate_s
4206 ? strtoull(max_rate_s, NULL, 10) / 8
4208 max_rate = MAX(max_rate, min_rate);
4209 max_rate = MIN(max_rate, hfsc->max_rate);
4211 class->min_rate = min_rate;
4212 class->max_rate = max_rate;
4217 /* Create an HFSC qdisc.
4219 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4221 hfsc_setup_qdisc__(struct netdev * netdev)
4223 struct tcmsg *tcmsg;
4224 struct ofpbuf request;
4225 struct tc_hfsc_qopt opt;
4227 tc_del_qdisc(netdev);
4229 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4230 NLM_F_EXCL | NLM_F_CREATE, &request);
4236 tcmsg->tcm_handle = tc_make_handle(1, 0);
4237 tcmsg->tcm_parent = TC_H_ROOT;
4239 memset(&opt, 0, sizeof opt);
4242 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4243 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4245 return tc_transact(&request, NULL);
4248 /* Create an HFSC class.
4250 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4251 * sc rate <min_rate> ul rate <max_rate>" */
4253 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4254 unsigned int parent, struct hfsc_class *class)
4258 struct tcmsg *tcmsg;
4259 struct ofpbuf request;
4260 struct tc_service_curve min, max;
4262 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4268 tcmsg->tcm_handle = handle;
4269 tcmsg->tcm_parent = parent;
4273 min.m2 = class->min_rate;
4277 max.m2 = class->max_rate;
4279 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4280 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4281 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4282 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4283 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4284 nl_msg_end_nested(&request, opt_offset);
4286 error = tc_transact(&request, NULL);
4288 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4289 "min-rate %ubps, max-rate %ubps (%s)",
4290 netdev_get_name(netdev),
4291 tc_get_major(handle), tc_get_minor(handle),
4292 tc_get_major(parent), tc_get_minor(parent),
4293 class->min_rate, class->max_rate, ovs_strerror(error));
4300 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4303 struct hfsc_class class;
4305 error = hfsc_setup_qdisc__(netdev);
4311 hfsc_parse_qdisc_details__(netdev, details, &class);
4312 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4313 tc_make_handle(1, 0), &class);
4319 hfsc_install__(netdev, class.max_rate);
4324 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4327 struct queue_dump_state state;
4328 struct hfsc_class hc;
4331 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4332 hfsc_install__(netdev, hc.max_rate);
4334 if (!start_queue_dump(netdev, &state)) {
4338 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4339 unsigned int queue_id;
4341 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4342 hfsc_update_queue__(netdev, queue_id, &hc);
4346 finish_queue_dump(&state);
4351 hfsc_tc_destroy(struct tc *tc)
4354 struct hfsc_class *hc, *next;
4356 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4358 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4359 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4368 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4370 const struct hfsc *hfsc;
4371 hfsc = hfsc_get__(netdev);
4372 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4377 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4380 struct hfsc_class class;
4382 hfsc_parse_qdisc_details__(netdev, details, &class);
4383 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4384 tc_make_handle(1, 0), &class);
4387 hfsc_get__(netdev)->max_rate = class.max_rate;
4394 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4395 const struct tc_queue *queue, struct smap *details)
4397 const struct hfsc_class *hc;
4399 hc = hfsc_class_cast__(queue);
4400 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4401 if (hc->min_rate != hc->max_rate) {
4402 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4408 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4409 const struct smap *details)
4412 struct hfsc_class class;
4414 error = hfsc_parse_class_details__(netdev, details, &class);
4419 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4420 tc_make_handle(1, 0xfffe), &class);
4425 hfsc_update_queue__(netdev, queue_id, &class);
4430 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4434 struct hfsc_class *hc;
4436 hc = hfsc_class_cast__(queue);
4437 hfsc = hfsc_get__(netdev);
4439 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4441 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4448 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4449 struct netdev_queue_stats *stats)
4451 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4452 tc_make_handle(1, 0xfffe), NULL, stats);
4456 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4457 const struct ofpbuf *nlmsg,
4458 netdev_dump_queue_stats_cb *cb, void *aux)
4460 struct netdev_queue_stats stats;
4461 unsigned int handle, major, minor;
4464 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4469 major = tc_get_major(handle);
4470 minor = tc_get_minor(handle);
4471 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4472 (*cb)(minor - 1, &stats, aux);
4477 static const struct tc_ops tc_ops_hfsc = {
4478 "hfsc", /* linux_name */
4479 "linux-hfsc", /* ovs_name */
4480 HFSC_N_QUEUES, /* n_queues */
4481 hfsc_tc_install, /* tc_install */
4482 hfsc_tc_load, /* tc_load */
4483 hfsc_tc_destroy, /* tc_destroy */
4484 hfsc_qdisc_get, /* qdisc_get */
4485 hfsc_qdisc_set, /* qdisc_set */
4486 hfsc_class_get, /* class_get */
4487 hfsc_class_set, /* class_set */
4488 hfsc_class_delete, /* class_delete */
4489 hfsc_class_get_stats, /* class_get_stats */
4490 hfsc_class_dump_stats /* class_dump_stats */
4493 /* "linux-default" traffic control class.
4495 * This class represents the default, unnamed Linux qdisc. It corresponds to
4496 * the "" (empty string) QoS type in the OVS database. */
4499 default_install__(struct netdev *netdev_)
4501 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4502 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4504 /* Nothing but a tc class implementation is allowed to write to a tc. This
4505 * class never does that, so we can legitimately use a const tc object. */
4506 netdev->tc = CONST_CAST(struct tc *, &tc);
4510 default_tc_install(struct netdev *netdev,
4511 const struct smap *details OVS_UNUSED)
4513 default_install__(netdev);
4518 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4520 default_install__(netdev);
4524 static const struct tc_ops tc_ops_default = {
4525 NULL, /* linux_name */
4530 NULL, /* tc_destroy */
4531 NULL, /* qdisc_get */
4532 NULL, /* qdisc_set */
4533 NULL, /* class_get */
4534 NULL, /* class_set */
4535 NULL, /* class_delete */
4536 NULL, /* class_get_stats */
4537 NULL /* class_dump_stats */
4540 /* "linux-other" traffic control class.
4545 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4548 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4550 /* Nothing but a tc class implementation is allowed to write to a tc. This
4551 * class never does that, so we can legitimately use a const tc object. */
4552 netdev->tc = CONST_CAST(struct tc *, &tc);
4556 static const struct tc_ops tc_ops_other = {
4557 NULL, /* linux_name */
4558 "linux-other", /* ovs_name */
4560 NULL, /* tc_install */
4562 NULL, /* tc_destroy */
4563 NULL, /* qdisc_get */
4564 NULL, /* qdisc_set */
4565 NULL, /* class_get */
4566 NULL, /* class_set */
4567 NULL, /* class_delete */
4568 NULL, /* class_get_stats */
4569 NULL /* class_dump_stats */
4572 /* Traffic control. */
4574 /* Number of kernel "tc" ticks per second. */
4575 static double ticks_per_s;
4577 /* Number of kernel "jiffies" per second. This is used for the purpose of
4578 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4579 * one jiffy's worth of data.
4581 * There are two possibilities here:
4583 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4584 * approximate range of 100 to 1024. That means that we really need to
4585 * make sure that the qdisc can buffer that much data.
4587 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4588 * has finely granular timers and there's no need to fudge additional room
4589 * for buffers. (There's no extra effort needed to implement that: the
4590 * large 'buffer_hz' is used as a divisor, so practically any number will
4591 * come out as 0 in the division. Small integer results in the case of
4592 * really high dividends won't have any real effect anyhow.)
4594 static unsigned int buffer_hz;
4596 /* Returns tc handle 'major':'minor'. */
4598 tc_make_handle(unsigned int major, unsigned int minor)
4600 return TC_H_MAKE(major << 16, minor);
4603 /* Returns the major number from 'handle'. */
4605 tc_get_major(unsigned int handle)
4607 return TC_H_MAJ(handle) >> 16;
4610 /* Returns the minor number from 'handle'. */
4612 tc_get_minor(unsigned int handle)
4614 return TC_H_MIN(handle);
4617 static struct tcmsg *
4618 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4619 struct ofpbuf *request)
4621 struct tcmsg *tcmsg;
4625 error = get_ifindex(netdev, &ifindex);
4630 ofpbuf_init(request, 512);
4631 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4632 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4633 tcmsg->tcm_family = AF_UNSPEC;
4634 tcmsg->tcm_ifindex = ifindex;
4635 /* Caller should fill in tcmsg->tcm_handle. */
4636 /* Caller should fill in tcmsg->tcm_parent. */
4642 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4644 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4645 ofpbuf_uninit(request);
4649 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4650 * policing configuration.
4652 * This function is equivalent to running the following when 'add' is true:
4653 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4655 * This function is equivalent to running the following when 'add' is false:
4656 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4658 * The configuration and stats may be seen with the following command:
4659 * /sbin/tc -s qdisc show dev <devname>
4661 * Returns 0 if successful, otherwise a positive errno value.
4664 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4666 struct ofpbuf request;
4667 struct tcmsg *tcmsg;
4669 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4670 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4672 tcmsg = tc_make_request(netdev, type, flags, &request);
4676 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4677 tcmsg->tcm_parent = TC_H_INGRESS;
4678 nl_msg_put_string(&request, TCA_KIND, "ingress");
4679 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4681 error = tc_transact(&request, NULL);
4683 /* If we're deleting the qdisc, don't worry about some of the
4684 * error conditions. */
4685 if (!add && (error == ENOENT || error == EINVAL)) {
4694 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4697 * This function is equivalent to running:
4698 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4699 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4702 * The configuration and stats may be seen with the following command:
4703 * /sbin/tc -s filter show dev <devname> parent ffff:
4705 * Returns 0 if successful, otherwise a positive errno value.
4708 tc_add_policer(struct netdev *netdev,
4709 uint32_t kbits_rate, uint32_t kbits_burst)
4711 struct tc_police tc_police;
4712 struct ofpbuf request;
4713 struct tcmsg *tcmsg;
4714 size_t basic_offset;
4715 size_t police_offset;
4719 memset(&tc_police, 0, sizeof tc_police);
4720 tc_police.action = TC_POLICE_SHOT;
4721 tc_police.mtu = mtu;
4722 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4724 /* The following appears wrong in two ways:
4726 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4727 * arguments (or at least consistently "bytes" as both or "bits" as
4728 * both), but this supplies bytes for the first argument and bits for the
4731 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4733 * However if you "fix" those problems then "tc filter show ..." shows
4734 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4735 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4736 * tc's point of view. Whatever. */
4737 tc_police.burst = tc_bytes_to_ticks(
4738 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4740 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4741 NLM_F_EXCL | NLM_F_CREATE, &request);
4745 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4746 tcmsg->tcm_info = tc_make_handle(49,
4747 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4749 nl_msg_put_string(&request, TCA_KIND, "basic");
4750 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4751 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4752 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4753 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4754 nl_msg_end_nested(&request, police_offset);
4755 nl_msg_end_nested(&request, basic_offset);
4757 error = tc_transact(&request, NULL);
4768 /* The values in psched are not individually very meaningful, but they are
4769 * important. The tables below show some values seen in the wild.
4773 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4774 * (Before that, there are hints that it was 1000000000.)
4776 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4780 * -----------------------------------
4781 * [1] 000c8000 000f4240 000f4240 00000064
4782 * [2] 000003e8 00000400 000f4240 3b9aca00
4783 * [3] 000003e8 00000400 000f4240 3b9aca00
4784 * [4] 000003e8 00000400 000f4240 00000064
4785 * [5] 000003e8 00000040 000f4240 3b9aca00
4786 * [6] 000003e8 00000040 000f4240 000000f9
4788 * a b c d ticks_per_s buffer_hz
4789 * ------- --------- ---------- ------------- ----------- -------------
4790 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4791 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4792 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4793 * [4] 1,000 1,024 1,000,000 100 976,562 100
4794 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4795 * [6] 1,000 64 1,000,000 249 15,625,000 249
4797 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4798 * [2] 2.6.26-1-686-bigmem from Debian lenny
4799 * [3] 2.6.26-2-sparc64 from Debian lenny
4800 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4801 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4802 * [6] 2.6.34 from kernel.org on KVM
4804 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4805 static const char fn[] = "/proc/net/psched";
4806 unsigned int a, b, c, d;
4809 if (!ovsthread_once_start(&once)) {
4816 stream = fopen(fn, "r");
4818 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4822 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4823 VLOG_WARN("%s: read failed", fn);
4827 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4831 VLOG_WARN("%s: invalid scheduler parameters", fn);
4835 ticks_per_s = (double) a * c / b;
4839 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4842 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4845 ovsthread_once_done(&once);
4848 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4849 * rate of 'rate' bytes per second. */
4851 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4854 return (rate * ticks) / ticks_per_s;
4857 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4858 * rate of 'rate' bytes per second. */
4860 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4863 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4866 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4867 * a transmission rate of 'rate' bytes per second. */
4869 tc_buffer_per_jiffy(unsigned int rate)
4872 return rate / buffer_hz;
4875 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4876 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4877 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4878 * stores NULL into it if it is absent.
4880 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4883 * Returns 0 if successful, otherwise a positive errno value. */
4885 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4886 struct nlattr **options)
4888 static const struct nl_policy tca_policy[] = {
4889 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4890 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4892 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4894 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4895 tca_policy, ta, ARRAY_SIZE(ta))) {
4896 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4901 *kind = nl_attr_get_string(ta[TCA_KIND]);
4905 *options = ta[TCA_OPTIONS];
4920 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4921 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4922 * into '*options', and its queue statistics into '*stats'. Any of the output
4923 * arguments may be null.
4925 * Returns 0 if successful, otherwise a positive errno value. */
4927 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4928 struct nlattr **options, struct netdev_queue_stats *stats)
4930 static const struct nl_policy tca_policy[] = {
4931 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4932 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4934 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4936 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4937 tca_policy, ta, ARRAY_SIZE(ta))) {
4938 VLOG_WARN_RL(&rl, "failed to parse class message");
4943 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4944 *handlep = tc->tcm_handle;
4948 *options = ta[TCA_OPTIONS];
4952 const struct gnet_stats_queue *gsq;
4953 struct gnet_stats_basic gsb;
4955 static const struct nl_policy stats_policy[] = {
4956 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4957 .min_len = sizeof gsb },
4958 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4959 .min_len = sizeof *gsq },
4961 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4963 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4964 sa, ARRAY_SIZE(sa))) {
4965 VLOG_WARN_RL(&rl, "failed to parse class stats");
4969 /* Alignment issues screw up the length of struct gnet_stats_basic on
4970 * some arch/bitsize combinations. Newer versions of Linux have a
4971 * struct gnet_stats_basic_packed, but we can't depend on that. The
4972 * easiest thing to do is just to make a copy. */
4973 memset(&gsb, 0, sizeof gsb);
4974 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4975 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4976 stats->tx_bytes = gsb.bytes;
4977 stats->tx_packets = gsb.packets;
4979 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4980 stats->tx_errors = gsq->drops;
4990 memset(stats, 0, sizeof *stats);
4995 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4998 tc_query_class(const struct netdev *netdev,
4999 unsigned int handle, unsigned int parent,
5000 struct ofpbuf **replyp)
5002 struct ofpbuf request;
5003 struct tcmsg *tcmsg;
5006 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5010 tcmsg->tcm_handle = handle;
5011 tcmsg->tcm_parent = parent;
5013 error = tc_transact(&request, replyp);
5015 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5016 netdev_get_name(netdev),
5017 tc_get_major(handle), tc_get_minor(handle),
5018 tc_get_major(parent), tc_get_minor(parent),
5019 ovs_strerror(error));
5024 /* Equivalent to "tc class del dev <name> handle <handle>". */
5026 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5028 struct ofpbuf request;
5029 struct tcmsg *tcmsg;
5032 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5036 tcmsg->tcm_handle = handle;
5037 tcmsg->tcm_parent = 0;
5039 error = tc_transact(&request, NULL);
5041 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5042 netdev_get_name(netdev),
5043 tc_get_major(handle), tc_get_minor(handle),
5044 ovs_strerror(error));
5049 /* Equivalent to "tc qdisc del dev <name> root". */
5051 tc_del_qdisc(struct netdev *netdev_)
5053 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5054 struct ofpbuf request;
5055 struct tcmsg *tcmsg;
5058 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5062 tcmsg->tcm_handle = tc_make_handle(1, 0);
5063 tcmsg->tcm_parent = TC_H_ROOT;
5065 error = tc_transact(&request, NULL);
5066 if (error == EINVAL) {
5067 /* EINVAL probably means that the default qdisc was in use, in which
5068 * case we've accomplished our purpose. */
5071 if (!error && netdev->tc) {
5072 if (netdev->tc->ops->tc_destroy) {
5073 netdev->tc->ops->tc_destroy(netdev->tc);
5081 getqdisc_is_safe(void)
5083 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5084 static bool safe = false;
5086 if (ovsthread_once_start(&once)) {
5087 struct utsname utsname;
5090 if (uname(&utsname) == -1) {
5091 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5092 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5093 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5094 } else if (major < 2 || (major == 2 && minor < 35)) {
5095 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5100 ovsthread_once_done(&once);
5105 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5106 * kernel to determine what they are. Returns 0 if successful, otherwise a
5107 * positive errno value. */
5109 tc_query_qdisc(const struct netdev *netdev_)
5111 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5112 struct ofpbuf request, *qdisc;
5113 const struct tc_ops *ops;
5114 struct tcmsg *tcmsg;
5122 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5123 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5124 * 2.6.35 without that fix backported to it.
5126 * To avoid the OOPS, we must not make a request that would attempt to dump
5127 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5128 * few others. There are a few ways that I can see to do this, but most of
5129 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5130 * technique chosen here is to assume that any non-default qdisc that we
5131 * create will have a class with handle 1:0. The built-in qdiscs only have
5132 * a class with handle 0:0.
5134 * On Linux 2.6.35+ we use the straightforward method because it allows us
5135 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5136 * in such a case we get no response at all from the kernel (!) if a
5137 * builtin qdisc is in use (which is later caught by "!error &&
5138 * !qdisc->size"). */
5139 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5143 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5144 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5146 /* Figure out what tc class to instantiate. */
5147 error = tc_transact(&request, &qdisc);
5148 if (!error && qdisc->size) {
5151 error = tc_parse_qdisc(qdisc, &kind, NULL);
5153 ops = &tc_ops_other;
5155 ops = tc_lookup_linux_name(kind);
5157 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5158 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5160 ops = &tc_ops_other;
5163 } else if ((!error && !qdisc->size) || error == ENOENT) {
5164 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5165 * set up by some other entity that doesn't have a handle 1:0. We will
5166 * assume that it's the system default qdisc. */
5167 ops = &tc_ops_default;
5170 /* Who knows? Maybe the device got deleted. */
5171 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5172 netdev_get_name(netdev_), ovs_strerror(error));
5173 ops = &tc_ops_other;
5176 /* Instantiate it. */
5177 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5178 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5179 ofpbuf_delete(qdisc);
5181 return error ? error : load_error;
5184 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5185 approximate the time to transmit packets of various lengths. For an MTU of
5186 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5187 represents two possible packet lengths; for a MTU of 513 through 1024, four
5188 possible lengths; and so on.
5190 Returns, for the specified 'mtu', the number of bits that packet lengths
5191 need to be shifted right to fit within such a 256-entry table. */
5193 tc_calc_cell_log(unsigned int mtu)
5198 mtu = ETH_PAYLOAD_MAX;
5200 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5202 for (cell_log = 0; mtu >= 256; cell_log++) {
5209 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5212 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5214 memset(rate, 0, sizeof *rate);
5215 rate->cell_log = tc_calc_cell_log(mtu);
5216 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5217 /* rate->cell_align = 0; */ /* distro headers. */
5218 rate->mpu = ETH_TOTAL_MIN;
5222 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5223 * attribute of the specified "type".
5225 * See tc_calc_cell_log() above for a description of "rtab"s. */
5227 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5232 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5233 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5234 unsigned packet_size = (i + 1) << rate->cell_log;
5235 if (packet_size < rate->mpu) {
5236 packet_size = rate->mpu;
5238 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5242 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5243 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5244 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5247 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5249 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5250 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5253 /* Linux-only functions declared in netdev-linux.h */
5255 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5256 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5258 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5259 const char *flag_name, bool enable)
5261 const char *netdev_name = netdev_get_name(netdev);
5262 struct ethtool_value evalue;
5266 COVERAGE_INC(netdev_get_ethtool);
5267 memset(&evalue, 0, sizeof evalue);
5268 error = netdev_linux_do_ethtool(netdev_name,
5269 (struct ethtool_cmd *)&evalue,
5270 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5275 COVERAGE_INC(netdev_set_ethtool);
5276 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5277 if (new_flags == evalue.data) {
5280 evalue.data = new_flags;
5281 error = netdev_linux_do_ethtool(netdev_name,
5282 (struct ethtool_cmd *)&evalue,
5283 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5288 COVERAGE_INC(netdev_get_ethtool);
5289 memset(&evalue, 0, sizeof evalue);
5290 error = netdev_linux_do_ethtool(netdev_name,
5291 (struct ethtool_cmd *)&evalue,
5292 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5297 if (new_flags != evalue.data) {
5298 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5299 "device %s failed", enable ? "enable" : "disable",
5300 flag_name, netdev_name);
5307 /* Utility functions. */
5309 /* Copies 'src' into 'dst', performing format conversion in the process. */
5311 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5312 const struct rtnl_link_stats *src)
5314 dst->rx_packets = src->rx_packets;
5315 dst->tx_packets = src->tx_packets;
5316 dst->rx_bytes = src->rx_bytes;
5317 dst->tx_bytes = src->tx_bytes;
5318 dst->rx_errors = src->rx_errors;
5319 dst->tx_errors = src->tx_errors;
5320 dst->rx_dropped = src->rx_dropped;
5321 dst->tx_dropped = src->tx_dropped;
5322 dst->multicast = src->multicast;
5323 dst->collisions = src->collisions;
5324 dst->rx_length_errors = src->rx_length_errors;
5325 dst->rx_over_errors = src->rx_over_errors;
5326 dst->rx_crc_errors = src->rx_crc_errors;
5327 dst->rx_frame_errors = src->rx_frame_errors;
5328 dst->rx_fifo_errors = src->rx_fifo_errors;
5329 dst->rx_missed_errors = src->rx_missed_errors;
5330 dst->tx_aborted_errors = src->tx_aborted_errors;
5331 dst->tx_carrier_errors = src->tx_carrier_errors;
5332 dst->tx_fifo_errors = src->tx_fifo_errors;
5333 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5334 dst->tx_window_errors = src->tx_window_errors;
5337 /* Copies 'src' into 'dst', performing format conversion in the process. */
5339 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5340 const struct rtnl_link_stats64 *src)
5342 dst->rx_packets = src->rx_packets;
5343 dst->tx_packets = src->tx_packets;
5344 dst->rx_bytes = src->rx_bytes;
5345 dst->tx_bytes = src->tx_bytes;
5346 dst->rx_errors = src->rx_errors;
5347 dst->tx_errors = src->tx_errors;
5348 dst->rx_dropped = src->rx_dropped;
5349 dst->tx_dropped = src->tx_dropped;
5350 dst->multicast = src->multicast;
5351 dst->collisions = src->collisions;
5352 dst->rx_length_errors = src->rx_length_errors;
5353 dst->rx_over_errors = src->rx_over_errors;
5354 dst->rx_crc_errors = src->rx_crc_errors;
5355 dst->rx_frame_errors = src->rx_frame_errors;
5356 dst->rx_fifo_errors = src->rx_fifo_errors;
5357 dst->rx_missed_errors = src->rx_missed_errors;
5358 dst->tx_aborted_errors = src->tx_aborted_errors;
5359 dst->tx_carrier_errors = src->tx_carrier_errors;
5360 dst->tx_fifo_errors = src->tx_fifo_errors;
5361 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5362 dst->tx_window_errors = src->tx_window_errors;
5366 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5368 struct ofpbuf request;
5369 struct ofpbuf *reply;
5372 ofpbuf_init(&request, 0);
5373 nl_msg_put_nlmsghdr(&request,
5374 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5375 RTM_GETLINK, NLM_F_REQUEST);
5376 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5377 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5378 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5379 ofpbuf_uninit(&request);
5384 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5385 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5386 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5387 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5390 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5391 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5392 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5395 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5400 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5405 ofpbuf_delete(reply);
5410 get_flags(const struct netdev *dev, unsigned int *flags)
5416 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5418 *flags = ifr.ifr_flags;
5424 set_flags(const char *name, unsigned int flags)
5428 ifr.ifr_flags = flags;
5429 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5433 do_get_ifindex(const char *netdev_name)
5438 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5439 COVERAGE_INC(netdev_get_ifindex);
5441 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5443 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5444 netdev_name, ovs_strerror(error));
5447 return ifr.ifr_ifindex;
5451 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5453 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5455 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5456 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5459 netdev->get_ifindex_error = -ifindex;
5460 netdev->ifindex = 0;
5462 netdev->get_ifindex_error = 0;
5463 netdev->ifindex = ifindex;
5465 netdev->cache_valid |= VALID_IFINDEX;
5468 *ifindexp = netdev->ifindex;
5469 return netdev->get_ifindex_error;
5473 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5479 memset(&ifr, 0, sizeof ifr);
5480 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5481 COVERAGE_INC(netdev_get_hwaddr);
5482 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5484 /* ENODEV probably means that a vif disappeared asynchronously and
5485 * hasn't been removed from the database yet, so reduce the log level
5486 * to INFO for that case. */
5487 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5488 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5489 netdev_name, ovs_strerror(error));
5492 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5493 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5494 VLOG_INFO("%s device has unknown hardware address family %d",
5495 netdev_name, hwaddr_family);
5498 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5503 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5508 memset(&ifr, 0, sizeof ifr);
5509 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5510 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5511 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5512 COVERAGE_INC(netdev_set_hwaddr);
5513 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5515 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5516 netdev_name, ovs_strerror(error));
5522 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5523 int cmd, const char *cmd_name)
5528 memset(&ifr, 0, sizeof ifr);
5529 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5530 ifr.ifr_data = (caddr_t) ecmd;
5533 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5535 if (error != EOPNOTSUPP) {
5536 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5537 "failed: %s", cmd_name, name, ovs_strerror(error));
5539 /* The device doesn't support this operation. That's pretty
5540 * common, so there's no point in logging anything. */
5546 /* Returns an AF_PACKET raw socket or a negative errno value. */
5548 af_packet_sock(void)
5550 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5553 if (ovsthread_once_start(&once)) {
5554 sock = socket(AF_PACKET, SOCK_RAW, 0);
5556 int error = set_nonblocking(sock);
5563 VLOG_ERR("failed to create packet socket: %s",
5564 ovs_strerror(errno));
5566 ovsthread_once_done(&once);