2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
403 tc_init(struct tc *tc, const struct tc_ops *ops)
406 hmap_init(&tc->queues);
410 tc_destroy(struct tc *tc)
412 hmap_destroy(&tc->queues);
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_other;
423 static const struct tc_ops *const tcs[] = {
424 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
425 &tc_ops_hfsc, /* Hierarchical fair service curve. */
426 &tc_ops_codel, /* Controlled delay */
427 &tc_ops_fqcodel, /* Fair queue controlled delay */
428 &tc_ops_sfq, /* Stochastic fair queueing */
429 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
430 &tc_ops_other, /* Some other qdisc. */
434 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
435 static unsigned int tc_get_major(unsigned int handle);
436 static unsigned int tc_get_minor(unsigned int handle);
438 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
439 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
440 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
442 static struct tcmsg *tc_make_request(const struct netdev *, int type,
443 unsigned int flags, struct ofpbuf *);
444 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
445 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
446 static int tc_add_policer(struct netdev *,
447 uint32_t kbits_rate, uint32_t kbits_burst);
449 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
450 struct nlattr **options);
451 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
452 struct nlattr **options,
453 struct netdev_queue_stats *);
454 static int tc_query_class(const struct netdev *,
455 unsigned int handle, unsigned int parent,
456 struct ofpbuf **replyp);
457 static int tc_delete_class(const struct netdev *, unsigned int handle);
459 static int tc_del_qdisc(struct netdev *netdev);
460 static int tc_query_qdisc(const struct netdev *netdev);
462 static int tc_calc_cell_log(unsigned int mtu);
463 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
464 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
465 const struct tc_ratespec *rate);
466 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
468 struct netdev_linux {
471 /* Protects all members below. */
472 struct ovs_mutex mutex;
474 unsigned int cache_valid;
476 bool miimon; /* Link status of last poll. */
477 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
478 struct timer miimon_timer;
480 /* The following are figured out "on demand" only. They are only valid
481 * when the corresponding VALID_* bit in 'cache_valid' is set. */
483 struct eth_addr etheraddr;
485 unsigned int ifi_flags;
486 long long int carrier_resets;
487 uint32_t kbits_rate; /* Policing data. */
488 uint32_t kbits_burst;
489 int vport_stats_error; /* Cached error code from vport_get_stats().
490 0 or an errno value. */
491 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
492 int ether_addr_error; /* Cached error code from set/get etheraddr. */
493 int netdev_policing_error; /* Cached error code from set policing. */
494 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
495 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
497 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
499 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
501 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
504 /* For devices of class netdev_tap_class only. */
508 struct netdev_rxq_linux {
509 struct netdev_rxq up;
514 /* This is set pretty low because we probably won't learn anything from the
515 * additional log messages. */
516 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
518 /* Polling miimon status for all ports causes performance degradation when
519 * handling a large number of ports. If there are no devices using miimon, then
520 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
522 * Readers do not depend on this variable synchronizing with the related
523 * changes in the device miimon status, so we can use atomic_count. */
524 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
526 static void netdev_linux_run(void);
528 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
529 int cmd, const char *cmd_name);
530 static int get_flags(const struct netdev *, unsigned int *flags);
531 static int set_flags(const char *, unsigned int flags);
532 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
533 enum netdev_flags on, enum netdev_flags *old_flagsp)
534 OVS_REQUIRES(netdev->mutex);
535 static int do_get_ifindex(const char *netdev_name);
536 static int get_ifindex(const struct netdev *, int *ifindexp);
537 static int do_set_addr(struct netdev *netdev,
538 int ioctl_nr, const char *ioctl_name,
539 struct in_addr addr);
540 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
541 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
542 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
543 static int af_packet_sock(void);
544 static bool netdev_linux_miimon_enabled(void);
545 static void netdev_linux_miimon_run(void);
546 static void netdev_linux_miimon_wait(void);
547 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
550 is_netdev_linux_class(const struct netdev_class *netdev_class)
552 return netdev_class->run == netdev_linux_run;
556 is_tap_netdev(const struct netdev *netdev)
558 return netdev_get_class(netdev) == &netdev_tap_class;
561 static struct netdev_linux *
562 netdev_linux_cast(const struct netdev *netdev)
564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
566 return CONTAINER_OF(netdev, struct netdev_linux, up);
569 static struct netdev_rxq_linux *
570 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
573 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
576 static void netdev_linux_update(struct netdev_linux *netdev,
577 const struct rtnetlink_change *)
578 OVS_REQUIRES(netdev->mutex);
579 static void netdev_linux_changed(struct netdev_linux *netdev,
580 unsigned int ifi_flags, unsigned int mask)
581 OVS_REQUIRES(netdev->mutex);
583 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
585 * if no such socket could be created. */
586 static struct nl_sock *
587 netdev_linux_notify_sock(void)
589 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
590 static struct nl_sock *sock;
591 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
592 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
594 if (ovsthread_once_start(&once)) {
597 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
602 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
604 nl_sock_destroy(sock);
610 ovsthread_once_done(&once);
617 netdev_linux_miimon_enabled(void)
619 return atomic_count_get(&miimon_cnt) > 0;
623 netdev_linux_run(void)
625 struct nl_sock *sock;
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
632 sock = netdev_linux_notify_sock();
638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
639 uint64_t buf_stub[4096 / 8];
642 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
643 error = nl_sock_recv(sock, &buf, false);
645 struct rtnetlink_change change;
647 if (rtnetlink_parse(&buf, &change)) {
648 struct netdev *netdev_ = NULL;
649 char dev_name[IFNAMSIZ];
651 if (!change.ifname) {
652 change.ifname = if_indextoname(change.if_index, dev_name);
656 netdev_ = netdev_from_name(change.ifname);
658 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 ovs_mutex_lock(&netdev->mutex);
662 netdev_linux_update(netdev, &change);
663 ovs_mutex_unlock(&netdev->mutex);
665 netdev_close(netdev_);
667 } else if (error == ENOBUFS) {
668 struct shash device_shash;
669 struct shash_node *node;
673 shash_init(&device_shash);
674 netdev_get_devices(&netdev_linux_class, &device_shash);
675 SHASH_FOR_EACH (node, &device_shash) {
676 struct netdev *netdev_ = node->data;
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
680 ovs_mutex_lock(&netdev->mutex);
681 get_flags(netdev_, &flags);
682 netdev_linux_changed(netdev, flags, 0);
683 ovs_mutex_unlock(&netdev->mutex);
685 netdev_close(netdev_);
687 shash_destroy(&device_shash);
688 } else if (error != EAGAIN) {
689 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
690 ovs_strerror(error));
697 netdev_linux_wait(void)
699 struct nl_sock *sock;
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
704 sock = netdev_linux_notify_sock();
706 nl_sock_wait(sock, POLLIN);
711 netdev_linux_changed(struct netdev_linux *dev,
712 unsigned int ifi_flags, unsigned int mask)
713 OVS_REQUIRES(dev->mutex)
715 netdev_change_seq_changed(&dev->up);
717 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
718 dev->carrier_resets++;
720 dev->ifi_flags = ifi_flags;
722 dev->cache_valid &= mask;
723 if (!(mask & VALID_IN)) {
724 netdev_get_addrs_list_flush();
729 netdev_linux_update(struct netdev_linux *dev,
730 const struct rtnetlink_change *change)
731 OVS_REQUIRES(dev->mutex)
733 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
734 if (change->nlmsg_type == RTM_NEWLINK) {
735 /* Keep drv-info, and ip addresses. */
736 netdev_linux_changed(dev, change->ifi_flags,
737 VALID_DRVINFO | VALID_IN);
739 /* Update netdev from rtnl-change msg. */
741 dev->mtu = change->mtu;
742 dev->cache_valid |= VALID_MTU;
743 dev->netdev_mtu_error = 0;
746 if (!eth_addr_is_zero(change->mac)) {
747 dev->etheraddr = change->mac;
748 dev->cache_valid |= VALID_ETHERADDR;
749 dev->ether_addr_error = 0;
752 dev->ifindex = change->if_index;
753 dev->cache_valid |= VALID_IFINDEX;
754 dev->get_ifindex_error = 0;
756 netdev_linux_changed(dev, change->ifi_flags, 0);
758 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
759 /* Invalidates in4, in6. */
760 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
766 static struct netdev *
767 netdev_linux_alloc(void)
769 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
774 netdev_linux_common_construct(struct netdev_linux *netdev)
776 ovs_mutex_init(&netdev->mutex);
779 /* Creates system and internal devices. */
781 netdev_linux_construct(struct netdev *netdev_)
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
786 netdev_linux_common_construct(netdev);
788 error = get_flags(&netdev->up, &netdev->ifi_flags);
789 if (error == ENODEV) {
790 if (netdev->up.netdev_class != &netdev_internal_class) {
791 /* The device does not exist, so don't allow it to be opened. */
794 /* "Internal" netdevs have to be created as netdev objects before
795 * they exist in the kernel, because creating them in the kernel
796 * happens by passing a netdev object to dpif_port_add().
797 * Therefore, ignore the error. */
804 /* For most types of netdevs we open the device for each call of
805 * netdev_open(). However, this is not the case with tap devices,
806 * since it is only possible to open the device once. In this
807 * situation we share a single file descriptor, and consequently
808 * buffers, across all readers. Therefore once data is read it will
809 * be unavailable to other reads for tap devices. */
811 netdev_linux_construct_tap(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 static const char tap_dev[] = "/dev/net/tun";
815 const char *name = netdev_->name;
819 netdev_linux_common_construct(netdev);
821 /* Open tap device. */
822 netdev->tap_fd = open(tap_dev, O_RDWR);
823 if (netdev->tap_fd < 0) {
825 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
829 /* Create tap device. */
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
831 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
832 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
833 VLOG_WARN("%s: creating tap device failed: %s", name,
834 ovs_strerror(errno));
839 /* Make non-blocking. */
840 error = set_nonblocking(netdev->tap_fd);
848 close(netdev->tap_fd);
853 netdev_linux_destruct(struct netdev *netdev_)
855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
857 if (netdev->tc && netdev->tc->ops->tc_destroy) {
858 netdev->tc->ops->tc_destroy(netdev->tc);
861 if (netdev_get_class(netdev_) == &netdev_tap_class
862 && netdev->tap_fd >= 0)
864 close(netdev->tap_fd);
867 if (netdev->miimon_interval > 0) {
868 atomic_count_dec(&miimon_cnt);
871 ovs_mutex_destroy(&netdev->mutex);
875 netdev_linux_dealloc(struct netdev *netdev_)
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
881 static struct netdev_rxq *
882 netdev_linux_rxq_alloc(void)
884 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
889 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
891 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
892 struct netdev *netdev_ = rx->up.netdev;
893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
896 ovs_mutex_lock(&netdev->mutex);
897 rx->is_tap = is_tap_netdev(netdev_);
899 rx->fd = netdev->tap_fd;
901 struct sockaddr_ll sll;
903 /* Result of tcpdump -dd inbound */
904 static const struct sock_filter filt[] = {
905 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
906 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
907 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
908 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
910 static const struct sock_fprog fprog = {
911 ARRAY_SIZE(filt), (struct sock_filter *) filt
914 /* Create file descriptor. */
915 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
918 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
923 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
925 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
926 netdev_get_name(netdev_), ovs_strerror(error));
930 /* Set non-blocking mode. */
931 error = set_nonblocking(rx->fd);
936 /* Get ethernet device index. */
937 error = get_ifindex(&netdev->up, &ifindex);
942 /* Bind to specific ethernet device. */
943 memset(&sll, 0, sizeof sll);
944 sll.sll_family = AF_PACKET;
945 sll.sll_ifindex = ifindex;
946 sll.sll_protocol = htons(ETH_P_ALL);
947 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
949 VLOG_ERR("%s: failed to bind raw socket (%s)",
950 netdev_get_name(netdev_), ovs_strerror(error));
954 /* Filter for only inbound packets. */
955 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
959 VLOG_ERR("%s: failed to attach filter (%s)",
960 netdev_get_name(netdev_), ovs_strerror(error));
964 ovs_mutex_unlock(&netdev->mutex);
972 ovs_mutex_unlock(&netdev->mutex);
977 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
979 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
987 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
995 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
997 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
998 return htons(aux->tp_vlan_tpid);
1000 return htons(ETH_TYPE_VLAN);
1005 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1007 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1011 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1016 struct cmsghdr *cmsg;
1018 struct cmsghdr cmsg;
1019 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1023 /* Reserve headroom for a single VLAN tag */
1024 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1025 size = dp_packet_tailroom(buffer);
1027 iov.iov_base = dp_packet_data(buffer);
1029 msgh.msg_name = NULL;
1030 msgh.msg_namelen = 0;
1031 msgh.msg_iov = &iov;
1032 msgh.msg_iovlen = 1;
1033 msgh.msg_control = &cmsg_buffer;
1034 msgh.msg_controllen = sizeof cmsg_buffer;
1038 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1039 } while (retval < 0 && errno == EINTR);
1043 } else if (retval > size) {
1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1049 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1050 const struct tpacket_auxdata *aux;
1052 if (cmsg->cmsg_level != SOL_PACKET
1053 || cmsg->cmsg_type != PACKET_AUXDATA
1054 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1058 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1059 if (auxdata_has_vlan_tci(aux)) {
1060 if (retval < ETH_HEADER_LEN) {
1064 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1065 htons(aux->tp_vlan_tci));
1074 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1077 size_t size = dp_packet_tailroom(buffer);
1080 retval = read(fd, dp_packet_data(buffer), size);
1081 } while (retval < 0 && errno == EINTR);
1087 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1092 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1096 struct netdev *netdev = rx->up.netdev;
1097 struct dp_packet *buffer;
1101 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1102 mtu = ETH_PAYLOAD_MAX;
1105 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1106 DP_NETDEV_HEADROOM);
1107 retval = (rx->is_tap
1108 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1109 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1112 if (retval != EAGAIN && retval != EMSGSIZE) {
1113 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1114 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1116 dp_packet_delete(buffer);
1118 dp_packet_pad(buffer);
1119 packets[0] = buffer;
1127 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1129 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1130 poll_fd_wait(rx->fd, POLLIN);
1134 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1136 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1139 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1140 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1144 drain_fd(rx->fd, ifr.ifr_qlen);
1147 return drain_rcvbuf(rx->fd);
1151 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1152 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1153 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1154 * the packet is too big or too small to transmit on the device.
1156 * The caller retains ownership of 'buffer' in all cases.
1158 * The kernel maintains a packet transmission queue, so the caller is not
1159 * expected to do additional queuing of packets. */
1161 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1162 struct dp_packet **pkts, int cnt, bool may_steal)
1167 /* 'i' is incremented only if there's no error */
1168 for (i = 0; i < cnt;) {
1169 const void *data = dp_packet_data(pkts[i]);
1170 size_t size = dp_packet_size(pkts[i]);
1173 if (!is_tap_netdev(netdev_)) {
1174 /* Use our AF_PACKET socket to send to this device. */
1175 struct sockaddr_ll sll;
1181 sock = af_packet_sock();
1186 ifindex = netdev_get_ifindex(netdev_);
1191 /* We don't bother setting most fields in sockaddr_ll because the
1192 * kernel ignores them for SOCK_RAW. */
1193 memset(&sll, 0, sizeof sll);
1194 sll.sll_family = AF_PACKET;
1195 sll.sll_ifindex = ifindex;
1197 iov.iov_base = CONST_CAST(void *, data);
1200 msg.msg_name = &sll;
1201 msg.msg_namelen = sizeof sll;
1204 msg.msg_control = NULL;
1205 msg.msg_controllen = 0;
1208 retval = sendmsg(sock, &msg, 0);
1210 /* Use the tap fd to send to this device. This is essential for
1211 * tap devices, because packets sent to a tap device with an
1212 * AF_PACKET socket will loop back to be *received* again on the
1213 * tap device. This doesn't occur on other interface types
1214 * because we attach a socket filter to the rx socket. */
1215 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1217 retval = write(netdev->tap_fd, data, size);
1221 /* The Linux AF_PACKET implementation never blocks waiting for room
1222 * for packets, instead returning ENOBUFS. Translate this into
1223 * EAGAIN for the caller. */
1224 error = errno == ENOBUFS ? EAGAIN : errno;
1225 if (error == EINTR) {
1226 /* continue without incrementing 'i', i.e. retry this packet */
1230 } else if (retval != size) {
1231 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1232 " of %"PRIuSIZE") on %s", retval, size,
1233 netdev_get_name(netdev_));
1238 /* Process the next packet in the batch */
1243 for (i = 0; i < cnt; i++) {
1244 dp_packet_delete(pkts[i]);
1248 if (error && error != EAGAIN) {
1249 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1250 netdev_get_name(netdev_), ovs_strerror(error));
1257 /* Registers with the poll loop to wake up from the next call to poll_block()
1258 * when the packet transmission queue has sufficient room to transmit a packet
1259 * with netdev_send().
1261 * The kernel maintains a packet transmission queue, so the client is not
1262 * expected to do additional queuing of packets. Thus, this function is
1263 * unlikely to ever be used. It is included for completeness. */
1265 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1267 if (is_tap_netdev(netdev)) {
1268 /* TAP device always accepts packets.*/
1269 poll_immediate_wake();
1273 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1274 * otherwise a positive errno value. */
1276 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1279 enum netdev_flags old_flags = 0;
1282 ovs_mutex_lock(&netdev->mutex);
1284 if (netdev->cache_valid & VALID_ETHERADDR) {
1285 error = netdev->ether_addr_error;
1286 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1289 netdev->cache_valid &= ~VALID_ETHERADDR;
1292 /* Tap devices must be brought down before setting the address. */
1293 if (is_tap_netdev(netdev_)) {
1294 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1296 error = set_etheraddr(netdev_get_name(netdev_), mac);
1297 if (!error || error == ENODEV) {
1298 netdev->ether_addr_error = error;
1299 netdev->cache_valid |= VALID_ETHERADDR;
1301 netdev->etheraddr = mac;
1305 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1306 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1310 ovs_mutex_unlock(&netdev->mutex);
1314 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1316 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1318 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1321 ovs_mutex_lock(&netdev->mutex);
1322 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1323 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1324 &netdev->etheraddr);
1325 netdev->cache_valid |= VALID_ETHERADDR;
1328 error = netdev->ether_addr_error;
1330 *mac = netdev->etheraddr;
1332 ovs_mutex_unlock(&netdev->mutex);
1338 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1342 if (!(netdev->cache_valid & VALID_MTU)) {
1345 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1346 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1347 netdev->mtu = ifr.ifr_mtu;
1348 netdev->cache_valid |= VALID_MTU;
1351 error = netdev->netdev_mtu_error;
1353 *mtup = netdev->mtu;
1359 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1360 * in bytes, not including the hardware header; thus, this is typically 1500
1361 * bytes for Ethernet devices. */
1363 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1368 ovs_mutex_lock(&netdev->mutex);
1369 error = netdev_linux_get_mtu__(netdev, mtup);
1370 ovs_mutex_unlock(&netdev->mutex);
1375 /* Sets the maximum size of transmitted (MTU) for given device using linux
1376 * networking ioctl interface.
1379 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1385 ovs_mutex_lock(&netdev->mutex);
1386 if (netdev->cache_valid & VALID_MTU) {
1387 error = netdev->netdev_mtu_error;
1388 if (error || netdev->mtu == mtu) {
1391 netdev->cache_valid &= ~VALID_MTU;
1394 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1395 SIOCSIFMTU, "SIOCSIFMTU");
1396 if (!error || error == ENODEV) {
1397 netdev->netdev_mtu_error = error;
1398 netdev->mtu = ifr.ifr_mtu;
1399 netdev->cache_valid |= VALID_MTU;
1402 ovs_mutex_unlock(&netdev->mutex);
1406 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1407 * On failure, returns a negative errno value. */
1409 netdev_linux_get_ifindex(const struct netdev *netdev_)
1411 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1414 ovs_mutex_lock(&netdev->mutex);
1415 error = get_ifindex(netdev_, &ifindex);
1416 ovs_mutex_unlock(&netdev->mutex);
1418 return error ? -error : ifindex;
1422 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1426 ovs_mutex_lock(&netdev->mutex);
1427 if (netdev->miimon_interval > 0) {
1428 *carrier = netdev->miimon;
1430 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1432 ovs_mutex_unlock(&netdev->mutex);
1437 static long long int
1438 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1441 long long int carrier_resets;
1443 ovs_mutex_lock(&netdev->mutex);
1444 carrier_resets = netdev->carrier_resets;
1445 ovs_mutex_unlock(&netdev->mutex);
1447 return carrier_resets;
1451 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1452 struct mii_ioctl_data *data)
1457 memset(&ifr, 0, sizeof ifr);
1458 memcpy(&ifr.ifr_data, data, sizeof *data);
1459 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1460 memcpy(data, &ifr.ifr_data, sizeof *data);
1466 netdev_linux_get_miimon(const char *name, bool *miimon)
1468 struct mii_ioctl_data data;
1473 memset(&data, 0, sizeof data);
1474 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1476 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1477 data.reg_num = MII_BMSR;
1478 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1482 *miimon = !!(data.val_out & BMSR_LSTATUS);
1484 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1487 struct ethtool_cmd ecmd;
1489 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1492 COVERAGE_INC(netdev_get_ethtool);
1493 memset(&ecmd, 0, sizeof ecmd);
1494 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1497 struct ethtool_value eval;
1499 memcpy(&eval, &ecmd, sizeof eval);
1500 *miimon = !!eval.data;
1502 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1510 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1511 long long int interval)
1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1515 ovs_mutex_lock(&netdev->mutex);
1516 interval = interval > 0 ? MAX(interval, 100) : 0;
1517 if (netdev->miimon_interval != interval) {
1518 if (interval && !netdev->miimon_interval) {
1519 atomic_count_inc(&miimon_cnt);
1520 } else if (!interval && netdev->miimon_interval) {
1521 atomic_count_dec(&miimon_cnt);
1524 netdev->miimon_interval = interval;
1525 timer_set_expired(&netdev->miimon_timer);
1527 ovs_mutex_unlock(&netdev->mutex);
1533 netdev_linux_miimon_run(void)
1535 struct shash device_shash;
1536 struct shash_node *node;
1538 shash_init(&device_shash);
1539 netdev_get_devices(&netdev_linux_class, &device_shash);
1540 SHASH_FOR_EACH (node, &device_shash) {
1541 struct netdev *netdev = node->data;
1542 struct netdev_linux *dev = netdev_linux_cast(netdev);
1545 ovs_mutex_lock(&dev->mutex);
1546 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1547 netdev_linux_get_miimon(dev->up.name, &miimon);
1548 if (miimon != dev->miimon) {
1549 dev->miimon = miimon;
1550 netdev_linux_changed(dev, dev->ifi_flags, 0);
1553 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1555 ovs_mutex_unlock(&dev->mutex);
1556 netdev_close(netdev);
1559 shash_destroy(&device_shash);
1563 netdev_linux_miimon_wait(void)
1565 struct shash device_shash;
1566 struct shash_node *node;
1568 shash_init(&device_shash);
1569 netdev_get_devices(&netdev_linux_class, &device_shash);
1570 SHASH_FOR_EACH (node, &device_shash) {
1571 struct netdev *netdev = node->data;
1572 struct netdev_linux *dev = netdev_linux_cast(netdev);
1574 ovs_mutex_lock(&dev->mutex);
1575 if (dev->miimon_interval > 0) {
1576 timer_wait(&dev->miimon_timer);
1578 ovs_mutex_unlock(&dev->mutex);
1579 netdev_close(netdev);
1581 shash_destroy(&device_shash);
1585 swap_uint64(uint64_t *a, uint64_t *b)
1592 /* Copies 'src' into 'dst', performing format conversion in the process.
1594 * 'src' is allowed to be misaligned. */
1596 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1597 const struct ovs_vport_stats *src)
1599 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1600 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1601 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1602 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1603 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1604 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1605 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1606 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1608 dst->collisions = 0;
1609 dst->rx_length_errors = 0;
1610 dst->rx_over_errors = 0;
1611 dst->rx_crc_errors = 0;
1612 dst->rx_frame_errors = 0;
1613 dst->rx_fifo_errors = 0;
1614 dst->rx_missed_errors = 0;
1615 dst->tx_aborted_errors = 0;
1616 dst->tx_carrier_errors = 0;
1617 dst->tx_fifo_errors = 0;
1618 dst->tx_heartbeat_errors = 0;
1619 dst->tx_window_errors = 0;
1623 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1625 struct dpif_netlink_vport reply;
1629 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1632 } else if (!reply.stats) {
1637 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1645 get_stats_via_vport(const struct netdev *netdev_,
1646 struct netdev_stats *stats)
1648 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1650 if (!netdev->vport_stats_error ||
1651 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1654 error = get_stats_via_vport__(netdev_, stats);
1655 if (error && error != ENOENT && error != ENODEV) {
1656 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1658 netdev_get_name(netdev_), ovs_strerror(error));
1660 netdev->vport_stats_error = error;
1661 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1665 /* Retrieves current device stats for 'netdev-linux'. */
1667 netdev_linux_get_stats(const struct netdev *netdev_,
1668 struct netdev_stats *stats)
1670 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1671 struct netdev_stats dev_stats;
1674 ovs_mutex_lock(&netdev->mutex);
1675 get_stats_via_vport(netdev_, stats);
1676 error = get_stats_via_netlink(netdev_, &dev_stats);
1678 if (!netdev->vport_stats_error) {
1681 } else if (netdev->vport_stats_error) {
1682 /* stats not available from OVS then use netdev stats. */
1685 /* Use kernel netdev's packet and byte counts since vport's counters
1686 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1688 stats->rx_packets = dev_stats.rx_packets;
1689 stats->rx_bytes = dev_stats.rx_bytes;
1690 stats->tx_packets = dev_stats.tx_packets;
1691 stats->tx_bytes = dev_stats.tx_bytes;
1693 stats->rx_errors += dev_stats.rx_errors;
1694 stats->tx_errors += dev_stats.tx_errors;
1695 stats->rx_dropped += dev_stats.rx_dropped;
1696 stats->tx_dropped += dev_stats.tx_dropped;
1697 stats->multicast += dev_stats.multicast;
1698 stats->collisions += dev_stats.collisions;
1699 stats->rx_length_errors += dev_stats.rx_length_errors;
1700 stats->rx_over_errors += dev_stats.rx_over_errors;
1701 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1702 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1703 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1704 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1705 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1706 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1707 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1708 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1709 stats->tx_window_errors += dev_stats.tx_window_errors;
1711 ovs_mutex_unlock(&netdev->mutex);
1716 /* Retrieves current device stats for 'netdev-tap' netdev or
1717 * netdev-internal. */
1719 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1722 struct netdev_stats dev_stats;
1725 ovs_mutex_lock(&netdev->mutex);
1726 get_stats_via_vport(netdev_, stats);
1727 error = get_stats_via_netlink(netdev_, &dev_stats);
1729 if (!netdev->vport_stats_error) {
1732 } else if (netdev->vport_stats_error) {
1733 /* Transmit and receive stats will appear to be swapped relative to the
1734 * other ports since we are the one sending the data, not a remote
1735 * computer. For consistency, we swap them back here. This does not
1736 * apply if we are getting stats from the vport layer because it always
1737 * tracks stats from the perspective of the switch. */
1740 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1741 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1742 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1743 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1744 stats->rx_length_errors = 0;
1745 stats->rx_over_errors = 0;
1746 stats->rx_crc_errors = 0;
1747 stats->rx_frame_errors = 0;
1748 stats->rx_fifo_errors = 0;
1749 stats->rx_missed_errors = 0;
1750 stats->tx_aborted_errors = 0;
1751 stats->tx_carrier_errors = 0;
1752 stats->tx_fifo_errors = 0;
1753 stats->tx_heartbeat_errors = 0;
1754 stats->tx_window_errors = 0;
1756 /* Use kernel netdev's packet and byte counts since vport counters
1757 * do not reflect packet counts on the wire when GSO, TSO or GRO
1759 stats->rx_packets = dev_stats.tx_packets;
1760 stats->rx_bytes = dev_stats.tx_bytes;
1761 stats->tx_packets = dev_stats.rx_packets;
1762 stats->tx_bytes = dev_stats.rx_bytes;
1764 stats->rx_dropped += dev_stats.tx_dropped;
1765 stats->tx_dropped += dev_stats.rx_dropped;
1767 stats->rx_errors += dev_stats.tx_errors;
1768 stats->tx_errors += dev_stats.rx_errors;
1770 stats->multicast += dev_stats.multicast;
1771 stats->collisions += dev_stats.collisions;
1773 ovs_mutex_unlock(&netdev->mutex);
1779 netdev_internal_get_stats(const struct netdev *netdev_,
1780 struct netdev_stats *stats)
1782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1785 ovs_mutex_lock(&netdev->mutex);
1786 get_stats_via_vport(netdev_, stats);
1787 error = netdev->vport_stats_error;
1788 ovs_mutex_unlock(&netdev->mutex);
1794 netdev_linux_read_features(struct netdev_linux *netdev)
1796 struct ethtool_cmd ecmd;
1800 if (netdev->cache_valid & VALID_FEATURES) {
1804 COVERAGE_INC(netdev_get_ethtool);
1805 memset(&ecmd, 0, sizeof ecmd);
1806 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1807 ETHTOOL_GSET, "ETHTOOL_GSET");
1812 /* Supported features. */
1813 netdev->supported = 0;
1814 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1815 netdev->supported |= NETDEV_F_10MB_HD;
1817 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1818 netdev->supported |= NETDEV_F_10MB_FD;
1820 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1821 netdev->supported |= NETDEV_F_100MB_HD;
1823 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1824 netdev->supported |= NETDEV_F_100MB_FD;
1826 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1827 netdev->supported |= NETDEV_F_1GB_HD;
1829 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1830 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1831 netdev->supported |= NETDEV_F_1GB_FD;
1833 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1834 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1835 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1836 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1837 netdev->supported |= NETDEV_F_10GB_FD;
1839 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1840 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1841 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1842 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1843 netdev->supported |= NETDEV_F_40GB_FD;
1845 if (ecmd.supported & SUPPORTED_TP) {
1846 netdev->supported |= NETDEV_F_COPPER;
1848 if (ecmd.supported & SUPPORTED_FIBRE) {
1849 netdev->supported |= NETDEV_F_FIBER;
1851 if (ecmd.supported & SUPPORTED_Autoneg) {
1852 netdev->supported |= NETDEV_F_AUTONEG;
1854 if (ecmd.supported & SUPPORTED_Pause) {
1855 netdev->supported |= NETDEV_F_PAUSE;
1857 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1858 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1861 /* Advertised features. */
1862 netdev->advertised = 0;
1863 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1864 netdev->advertised |= NETDEV_F_10MB_HD;
1866 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1867 netdev->advertised |= NETDEV_F_10MB_FD;
1869 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1870 netdev->advertised |= NETDEV_F_100MB_HD;
1872 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1873 netdev->advertised |= NETDEV_F_100MB_FD;
1875 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1876 netdev->advertised |= NETDEV_F_1GB_HD;
1878 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1879 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1880 netdev->advertised |= NETDEV_F_1GB_FD;
1882 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1883 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1884 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1885 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1886 netdev->advertised |= NETDEV_F_10GB_FD;
1888 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1889 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1890 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1891 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1892 netdev->advertised |= NETDEV_F_40GB_FD;
1894 if (ecmd.advertising & ADVERTISED_TP) {
1895 netdev->advertised |= NETDEV_F_COPPER;
1897 if (ecmd.advertising & ADVERTISED_FIBRE) {
1898 netdev->advertised |= NETDEV_F_FIBER;
1900 if (ecmd.advertising & ADVERTISED_Autoneg) {
1901 netdev->advertised |= NETDEV_F_AUTONEG;
1903 if (ecmd.advertising & ADVERTISED_Pause) {
1904 netdev->advertised |= NETDEV_F_PAUSE;
1906 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1907 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1910 /* Current settings. */
1911 speed = ethtool_cmd_speed(&ecmd);
1912 if (speed == SPEED_10) {
1913 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1914 } else if (speed == SPEED_100) {
1915 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1916 } else if (speed == SPEED_1000) {
1917 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1918 } else if (speed == SPEED_10000) {
1919 netdev->current = NETDEV_F_10GB_FD;
1920 } else if (speed == 40000) {
1921 netdev->current = NETDEV_F_40GB_FD;
1922 } else if (speed == 100000) {
1923 netdev->current = NETDEV_F_100GB_FD;
1924 } else if (speed == 1000000) {
1925 netdev->current = NETDEV_F_1TB_FD;
1927 netdev->current = 0;
1930 if (ecmd.port == PORT_TP) {
1931 netdev->current |= NETDEV_F_COPPER;
1932 } else if (ecmd.port == PORT_FIBRE) {
1933 netdev->current |= NETDEV_F_FIBER;
1937 netdev->current |= NETDEV_F_AUTONEG;
1941 netdev->cache_valid |= VALID_FEATURES;
1942 netdev->get_features_error = error;
1945 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1946 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1947 * Returns 0 if successful, otherwise a positive errno value. */
1949 netdev_linux_get_features(const struct netdev *netdev_,
1950 enum netdev_features *current,
1951 enum netdev_features *advertised,
1952 enum netdev_features *supported,
1953 enum netdev_features *peer)
1955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1958 ovs_mutex_lock(&netdev->mutex);
1959 netdev_linux_read_features(netdev);
1960 if (!netdev->get_features_error) {
1961 *current = netdev->current;
1962 *advertised = netdev->advertised;
1963 *supported = netdev->supported;
1964 *peer = 0; /* XXX */
1966 error = netdev->get_features_error;
1967 ovs_mutex_unlock(&netdev->mutex);
1972 /* Set the features advertised by 'netdev' to 'advertise'. */
1974 netdev_linux_set_advertisements(struct netdev *netdev_,
1975 enum netdev_features advertise)
1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1978 struct ethtool_cmd ecmd;
1981 ovs_mutex_lock(&netdev->mutex);
1983 COVERAGE_INC(netdev_get_ethtool);
1984 memset(&ecmd, 0, sizeof ecmd);
1985 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1986 ETHTOOL_GSET, "ETHTOOL_GSET");
1991 ecmd.advertising = 0;
1992 if (advertise & NETDEV_F_10MB_HD) {
1993 ecmd.advertising |= ADVERTISED_10baseT_Half;
1995 if (advertise & NETDEV_F_10MB_FD) {
1996 ecmd.advertising |= ADVERTISED_10baseT_Full;
1998 if (advertise & NETDEV_F_100MB_HD) {
1999 ecmd.advertising |= ADVERTISED_100baseT_Half;
2001 if (advertise & NETDEV_F_100MB_FD) {
2002 ecmd.advertising |= ADVERTISED_100baseT_Full;
2004 if (advertise & NETDEV_F_1GB_HD) {
2005 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2007 if (advertise & NETDEV_F_1GB_FD) {
2008 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2010 if (advertise & NETDEV_F_10GB_FD) {
2011 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2013 if (advertise & NETDEV_F_COPPER) {
2014 ecmd.advertising |= ADVERTISED_TP;
2016 if (advertise & NETDEV_F_FIBER) {
2017 ecmd.advertising |= ADVERTISED_FIBRE;
2019 if (advertise & NETDEV_F_AUTONEG) {
2020 ecmd.advertising |= ADVERTISED_Autoneg;
2022 if (advertise & NETDEV_F_PAUSE) {
2023 ecmd.advertising |= ADVERTISED_Pause;
2025 if (advertise & NETDEV_F_PAUSE_ASYM) {
2026 ecmd.advertising |= ADVERTISED_Asym_Pause;
2028 COVERAGE_INC(netdev_set_ethtool);
2029 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2030 ETHTOOL_SSET, "ETHTOOL_SSET");
2033 ovs_mutex_unlock(&netdev->mutex);
2037 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2038 * successful, otherwise a positive errno value. */
2040 netdev_linux_set_policing(struct netdev *netdev_,
2041 uint32_t kbits_rate, uint32_t kbits_burst)
2043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2044 const char *netdev_name = netdev_get_name(netdev_);
2047 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2048 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2049 : kbits_burst); /* Stick with user-specified value. */
2051 ovs_mutex_lock(&netdev->mutex);
2052 if (netdev->cache_valid & VALID_POLICING) {
2053 error = netdev->netdev_policing_error;
2054 if (error || (netdev->kbits_rate == kbits_rate &&
2055 netdev->kbits_burst == kbits_burst)) {
2056 /* Assume that settings haven't changed since we last set them. */
2059 netdev->cache_valid &= ~VALID_POLICING;
2062 COVERAGE_INC(netdev_set_policing);
2063 /* Remove any existing ingress qdisc. */
2064 error = tc_add_del_ingress_qdisc(netdev_, false);
2066 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2067 netdev_name, ovs_strerror(error));
2072 error = tc_add_del_ingress_qdisc(netdev_, true);
2074 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2075 netdev_name, ovs_strerror(error));
2079 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2081 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2082 netdev_name, ovs_strerror(error));
2087 netdev->kbits_rate = kbits_rate;
2088 netdev->kbits_burst = kbits_burst;
2091 if (!error || error == ENODEV) {
2092 netdev->netdev_policing_error = error;
2093 netdev->cache_valid |= VALID_POLICING;
2095 ovs_mutex_unlock(&netdev->mutex);
2100 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2103 const struct tc_ops *const *opsp;
2105 for (opsp = tcs; *opsp != NULL; opsp++) {
2106 const struct tc_ops *ops = *opsp;
2107 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2108 sset_add(types, ops->ovs_name);
2114 static const struct tc_ops *
2115 tc_lookup_ovs_name(const char *name)
2117 const struct tc_ops *const *opsp;
2119 for (opsp = tcs; *opsp != NULL; opsp++) {
2120 const struct tc_ops *ops = *opsp;
2121 if (!strcmp(name, ops->ovs_name)) {
2128 static const struct tc_ops *
2129 tc_lookup_linux_name(const char *name)
2131 const struct tc_ops *const *opsp;
2133 for (opsp = tcs; *opsp != NULL; opsp++) {
2134 const struct tc_ops *ops = *opsp;
2135 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2142 static struct tc_queue *
2143 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2147 struct tc_queue *queue;
2149 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2150 if (queue->queue_id == queue_id) {
2157 static struct tc_queue *
2158 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2160 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2164 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2166 struct netdev_qos_capabilities *caps)
2168 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2172 caps->n_queues = ops->n_queues;
2177 netdev_linux_get_qos(const struct netdev *netdev_,
2178 const char **typep, struct smap *details)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 *typep = netdev->tc->ops->ovs_name;
2187 error = (netdev->tc->ops->qdisc_get
2188 ? netdev->tc->ops->qdisc_get(netdev_, details)
2191 ovs_mutex_unlock(&netdev->mutex);
2197 netdev_linux_set_qos(struct netdev *netdev_,
2198 const char *type, const struct smap *details)
2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2201 const struct tc_ops *new_ops;
2204 new_ops = tc_lookup_ovs_name(type);
2205 if (!new_ops || !new_ops->tc_install) {
2209 ovs_mutex_lock(&netdev->mutex);
2210 error = tc_query_qdisc(netdev_);
2215 if (new_ops == netdev->tc->ops) {
2216 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2218 /* Delete existing qdisc. */
2219 error = tc_del_qdisc(netdev_);
2223 ovs_assert(netdev->tc == NULL);
2225 /* Install new qdisc. */
2226 error = new_ops->tc_install(netdev_, details);
2227 ovs_assert((error == 0) == (netdev->tc != NULL));
2231 ovs_mutex_unlock(&netdev->mutex);
2236 netdev_linux_get_queue(const struct netdev *netdev_,
2237 unsigned int queue_id, struct smap *details)
2239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2242 ovs_mutex_lock(&netdev->mutex);
2243 error = tc_query_qdisc(netdev_);
2245 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2247 ? netdev->tc->ops->class_get(netdev_, queue, details)
2250 ovs_mutex_unlock(&netdev->mutex);
2256 netdev_linux_set_queue(struct netdev *netdev_,
2257 unsigned int queue_id, const struct smap *details)
2259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2262 ovs_mutex_lock(&netdev->mutex);
2263 error = tc_query_qdisc(netdev_);
2265 error = (queue_id < netdev->tc->ops->n_queues
2266 && netdev->tc->ops->class_set
2267 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2270 ovs_mutex_unlock(&netdev->mutex);
2276 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2281 ovs_mutex_lock(&netdev->mutex);
2282 error = tc_query_qdisc(netdev_);
2284 if (netdev->tc->ops->class_delete) {
2285 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2287 ? netdev->tc->ops->class_delete(netdev_, queue)
2293 ovs_mutex_unlock(&netdev->mutex);
2299 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2300 unsigned int queue_id,
2301 struct netdev_queue_stats *stats)
2303 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2306 ovs_mutex_lock(&netdev->mutex);
2307 error = tc_query_qdisc(netdev_);
2309 if (netdev->tc->ops->class_get_stats) {
2310 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2312 stats->created = queue->created;
2313 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2322 ovs_mutex_unlock(&netdev->mutex);
2327 struct queue_dump_state {
2328 struct nl_dump dump;
2333 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2335 struct ofpbuf request;
2336 struct tcmsg *tcmsg;
2338 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2342 tcmsg->tcm_parent = 0;
2343 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2344 ofpbuf_uninit(&request);
2346 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2351 finish_queue_dump(struct queue_dump_state *state)
2353 ofpbuf_uninit(&state->buf);
2354 return nl_dump_done(&state->dump);
2357 struct netdev_linux_queue_state {
2358 unsigned int *queues;
2364 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2366 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2369 ovs_mutex_lock(&netdev->mutex);
2370 error = tc_query_qdisc(netdev_);
2372 if (netdev->tc->ops->class_get) {
2373 struct netdev_linux_queue_state *state;
2374 struct tc_queue *queue;
2377 *statep = state = xmalloc(sizeof *state);
2378 state->n_queues = hmap_count(&netdev->tc->queues);
2379 state->cur_queue = 0;
2380 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2383 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2384 state->queues[i++] = queue->queue_id;
2390 ovs_mutex_unlock(&netdev->mutex);
2396 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2397 unsigned int *queue_idp, struct smap *details)
2399 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2400 struct netdev_linux_queue_state *state = state_;
2403 ovs_mutex_lock(&netdev->mutex);
2404 while (state->cur_queue < state->n_queues) {
2405 unsigned int queue_id = state->queues[state->cur_queue++];
2406 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2409 *queue_idp = queue_id;
2410 error = netdev->tc->ops->class_get(netdev_, queue, details);
2414 ovs_mutex_unlock(&netdev->mutex);
2420 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2423 struct netdev_linux_queue_state *state = state_;
2425 free(state->queues);
2431 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2432 netdev_dump_queue_stats_cb *cb, void *aux)
2434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2437 ovs_mutex_lock(&netdev->mutex);
2438 error = tc_query_qdisc(netdev_);
2440 struct queue_dump_state state;
2442 if (!netdev->tc->ops->class_dump_stats) {
2444 } else if (!start_queue_dump(netdev_, &state)) {
2450 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2451 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2458 retval = finish_queue_dump(&state);
2464 ovs_mutex_unlock(&netdev->mutex);
2470 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2471 struct in_addr netmask)
2473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2476 ovs_mutex_lock(&netdev->mutex);
2477 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2479 if (address.s_addr != INADDR_ANY) {
2480 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2481 "SIOCSIFNETMASK", netmask);
2485 ovs_mutex_unlock(&netdev->mutex);
2490 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2491 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2494 netdev_linux_get_addr_list(const struct netdev *netdev_,
2495 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2500 ovs_mutex_lock(&netdev->mutex);
2501 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2502 ovs_mutex_unlock(&netdev->mutex);
2508 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2510 struct sockaddr_in sin;
2511 memset(&sin, 0, sizeof sin);
2512 sin.sin_family = AF_INET;
2513 sin.sin_addr = addr;
2516 memset(sa, 0, sizeof *sa);
2517 memcpy(sa, &sin, sizeof sin);
2521 do_set_addr(struct netdev *netdev,
2522 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2526 make_in4_sockaddr(&ifr.ifr_addr, addr);
2527 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2531 /* Adds 'router' as a default IP gateway. */
2533 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2535 struct in_addr any = { INADDR_ANY };
2539 memset(&rt, 0, sizeof rt);
2540 make_in4_sockaddr(&rt.rt_dst, any);
2541 make_in4_sockaddr(&rt.rt_gateway, router);
2542 make_in4_sockaddr(&rt.rt_genmask, any);
2543 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2544 error = af_inet_ioctl(SIOCADDRT, &rt);
2546 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2552 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2555 static const char fn[] = "/proc/net/route";
2560 *netdev_name = NULL;
2561 stream = fopen(fn, "r");
2562 if (stream == NULL) {
2563 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2568 while (fgets(line, sizeof line, stream)) {
2571 ovs_be32 dest, gateway, mask;
2572 int refcnt, metric, mtu;
2573 unsigned int flags, use, window, irtt;
2576 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2578 iface, &dest, &gateway, &flags, &refcnt,
2579 &use, &metric, &mask, &mtu, &window, &irtt)) {
2580 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2584 if (!(flags & RTF_UP)) {
2585 /* Skip routes that aren't up. */
2589 /* The output of 'dest', 'mask', and 'gateway' were given in
2590 * network byte order, so we don't need need any endian
2591 * conversions here. */
2592 if ((dest & mask) == (host->s_addr & mask)) {
2594 /* The host is directly reachable. */
2595 next_hop->s_addr = 0;
2597 /* To reach the host, we must go through a gateway. */
2598 next_hop->s_addr = gateway;
2600 *netdev_name = xstrdup(iface);
2612 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2617 ovs_mutex_lock(&netdev->mutex);
2618 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2619 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2621 COVERAGE_INC(netdev_get_ethtool);
2622 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2623 error = netdev_linux_do_ethtool(netdev->up.name,
2626 "ETHTOOL_GDRVINFO");
2628 netdev->cache_valid |= VALID_DRVINFO;
2633 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2634 smap_add(smap, "driver_version", netdev->drvinfo.version);
2635 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2637 ovs_mutex_unlock(&netdev->mutex);
2643 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2646 smap_add(smap, "driver_name", "openvswitch");
2650 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2651 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2652 * returns 0. Otherwise, it returns a positive errno value; in particular,
2653 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2655 netdev_linux_arp_lookup(const struct netdev *netdev,
2656 ovs_be32 ip, struct eth_addr *mac)
2659 struct sockaddr_in sin;
2662 memset(&r, 0, sizeof r);
2663 memset(&sin, 0, sizeof sin);
2664 sin.sin_family = AF_INET;
2665 sin.sin_addr.s_addr = ip;
2667 memcpy(&r.arp_pa, &sin, sizeof sin);
2668 r.arp_ha.sa_family = ARPHRD_ETHER;
2670 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2671 COVERAGE_INC(netdev_arp_lookup);
2672 retval = af_inet_ioctl(SIOCGARP, &r);
2674 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2675 } else if (retval != ENXIO) {
2676 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2677 netdev_get_name(netdev), IP_ARGS(ip),
2678 ovs_strerror(retval));
2684 nd_to_iff_flags(enum netdev_flags nd)
2687 if (nd & NETDEV_UP) {
2690 if (nd & NETDEV_PROMISC) {
2693 if (nd & NETDEV_LOOPBACK) {
2694 iff |= IFF_LOOPBACK;
2700 iff_to_nd_flags(int iff)
2702 enum netdev_flags nd = 0;
2706 if (iff & IFF_PROMISC) {
2707 nd |= NETDEV_PROMISC;
2709 if (iff & IFF_LOOPBACK) {
2710 nd |= NETDEV_LOOPBACK;
2716 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2717 enum netdev_flags on, enum netdev_flags *old_flagsp)
2718 OVS_REQUIRES(netdev->mutex)
2720 int old_flags, new_flags;
2723 old_flags = netdev->ifi_flags;
2724 *old_flagsp = iff_to_nd_flags(old_flags);
2725 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2726 if (new_flags != old_flags) {
2727 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2728 get_flags(&netdev->up, &netdev->ifi_flags);
2735 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2736 enum netdev_flags on, enum netdev_flags *old_flagsp)
2738 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2741 ovs_mutex_lock(&netdev->mutex);
2742 error = update_flags(netdev, off, on, old_flagsp);
2743 ovs_mutex_unlock(&netdev->mutex);
2748 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2749 GET_FEATURES, GET_STATUS) \
2752 false, /* is_pmd */ \
2756 netdev_linux_wait, \
2758 netdev_linux_alloc, \
2760 netdev_linux_destruct, \
2761 netdev_linux_dealloc, \
2762 NULL, /* get_config */ \
2763 NULL, /* set_config */ \
2764 NULL, /* get_tunnel_config */ \
2765 NULL, /* build header */ \
2766 NULL, /* push header */ \
2767 NULL, /* pop header */ \
2768 NULL, /* get_numa_id */ \
2769 NULL, /* set_tx_multiq */ \
2771 netdev_linux_send, \
2772 netdev_linux_send_wait, \
2774 netdev_linux_set_etheraddr, \
2775 netdev_linux_get_etheraddr, \
2776 netdev_linux_get_mtu, \
2777 netdev_linux_set_mtu, \
2778 netdev_linux_get_ifindex, \
2779 netdev_linux_get_carrier, \
2780 netdev_linux_get_carrier_resets, \
2781 netdev_linux_set_miimon_interval, \
2785 netdev_linux_set_advertisements, \
2787 netdev_linux_set_policing, \
2788 netdev_linux_get_qos_types, \
2789 netdev_linux_get_qos_capabilities, \
2790 netdev_linux_get_qos, \
2791 netdev_linux_set_qos, \
2792 netdev_linux_get_queue, \
2793 netdev_linux_set_queue, \
2794 netdev_linux_delete_queue, \
2795 netdev_linux_get_queue_stats, \
2796 netdev_linux_queue_dump_start, \
2797 netdev_linux_queue_dump_next, \
2798 netdev_linux_queue_dump_done, \
2799 netdev_linux_dump_queue_stats, \
2801 netdev_linux_set_in4, \
2802 netdev_linux_get_addr_list, \
2803 netdev_linux_add_router, \
2804 netdev_linux_get_next_hop, \
2806 netdev_linux_arp_lookup, \
2808 netdev_linux_update_flags, \
2809 NULL, /* reconfigure */ \
2811 netdev_linux_rxq_alloc, \
2812 netdev_linux_rxq_construct, \
2813 netdev_linux_rxq_destruct, \
2814 netdev_linux_rxq_dealloc, \
2815 netdev_linux_rxq_recv, \
2816 netdev_linux_rxq_wait, \
2817 netdev_linux_rxq_drain, \
2820 const struct netdev_class netdev_linux_class =
2823 netdev_linux_construct,
2824 netdev_linux_get_stats,
2825 netdev_linux_get_features,
2826 netdev_linux_get_status);
2828 const struct netdev_class netdev_tap_class =
2831 netdev_linux_construct_tap,
2832 netdev_tap_get_stats,
2833 netdev_linux_get_features,
2834 netdev_linux_get_status);
2836 const struct netdev_class netdev_internal_class =
2839 netdev_linux_construct,
2840 netdev_internal_get_stats,
2841 NULL, /* get_features */
2842 netdev_internal_get_status);
2845 #define CODEL_N_QUEUES 0x0000
2847 /* In sufficiently new kernel headers these are defined as enums in
2848 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2849 * kernels. (This overrides any enum definition in the header file but that's
2851 #define TCA_CODEL_TARGET 1
2852 #define TCA_CODEL_LIMIT 2
2853 #define TCA_CODEL_INTERVAL 3
2862 static struct codel *
2863 codel_get__(const struct netdev *netdev_)
2865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2866 return CONTAINER_OF(netdev->tc, struct codel, tc);
2870 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2874 struct codel *codel;
2876 codel = xmalloc(sizeof *codel);
2877 tc_init(&codel->tc, &tc_ops_codel);
2878 codel->target = target;
2879 codel->limit = limit;
2880 codel->interval = interval;
2882 netdev->tc = &codel->tc;
2886 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2890 struct ofpbuf request;
2891 struct tcmsg *tcmsg;
2892 uint32_t otarget, olimit, ointerval;
2895 tc_del_qdisc(netdev);
2897 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2898 NLM_F_EXCL | NLM_F_CREATE, &request);
2902 tcmsg->tcm_handle = tc_make_handle(1, 0);
2903 tcmsg->tcm_parent = TC_H_ROOT;
2905 otarget = target ? target : 5000;
2906 olimit = limit ? limit : 10240;
2907 ointerval = interval ? interval : 100000;
2909 nl_msg_put_string(&request, TCA_KIND, "codel");
2910 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2911 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2912 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2913 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2914 nl_msg_end_nested(&request, opt_offset);
2916 error = tc_transact(&request, NULL);
2918 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2919 "target %u, limit %u, interval %u error %d(%s)",
2920 netdev_get_name(netdev),
2921 otarget, olimit, ointerval,
2922 error, ovs_strerror(error));
2928 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2929 const struct smap *details, struct codel *codel)
2931 const char *target_s;
2932 const char *limit_s;
2933 const char *interval_s;
2935 target_s = smap_get(details, "target");
2936 limit_s = smap_get(details, "limit");
2937 interval_s = smap_get(details, "interval");
2939 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2940 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2941 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2943 if (!codel->target) {
2944 codel->target = 5000;
2946 if (!codel->limit) {
2947 codel->limit = 10240;
2949 if (!codel->interval) {
2950 codel->interval = 100000;
2955 codel_tc_install(struct netdev *netdev, const struct smap *details)
2960 codel_parse_qdisc_details__(netdev, details, &codel);
2961 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2964 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2970 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2972 static const struct nl_policy tca_codel_policy[] = {
2973 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2974 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2975 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2978 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2980 if (!nl_parse_nested(nl_options, tca_codel_policy,
2981 attrs, ARRAY_SIZE(tca_codel_policy))) {
2982 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2986 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2987 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2988 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2993 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2995 struct nlattr *nlattr;
3000 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3005 error = codel_parse_tca_options__(nlattr, &codel);
3010 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3016 codel_tc_destroy(struct tc *tc)
3018 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3024 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3026 const struct codel *codel = codel_get__(netdev);
3027 smap_add_format(details, "target", "%u", codel->target);
3028 smap_add_format(details, "limit", "%u", codel->limit);
3029 smap_add_format(details, "interval", "%u", codel->interval);
3034 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3038 codel_parse_qdisc_details__(netdev, details, &codel);
3039 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3040 codel_get__(netdev)->target = codel.target;
3041 codel_get__(netdev)->limit = codel.limit;
3042 codel_get__(netdev)->interval = codel.interval;
3046 static const struct tc_ops tc_ops_codel = {
3047 "codel", /* linux_name */
3048 "linux-codel", /* ovs_name */
3049 CODEL_N_QUEUES, /* n_queues */
3062 /* FQ-CoDel traffic control class. */
3064 #define FQCODEL_N_QUEUES 0x0000
3066 /* In sufficiently new kernel headers these are defined as enums in
3067 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3068 * kernels. (This overrides any enum definition in the header file but that's
3070 #define TCA_FQ_CODEL_TARGET 1
3071 #define TCA_FQ_CODEL_LIMIT 2
3072 #define TCA_FQ_CODEL_INTERVAL 3
3073 #define TCA_FQ_CODEL_ECN 4
3074 #define TCA_FQ_CODEL_FLOWS 5
3075 #define TCA_FQ_CODEL_QUANTUM 6
3086 static struct fqcodel *
3087 fqcodel_get__(const struct netdev *netdev_)
3089 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3090 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3094 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3095 uint32_t interval, uint32_t flows, uint32_t quantum)
3097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3098 struct fqcodel *fqcodel;
3100 fqcodel = xmalloc(sizeof *fqcodel);
3101 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3102 fqcodel->target = target;
3103 fqcodel->limit = limit;
3104 fqcodel->interval = interval;
3105 fqcodel->flows = flows;
3106 fqcodel->quantum = quantum;
3108 netdev->tc = &fqcodel->tc;
3112 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3113 uint32_t interval, uint32_t flows, uint32_t quantum)
3116 struct ofpbuf request;
3117 struct tcmsg *tcmsg;
3118 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3121 tc_del_qdisc(netdev);
3123 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3124 NLM_F_EXCL | NLM_F_CREATE, &request);
3128 tcmsg->tcm_handle = tc_make_handle(1, 0);
3129 tcmsg->tcm_parent = TC_H_ROOT;
3131 otarget = target ? target : 5000;
3132 olimit = limit ? limit : 10240;
3133 ointerval = interval ? interval : 100000;
3134 oflows = flows ? flows : 1024;
3135 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3138 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3139 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3141 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3142 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3143 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3144 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3145 nl_msg_end_nested(&request, opt_offset);
3147 error = tc_transact(&request, NULL);
3149 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3150 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3151 netdev_get_name(netdev),
3152 otarget, olimit, ointerval, oflows, oquantum,
3153 error, ovs_strerror(error));
3159 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3160 const struct smap *details, struct fqcodel *fqcodel)
3162 const char *target_s;
3163 const char *limit_s;
3164 const char *interval_s;
3165 const char *flows_s;
3166 const char *quantum_s;
3168 target_s = smap_get(details, "target");
3169 limit_s = smap_get(details, "limit");
3170 interval_s = smap_get(details, "interval");
3171 flows_s = smap_get(details, "flows");
3172 quantum_s = smap_get(details, "quantum");
3173 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3174 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3175 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3176 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3177 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3178 if (!fqcodel->target) {
3179 fqcodel->target = 5000;
3181 if (!fqcodel->limit) {
3182 fqcodel->limit = 10240;
3184 if (!fqcodel->interval) {
3185 fqcodel->interval = 1000000;
3187 if (!fqcodel->flows) {
3188 fqcodel->flows = 1024;
3190 if (!fqcodel->quantum) {
3191 fqcodel->quantum = 1514;
3196 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3199 struct fqcodel fqcodel;
3201 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3202 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3203 fqcodel.interval, fqcodel.flows,
3206 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3207 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3213 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3215 static const struct nl_policy tca_fqcodel_policy[] = {
3216 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3218 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3219 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3220 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3223 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3225 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3226 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3227 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3231 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3232 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3233 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3234 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3235 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3240 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3242 struct nlattr *nlattr;
3245 struct fqcodel fqcodel;
3247 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3252 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3257 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3258 fqcodel.flows, fqcodel.quantum);
3263 fqcodel_tc_destroy(struct tc *tc)
3265 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3271 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3273 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3274 smap_add_format(details, "target", "%u", fqcodel->target);
3275 smap_add_format(details, "limit", "%u", fqcodel->limit);
3276 smap_add_format(details, "interval", "%u", fqcodel->interval);
3277 smap_add_format(details, "flows", "%u", fqcodel->flows);
3278 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3283 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3285 struct fqcodel fqcodel;
3287 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3288 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3289 fqcodel.flows, fqcodel.quantum);
3290 fqcodel_get__(netdev)->target = fqcodel.target;
3291 fqcodel_get__(netdev)->limit = fqcodel.limit;
3292 fqcodel_get__(netdev)->interval = fqcodel.interval;
3293 fqcodel_get__(netdev)->flows = fqcodel.flows;
3294 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3298 static const struct tc_ops tc_ops_fqcodel = {
3299 "fq_codel", /* linux_name */
3300 "linux-fq_codel", /* ovs_name */
3301 FQCODEL_N_QUEUES, /* n_queues */
3314 /* SFQ traffic control class. */
3316 #define SFQ_N_QUEUES 0x0000
3325 sfq_get__(const struct netdev *netdev_)
3327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3328 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3332 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3337 sfq = xmalloc(sizeof *sfq);
3338 tc_init(&sfq->tc, &tc_ops_sfq);
3339 sfq->perturb = perturb;
3340 sfq->quantum = quantum;
3342 netdev->tc = &sfq->tc;
3346 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3348 struct tc_sfq_qopt opt;
3349 struct ofpbuf request;
3350 struct tcmsg *tcmsg;
3352 int mtu_error, error;
3353 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3355 tc_del_qdisc(netdev);
3357 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3358 NLM_F_EXCL | NLM_F_CREATE, &request);
3362 tcmsg->tcm_handle = tc_make_handle(1, 0);
3363 tcmsg->tcm_parent = TC_H_ROOT;
3365 memset(&opt, 0, sizeof opt);
3368 opt.quantum = mtu; /* if we cannot find mtu, use default */
3371 opt.quantum = quantum;
3375 opt.perturb_period = 10;
3377 opt.perturb_period = perturb;
3380 nl_msg_put_string(&request, TCA_KIND, "sfq");
3381 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3383 error = tc_transact(&request, NULL);
3385 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3386 "quantum %u, perturb %u error %d(%s)",
3387 netdev_get_name(netdev),
3388 opt.quantum, opt.perturb_period,
3389 error, ovs_strerror(error));
3395 sfq_parse_qdisc_details__(struct netdev *netdev,
3396 const struct smap *details, struct sfq *sfq)
3398 const char *perturb_s;
3399 const char *quantum_s;
3403 perturb_s = smap_get(details, "perturb");
3404 quantum_s = smap_get(details, "quantum");
3405 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3406 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3407 if (!sfq->perturb) {
3411 if (!sfq->quantum) {
3412 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3416 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3417 "device without mtu");
3424 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3429 sfq_parse_qdisc_details__(netdev, details, &sfq);
3430 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3432 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3438 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3440 const struct tc_sfq_qopt *sfq;
3441 struct nlattr *nlattr;
3445 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3447 sfq = nl_attr_get(nlattr);
3448 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3456 sfq_tc_destroy(struct tc *tc)
3458 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3464 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3466 const struct sfq *sfq = sfq_get__(netdev);
3467 smap_add_format(details, "quantum", "%u", sfq->quantum);
3468 smap_add_format(details, "perturb", "%u", sfq->perturb);
3473 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3477 sfq_parse_qdisc_details__(netdev, details, &sfq);
3478 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3479 sfq_get__(netdev)->quantum = sfq.quantum;
3480 sfq_get__(netdev)->perturb = sfq.perturb;
3484 static const struct tc_ops tc_ops_sfq = {
3485 "sfq", /* linux_name */
3486 "linux-sfq", /* ovs_name */
3487 SFQ_N_QUEUES, /* n_queues */
3500 /* HTB traffic control class. */
3502 #define HTB_N_QUEUES 0xf000
3503 #define HTB_RATE2QUANTUM 10
3507 unsigned int max_rate; /* In bytes/s. */
3511 struct tc_queue tc_queue;
3512 unsigned int min_rate; /* In bytes/s. */
3513 unsigned int max_rate; /* In bytes/s. */
3514 unsigned int burst; /* In bytes. */
3515 unsigned int priority; /* Lower values are higher priorities. */
3519 htb_get__(const struct netdev *netdev_)
3521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3522 return CONTAINER_OF(netdev->tc, struct htb, tc);
3526 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3531 htb = xmalloc(sizeof *htb);
3532 tc_init(&htb->tc, &tc_ops_htb);
3533 htb->max_rate = max_rate;
3535 netdev->tc = &htb->tc;
3538 /* Create an HTB qdisc.
3540 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3542 htb_setup_qdisc__(struct netdev *netdev)
3545 struct tc_htb_glob opt;
3546 struct ofpbuf request;
3547 struct tcmsg *tcmsg;
3549 tc_del_qdisc(netdev);
3551 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3552 NLM_F_EXCL | NLM_F_CREATE, &request);
3556 tcmsg->tcm_handle = tc_make_handle(1, 0);
3557 tcmsg->tcm_parent = TC_H_ROOT;
3559 nl_msg_put_string(&request, TCA_KIND, "htb");
3561 memset(&opt, 0, sizeof opt);
3562 opt.rate2quantum = HTB_RATE2QUANTUM;
3566 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3567 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3568 nl_msg_end_nested(&request, opt_offset);
3570 return tc_transact(&request, NULL);
3573 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3574 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3576 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3577 unsigned int parent, struct htb_class *class)
3580 struct tc_htb_opt opt;
3581 struct ofpbuf request;
3582 struct tcmsg *tcmsg;
3586 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3588 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3589 netdev_get_name(netdev));
3593 memset(&opt, 0, sizeof opt);
3594 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3595 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3596 /* Makes sure the quantum is at least MTU. Setting quantum will
3597 * make htb ignore the r2q for this class. */
3598 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3601 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3602 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3603 opt.prio = class->priority;
3605 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3609 tcmsg->tcm_handle = handle;
3610 tcmsg->tcm_parent = parent;
3612 nl_msg_put_string(&request, TCA_KIND, "htb");
3613 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3614 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3615 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3616 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3617 nl_msg_end_nested(&request, opt_offset);
3619 error = tc_transact(&request, NULL);
3621 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3622 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3623 netdev_get_name(netdev),
3624 tc_get_major(handle), tc_get_minor(handle),
3625 tc_get_major(parent), tc_get_minor(parent),
3626 class->min_rate, class->max_rate,
3627 class->burst, class->priority, ovs_strerror(error));
3632 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3633 * description of them into 'details'. The description complies with the
3634 * specification given in the vswitch database documentation for linux-htb
3637 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3639 static const struct nl_policy tca_htb_policy[] = {
3640 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3641 .min_len = sizeof(struct tc_htb_opt) },
3644 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3645 const struct tc_htb_opt *htb;
3647 if (!nl_parse_nested(nl_options, tca_htb_policy,
3648 attrs, ARRAY_SIZE(tca_htb_policy))) {
3649 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3653 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3654 class->min_rate = htb->rate.rate;
3655 class->max_rate = htb->ceil.rate;
3656 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3657 class->priority = htb->prio;
3662 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3663 struct htb_class *options,
3664 struct netdev_queue_stats *stats)
3666 struct nlattr *nl_options;
3667 unsigned int handle;
3670 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3671 if (!error && queue_id) {
3672 unsigned int major = tc_get_major(handle);
3673 unsigned int minor = tc_get_minor(handle);
3674 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3675 *queue_id = minor - 1;
3680 if (!error && options) {
3681 error = htb_parse_tca_options__(nl_options, options);
3687 htb_parse_qdisc_details__(struct netdev *netdev_,
3688 const struct smap *details, struct htb_class *hc)
3690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3691 const char *max_rate_s;
3693 max_rate_s = smap_get(details, "max-rate");
3694 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3695 if (!hc->max_rate) {
3696 enum netdev_features current;
3698 netdev_linux_read_features(netdev);
3699 current = !netdev->get_features_error ? netdev->current : 0;
3700 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3702 hc->min_rate = hc->max_rate;
3708 htb_parse_class_details__(struct netdev *netdev,
3709 const struct smap *details, struct htb_class *hc)
3711 const struct htb *htb = htb_get__(netdev);
3712 const char *min_rate_s = smap_get(details, "min-rate");
3713 const char *max_rate_s = smap_get(details, "max-rate");
3714 const char *burst_s = smap_get(details, "burst");
3715 const char *priority_s = smap_get(details, "priority");
3718 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3720 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3721 netdev_get_name(netdev));
3725 /* HTB requires at least an mtu sized min-rate to send any traffic even
3726 * on uncongested links. */
3727 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3728 hc->min_rate = MAX(hc->min_rate, mtu);
3729 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3732 hc->max_rate = (max_rate_s
3733 ? strtoull(max_rate_s, NULL, 10) / 8
3735 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3736 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3740 * According to hints in the documentation that I've read, it is important
3741 * that 'burst' be at least as big as the largest frame that might be
3742 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3743 * but having it a bit too small is a problem. Since netdev_get_mtu()
3744 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3745 * the MTU. We actually add 64, instead of 14, as a guard against
3746 * additional headers get tacked on somewhere that we're not aware of. */
3747 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3748 hc->burst = MAX(hc->burst, mtu + 64);
3751 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3757 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3758 unsigned int parent, struct htb_class *options,
3759 struct netdev_queue_stats *stats)
3761 struct ofpbuf *reply;
3764 error = tc_query_class(netdev, handle, parent, &reply);
3766 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3767 ofpbuf_delete(reply);
3773 htb_tc_install(struct netdev *netdev, const struct smap *details)
3777 error = htb_setup_qdisc__(netdev);
3779 struct htb_class hc;
3781 htb_parse_qdisc_details__(netdev, details, &hc);
3782 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3783 tc_make_handle(1, 0), &hc);
3785 htb_install__(netdev, hc.max_rate);
3791 static struct htb_class *
3792 htb_class_cast__(const struct tc_queue *queue)
3794 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3798 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3799 const struct htb_class *hc)
3801 struct htb *htb = htb_get__(netdev);
3802 size_t hash = hash_int(queue_id, 0);
3803 struct tc_queue *queue;
3804 struct htb_class *hcp;
3806 queue = tc_find_queue__(netdev, queue_id, hash);
3808 hcp = htb_class_cast__(queue);
3810 hcp = xmalloc(sizeof *hcp);
3811 queue = &hcp->tc_queue;
3812 queue->queue_id = queue_id;
3813 queue->created = time_msec();
3814 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3817 hcp->min_rate = hc->min_rate;
3818 hcp->max_rate = hc->max_rate;
3819 hcp->burst = hc->burst;
3820 hcp->priority = hc->priority;
3824 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3827 struct queue_dump_state state;
3828 struct htb_class hc;
3830 /* Get qdisc options. */
3832 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3833 htb_install__(netdev, hc.max_rate);
3836 if (!start_queue_dump(netdev, &state)) {
3839 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3840 unsigned int queue_id;
3842 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3843 htb_update_queue__(netdev, queue_id, &hc);
3846 finish_queue_dump(&state);
3852 htb_tc_destroy(struct tc *tc)
3854 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3855 struct htb_class *hc;
3857 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3865 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3867 const struct htb *htb = htb_get__(netdev);
3868 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3873 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3875 struct htb_class hc;
3878 htb_parse_qdisc_details__(netdev, details, &hc);
3879 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3880 tc_make_handle(1, 0), &hc);
3882 htb_get__(netdev)->max_rate = hc.max_rate;
3888 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3889 const struct tc_queue *queue, struct smap *details)
3891 const struct htb_class *hc = htb_class_cast__(queue);
3893 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3894 if (hc->min_rate != hc->max_rate) {
3895 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3897 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3899 smap_add_format(details, "priority", "%u", hc->priority);
3905 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3906 const struct smap *details)
3908 struct htb_class hc;
3911 error = htb_parse_class_details__(netdev, details, &hc);
3916 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3917 tc_make_handle(1, 0xfffe), &hc);
3922 htb_update_queue__(netdev, queue_id, &hc);
3927 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3929 struct htb_class *hc = htb_class_cast__(queue);
3930 struct htb *htb = htb_get__(netdev);
3933 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3935 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3942 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3943 struct netdev_queue_stats *stats)
3945 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3946 tc_make_handle(1, 0xfffe), NULL, stats);
3950 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3951 const struct ofpbuf *nlmsg,
3952 netdev_dump_queue_stats_cb *cb, void *aux)
3954 struct netdev_queue_stats stats;
3955 unsigned int handle, major, minor;
3958 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3963 major = tc_get_major(handle);
3964 minor = tc_get_minor(handle);
3965 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3966 (*cb)(minor - 1, &stats, aux);
3971 static const struct tc_ops tc_ops_htb = {
3972 "htb", /* linux_name */
3973 "linux-htb", /* ovs_name */
3974 HTB_N_QUEUES, /* n_queues */
3983 htb_class_get_stats,
3984 htb_class_dump_stats
3987 /* "linux-hfsc" traffic control class. */
3989 #define HFSC_N_QUEUES 0xf000
3997 struct tc_queue tc_queue;
4002 static struct hfsc *
4003 hfsc_get__(const struct netdev *netdev_)
4005 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4006 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4009 static struct hfsc_class *
4010 hfsc_class_cast__(const struct tc_queue *queue)
4012 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4016 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4021 hfsc = xmalloc(sizeof *hfsc);
4022 tc_init(&hfsc->tc, &tc_ops_hfsc);
4023 hfsc->max_rate = max_rate;
4024 netdev->tc = &hfsc->tc;
4028 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4029 const struct hfsc_class *hc)
4033 struct hfsc_class *hcp;
4034 struct tc_queue *queue;
4036 hfsc = hfsc_get__(netdev);
4037 hash = hash_int(queue_id, 0);
4039 queue = tc_find_queue__(netdev, queue_id, hash);
4041 hcp = hfsc_class_cast__(queue);
4043 hcp = xmalloc(sizeof *hcp);
4044 queue = &hcp->tc_queue;
4045 queue->queue_id = queue_id;
4046 queue->created = time_msec();
4047 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4050 hcp->min_rate = hc->min_rate;
4051 hcp->max_rate = hc->max_rate;
4055 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4057 const struct tc_service_curve *rsc, *fsc, *usc;
4058 static const struct nl_policy tca_hfsc_policy[] = {
4060 .type = NL_A_UNSPEC,
4062 .min_len = sizeof(struct tc_service_curve),
4065 .type = NL_A_UNSPEC,
4067 .min_len = sizeof(struct tc_service_curve),
4070 .type = NL_A_UNSPEC,
4072 .min_len = sizeof(struct tc_service_curve),
4075 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4077 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4078 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4079 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4083 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4084 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4085 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4087 if (rsc->m1 != 0 || rsc->d != 0 ||
4088 fsc->m1 != 0 || fsc->d != 0 ||
4089 usc->m1 != 0 || usc->d != 0) {
4090 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4091 "Non-linear service curves are not supported.");
4095 if (rsc->m2 != fsc->m2) {
4096 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4097 "Real-time service curves are not supported ");
4101 if (rsc->m2 > usc->m2) {
4102 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4103 "Min-rate service curve is greater than "
4104 "the max-rate service curve.");
4108 class->min_rate = fsc->m2;
4109 class->max_rate = usc->m2;
4114 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4115 struct hfsc_class *options,
4116 struct netdev_queue_stats *stats)
4119 unsigned int handle;
4120 struct nlattr *nl_options;
4122 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4128 unsigned int major, minor;
4130 major = tc_get_major(handle);
4131 minor = tc_get_minor(handle);
4132 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4133 *queue_id = minor - 1;
4140 error = hfsc_parse_tca_options__(nl_options, options);
4147 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4148 unsigned int parent, struct hfsc_class *options,
4149 struct netdev_queue_stats *stats)
4152 struct ofpbuf *reply;
4154 error = tc_query_class(netdev, handle, parent, &reply);
4159 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4160 ofpbuf_delete(reply);
4165 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4166 struct hfsc_class *class)
4168 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4170 const char *max_rate_s;
4172 max_rate_s = smap_get(details, "max-rate");
4173 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4176 enum netdev_features current;
4178 netdev_linux_read_features(netdev);
4179 current = !netdev->get_features_error ? netdev->current : 0;
4180 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4183 class->min_rate = max_rate;
4184 class->max_rate = max_rate;
4188 hfsc_parse_class_details__(struct netdev *netdev,
4189 const struct smap *details,
4190 struct hfsc_class * class)
4192 const struct hfsc *hfsc;
4193 uint32_t min_rate, max_rate;
4194 const char *min_rate_s, *max_rate_s;
4196 hfsc = hfsc_get__(netdev);
4197 min_rate_s = smap_get(details, "min-rate");
4198 max_rate_s = smap_get(details, "max-rate");
4200 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4201 min_rate = MAX(min_rate, 1);
4202 min_rate = MIN(min_rate, hfsc->max_rate);
4204 max_rate = (max_rate_s
4205 ? strtoull(max_rate_s, NULL, 10) / 8
4207 max_rate = MAX(max_rate, min_rate);
4208 max_rate = MIN(max_rate, hfsc->max_rate);
4210 class->min_rate = min_rate;
4211 class->max_rate = max_rate;
4216 /* Create an HFSC qdisc.
4218 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4220 hfsc_setup_qdisc__(struct netdev * netdev)
4222 struct tcmsg *tcmsg;
4223 struct ofpbuf request;
4224 struct tc_hfsc_qopt opt;
4226 tc_del_qdisc(netdev);
4228 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4229 NLM_F_EXCL | NLM_F_CREATE, &request);
4235 tcmsg->tcm_handle = tc_make_handle(1, 0);
4236 tcmsg->tcm_parent = TC_H_ROOT;
4238 memset(&opt, 0, sizeof opt);
4241 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4242 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4244 return tc_transact(&request, NULL);
4247 /* Create an HFSC class.
4249 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4250 * sc rate <min_rate> ul rate <max_rate>" */
4252 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4253 unsigned int parent, struct hfsc_class *class)
4257 struct tcmsg *tcmsg;
4258 struct ofpbuf request;
4259 struct tc_service_curve min, max;
4261 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4267 tcmsg->tcm_handle = handle;
4268 tcmsg->tcm_parent = parent;
4272 min.m2 = class->min_rate;
4276 max.m2 = class->max_rate;
4278 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4279 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4280 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4281 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4282 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4283 nl_msg_end_nested(&request, opt_offset);
4285 error = tc_transact(&request, NULL);
4287 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4288 "min-rate %ubps, max-rate %ubps (%s)",
4289 netdev_get_name(netdev),
4290 tc_get_major(handle), tc_get_minor(handle),
4291 tc_get_major(parent), tc_get_minor(parent),
4292 class->min_rate, class->max_rate, ovs_strerror(error));
4299 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4302 struct hfsc_class class;
4304 error = hfsc_setup_qdisc__(netdev);
4310 hfsc_parse_qdisc_details__(netdev, details, &class);
4311 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4312 tc_make_handle(1, 0), &class);
4318 hfsc_install__(netdev, class.max_rate);
4323 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4326 struct queue_dump_state state;
4327 struct hfsc_class hc;
4330 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4331 hfsc_install__(netdev, hc.max_rate);
4333 if (!start_queue_dump(netdev, &state)) {
4337 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4338 unsigned int queue_id;
4340 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4341 hfsc_update_queue__(netdev, queue_id, &hc);
4345 finish_queue_dump(&state);
4350 hfsc_tc_destroy(struct tc *tc)
4353 struct hfsc_class *hc, *next;
4355 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4357 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4358 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4367 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4369 const struct hfsc *hfsc;
4370 hfsc = hfsc_get__(netdev);
4371 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4376 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4379 struct hfsc_class class;
4381 hfsc_parse_qdisc_details__(netdev, details, &class);
4382 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4383 tc_make_handle(1, 0), &class);
4386 hfsc_get__(netdev)->max_rate = class.max_rate;
4393 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4394 const struct tc_queue *queue, struct smap *details)
4396 const struct hfsc_class *hc;
4398 hc = hfsc_class_cast__(queue);
4399 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4400 if (hc->min_rate != hc->max_rate) {
4401 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4407 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4408 const struct smap *details)
4411 struct hfsc_class class;
4413 error = hfsc_parse_class_details__(netdev, details, &class);
4418 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4419 tc_make_handle(1, 0xfffe), &class);
4424 hfsc_update_queue__(netdev, queue_id, &class);
4429 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4433 struct hfsc_class *hc;
4435 hc = hfsc_class_cast__(queue);
4436 hfsc = hfsc_get__(netdev);
4438 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4440 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4447 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4448 struct netdev_queue_stats *stats)
4450 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4451 tc_make_handle(1, 0xfffe), NULL, stats);
4455 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4456 const struct ofpbuf *nlmsg,
4457 netdev_dump_queue_stats_cb *cb, void *aux)
4459 struct netdev_queue_stats stats;
4460 unsigned int handle, major, minor;
4463 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4468 major = tc_get_major(handle);
4469 minor = tc_get_minor(handle);
4470 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4471 (*cb)(minor - 1, &stats, aux);
4476 static const struct tc_ops tc_ops_hfsc = {
4477 "hfsc", /* linux_name */
4478 "linux-hfsc", /* ovs_name */
4479 HFSC_N_QUEUES, /* n_queues */
4480 hfsc_tc_install, /* tc_install */
4481 hfsc_tc_load, /* tc_load */
4482 hfsc_tc_destroy, /* tc_destroy */
4483 hfsc_qdisc_get, /* qdisc_get */
4484 hfsc_qdisc_set, /* qdisc_set */
4485 hfsc_class_get, /* class_get */
4486 hfsc_class_set, /* class_set */
4487 hfsc_class_delete, /* class_delete */
4488 hfsc_class_get_stats, /* class_get_stats */
4489 hfsc_class_dump_stats /* class_dump_stats */
4492 /* "linux-default" traffic control class.
4494 * This class represents the default, unnamed Linux qdisc. It corresponds to
4495 * the "" (empty string) QoS type in the OVS database. */
4498 default_install__(struct netdev *netdev_)
4500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4501 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4503 /* Nothing but a tc class implementation is allowed to write to a tc. This
4504 * class never does that, so we can legitimately use a const tc object. */
4505 netdev->tc = CONST_CAST(struct tc *, &tc);
4509 default_tc_install(struct netdev *netdev,
4510 const struct smap *details OVS_UNUSED)
4512 default_install__(netdev);
4517 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4519 default_install__(netdev);
4523 static const struct tc_ops tc_ops_default = {
4524 NULL, /* linux_name */
4529 NULL, /* tc_destroy */
4530 NULL, /* qdisc_get */
4531 NULL, /* qdisc_set */
4532 NULL, /* class_get */
4533 NULL, /* class_set */
4534 NULL, /* class_delete */
4535 NULL, /* class_get_stats */
4536 NULL /* class_dump_stats */
4539 /* "linux-other" traffic control class.
4544 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4547 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4549 /* Nothing but a tc class implementation is allowed to write to a tc. This
4550 * class never does that, so we can legitimately use a const tc object. */
4551 netdev->tc = CONST_CAST(struct tc *, &tc);
4555 static const struct tc_ops tc_ops_other = {
4556 NULL, /* linux_name */
4557 "linux-other", /* ovs_name */
4559 NULL, /* tc_install */
4561 NULL, /* tc_destroy */
4562 NULL, /* qdisc_get */
4563 NULL, /* qdisc_set */
4564 NULL, /* class_get */
4565 NULL, /* class_set */
4566 NULL, /* class_delete */
4567 NULL, /* class_get_stats */
4568 NULL /* class_dump_stats */
4571 /* Traffic control. */
4573 /* Number of kernel "tc" ticks per second. */
4574 static double ticks_per_s;
4576 /* Number of kernel "jiffies" per second. This is used for the purpose of
4577 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4578 * one jiffy's worth of data.
4580 * There are two possibilities here:
4582 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4583 * approximate range of 100 to 1024. That means that we really need to
4584 * make sure that the qdisc can buffer that much data.
4586 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4587 * has finely granular timers and there's no need to fudge additional room
4588 * for buffers. (There's no extra effort needed to implement that: the
4589 * large 'buffer_hz' is used as a divisor, so practically any number will
4590 * come out as 0 in the division. Small integer results in the case of
4591 * really high dividends won't have any real effect anyhow.)
4593 static unsigned int buffer_hz;
4595 /* Returns tc handle 'major':'minor'. */
4597 tc_make_handle(unsigned int major, unsigned int minor)
4599 return TC_H_MAKE(major << 16, minor);
4602 /* Returns the major number from 'handle'. */
4604 tc_get_major(unsigned int handle)
4606 return TC_H_MAJ(handle) >> 16;
4609 /* Returns the minor number from 'handle'. */
4611 tc_get_minor(unsigned int handle)
4613 return TC_H_MIN(handle);
4616 static struct tcmsg *
4617 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4618 struct ofpbuf *request)
4620 struct tcmsg *tcmsg;
4624 error = get_ifindex(netdev, &ifindex);
4629 ofpbuf_init(request, 512);
4630 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4631 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4632 tcmsg->tcm_family = AF_UNSPEC;
4633 tcmsg->tcm_ifindex = ifindex;
4634 /* Caller should fill in tcmsg->tcm_handle. */
4635 /* Caller should fill in tcmsg->tcm_parent. */
4641 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4643 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4644 ofpbuf_uninit(request);
4648 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4649 * policing configuration.
4651 * This function is equivalent to running the following when 'add' is true:
4652 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4654 * This function is equivalent to running the following when 'add' is false:
4655 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4657 * The configuration and stats may be seen with the following command:
4658 * /sbin/tc -s qdisc show dev <devname>
4660 * Returns 0 if successful, otherwise a positive errno value.
4663 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4665 struct ofpbuf request;
4666 struct tcmsg *tcmsg;
4668 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4669 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4671 tcmsg = tc_make_request(netdev, type, flags, &request);
4675 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4676 tcmsg->tcm_parent = TC_H_INGRESS;
4677 nl_msg_put_string(&request, TCA_KIND, "ingress");
4678 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4680 error = tc_transact(&request, NULL);
4682 /* If we're deleting the qdisc, don't worry about some of the
4683 * error conditions. */
4684 if (!add && (error == ENOENT || error == EINVAL)) {
4693 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4696 * This function is equivalent to running:
4697 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4698 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4701 * The configuration and stats may be seen with the following command:
4702 * /sbin/tc -s filter show dev <devname> parent ffff:
4704 * Returns 0 if successful, otherwise a positive errno value.
4707 tc_add_policer(struct netdev *netdev,
4708 uint32_t kbits_rate, uint32_t kbits_burst)
4710 struct tc_police tc_police;
4711 struct ofpbuf request;
4712 struct tcmsg *tcmsg;
4713 size_t basic_offset;
4714 size_t police_offset;
4718 memset(&tc_police, 0, sizeof tc_police);
4719 tc_police.action = TC_POLICE_SHOT;
4720 tc_police.mtu = mtu;
4721 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4723 /* The following appears wrong in one way: In networking a kilobit is
4724 * usually 1000 bits but this uses 1024 bits.
4726 * However if you "fix" those problems then "tc filter show ..." shows
4727 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4728 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4729 * tc's point of view. Whatever. */
4730 tc_police.burst = tc_bytes_to_ticks(
4731 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4733 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4734 NLM_F_EXCL | NLM_F_CREATE, &request);
4738 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4739 tcmsg->tcm_info = tc_make_handle(49,
4740 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4742 nl_msg_put_string(&request, TCA_KIND, "basic");
4743 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4744 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4745 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4746 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4747 nl_msg_end_nested(&request, police_offset);
4748 nl_msg_end_nested(&request, basic_offset);
4750 error = tc_transact(&request, NULL);
4761 /* The values in psched are not individually very meaningful, but they are
4762 * important. The tables below show some values seen in the wild.
4766 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4767 * (Before that, there are hints that it was 1000000000.)
4769 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4773 * -----------------------------------
4774 * [1] 000c8000 000f4240 000f4240 00000064
4775 * [2] 000003e8 00000400 000f4240 3b9aca00
4776 * [3] 000003e8 00000400 000f4240 3b9aca00
4777 * [4] 000003e8 00000400 000f4240 00000064
4778 * [5] 000003e8 00000040 000f4240 3b9aca00
4779 * [6] 000003e8 00000040 000f4240 000000f9
4781 * a b c d ticks_per_s buffer_hz
4782 * ------- --------- ---------- ------------- ----------- -------------
4783 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4784 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4785 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4786 * [4] 1,000 1,024 1,000,000 100 976,562 100
4787 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4788 * [6] 1,000 64 1,000,000 249 15,625,000 249
4790 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4791 * [2] 2.6.26-1-686-bigmem from Debian lenny
4792 * [3] 2.6.26-2-sparc64 from Debian lenny
4793 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4794 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4795 * [6] 2.6.34 from kernel.org on KVM
4797 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4798 static const char fn[] = "/proc/net/psched";
4799 unsigned int a, b, c, d;
4802 if (!ovsthread_once_start(&once)) {
4809 stream = fopen(fn, "r");
4811 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4815 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4816 VLOG_WARN("%s: read failed", fn);
4820 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4824 VLOG_WARN("%s: invalid scheduler parameters", fn);
4828 ticks_per_s = (double) a * c / b;
4832 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4835 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4838 ovsthread_once_done(&once);
4841 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4842 * rate of 'rate' bytes per second. */
4844 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4847 return (rate * ticks) / ticks_per_s;
4850 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4851 * rate of 'rate' bytes per second. */
4853 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4856 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4859 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4860 * a transmission rate of 'rate' bytes per second. */
4862 tc_buffer_per_jiffy(unsigned int rate)
4865 return rate / buffer_hz;
4868 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4869 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4870 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4871 * stores NULL into it if it is absent.
4873 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4876 * Returns 0 if successful, otherwise a positive errno value. */
4878 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4879 struct nlattr **options)
4881 static const struct nl_policy tca_policy[] = {
4882 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4883 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4885 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4887 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4888 tca_policy, ta, ARRAY_SIZE(ta))) {
4889 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4894 *kind = nl_attr_get_string(ta[TCA_KIND]);
4898 *options = ta[TCA_OPTIONS];
4913 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4914 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4915 * into '*options', and its queue statistics into '*stats'. Any of the output
4916 * arguments may be null.
4918 * Returns 0 if successful, otherwise a positive errno value. */
4920 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4921 struct nlattr **options, struct netdev_queue_stats *stats)
4923 static const struct nl_policy tca_policy[] = {
4924 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4925 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4927 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4929 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4930 tca_policy, ta, ARRAY_SIZE(ta))) {
4931 VLOG_WARN_RL(&rl, "failed to parse class message");
4936 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4937 *handlep = tc->tcm_handle;
4941 *options = ta[TCA_OPTIONS];
4945 const struct gnet_stats_queue *gsq;
4946 struct gnet_stats_basic gsb;
4948 static const struct nl_policy stats_policy[] = {
4949 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4950 .min_len = sizeof gsb },
4951 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4952 .min_len = sizeof *gsq },
4954 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4956 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4957 sa, ARRAY_SIZE(sa))) {
4958 VLOG_WARN_RL(&rl, "failed to parse class stats");
4962 /* Alignment issues screw up the length of struct gnet_stats_basic on
4963 * some arch/bitsize combinations. Newer versions of Linux have a
4964 * struct gnet_stats_basic_packed, but we can't depend on that. The
4965 * easiest thing to do is just to make a copy. */
4966 memset(&gsb, 0, sizeof gsb);
4967 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4968 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4969 stats->tx_bytes = gsb.bytes;
4970 stats->tx_packets = gsb.packets;
4972 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4973 stats->tx_errors = gsq->drops;
4983 memset(stats, 0, sizeof *stats);
4988 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4991 tc_query_class(const struct netdev *netdev,
4992 unsigned int handle, unsigned int parent,
4993 struct ofpbuf **replyp)
4995 struct ofpbuf request;
4996 struct tcmsg *tcmsg;
4999 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5003 tcmsg->tcm_handle = handle;
5004 tcmsg->tcm_parent = parent;
5006 error = tc_transact(&request, replyp);
5008 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5009 netdev_get_name(netdev),
5010 tc_get_major(handle), tc_get_minor(handle),
5011 tc_get_major(parent), tc_get_minor(parent),
5012 ovs_strerror(error));
5017 /* Equivalent to "tc class del dev <name> handle <handle>". */
5019 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5021 struct ofpbuf request;
5022 struct tcmsg *tcmsg;
5025 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5029 tcmsg->tcm_handle = handle;
5030 tcmsg->tcm_parent = 0;
5032 error = tc_transact(&request, NULL);
5034 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5035 netdev_get_name(netdev),
5036 tc_get_major(handle), tc_get_minor(handle),
5037 ovs_strerror(error));
5042 /* Equivalent to "tc qdisc del dev <name> root". */
5044 tc_del_qdisc(struct netdev *netdev_)
5046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5047 struct ofpbuf request;
5048 struct tcmsg *tcmsg;
5051 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5055 tcmsg->tcm_handle = tc_make_handle(1, 0);
5056 tcmsg->tcm_parent = TC_H_ROOT;
5058 error = tc_transact(&request, NULL);
5059 if (error == EINVAL) {
5060 /* EINVAL probably means that the default qdisc was in use, in which
5061 * case we've accomplished our purpose. */
5064 if (!error && netdev->tc) {
5065 if (netdev->tc->ops->tc_destroy) {
5066 netdev->tc->ops->tc_destroy(netdev->tc);
5074 getqdisc_is_safe(void)
5076 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5077 static bool safe = false;
5079 if (ovsthread_once_start(&once)) {
5080 struct utsname utsname;
5083 if (uname(&utsname) == -1) {
5084 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5085 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5086 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5087 } else if (major < 2 || (major == 2 && minor < 35)) {
5088 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5093 ovsthread_once_done(&once);
5098 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5099 * kernel to determine what they are. Returns 0 if successful, otherwise a
5100 * positive errno value. */
5102 tc_query_qdisc(const struct netdev *netdev_)
5104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5105 struct ofpbuf request, *qdisc;
5106 const struct tc_ops *ops;
5107 struct tcmsg *tcmsg;
5115 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5116 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5117 * 2.6.35 without that fix backported to it.
5119 * To avoid the OOPS, we must not make a request that would attempt to dump
5120 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5121 * few others. There are a few ways that I can see to do this, but most of
5122 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5123 * technique chosen here is to assume that any non-default qdisc that we
5124 * create will have a class with handle 1:0. The built-in qdiscs only have
5125 * a class with handle 0:0.
5127 * On Linux 2.6.35+ we use the straightforward method because it allows us
5128 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5129 * in such a case we get no response at all from the kernel (!) if a
5130 * builtin qdisc is in use (which is later caught by "!error &&
5131 * !qdisc->size"). */
5132 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5136 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5137 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5139 /* Figure out what tc class to instantiate. */
5140 error = tc_transact(&request, &qdisc);
5141 if (!error && qdisc->size) {
5144 error = tc_parse_qdisc(qdisc, &kind, NULL);
5146 ops = &tc_ops_other;
5148 ops = tc_lookup_linux_name(kind);
5150 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5151 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5153 ops = &tc_ops_other;
5156 } else if ((!error && !qdisc->size) || error == ENOENT) {
5157 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5158 * set up by some other entity that doesn't have a handle 1:0. We will
5159 * assume that it's the system default qdisc. */
5160 ops = &tc_ops_default;
5163 /* Who knows? Maybe the device got deleted. */
5164 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5165 netdev_get_name(netdev_), ovs_strerror(error));
5166 ops = &tc_ops_other;
5169 /* Instantiate it. */
5170 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5171 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5172 ofpbuf_delete(qdisc);
5174 return error ? error : load_error;
5177 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5178 approximate the time to transmit packets of various lengths. For an MTU of
5179 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5180 represents two possible packet lengths; for a MTU of 513 through 1024, four
5181 possible lengths; and so on.
5183 Returns, for the specified 'mtu', the number of bits that packet lengths
5184 need to be shifted right to fit within such a 256-entry table. */
5186 tc_calc_cell_log(unsigned int mtu)
5191 mtu = ETH_PAYLOAD_MAX;
5193 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5195 for (cell_log = 0; mtu >= 256; cell_log++) {
5202 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5205 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5207 memset(rate, 0, sizeof *rate);
5208 rate->cell_log = tc_calc_cell_log(mtu);
5209 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5210 /* rate->cell_align = 0; */ /* distro headers. */
5211 rate->mpu = ETH_TOTAL_MIN;
5215 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5216 * attribute of the specified "type".
5218 * See tc_calc_cell_log() above for a description of "rtab"s. */
5220 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5225 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5226 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5227 unsigned packet_size = (i + 1) << rate->cell_log;
5228 if (packet_size < rate->mpu) {
5229 packet_size = rate->mpu;
5231 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5235 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5236 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5237 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5240 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5242 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5243 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5246 /* Linux-only functions declared in netdev-linux.h */
5248 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5249 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5251 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5252 const char *flag_name, bool enable)
5254 const char *netdev_name = netdev_get_name(netdev);
5255 struct ethtool_value evalue;
5259 COVERAGE_INC(netdev_get_ethtool);
5260 memset(&evalue, 0, sizeof evalue);
5261 error = netdev_linux_do_ethtool(netdev_name,
5262 (struct ethtool_cmd *)&evalue,
5263 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5268 COVERAGE_INC(netdev_set_ethtool);
5269 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5270 if (new_flags == evalue.data) {
5273 evalue.data = new_flags;
5274 error = netdev_linux_do_ethtool(netdev_name,
5275 (struct ethtool_cmd *)&evalue,
5276 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5281 COVERAGE_INC(netdev_get_ethtool);
5282 memset(&evalue, 0, sizeof evalue);
5283 error = netdev_linux_do_ethtool(netdev_name,
5284 (struct ethtool_cmd *)&evalue,
5285 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5290 if (new_flags != evalue.data) {
5291 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5292 "device %s failed", enable ? "enable" : "disable",
5293 flag_name, netdev_name);
5300 /* Utility functions. */
5302 /* Copies 'src' into 'dst', performing format conversion in the process. */
5304 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5305 const struct rtnl_link_stats *src)
5307 dst->rx_packets = src->rx_packets;
5308 dst->tx_packets = src->tx_packets;
5309 dst->rx_bytes = src->rx_bytes;
5310 dst->tx_bytes = src->tx_bytes;
5311 dst->rx_errors = src->rx_errors;
5312 dst->tx_errors = src->tx_errors;
5313 dst->rx_dropped = src->rx_dropped;
5314 dst->tx_dropped = src->tx_dropped;
5315 dst->multicast = src->multicast;
5316 dst->collisions = src->collisions;
5317 dst->rx_length_errors = src->rx_length_errors;
5318 dst->rx_over_errors = src->rx_over_errors;
5319 dst->rx_crc_errors = src->rx_crc_errors;
5320 dst->rx_frame_errors = src->rx_frame_errors;
5321 dst->rx_fifo_errors = src->rx_fifo_errors;
5322 dst->rx_missed_errors = src->rx_missed_errors;
5323 dst->tx_aborted_errors = src->tx_aborted_errors;
5324 dst->tx_carrier_errors = src->tx_carrier_errors;
5325 dst->tx_fifo_errors = src->tx_fifo_errors;
5326 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5327 dst->tx_window_errors = src->tx_window_errors;
5330 /* Copies 'src' into 'dst', performing format conversion in the process. */
5332 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5333 const struct rtnl_link_stats64 *src)
5335 dst->rx_packets = src->rx_packets;
5336 dst->tx_packets = src->tx_packets;
5337 dst->rx_bytes = src->rx_bytes;
5338 dst->tx_bytes = src->tx_bytes;
5339 dst->rx_errors = src->rx_errors;
5340 dst->tx_errors = src->tx_errors;
5341 dst->rx_dropped = src->rx_dropped;
5342 dst->tx_dropped = src->tx_dropped;
5343 dst->multicast = src->multicast;
5344 dst->collisions = src->collisions;
5345 dst->rx_length_errors = src->rx_length_errors;
5346 dst->rx_over_errors = src->rx_over_errors;
5347 dst->rx_crc_errors = src->rx_crc_errors;
5348 dst->rx_frame_errors = src->rx_frame_errors;
5349 dst->rx_fifo_errors = src->rx_fifo_errors;
5350 dst->rx_missed_errors = src->rx_missed_errors;
5351 dst->tx_aborted_errors = src->tx_aborted_errors;
5352 dst->tx_carrier_errors = src->tx_carrier_errors;
5353 dst->tx_fifo_errors = src->tx_fifo_errors;
5354 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5355 dst->tx_window_errors = src->tx_window_errors;
5359 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5361 struct ofpbuf request;
5362 struct ofpbuf *reply;
5365 /* Filtering all counters by default */
5366 memset(stats, 0xFF, sizeof(struct netdev_stats));
5368 ofpbuf_init(&request, 0);
5369 nl_msg_put_nlmsghdr(&request,
5370 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5371 RTM_GETLINK, NLM_F_REQUEST);
5372 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5373 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5374 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5375 ofpbuf_uninit(&request);
5380 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5381 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5382 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5383 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5386 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5387 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5388 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5391 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5396 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5401 ofpbuf_delete(reply);
5406 get_flags(const struct netdev *dev, unsigned int *flags)
5412 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5414 *flags = ifr.ifr_flags;
5420 set_flags(const char *name, unsigned int flags)
5424 ifr.ifr_flags = flags;
5425 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5429 do_get_ifindex(const char *netdev_name)
5434 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5435 COVERAGE_INC(netdev_get_ifindex);
5437 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5439 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5440 netdev_name, ovs_strerror(error));
5443 return ifr.ifr_ifindex;
5447 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5449 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5451 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5452 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5455 netdev->get_ifindex_error = -ifindex;
5456 netdev->ifindex = 0;
5458 netdev->get_ifindex_error = 0;
5459 netdev->ifindex = ifindex;
5461 netdev->cache_valid |= VALID_IFINDEX;
5464 *ifindexp = netdev->ifindex;
5465 return netdev->get_ifindex_error;
5469 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5475 memset(&ifr, 0, sizeof ifr);
5476 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5477 COVERAGE_INC(netdev_get_hwaddr);
5478 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5480 /* ENODEV probably means that a vif disappeared asynchronously and
5481 * hasn't been removed from the database yet, so reduce the log level
5482 * to INFO for that case. */
5483 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5484 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5485 netdev_name, ovs_strerror(error));
5488 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5489 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5490 VLOG_INFO("%s device has unknown hardware address family %d",
5491 netdev_name, hwaddr_family);
5494 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5499 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5504 memset(&ifr, 0, sizeof ifr);
5505 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5506 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5507 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5508 COVERAGE_INC(netdev_set_hwaddr);
5509 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5511 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5512 netdev_name, ovs_strerror(error));
5518 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5519 int cmd, const char *cmd_name)
5524 memset(&ifr, 0, sizeof ifr);
5525 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5526 ifr.ifr_data = (caddr_t) ecmd;
5529 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5531 if (error != EOPNOTSUPP) {
5532 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5533 "failed: %s", cmd_name, name, ovs_strerror(error));
5535 /* The device doesn't support this operation. That's pretty
5536 * common, so there's no point in logging anything. */
5542 /* Returns an AF_PACKET raw socket or a negative errno value. */
5544 af_packet_sock(void)
5546 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5549 if (ovsthread_once_start(&once)) {
5550 sock = socket(AF_PACKET, SOCK_RAW, 0);
5552 int error = set_nonblocking(sock);
5559 VLOG_ERR("failed to create packet socket: %s",
5560 ovs_strerror(errno));
5562 ovsthread_once_done(&once);