2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
226 VALID_POLICING = 1 << 5,
227 VALID_VPORT_STAT_ERROR = 1 << 6,
228 VALID_DRVINFO = 1 << 7,
229 VALID_FEATURES = 1 << 8,
232 /* Traffic control. */
234 /* An instance of a traffic control class. Always associated with a particular
237 * Each TC implementation subclasses this with whatever additional data it
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
248 /* One traffic control queue.
250 * Each TC implementation subclasses this with whatever additional data it
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
330 * This function may be null if 'tc' is not configurable.
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
341 * This function may be null if 'tc' is not configurable.
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
356 * This function may be null if 'tc' does not have queues ('n_queues' is
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
385 * On success, initializes '*stats'.
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
404 tc_init(struct tc *tc, const struct tc_ops *ops)
407 hmap_init(&tc->queues);
411 tc_destroy(struct tc *tc)
413 hmap_destroy(&tc->queues);
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_other;
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
431 &tc_ops_other, /* Some other qdisc. */
435 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
436 static unsigned int tc_get_major(unsigned int handle);
437 static unsigned int tc_get_minor(unsigned int handle);
439 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
440 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
441 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
443 static struct tcmsg *tc_make_request(const struct netdev *, int type,
444 unsigned int flags, struct ofpbuf *);
445 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
446 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
447 static int tc_add_policer(struct netdev *,
448 uint32_t kbits_rate, uint32_t kbits_burst);
450 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
451 struct nlattr **options);
452 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
453 struct nlattr **options,
454 struct netdev_queue_stats *);
455 static int tc_query_class(const struct netdev *,
456 unsigned int handle, unsigned int parent,
457 struct ofpbuf **replyp);
458 static int tc_delete_class(const struct netdev *, unsigned int handle);
460 static int tc_del_qdisc(struct netdev *netdev);
461 static int tc_query_qdisc(const struct netdev *netdev);
463 static int tc_calc_cell_log(unsigned int mtu);
464 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
465 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
466 const struct tc_ratespec *rate);
467 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
469 struct netdev_linux {
472 /* Protects all members below. */
473 struct ovs_mutex mutex;
475 unsigned int cache_valid;
477 bool miimon; /* Link status of last poll. */
478 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
479 struct timer miimon_timer;
481 /* The following are figured out "on demand" only. They are only valid
482 * when the corresponding VALID_* bit in 'cache_valid' is set. */
484 struct eth_addr etheraddr;
485 struct in_addr address, netmask;
488 unsigned int ifi_flags;
489 long long int carrier_resets;
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error; /* Cached error code from set policing. */
497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499 int in4_error; /* Cached error code from reading in4 addr. */
500 int in6_error; /* Cached error code from reading in6 addr. */
502 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
504 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
506 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
509 /* For devices of class netdev_tap_class only. */
513 struct netdev_rxq_linux {
514 struct netdev_rxq up;
519 /* This is set pretty low because we probably won't learn anything from the
520 * additional log messages. */
521 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
523 /* Polling miimon status for all ports causes performance degradation when
524 * handling a large number of ports. If there are no devices using miimon, then
525 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 * Readers do not depend on this variable synchronizing with the related
528 * changes in the device miimon status, so we can use atomic_count. */
529 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
531 static void netdev_linux_run(void);
533 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
534 int cmd, const char *cmd_name);
535 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
536 int cmd, const char *cmd_name);
537 static int get_flags(const struct netdev *, unsigned int *flags);
538 static int set_flags(const char *, unsigned int flags);
539 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
540 enum netdev_flags on, enum netdev_flags *old_flagsp)
541 OVS_REQUIRES(netdev->mutex);
542 static int do_get_ifindex(const char *netdev_name);
543 static int get_ifindex(const struct netdev *, int *ifindexp);
544 static int do_set_addr(struct netdev *netdev,
545 int ioctl_nr, const char *ioctl_name,
546 struct in_addr addr);
547 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
548 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
549 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
550 static int af_packet_sock(void);
551 static bool netdev_linux_miimon_enabled(void);
552 static void netdev_linux_miimon_run(void);
553 static void netdev_linux_miimon_wait(void);
554 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
557 is_netdev_linux_class(const struct netdev_class *netdev_class)
559 return netdev_class->run == netdev_linux_run;
563 is_tap_netdev(const struct netdev *netdev)
565 return netdev_get_class(netdev) == &netdev_tap_class;
568 static struct netdev_linux *
569 netdev_linux_cast(const struct netdev *netdev)
571 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
573 return CONTAINER_OF(netdev, struct netdev_linux, up);
576 static struct netdev_rxq_linux *
577 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
579 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
580 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
583 static void netdev_linux_update(struct netdev_linux *netdev,
584 const struct rtnetlink_change *)
585 OVS_REQUIRES(netdev->mutex);
586 static void netdev_linux_changed(struct netdev_linux *netdev,
587 unsigned int ifi_flags, unsigned int mask)
588 OVS_REQUIRES(netdev->mutex);
590 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
591 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
592 * if no such socket could be created. */
593 static struct nl_sock *
594 netdev_linux_notify_sock(void)
596 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
597 static struct nl_sock *sock;
598 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
599 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
601 if (ovsthread_once_start(&once)) {
604 error = nl_sock_create(NETLINK_ROUTE, &sock);
608 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
609 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
611 nl_sock_destroy(sock);
617 ovsthread_once_done(&once);
624 netdev_linux_miimon_enabled(void)
626 return atomic_count_get(&miimon_cnt) > 0;
630 netdev_linux_run(void)
632 struct nl_sock *sock;
635 if (netdev_linux_miimon_enabled()) {
636 netdev_linux_miimon_run();
639 sock = netdev_linux_notify_sock();
645 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
646 uint64_t buf_stub[4096 / 8];
649 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
650 error = nl_sock_recv(sock, &buf, false);
652 struct rtnetlink_change change;
654 if (rtnetlink_parse(&buf, &change)) {
655 struct netdev *netdev_ = NULL;
656 char dev_name[IFNAMSIZ];
658 if (!change.ifname) {
659 change.ifname = if_indextoname(change.if_index, dev_name);
663 netdev_ = netdev_from_name(change.ifname);
665 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
668 ovs_mutex_lock(&netdev->mutex);
669 netdev_linux_update(netdev, &change);
670 ovs_mutex_unlock(&netdev->mutex);
672 netdev_close(netdev_);
674 } else if (error == ENOBUFS) {
675 struct shash device_shash;
676 struct shash_node *node;
680 shash_init(&device_shash);
681 netdev_get_devices(&netdev_linux_class, &device_shash);
682 SHASH_FOR_EACH (node, &device_shash) {
683 struct netdev *netdev_ = node->data;
684 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
687 ovs_mutex_lock(&netdev->mutex);
688 get_flags(netdev_, &flags);
689 netdev_linux_changed(netdev, flags, 0);
690 ovs_mutex_unlock(&netdev->mutex);
692 netdev_close(netdev_);
694 shash_destroy(&device_shash);
695 } else if (error != EAGAIN) {
696 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
697 ovs_strerror(error));
704 netdev_linux_wait(void)
706 struct nl_sock *sock;
708 if (netdev_linux_miimon_enabled()) {
709 netdev_linux_miimon_wait();
711 sock = netdev_linux_notify_sock();
713 nl_sock_wait(sock, POLLIN);
718 netdev_linux_changed(struct netdev_linux *dev,
719 unsigned int ifi_flags, unsigned int mask)
720 OVS_REQUIRES(dev->mutex)
722 netdev_change_seq_changed(&dev->up);
724 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
725 dev->carrier_resets++;
727 dev->ifi_flags = ifi_flags;
729 dev->cache_valid &= mask;
733 netdev_linux_update(struct netdev_linux *dev,
734 const struct rtnetlink_change *change)
735 OVS_REQUIRES(dev->mutex)
737 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
738 if (change->nlmsg_type == RTM_NEWLINK) {
739 /* Keep drv-info, in4, in6. */
740 netdev_linux_changed(dev, change->ifi_flags,
741 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
743 /* Update netdev from rtnl-change msg. */
745 dev->mtu = change->mtu;
746 dev->cache_valid |= VALID_MTU;
747 dev->netdev_mtu_error = 0;
750 if (!eth_addr_is_zero(change->mac)) {
751 dev->etheraddr = change->mac;
752 dev->cache_valid |= VALID_ETHERADDR;
753 dev->ether_addr_error = 0;
756 dev->ifindex = change->if_index;
757 dev->cache_valid |= VALID_IFINDEX;
758 dev->get_ifindex_error = 0;
760 netdev_linux_changed(dev, change->ifi_flags, 0);
762 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
763 /* Invalidates in4, in6. */
764 netdev_linux_changed(dev, dev->ifi_flags,
765 ~(VALID_IN4 | VALID_IN6));
771 static struct netdev *
772 netdev_linux_alloc(void)
774 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
779 netdev_linux_common_construct(struct netdev_linux *netdev)
781 ovs_mutex_init(&netdev->mutex);
784 /* Creates system and internal devices. */
786 netdev_linux_construct(struct netdev *netdev_)
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 netdev_linux_common_construct(netdev);
793 error = get_flags(&netdev->up, &netdev->ifi_flags);
794 if (error == ENODEV) {
795 if (netdev->up.netdev_class != &netdev_internal_class) {
796 /* The device does not exist, so don't allow it to be opened. */
799 /* "Internal" netdevs have to be created as netdev objects before
800 * they exist in the kernel, because creating them in the kernel
801 * happens by passing a netdev object to dpif_port_add().
802 * Therefore, ignore the error. */
809 /* For most types of netdevs we open the device for each call of
810 * netdev_open(). However, this is not the case with tap devices,
811 * since it is only possible to open the device once. In this
812 * situation we share a single file descriptor, and consequently
813 * buffers, across all readers. Therefore once data is read it will
814 * be unavailable to other reads for tap devices. */
816 netdev_linux_construct_tap(struct netdev *netdev_)
818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
819 static const char tap_dev[] = "/dev/net/tun";
820 const char *name = netdev_->name;
824 netdev_linux_common_construct(netdev);
826 /* Open tap device. */
827 netdev->tap_fd = open(tap_dev, O_RDWR);
828 if (netdev->tap_fd < 0) {
830 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
834 /* Create tap device. */
835 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
836 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
837 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
838 VLOG_WARN("%s: creating tap device failed: %s", name,
839 ovs_strerror(errno));
844 /* Make non-blocking. */
845 error = set_nonblocking(netdev->tap_fd);
853 close(netdev->tap_fd);
858 netdev_linux_destruct(struct netdev *netdev_)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
862 if (netdev->tc && netdev->tc->ops->tc_destroy) {
863 netdev->tc->ops->tc_destroy(netdev->tc);
866 if (netdev_get_class(netdev_) == &netdev_tap_class
867 && netdev->tap_fd >= 0)
869 close(netdev->tap_fd);
872 if (netdev->miimon_interval > 0) {
873 atomic_count_dec(&miimon_cnt);
876 ovs_mutex_destroy(&netdev->mutex);
880 netdev_linux_dealloc(struct netdev *netdev_)
882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
886 static struct netdev_rxq *
887 netdev_linux_rxq_alloc(void)
889 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
894 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
896 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
897 struct netdev *netdev_ = rx->up.netdev;
898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
901 ovs_mutex_lock(&netdev->mutex);
902 rx->is_tap = is_tap_netdev(netdev_);
904 rx->fd = netdev->tap_fd;
906 struct sockaddr_ll sll;
908 /* Result of tcpdump -dd inbound */
909 static const struct sock_filter filt[] = {
910 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
911 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
912 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
913 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
915 static const struct sock_fprog fprog = {
916 ARRAY_SIZE(filt), (struct sock_filter *) filt
919 /* Create file descriptor. */
920 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
923 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
928 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
930 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
931 netdev_get_name(netdev_), ovs_strerror(error));
935 /* Set non-blocking mode. */
936 error = set_nonblocking(rx->fd);
941 /* Get ethernet device index. */
942 error = get_ifindex(&netdev->up, &ifindex);
947 /* Bind to specific ethernet device. */
948 memset(&sll, 0, sizeof sll);
949 sll.sll_family = AF_PACKET;
950 sll.sll_ifindex = ifindex;
951 sll.sll_protocol = htons(ETH_P_ALL);
952 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
954 VLOG_ERR("%s: failed to bind raw socket (%s)",
955 netdev_get_name(netdev_), ovs_strerror(error));
959 /* Filter for only inbound packets. */
960 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
964 VLOG_ERR("%s: failed to attach filter (%s)",
965 netdev_get_name(netdev_), ovs_strerror(error));
969 ovs_mutex_unlock(&netdev->mutex);
977 ovs_mutex_unlock(&netdev->mutex);
982 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
984 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
992 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
994 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1000 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
1002 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1003 return htons(aux->tp_vlan_tpid);
1005 return htons(ETH_TYPE_VLAN);
1010 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1012 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1016 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1021 struct cmsghdr *cmsg;
1023 struct cmsghdr cmsg;
1024 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1028 /* Reserve headroom for a single VLAN tag */
1029 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1030 size = dp_packet_tailroom(buffer);
1032 iov.iov_base = dp_packet_data(buffer);
1034 msgh.msg_name = NULL;
1035 msgh.msg_namelen = 0;
1036 msgh.msg_iov = &iov;
1037 msgh.msg_iovlen = 1;
1038 msgh.msg_control = &cmsg_buffer;
1039 msgh.msg_controllen = sizeof cmsg_buffer;
1043 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1044 } while (retval < 0 && errno == EINTR);
1048 } else if (retval > size) {
1052 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1054 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1055 const struct tpacket_auxdata *aux;
1057 if (cmsg->cmsg_level != SOL_PACKET
1058 || cmsg->cmsg_type != PACKET_AUXDATA
1059 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1063 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1064 if (auxdata_has_vlan_tci(aux)) {
1065 if (retval < ETH_HEADER_LEN) {
1069 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1070 htons(aux->tp_vlan_tci));
1079 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1082 size_t size = dp_packet_tailroom(buffer);
1085 retval = read(fd, dp_packet_data(buffer), size);
1086 } while (retval < 0 && errno == EINTR);
1092 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1097 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1100 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1101 struct netdev *netdev = rx->up.netdev;
1102 struct dp_packet *buffer;
1106 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1107 mtu = ETH_PAYLOAD_MAX;
1110 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1111 DP_NETDEV_HEADROOM);
1112 retval = (rx->is_tap
1113 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1114 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1117 if (retval != EAGAIN && retval != EMSGSIZE) {
1118 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1119 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1121 dp_packet_delete(buffer);
1123 dp_packet_pad(buffer);
1124 dp_packet_rss_invalidate(buffer);
1125 packets[0] = buffer;
1133 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1135 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1136 poll_fd_wait(rx->fd, POLLIN);
1140 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1142 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1145 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1146 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1150 drain_fd(rx->fd, ifr.ifr_qlen);
1153 return drain_rcvbuf(rx->fd);
1157 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1158 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1159 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1160 * the packet is too big or too small to transmit on the device.
1162 * The caller retains ownership of 'buffer' in all cases.
1164 * The kernel maintains a packet transmission queue, so the caller is not
1165 * expected to do additional queuing of packets. */
1167 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1168 struct dp_packet **pkts, int cnt, bool may_steal)
1173 /* 'i' is incremented only if there's no error */
1174 for (i = 0; i < cnt;) {
1175 const void *data = dp_packet_data(pkts[i]);
1176 size_t size = dp_packet_size(pkts[i]);
1179 if (!is_tap_netdev(netdev_)) {
1180 /* Use our AF_PACKET socket to send to this device. */
1181 struct sockaddr_ll sll;
1187 sock = af_packet_sock();
1192 ifindex = netdev_get_ifindex(netdev_);
1197 /* We don't bother setting most fields in sockaddr_ll because the
1198 * kernel ignores them for SOCK_RAW. */
1199 memset(&sll, 0, sizeof sll);
1200 sll.sll_family = AF_PACKET;
1201 sll.sll_ifindex = ifindex;
1203 iov.iov_base = CONST_CAST(void *, data);
1206 msg.msg_name = &sll;
1207 msg.msg_namelen = sizeof sll;
1210 msg.msg_control = NULL;
1211 msg.msg_controllen = 0;
1214 retval = sendmsg(sock, &msg, 0);
1216 /* Use the tap fd to send to this device. This is essential for
1217 * tap devices, because packets sent to a tap device with an
1218 * AF_PACKET socket will loop back to be *received* again on the
1219 * tap device. This doesn't occur on other interface types
1220 * because we attach a socket filter to the rx socket. */
1221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1223 retval = write(netdev->tap_fd, data, size);
1227 /* The Linux AF_PACKET implementation never blocks waiting for room
1228 * for packets, instead returning ENOBUFS. Translate this into
1229 * EAGAIN for the caller. */
1230 error = errno == ENOBUFS ? EAGAIN : errno;
1231 if (error == EINTR) {
1232 /* continue without incrementing 'i', i.e. retry this packet */
1236 } else if (retval != size) {
1237 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1238 " of %"PRIuSIZE") on %s", retval, size,
1239 netdev_get_name(netdev_));
1244 /* Process the next packet in the batch */
1249 for (i = 0; i < cnt; i++) {
1250 dp_packet_delete(pkts[i]);
1254 if (error && error != EAGAIN) {
1255 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1256 netdev_get_name(netdev_), ovs_strerror(error));
1263 /* Registers with the poll loop to wake up from the next call to poll_block()
1264 * when the packet transmission queue has sufficient room to transmit a packet
1265 * with netdev_send().
1267 * The kernel maintains a packet transmission queue, so the client is not
1268 * expected to do additional queuing of packets. Thus, this function is
1269 * unlikely to ever be used. It is included for completeness. */
1271 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1273 if (is_tap_netdev(netdev)) {
1274 /* TAP device always accepts packets.*/
1275 poll_immediate_wake();
1279 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1280 * otherwise a positive errno value. */
1282 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1284 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1285 enum netdev_flags old_flags = 0;
1288 ovs_mutex_lock(&netdev->mutex);
1290 if (netdev->cache_valid & VALID_ETHERADDR) {
1291 error = netdev->ether_addr_error;
1292 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1295 netdev->cache_valid &= ~VALID_ETHERADDR;
1298 /* Tap devices must be brought down before setting the address. */
1299 if (is_tap_netdev(netdev_)) {
1300 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1302 error = set_etheraddr(netdev_get_name(netdev_), mac);
1303 if (!error || error == ENODEV) {
1304 netdev->ether_addr_error = error;
1305 netdev->cache_valid |= VALID_ETHERADDR;
1307 netdev->etheraddr = mac;
1311 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1312 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1316 ovs_mutex_unlock(&netdev->mutex);
1320 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1322 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1327 ovs_mutex_lock(&netdev->mutex);
1328 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1329 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1330 &netdev->etheraddr);
1331 netdev->cache_valid |= VALID_ETHERADDR;
1334 error = netdev->ether_addr_error;
1336 *mac = netdev->etheraddr;
1338 ovs_mutex_unlock(&netdev->mutex);
1344 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1348 if (!(netdev->cache_valid & VALID_MTU)) {
1351 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1352 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1353 netdev->mtu = ifr.ifr_mtu;
1354 netdev->cache_valid |= VALID_MTU;
1357 error = netdev->netdev_mtu_error;
1359 *mtup = netdev->mtu;
1365 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1366 * in bytes, not including the hardware header; thus, this is typically 1500
1367 * bytes for Ethernet devices. */
1369 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1371 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1374 ovs_mutex_lock(&netdev->mutex);
1375 error = netdev_linux_get_mtu__(netdev, mtup);
1376 ovs_mutex_unlock(&netdev->mutex);
1381 /* Sets the maximum size of transmitted (MTU) for given device using linux
1382 * networking ioctl interface.
1385 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1387 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1391 ovs_mutex_lock(&netdev->mutex);
1392 if (netdev->cache_valid & VALID_MTU) {
1393 error = netdev->netdev_mtu_error;
1394 if (error || netdev->mtu == mtu) {
1397 netdev->cache_valid &= ~VALID_MTU;
1400 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1401 SIOCSIFMTU, "SIOCSIFMTU");
1402 if (!error || error == ENODEV) {
1403 netdev->netdev_mtu_error = error;
1404 netdev->mtu = ifr.ifr_mtu;
1405 netdev->cache_valid |= VALID_MTU;
1408 ovs_mutex_unlock(&netdev->mutex);
1412 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1413 * On failure, returns a negative errno value. */
1415 netdev_linux_get_ifindex(const struct netdev *netdev_)
1417 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1420 ovs_mutex_lock(&netdev->mutex);
1421 error = get_ifindex(netdev_, &ifindex);
1422 ovs_mutex_unlock(&netdev->mutex);
1424 return error ? -error : ifindex;
1428 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1430 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1432 ovs_mutex_lock(&netdev->mutex);
1433 if (netdev->miimon_interval > 0) {
1434 *carrier = netdev->miimon;
1436 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1438 ovs_mutex_unlock(&netdev->mutex);
1443 static long long int
1444 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1447 long long int carrier_resets;
1449 ovs_mutex_lock(&netdev->mutex);
1450 carrier_resets = netdev->carrier_resets;
1451 ovs_mutex_unlock(&netdev->mutex);
1453 return carrier_resets;
1457 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1458 struct mii_ioctl_data *data)
1463 memset(&ifr, 0, sizeof ifr);
1464 memcpy(&ifr.ifr_data, data, sizeof *data);
1465 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1466 memcpy(data, &ifr.ifr_data, sizeof *data);
1472 netdev_linux_get_miimon(const char *name, bool *miimon)
1474 struct mii_ioctl_data data;
1479 memset(&data, 0, sizeof data);
1480 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1482 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1483 data.reg_num = MII_BMSR;
1484 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1488 *miimon = !!(data.val_out & BMSR_LSTATUS);
1490 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1493 struct ethtool_cmd ecmd;
1495 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1498 COVERAGE_INC(netdev_get_ethtool);
1499 memset(&ecmd, 0, sizeof ecmd);
1500 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1503 struct ethtool_value eval;
1505 memcpy(&eval, &ecmd, sizeof eval);
1506 *miimon = !!eval.data;
1508 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1516 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1517 long long int interval)
1519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1521 ovs_mutex_lock(&netdev->mutex);
1522 interval = interval > 0 ? MAX(interval, 100) : 0;
1523 if (netdev->miimon_interval != interval) {
1524 if (interval && !netdev->miimon_interval) {
1525 atomic_count_inc(&miimon_cnt);
1526 } else if (!interval && netdev->miimon_interval) {
1527 atomic_count_dec(&miimon_cnt);
1530 netdev->miimon_interval = interval;
1531 timer_set_expired(&netdev->miimon_timer);
1533 ovs_mutex_unlock(&netdev->mutex);
1539 netdev_linux_miimon_run(void)
1541 struct shash device_shash;
1542 struct shash_node *node;
1544 shash_init(&device_shash);
1545 netdev_get_devices(&netdev_linux_class, &device_shash);
1546 SHASH_FOR_EACH (node, &device_shash) {
1547 struct netdev *netdev = node->data;
1548 struct netdev_linux *dev = netdev_linux_cast(netdev);
1551 ovs_mutex_lock(&dev->mutex);
1552 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1553 netdev_linux_get_miimon(dev->up.name, &miimon);
1554 if (miimon != dev->miimon) {
1555 dev->miimon = miimon;
1556 netdev_linux_changed(dev, dev->ifi_flags, 0);
1559 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1561 ovs_mutex_unlock(&dev->mutex);
1562 netdev_close(netdev);
1565 shash_destroy(&device_shash);
1569 netdev_linux_miimon_wait(void)
1571 struct shash device_shash;
1572 struct shash_node *node;
1574 shash_init(&device_shash);
1575 netdev_get_devices(&netdev_linux_class, &device_shash);
1576 SHASH_FOR_EACH (node, &device_shash) {
1577 struct netdev *netdev = node->data;
1578 struct netdev_linux *dev = netdev_linux_cast(netdev);
1580 ovs_mutex_lock(&dev->mutex);
1581 if (dev->miimon_interval > 0) {
1582 timer_wait(&dev->miimon_timer);
1584 ovs_mutex_unlock(&dev->mutex);
1585 netdev_close(netdev);
1587 shash_destroy(&device_shash);
1591 swap_uint64(uint64_t *a, uint64_t *b)
1598 /* Copies 'src' into 'dst', performing format conversion in the process.
1600 * 'src' is allowed to be misaligned. */
1602 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1603 const struct ovs_vport_stats *src)
1605 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1606 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1607 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1608 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1609 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1610 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1611 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1612 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1614 dst->collisions = 0;
1615 dst->rx_length_errors = 0;
1616 dst->rx_over_errors = 0;
1617 dst->rx_crc_errors = 0;
1618 dst->rx_frame_errors = 0;
1619 dst->rx_fifo_errors = 0;
1620 dst->rx_missed_errors = 0;
1621 dst->tx_aborted_errors = 0;
1622 dst->tx_carrier_errors = 0;
1623 dst->tx_fifo_errors = 0;
1624 dst->tx_heartbeat_errors = 0;
1625 dst->tx_window_errors = 0;
1629 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1631 struct dpif_netlink_vport reply;
1635 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1638 } else if (!reply.stats) {
1643 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1651 get_stats_via_vport(const struct netdev *netdev_,
1652 struct netdev_stats *stats)
1654 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1656 if (!netdev->vport_stats_error ||
1657 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1660 error = get_stats_via_vport__(netdev_, stats);
1661 if (error && error != ENOENT && error != ENODEV) {
1662 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1664 netdev_get_name(netdev_), ovs_strerror(error));
1666 netdev->vport_stats_error = error;
1667 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1671 /* Retrieves current device stats for 'netdev-linux'. */
1673 netdev_linux_get_stats(const struct netdev *netdev_,
1674 struct netdev_stats *stats)
1676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1677 struct netdev_stats dev_stats;
1680 ovs_mutex_lock(&netdev->mutex);
1681 get_stats_via_vport(netdev_, stats);
1682 error = get_stats_via_netlink(netdev_, &dev_stats);
1684 if (!netdev->vport_stats_error) {
1687 } else if (netdev->vport_stats_error) {
1688 /* stats not available from OVS then use netdev stats. */
1691 /* Use kernel netdev's packet and byte counts since vport's counters
1692 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1694 stats->rx_packets = dev_stats.rx_packets;
1695 stats->rx_bytes = dev_stats.rx_bytes;
1696 stats->tx_packets = dev_stats.tx_packets;
1697 stats->tx_bytes = dev_stats.tx_bytes;
1699 stats->rx_errors += dev_stats.rx_errors;
1700 stats->tx_errors += dev_stats.tx_errors;
1701 stats->rx_dropped += dev_stats.rx_dropped;
1702 stats->tx_dropped += dev_stats.tx_dropped;
1703 stats->multicast += dev_stats.multicast;
1704 stats->collisions += dev_stats.collisions;
1705 stats->rx_length_errors += dev_stats.rx_length_errors;
1706 stats->rx_over_errors += dev_stats.rx_over_errors;
1707 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1708 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1709 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1710 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1711 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1712 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1713 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1714 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1715 stats->tx_window_errors += dev_stats.tx_window_errors;
1717 ovs_mutex_unlock(&netdev->mutex);
1722 /* Retrieves current device stats for 'netdev-tap' netdev or
1723 * netdev-internal. */
1725 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1728 struct netdev_stats dev_stats;
1731 ovs_mutex_lock(&netdev->mutex);
1732 get_stats_via_vport(netdev_, stats);
1733 error = get_stats_via_netlink(netdev_, &dev_stats);
1735 if (!netdev->vport_stats_error) {
1738 } else if (netdev->vport_stats_error) {
1739 /* Transmit and receive stats will appear to be swapped relative to the
1740 * other ports since we are the one sending the data, not a remote
1741 * computer. For consistency, we swap them back here. This does not
1742 * apply if we are getting stats from the vport layer because it always
1743 * tracks stats from the perspective of the switch. */
1746 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1747 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1748 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1749 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1750 stats->rx_length_errors = 0;
1751 stats->rx_over_errors = 0;
1752 stats->rx_crc_errors = 0;
1753 stats->rx_frame_errors = 0;
1754 stats->rx_fifo_errors = 0;
1755 stats->rx_missed_errors = 0;
1756 stats->tx_aborted_errors = 0;
1757 stats->tx_carrier_errors = 0;
1758 stats->tx_fifo_errors = 0;
1759 stats->tx_heartbeat_errors = 0;
1760 stats->tx_window_errors = 0;
1762 /* Use kernel netdev's packet and byte counts since vport counters
1763 * do not reflect packet counts on the wire when GSO, TSO or GRO
1765 stats->rx_packets = dev_stats.tx_packets;
1766 stats->rx_bytes = dev_stats.tx_bytes;
1767 stats->tx_packets = dev_stats.rx_packets;
1768 stats->tx_bytes = dev_stats.rx_bytes;
1770 stats->rx_dropped += dev_stats.tx_dropped;
1771 stats->tx_dropped += dev_stats.rx_dropped;
1773 stats->rx_errors += dev_stats.tx_errors;
1774 stats->tx_errors += dev_stats.rx_errors;
1776 stats->multicast += dev_stats.multicast;
1777 stats->collisions += dev_stats.collisions;
1779 ovs_mutex_unlock(&netdev->mutex);
1785 netdev_internal_get_stats(const struct netdev *netdev_,
1786 struct netdev_stats *stats)
1788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1791 ovs_mutex_lock(&netdev->mutex);
1792 get_stats_via_vport(netdev_, stats);
1793 error = netdev->vport_stats_error;
1794 ovs_mutex_unlock(&netdev->mutex);
1800 netdev_linux_read_features(struct netdev_linux *netdev)
1802 struct ethtool_cmd ecmd;
1806 if (netdev->cache_valid & VALID_FEATURES) {
1810 COVERAGE_INC(netdev_get_ethtool);
1811 memset(&ecmd, 0, sizeof ecmd);
1812 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1813 ETHTOOL_GSET, "ETHTOOL_GSET");
1818 /* Supported features. */
1819 netdev->supported = 0;
1820 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1821 netdev->supported |= NETDEV_F_10MB_HD;
1823 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1824 netdev->supported |= NETDEV_F_10MB_FD;
1826 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1827 netdev->supported |= NETDEV_F_100MB_HD;
1829 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1830 netdev->supported |= NETDEV_F_100MB_FD;
1832 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1833 netdev->supported |= NETDEV_F_1GB_HD;
1835 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1836 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1837 netdev->supported |= NETDEV_F_1GB_FD;
1839 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1840 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1841 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1842 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1843 netdev->supported |= NETDEV_F_10GB_FD;
1845 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1846 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1847 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1848 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1849 netdev->supported |= NETDEV_F_40GB_FD;
1851 if (ecmd.supported & SUPPORTED_TP) {
1852 netdev->supported |= NETDEV_F_COPPER;
1854 if (ecmd.supported & SUPPORTED_FIBRE) {
1855 netdev->supported |= NETDEV_F_FIBER;
1857 if (ecmd.supported & SUPPORTED_Autoneg) {
1858 netdev->supported |= NETDEV_F_AUTONEG;
1860 if (ecmd.supported & SUPPORTED_Pause) {
1861 netdev->supported |= NETDEV_F_PAUSE;
1863 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1864 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1867 /* Advertised features. */
1868 netdev->advertised = 0;
1869 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1870 netdev->advertised |= NETDEV_F_10MB_HD;
1872 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1873 netdev->advertised |= NETDEV_F_10MB_FD;
1875 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1876 netdev->advertised |= NETDEV_F_100MB_HD;
1878 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1879 netdev->advertised |= NETDEV_F_100MB_FD;
1881 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1882 netdev->advertised |= NETDEV_F_1GB_HD;
1884 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1885 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1886 netdev->advertised |= NETDEV_F_1GB_FD;
1888 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1889 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1890 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1891 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1892 netdev->advertised |= NETDEV_F_10GB_FD;
1894 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1895 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1896 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1897 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1898 netdev->advertised |= NETDEV_F_40GB_FD;
1900 if (ecmd.advertising & ADVERTISED_TP) {
1901 netdev->advertised |= NETDEV_F_COPPER;
1903 if (ecmd.advertising & ADVERTISED_FIBRE) {
1904 netdev->advertised |= NETDEV_F_FIBER;
1906 if (ecmd.advertising & ADVERTISED_Autoneg) {
1907 netdev->advertised |= NETDEV_F_AUTONEG;
1909 if (ecmd.advertising & ADVERTISED_Pause) {
1910 netdev->advertised |= NETDEV_F_PAUSE;
1912 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1913 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1916 /* Current settings. */
1917 speed = ethtool_cmd_speed(&ecmd);
1918 if (speed == SPEED_10) {
1919 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1920 } else if (speed == SPEED_100) {
1921 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1922 } else if (speed == SPEED_1000) {
1923 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1924 } else if (speed == SPEED_10000) {
1925 netdev->current = NETDEV_F_10GB_FD;
1926 } else if (speed == 40000) {
1927 netdev->current = NETDEV_F_40GB_FD;
1928 } else if (speed == 100000) {
1929 netdev->current = NETDEV_F_100GB_FD;
1930 } else if (speed == 1000000) {
1931 netdev->current = NETDEV_F_1TB_FD;
1933 netdev->current = 0;
1936 if (ecmd.port == PORT_TP) {
1937 netdev->current |= NETDEV_F_COPPER;
1938 } else if (ecmd.port == PORT_FIBRE) {
1939 netdev->current |= NETDEV_F_FIBER;
1943 netdev->current |= NETDEV_F_AUTONEG;
1947 netdev->cache_valid |= VALID_FEATURES;
1948 netdev->get_features_error = error;
1951 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1952 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1953 * Returns 0 if successful, otherwise a positive errno value. */
1955 netdev_linux_get_features(const struct netdev *netdev_,
1956 enum netdev_features *current,
1957 enum netdev_features *advertised,
1958 enum netdev_features *supported,
1959 enum netdev_features *peer)
1961 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1964 ovs_mutex_lock(&netdev->mutex);
1965 netdev_linux_read_features(netdev);
1966 if (!netdev->get_features_error) {
1967 *current = netdev->current;
1968 *advertised = netdev->advertised;
1969 *supported = netdev->supported;
1970 *peer = 0; /* XXX */
1972 error = netdev->get_features_error;
1973 ovs_mutex_unlock(&netdev->mutex);
1978 /* Set the features advertised by 'netdev' to 'advertise'. */
1980 netdev_linux_set_advertisements(struct netdev *netdev_,
1981 enum netdev_features advertise)
1983 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1984 struct ethtool_cmd ecmd;
1987 ovs_mutex_lock(&netdev->mutex);
1989 COVERAGE_INC(netdev_get_ethtool);
1990 memset(&ecmd, 0, sizeof ecmd);
1991 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1992 ETHTOOL_GSET, "ETHTOOL_GSET");
1997 ecmd.advertising = 0;
1998 if (advertise & NETDEV_F_10MB_HD) {
1999 ecmd.advertising |= ADVERTISED_10baseT_Half;
2001 if (advertise & NETDEV_F_10MB_FD) {
2002 ecmd.advertising |= ADVERTISED_10baseT_Full;
2004 if (advertise & NETDEV_F_100MB_HD) {
2005 ecmd.advertising |= ADVERTISED_100baseT_Half;
2007 if (advertise & NETDEV_F_100MB_FD) {
2008 ecmd.advertising |= ADVERTISED_100baseT_Full;
2010 if (advertise & NETDEV_F_1GB_HD) {
2011 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2013 if (advertise & NETDEV_F_1GB_FD) {
2014 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2016 if (advertise & NETDEV_F_10GB_FD) {
2017 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2019 if (advertise & NETDEV_F_COPPER) {
2020 ecmd.advertising |= ADVERTISED_TP;
2022 if (advertise & NETDEV_F_FIBER) {
2023 ecmd.advertising |= ADVERTISED_FIBRE;
2025 if (advertise & NETDEV_F_AUTONEG) {
2026 ecmd.advertising |= ADVERTISED_Autoneg;
2028 if (advertise & NETDEV_F_PAUSE) {
2029 ecmd.advertising |= ADVERTISED_Pause;
2031 if (advertise & NETDEV_F_PAUSE_ASYM) {
2032 ecmd.advertising |= ADVERTISED_Asym_Pause;
2034 COVERAGE_INC(netdev_set_ethtool);
2035 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2036 ETHTOOL_SSET, "ETHTOOL_SSET");
2039 ovs_mutex_unlock(&netdev->mutex);
2043 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2044 * successful, otherwise a positive errno value. */
2046 netdev_linux_set_policing(struct netdev *netdev_,
2047 uint32_t kbits_rate, uint32_t kbits_burst)
2049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2050 const char *netdev_name = netdev_get_name(netdev_);
2053 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2054 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2055 : kbits_burst); /* Stick with user-specified value. */
2057 ovs_mutex_lock(&netdev->mutex);
2058 if (netdev->cache_valid & VALID_POLICING) {
2059 error = netdev->netdev_policing_error;
2060 if (error || (netdev->kbits_rate == kbits_rate &&
2061 netdev->kbits_burst == kbits_burst)) {
2062 /* Assume that settings haven't changed since we last set them. */
2065 netdev->cache_valid &= ~VALID_POLICING;
2068 COVERAGE_INC(netdev_set_policing);
2069 /* Remove any existing ingress qdisc. */
2070 error = tc_add_del_ingress_qdisc(netdev_, false);
2072 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2073 netdev_name, ovs_strerror(error));
2078 error = tc_add_del_ingress_qdisc(netdev_, true);
2080 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2081 netdev_name, ovs_strerror(error));
2085 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2087 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2088 netdev_name, ovs_strerror(error));
2093 netdev->kbits_rate = kbits_rate;
2094 netdev->kbits_burst = kbits_burst;
2097 if (!error || error == ENODEV) {
2098 netdev->netdev_policing_error = error;
2099 netdev->cache_valid |= VALID_POLICING;
2101 ovs_mutex_unlock(&netdev->mutex);
2106 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2109 const struct tc_ops *const *opsp;
2111 for (opsp = tcs; *opsp != NULL; opsp++) {
2112 const struct tc_ops *ops = *opsp;
2113 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2114 sset_add(types, ops->ovs_name);
2120 static const struct tc_ops *
2121 tc_lookup_ovs_name(const char *name)
2123 const struct tc_ops *const *opsp;
2125 for (opsp = tcs; *opsp != NULL; opsp++) {
2126 const struct tc_ops *ops = *opsp;
2127 if (!strcmp(name, ops->ovs_name)) {
2134 static const struct tc_ops *
2135 tc_lookup_linux_name(const char *name)
2137 const struct tc_ops *const *opsp;
2139 for (opsp = tcs; *opsp != NULL; opsp++) {
2140 const struct tc_ops *ops = *opsp;
2141 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2148 static struct tc_queue *
2149 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2152 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2153 struct tc_queue *queue;
2155 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2156 if (queue->queue_id == queue_id) {
2163 static struct tc_queue *
2164 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2166 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2170 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2172 struct netdev_qos_capabilities *caps)
2174 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2178 caps->n_queues = ops->n_queues;
2183 netdev_linux_get_qos(const struct netdev *netdev_,
2184 const char **typep, struct smap *details)
2186 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2189 ovs_mutex_lock(&netdev->mutex);
2190 error = tc_query_qdisc(netdev_);
2192 *typep = netdev->tc->ops->ovs_name;
2193 error = (netdev->tc->ops->qdisc_get
2194 ? netdev->tc->ops->qdisc_get(netdev_, details)
2197 ovs_mutex_unlock(&netdev->mutex);
2203 netdev_linux_set_qos(struct netdev *netdev_,
2204 const char *type, const struct smap *details)
2206 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2207 const struct tc_ops *new_ops;
2210 new_ops = tc_lookup_ovs_name(type);
2211 if (!new_ops || !new_ops->tc_install) {
2215 ovs_mutex_lock(&netdev->mutex);
2216 error = tc_query_qdisc(netdev_);
2221 if (new_ops == netdev->tc->ops) {
2222 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2224 /* Delete existing qdisc. */
2225 error = tc_del_qdisc(netdev_);
2229 ovs_assert(netdev->tc == NULL);
2231 /* Install new qdisc. */
2232 error = new_ops->tc_install(netdev_, details);
2233 ovs_assert((error == 0) == (netdev->tc != NULL));
2237 ovs_mutex_unlock(&netdev->mutex);
2242 netdev_linux_get_queue(const struct netdev *netdev_,
2243 unsigned int queue_id, struct smap *details)
2245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2248 ovs_mutex_lock(&netdev->mutex);
2249 error = tc_query_qdisc(netdev_);
2251 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2253 ? netdev->tc->ops->class_get(netdev_, queue, details)
2256 ovs_mutex_unlock(&netdev->mutex);
2262 netdev_linux_set_queue(struct netdev *netdev_,
2263 unsigned int queue_id, const struct smap *details)
2265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2268 ovs_mutex_lock(&netdev->mutex);
2269 error = tc_query_qdisc(netdev_);
2271 error = (queue_id < netdev->tc->ops->n_queues
2272 && netdev->tc->ops->class_set
2273 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2276 ovs_mutex_unlock(&netdev->mutex);
2282 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2284 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2287 ovs_mutex_lock(&netdev->mutex);
2288 error = tc_query_qdisc(netdev_);
2290 if (netdev->tc->ops->class_delete) {
2291 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2293 ? netdev->tc->ops->class_delete(netdev_, queue)
2299 ovs_mutex_unlock(&netdev->mutex);
2305 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2306 unsigned int queue_id,
2307 struct netdev_queue_stats *stats)
2309 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2312 ovs_mutex_lock(&netdev->mutex);
2313 error = tc_query_qdisc(netdev_);
2315 if (netdev->tc->ops->class_get_stats) {
2316 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2318 stats->created = queue->created;
2319 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2328 ovs_mutex_unlock(&netdev->mutex);
2333 struct queue_dump_state {
2334 struct nl_dump dump;
2339 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2341 struct ofpbuf request;
2342 struct tcmsg *tcmsg;
2344 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2348 tcmsg->tcm_parent = 0;
2349 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2350 ofpbuf_uninit(&request);
2352 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2357 finish_queue_dump(struct queue_dump_state *state)
2359 ofpbuf_uninit(&state->buf);
2360 return nl_dump_done(&state->dump);
2363 struct netdev_linux_queue_state {
2364 unsigned int *queues;
2370 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2372 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2375 ovs_mutex_lock(&netdev->mutex);
2376 error = tc_query_qdisc(netdev_);
2378 if (netdev->tc->ops->class_get) {
2379 struct netdev_linux_queue_state *state;
2380 struct tc_queue *queue;
2383 *statep = state = xmalloc(sizeof *state);
2384 state->n_queues = hmap_count(&netdev->tc->queues);
2385 state->cur_queue = 0;
2386 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2389 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2390 state->queues[i++] = queue->queue_id;
2396 ovs_mutex_unlock(&netdev->mutex);
2402 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2403 unsigned int *queue_idp, struct smap *details)
2405 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2406 struct netdev_linux_queue_state *state = state_;
2409 ovs_mutex_lock(&netdev->mutex);
2410 while (state->cur_queue < state->n_queues) {
2411 unsigned int queue_id = state->queues[state->cur_queue++];
2412 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2415 *queue_idp = queue_id;
2416 error = netdev->tc->ops->class_get(netdev_, queue, details);
2420 ovs_mutex_unlock(&netdev->mutex);
2426 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2429 struct netdev_linux_queue_state *state = state_;
2431 free(state->queues);
2437 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2438 netdev_dump_queue_stats_cb *cb, void *aux)
2440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2443 ovs_mutex_lock(&netdev->mutex);
2444 error = tc_query_qdisc(netdev_);
2446 struct queue_dump_state state;
2448 if (!netdev->tc->ops->class_dump_stats) {
2450 } else if (!start_queue_dump(netdev_, &state)) {
2456 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2457 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2464 retval = finish_queue_dump(&state);
2470 ovs_mutex_unlock(&netdev->mutex);
2476 netdev_linux_get_in4(const struct netdev *netdev_,
2477 struct in_addr *address, struct in_addr *netmask)
2479 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2482 ovs_mutex_lock(&netdev->mutex);
2483 if (!(netdev->cache_valid & VALID_IN4)) {
2484 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2485 SIOCGIFADDR, "SIOCGIFADDR");
2487 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2488 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2490 netdev->in4_error = error;
2491 netdev->cache_valid |= VALID_IN4;
2493 error = netdev->in4_error;
2497 if (netdev->address.s_addr != INADDR_ANY) {
2498 *address = netdev->address;
2499 *netmask = netdev->netmask;
2501 error = EADDRNOTAVAIL;
2504 ovs_mutex_unlock(&netdev->mutex);
2510 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2511 struct in_addr netmask)
2513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2516 ovs_mutex_lock(&netdev->mutex);
2517 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2519 netdev->address = address;
2520 netdev->netmask = netmask;
2521 if (address.s_addr != INADDR_ANY) {
2522 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2523 "SIOCSIFNETMASK", netmask);
2528 netdev->cache_valid |= VALID_IN4;
2529 netdev->in4_error = 0;
2531 netdev->cache_valid &= ~VALID_IN4;
2533 ovs_mutex_unlock(&netdev->mutex);
2539 parse_if_inet6_line(const char *line,
2540 struct in6_addr *in6, char ifname[16 + 1])
2542 uint8_t *s6 = in6->s6_addr;
2543 #define X8 "%2"SCNx8
2544 return ovs_scan(line,
2545 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2546 "%*x %*x %*x %*x %16s\n",
2547 &s6[0], &s6[1], &s6[2], &s6[3],
2548 &s6[4], &s6[5], &s6[6], &s6[7],
2549 &s6[8], &s6[9], &s6[10], &s6[11],
2550 &s6[12], &s6[13], &s6[14], &s6[15],
2554 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2555 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2558 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2563 ovs_mutex_lock(&netdev->mutex);
2564 if (!(netdev->cache_valid & VALID_IN6)) {
2568 netdev->in6 = in6addr_any;
2569 netdev->in6_error = EADDRNOTAVAIL;
2571 file = fopen("/proc/net/if_inet6", "r");
2573 const char *name = netdev_get_name(netdev_);
2574 while (fgets(line, sizeof line, file)) {
2575 struct in6_addr in6_tmp;
2576 char ifname[16 + 1];
2577 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2578 && !strcmp(name, ifname))
2580 netdev->in6 = in6_tmp;
2581 netdev->in6_error = 0;
2587 netdev->in6_error = EOPNOTSUPP;
2589 netdev->cache_valid |= VALID_IN6;
2592 error = netdev->in6_error;
2593 ovs_mutex_unlock(&netdev->mutex);
2599 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2601 struct sockaddr_in sin;
2602 memset(&sin, 0, sizeof sin);
2603 sin.sin_family = AF_INET;
2604 sin.sin_addr = addr;
2607 memset(sa, 0, sizeof *sa);
2608 memcpy(sa, &sin, sizeof sin);
2612 do_set_addr(struct netdev *netdev,
2613 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2617 make_in4_sockaddr(&ifr.ifr_addr, addr);
2618 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2622 /* Adds 'router' as a default IP gateway. */
2624 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2626 struct in_addr any = { INADDR_ANY };
2630 memset(&rt, 0, sizeof rt);
2631 make_in4_sockaddr(&rt.rt_dst, any);
2632 make_in4_sockaddr(&rt.rt_gateway, router);
2633 make_in4_sockaddr(&rt.rt_genmask, any);
2634 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2635 error = af_inet_ioctl(SIOCADDRT, &rt);
2637 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2643 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2646 static const char fn[] = "/proc/net/route";
2651 *netdev_name = NULL;
2652 stream = fopen(fn, "r");
2653 if (stream == NULL) {
2654 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2659 while (fgets(line, sizeof line, stream)) {
2662 ovs_be32 dest, gateway, mask;
2663 int refcnt, metric, mtu;
2664 unsigned int flags, use, window, irtt;
2667 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2669 iface, &dest, &gateway, &flags, &refcnt,
2670 &use, &metric, &mask, &mtu, &window, &irtt)) {
2671 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2675 if (!(flags & RTF_UP)) {
2676 /* Skip routes that aren't up. */
2680 /* The output of 'dest', 'mask', and 'gateway' were given in
2681 * network byte order, so we don't need need any endian
2682 * conversions here. */
2683 if ((dest & mask) == (host->s_addr & mask)) {
2685 /* The host is directly reachable. */
2686 next_hop->s_addr = 0;
2688 /* To reach the host, we must go through a gateway. */
2689 next_hop->s_addr = gateway;
2691 *netdev_name = xstrdup(iface);
2703 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2708 ovs_mutex_lock(&netdev->mutex);
2709 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2710 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2712 COVERAGE_INC(netdev_get_ethtool);
2713 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2714 error = netdev_linux_do_ethtool(netdev->up.name,
2717 "ETHTOOL_GDRVINFO");
2719 netdev->cache_valid |= VALID_DRVINFO;
2724 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2725 smap_add(smap, "driver_version", netdev->drvinfo.version);
2726 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2728 ovs_mutex_unlock(&netdev->mutex);
2734 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2737 smap_add(smap, "driver_name", "openvswitch");
2741 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2742 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2743 * returns 0. Otherwise, it returns a positive errno value; in particular,
2744 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2746 netdev_linux_arp_lookup(const struct netdev *netdev,
2747 ovs_be32 ip, struct eth_addr *mac)
2750 struct sockaddr_in sin;
2753 memset(&r, 0, sizeof r);
2754 memset(&sin, 0, sizeof sin);
2755 sin.sin_family = AF_INET;
2756 sin.sin_addr.s_addr = ip;
2758 memcpy(&r.arp_pa, &sin, sizeof sin);
2759 r.arp_ha.sa_family = ARPHRD_ETHER;
2761 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2762 COVERAGE_INC(netdev_arp_lookup);
2763 retval = af_inet_ioctl(SIOCGARP, &r);
2765 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2766 } else if (retval != ENXIO) {
2767 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2768 netdev_get_name(netdev), IP_ARGS(ip),
2769 ovs_strerror(retval));
2775 nd_to_iff_flags(enum netdev_flags nd)
2778 if (nd & NETDEV_UP) {
2781 if (nd & NETDEV_PROMISC) {
2784 if (nd & NETDEV_LOOPBACK) {
2785 iff |= IFF_LOOPBACK;
2791 iff_to_nd_flags(int iff)
2793 enum netdev_flags nd = 0;
2797 if (iff & IFF_PROMISC) {
2798 nd |= NETDEV_PROMISC;
2800 if (iff & IFF_LOOPBACK) {
2801 nd |= NETDEV_LOOPBACK;
2807 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2808 enum netdev_flags on, enum netdev_flags *old_flagsp)
2809 OVS_REQUIRES(netdev->mutex)
2811 int old_flags, new_flags;
2814 old_flags = netdev->ifi_flags;
2815 *old_flagsp = iff_to_nd_flags(old_flags);
2816 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2817 if (new_flags != old_flags) {
2818 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2819 get_flags(&netdev->up, &netdev->ifi_flags);
2826 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2827 enum netdev_flags on, enum netdev_flags *old_flagsp)
2829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2832 ovs_mutex_lock(&netdev->mutex);
2833 error = update_flags(netdev, off, on, old_flagsp);
2834 ovs_mutex_unlock(&netdev->mutex);
2839 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2840 GET_FEATURES, GET_STATUS) \
2846 netdev_linux_wait, \
2848 netdev_linux_alloc, \
2850 netdev_linux_destruct, \
2851 netdev_linux_dealloc, \
2852 NULL, /* get_config */ \
2853 NULL, /* set_config */ \
2854 NULL, /* get_tunnel_config */ \
2855 NULL, /* build header */ \
2856 NULL, /* push header */ \
2857 NULL, /* pop header */ \
2858 NULL, /* get_numa_id */ \
2859 NULL, /* set_multiq */ \
2861 netdev_linux_send, \
2862 netdev_linux_send_wait, \
2864 netdev_linux_set_etheraddr, \
2865 netdev_linux_get_etheraddr, \
2866 netdev_linux_get_mtu, \
2867 netdev_linux_set_mtu, \
2868 netdev_linux_get_ifindex, \
2869 netdev_linux_get_carrier, \
2870 netdev_linux_get_carrier_resets, \
2871 netdev_linux_set_miimon_interval, \
2875 netdev_linux_set_advertisements, \
2877 netdev_linux_set_policing, \
2878 netdev_linux_get_qos_types, \
2879 netdev_linux_get_qos_capabilities, \
2880 netdev_linux_get_qos, \
2881 netdev_linux_set_qos, \
2882 netdev_linux_get_queue, \
2883 netdev_linux_set_queue, \
2884 netdev_linux_delete_queue, \
2885 netdev_linux_get_queue_stats, \
2886 netdev_linux_queue_dump_start, \
2887 netdev_linux_queue_dump_next, \
2888 netdev_linux_queue_dump_done, \
2889 netdev_linux_dump_queue_stats, \
2891 netdev_linux_get_in4, \
2892 netdev_linux_set_in4, \
2893 netdev_linux_get_in6, \
2894 netdev_linux_add_router, \
2895 netdev_linux_get_next_hop, \
2897 netdev_linux_arp_lookup, \
2899 netdev_linux_update_flags, \
2901 netdev_linux_rxq_alloc, \
2902 netdev_linux_rxq_construct, \
2903 netdev_linux_rxq_destruct, \
2904 netdev_linux_rxq_dealloc, \
2905 netdev_linux_rxq_recv, \
2906 netdev_linux_rxq_wait, \
2907 netdev_linux_rxq_drain, \
2910 const struct netdev_class netdev_linux_class =
2913 netdev_linux_construct,
2914 netdev_linux_get_stats,
2915 netdev_linux_get_features,
2916 netdev_linux_get_status);
2918 const struct netdev_class netdev_tap_class =
2921 netdev_linux_construct_tap,
2922 netdev_tap_get_stats,
2923 netdev_linux_get_features,
2924 netdev_linux_get_status);
2926 const struct netdev_class netdev_internal_class =
2929 netdev_linux_construct,
2930 netdev_internal_get_stats,
2931 NULL, /* get_features */
2932 netdev_internal_get_status);
2935 #define CODEL_N_QUEUES 0x0000
2937 /* In sufficiently new kernel headers these are defined as enums in
2938 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2939 * kernels. (This overrides any enum definition in the header file but that's
2941 #define TCA_CODEL_TARGET 1
2942 #define TCA_CODEL_LIMIT 2
2943 #define TCA_CODEL_INTERVAL 3
2952 static struct codel *
2953 codel_get__(const struct netdev *netdev_)
2955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2956 return CONTAINER_OF(netdev->tc, struct codel, tc);
2960 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2964 struct codel *codel;
2966 codel = xmalloc(sizeof *codel);
2967 tc_init(&codel->tc, &tc_ops_codel);
2968 codel->target = target;
2969 codel->limit = limit;
2970 codel->interval = interval;
2972 netdev->tc = &codel->tc;
2976 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2980 struct ofpbuf request;
2981 struct tcmsg *tcmsg;
2982 uint32_t otarget, olimit, ointerval;
2985 tc_del_qdisc(netdev);
2987 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2988 NLM_F_EXCL | NLM_F_CREATE, &request);
2992 tcmsg->tcm_handle = tc_make_handle(1, 0);
2993 tcmsg->tcm_parent = TC_H_ROOT;
2995 otarget = target ? target : 5000;
2996 olimit = limit ? limit : 10240;
2997 ointerval = interval ? interval : 100000;
2999 nl_msg_put_string(&request, TCA_KIND, "codel");
3000 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3001 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3002 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3003 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3004 nl_msg_end_nested(&request, opt_offset);
3006 error = tc_transact(&request, NULL);
3008 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3009 "target %u, limit %u, interval %u error %d(%s)",
3010 netdev_get_name(netdev),
3011 otarget, olimit, ointerval,
3012 error, ovs_strerror(error));
3018 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3019 const struct smap *details, struct codel *codel)
3021 const char *target_s;
3022 const char *limit_s;
3023 const char *interval_s;
3025 target_s = smap_get(details, "target");
3026 limit_s = smap_get(details, "limit");
3027 interval_s = smap_get(details, "interval");
3029 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3030 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3031 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3033 if (!codel->target) {
3034 codel->target = 5000;
3036 if (!codel->limit) {
3037 codel->limit = 10240;
3039 if (!codel->interval) {
3040 codel->interval = 100000;
3045 codel_tc_install(struct netdev *netdev, const struct smap *details)
3050 codel_parse_qdisc_details__(netdev, details, &codel);
3051 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3054 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3060 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3062 static const struct nl_policy tca_codel_policy[] = {
3063 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3064 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3065 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3068 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3070 if (!nl_parse_nested(nl_options, tca_codel_policy,
3071 attrs, ARRAY_SIZE(tca_codel_policy))) {
3072 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3076 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3077 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3078 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3083 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3085 struct nlattr *nlattr;
3090 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3095 error = codel_parse_tca_options__(nlattr, &codel);
3100 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3106 codel_tc_destroy(struct tc *tc)
3108 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3114 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3116 const struct codel *codel = codel_get__(netdev);
3117 smap_add_format(details, "target", "%u", codel->target);
3118 smap_add_format(details, "limit", "%u", codel->limit);
3119 smap_add_format(details, "interval", "%u", codel->interval);
3124 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3128 codel_parse_qdisc_details__(netdev, details, &codel);
3129 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3130 codel_get__(netdev)->target = codel.target;
3131 codel_get__(netdev)->limit = codel.limit;
3132 codel_get__(netdev)->interval = codel.interval;
3136 static const struct tc_ops tc_ops_codel = {
3137 "codel", /* linux_name */
3138 "linux-codel", /* ovs_name */
3139 CODEL_N_QUEUES, /* n_queues */
3152 /* FQ-CoDel traffic control class. */
3154 #define FQCODEL_N_QUEUES 0x0000
3156 /* In sufficiently new kernel headers these are defined as enums in
3157 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3158 * kernels. (This overrides any enum definition in the header file but that's
3160 #define TCA_FQ_CODEL_TARGET 1
3161 #define TCA_FQ_CODEL_LIMIT 2
3162 #define TCA_FQ_CODEL_INTERVAL 3
3163 #define TCA_FQ_CODEL_ECN 4
3164 #define TCA_FQ_CODEL_FLOWS 5
3165 #define TCA_FQ_CODEL_QUANTUM 6
3176 static struct fqcodel *
3177 fqcodel_get__(const struct netdev *netdev_)
3179 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3180 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3184 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3185 uint32_t interval, uint32_t flows, uint32_t quantum)
3187 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3188 struct fqcodel *fqcodel;
3190 fqcodel = xmalloc(sizeof *fqcodel);
3191 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3192 fqcodel->target = target;
3193 fqcodel->limit = limit;
3194 fqcodel->interval = interval;
3195 fqcodel->flows = flows;
3196 fqcodel->quantum = quantum;
3198 netdev->tc = &fqcodel->tc;
3202 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3203 uint32_t interval, uint32_t flows, uint32_t quantum)
3206 struct ofpbuf request;
3207 struct tcmsg *tcmsg;
3208 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3211 tc_del_qdisc(netdev);
3213 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3214 NLM_F_EXCL | NLM_F_CREATE, &request);
3218 tcmsg->tcm_handle = tc_make_handle(1, 0);
3219 tcmsg->tcm_parent = TC_H_ROOT;
3221 otarget = target ? target : 5000;
3222 olimit = limit ? limit : 10240;
3223 ointerval = interval ? interval : 100000;
3224 oflows = flows ? flows : 1024;
3225 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3228 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3229 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3230 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3231 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3232 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3233 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3234 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3235 nl_msg_end_nested(&request, opt_offset);
3237 error = tc_transact(&request, NULL);
3239 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3240 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3241 netdev_get_name(netdev),
3242 otarget, olimit, ointerval, oflows, oquantum,
3243 error, ovs_strerror(error));
3249 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3250 const struct smap *details, struct fqcodel *fqcodel)
3252 const char *target_s;
3253 const char *limit_s;
3254 const char *interval_s;
3255 const char *flows_s;
3256 const char *quantum_s;
3258 target_s = smap_get(details, "target");
3259 limit_s = smap_get(details, "limit");
3260 interval_s = smap_get(details, "interval");
3261 flows_s = smap_get(details, "flows");
3262 quantum_s = smap_get(details, "quantum");
3263 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3264 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3265 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3266 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3267 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3268 if (!fqcodel->target) {
3269 fqcodel->target = 5000;
3271 if (!fqcodel->limit) {
3272 fqcodel->limit = 10240;
3274 if (!fqcodel->interval) {
3275 fqcodel->interval = 1000000;
3277 if (!fqcodel->flows) {
3278 fqcodel->flows = 1024;
3280 if (!fqcodel->quantum) {
3281 fqcodel->quantum = 1514;
3286 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3289 struct fqcodel fqcodel;
3291 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3292 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3293 fqcodel.interval, fqcodel.flows,
3296 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3297 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3303 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3305 static const struct nl_policy tca_fqcodel_policy[] = {
3306 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3307 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3308 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3309 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3310 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3313 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3315 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3316 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3317 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3321 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3322 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3323 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3324 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3325 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3330 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3332 struct nlattr *nlattr;
3335 struct fqcodel fqcodel;
3337 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3342 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3347 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3348 fqcodel.flows, fqcodel.quantum);
3353 fqcodel_tc_destroy(struct tc *tc)
3355 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3361 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3363 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3364 smap_add_format(details, "target", "%u", fqcodel->target);
3365 smap_add_format(details, "limit", "%u", fqcodel->limit);
3366 smap_add_format(details, "interval", "%u", fqcodel->interval);
3367 smap_add_format(details, "flows", "%u", fqcodel->flows);
3368 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3373 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3375 struct fqcodel fqcodel;
3377 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3378 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3379 fqcodel.flows, fqcodel.quantum);
3380 fqcodel_get__(netdev)->target = fqcodel.target;
3381 fqcodel_get__(netdev)->limit = fqcodel.limit;
3382 fqcodel_get__(netdev)->interval = fqcodel.interval;
3383 fqcodel_get__(netdev)->flows = fqcodel.flows;
3384 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3388 static const struct tc_ops tc_ops_fqcodel = {
3389 "fq_codel", /* linux_name */
3390 "linux-fq_codel", /* ovs_name */
3391 FQCODEL_N_QUEUES, /* n_queues */
3404 /* SFQ traffic control class. */
3406 #define SFQ_N_QUEUES 0x0000
3415 sfq_get__(const struct netdev *netdev_)
3417 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3418 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3422 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3427 sfq = xmalloc(sizeof *sfq);
3428 tc_init(&sfq->tc, &tc_ops_sfq);
3429 sfq->perturb = perturb;
3430 sfq->quantum = quantum;
3432 netdev->tc = &sfq->tc;
3436 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3438 struct tc_sfq_qopt opt;
3439 struct ofpbuf request;
3440 struct tcmsg *tcmsg;
3442 int mtu_error, error;
3443 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3445 tc_del_qdisc(netdev);
3447 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3448 NLM_F_EXCL | NLM_F_CREATE, &request);
3452 tcmsg->tcm_handle = tc_make_handle(1, 0);
3453 tcmsg->tcm_parent = TC_H_ROOT;
3455 memset(&opt, 0, sizeof opt);
3458 opt.quantum = mtu; /* if we cannot find mtu, use default */
3461 opt.quantum = quantum;
3465 opt.perturb_period = 10;
3467 opt.perturb_period = perturb;
3470 nl_msg_put_string(&request, TCA_KIND, "sfq");
3471 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3473 error = tc_transact(&request, NULL);
3475 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3476 "quantum %u, perturb %u error %d(%s)",
3477 netdev_get_name(netdev),
3478 opt.quantum, opt.perturb_period,
3479 error, ovs_strerror(error));
3485 sfq_parse_qdisc_details__(struct netdev *netdev,
3486 const struct smap *details, struct sfq *sfq)
3488 const char *perturb_s;
3489 const char *quantum_s;
3493 perturb_s = smap_get(details, "perturb");
3494 quantum_s = smap_get(details, "quantum");
3495 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3496 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3497 if (!sfq->perturb) {
3501 if (!sfq->quantum) {
3502 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3506 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3507 "device without mtu");
3514 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3519 sfq_parse_qdisc_details__(netdev, details, &sfq);
3520 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3522 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3528 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3530 const struct tc_sfq_qopt *sfq;
3531 struct nlattr *nlattr;
3535 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3537 sfq = nl_attr_get(nlattr);
3538 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3546 sfq_tc_destroy(struct tc *tc)
3548 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3554 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3556 const struct sfq *sfq = sfq_get__(netdev);
3557 smap_add_format(details, "quantum", "%u", sfq->quantum);
3558 smap_add_format(details, "perturb", "%u", sfq->perturb);
3563 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3567 sfq_parse_qdisc_details__(netdev, details, &sfq);
3568 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3569 sfq_get__(netdev)->quantum = sfq.quantum;
3570 sfq_get__(netdev)->perturb = sfq.perturb;
3574 static const struct tc_ops tc_ops_sfq = {
3575 "sfq", /* linux_name */
3576 "linux-sfq", /* ovs_name */
3577 SFQ_N_QUEUES, /* n_queues */
3590 /* HTB traffic control class. */
3592 #define HTB_N_QUEUES 0xf000
3593 #define HTB_RATE2QUANTUM 10
3597 unsigned int max_rate; /* In bytes/s. */
3601 struct tc_queue tc_queue;
3602 unsigned int min_rate; /* In bytes/s. */
3603 unsigned int max_rate; /* In bytes/s. */
3604 unsigned int burst; /* In bytes. */
3605 unsigned int priority; /* Lower values are higher priorities. */
3609 htb_get__(const struct netdev *netdev_)
3611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3612 return CONTAINER_OF(netdev->tc, struct htb, tc);
3616 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3618 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3621 htb = xmalloc(sizeof *htb);
3622 tc_init(&htb->tc, &tc_ops_htb);
3623 htb->max_rate = max_rate;
3625 netdev->tc = &htb->tc;
3628 /* Create an HTB qdisc.
3630 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3632 htb_setup_qdisc__(struct netdev *netdev)
3635 struct tc_htb_glob opt;
3636 struct ofpbuf request;
3637 struct tcmsg *tcmsg;
3639 tc_del_qdisc(netdev);
3641 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3642 NLM_F_EXCL | NLM_F_CREATE, &request);
3646 tcmsg->tcm_handle = tc_make_handle(1, 0);
3647 tcmsg->tcm_parent = TC_H_ROOT;
3649 nl_msg_put_string(&request, TCA_KIND, "htb");
3651 memset(&opt, 0, sizeof opt);
3652 opt.rate2quantum = HTB_RATE2QUANTUM;
3656 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3657 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3658 nl_msg_end_nested(&request, opt_offset);
3660 return tc_transact(&request, NULL);
3663 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3664 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3666 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3667 unsigned int parent, struct htb_class *class)
3670 struct tc_htb_opt opt;
3671 struct ofpbuf request;
3672 struct tcmsg *tcmsg;
3676 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3678 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3679 netdev_get_name(netdev));
3683 memset(&opt, 0, sizeof opt);
3684 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3685 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3686 /* Makes sure the quantum is at least MTU. Setting quantum will
3687 * make htb ignore the r2q for this class. */
3688 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3691 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3692 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3693 opt.prio = class->priority;
3695 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3699 tcmsg->tcm_handle = handle;
3700 tcmsg->tcm_parent = parent;
3702 nl_msg_put_string(&request, TCA_KIND, "htb");
3703 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3704 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3705 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3706 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3707 nl_msg_end_nested(&request, opt_offset);
3709 error = tc_transact(&request, NULL);
3711 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3712 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3713 netdev_get_name(netdev),
3714 tc_get_major(handle), tc_get_minor(handle),
3715 tc_get_major(parent), tc_get_minor(parent),
3716 class->min_rate, class->max_rate,
3717 class->burst, class->priority, ovs_strerror(error));
3722 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3723 * description of them into 'details'. The description complies with the
3724 * specification given in the vswitch database documentation for linux-htb
3727 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3729 static const struct nl_policy tca_htb_policy[] = {
3730 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3731 .min_len = sizeof(struct tc_htb_opt) },
3734 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3735 const struct tc_htb_opt *htb;
3737 if (!nl_parse_nested(nl_options, tca_htb_policy,
3738 attrs, ARRAY_SIZE(tca_htb_policy))) {
3739 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3743 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3744 class->min_rate = htb->rate.rate;
3745 class->max_rate = htb->ceil.rate;
3746 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3747 class->priority = htb->prio;
3752 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3753 struct htb_class *options,
3754 struct netdev_queue_stats *stats)
3756 struct nlattr *nl_options;
3757 unsigned int handle;
3760 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3761 if (!error && queue_id) {
3762 unsigned int major = tc_get_major(handle);
3763 unsigned int minor = tc_get_minor(handle);
3764 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3765 *queue_id = minor - 1;
3770 if (!error && options) {
3771 error = htb_parse_tca_options__(nl_options, options);
3777 htb_parse_qdisc_details__(struct netdev *netdev_,
3778 const struct smap *details, struct htb_class *hc)
3780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3781 const char *max_rate_s;
3783 max_rate_s = smap_get(details, "max-rate");
3784 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3785 if (!hc->max_rate) {
3786 enum netdev_features current;
3788 netdev_linux_read_features(netdev);
3789 current = !netdev->get_features_error ? netdev->current : 0;
3790 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3792 hc->min_rate = hc->max_rate;
3798 htb_parse_class_details__(struct netdev *netdev,
3799 const struct smap *details, struct htb_class *hc)
3801 const struct htb *htb = htb_get__(netdev);
3802 const char *min_rate_s = smap_get(details, "min-rate");
3803 const char *max_rate_s = smap_get(details, "max-rate");
3804 const char *burst_s = smap_get(details, "burst");
3805 const char *priority_s = smap_get(details, "priority");
3808 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3810 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3811 netdev_get_name(netdev));
3815 /* HTB requires at least an mtu sized min-rate to send any traffic even
3816 * on uncongested links. */
3817 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3818 hc->min_rate = MAX(hc->min_rate, mtu);
3819 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3822 hc->max_rate = (max_rate_s
3823 ? strtoull(max_rate_s, NULL, 10) / 8
3825 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3826 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3830 * According to hints in the documentation that I've read, it is important
3831 * that 'burst' be at least as big as the largest frame that might be
3832 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3833 * but having it a bit too small is a problem. Since netdev_get_mtu()
3834 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3835 * the MTU. We actually add 64, instead of 14, as a guard against
3836 * additional headers get tacked on somewhere that we're not aware of. */
3837 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3838 hc->burst = MAX(hc->burst, mtu + 64);
3841 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3847 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3848 unsigned int parent, struct htb_class *options,
3849 struct netdev_queue_stats *stats)
3851 struct ofpbuf *reply;
3854 error = tc_query_class(netdev, handle, parent, &reply);
3856 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3857 ofpbuf_delete(reply);
3863 htb_tc_install(struct netdev *netdev, const struct smap *details)
3867 error = htb_setup_qdisc__(netdev);
3869 struct htb_class hc;
3871 htb_parse_qdisc_details__(netdev, details, &hc);
3872 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3873 tc_make_handle(1, 0), &hc);
3875 htb_install__(netdev, hc.max_rate);
3881 static struct htb_class *
3882 htb_class_cast__(const struct tc_queue *queue)
3884 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3888 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3889 const struct htb_class *hc)
3891 struct htb *htb = htb_get__(netdev);
3892 size_t hash = hash_int(queue_id, 0);
3893 struct tc_queue *queue;
3894 struct htb_class *hcp;
3896 queue = tc_find_queue__(netdev, queue_id, hash);
3898 hcp = htb_class_cast__(queue);
3900 hcp = xmalloc(sizeof *hcp);
3901 queue = &hcp->tc_queue;
3902 queue->queue_id = queue_id;
3903 queue->created = time_msec();
3904 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3907 hcp->min_rate = hc->min_rate;
3908 hcp->max_rate = hc->max_rate;
3909 hcp->burst = hc->burst;
3910 hcp->priority = hc->priority;
3914 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3917 struct queue_dump_state state;
3918 struct htb_class hc;
3920 /* Get qdisc options. */
3922 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3923 htb_install__(netdev, hc.max_rate);
3926 if (!start_queue_dump(netdev, &state)) {
3929 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3930 unsigned int queue_id;
3932 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3933 htb_update_queue__(netdev, queue_id, &hc);
3936 finish_queue_dump(&state);
3942 htb_tc_destroy(struct tc *tc)
3944 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3945 struct htb_class *hc, *next;
3947 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3948 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3956 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3958 const struct htb *htb = htb_get__(netdev);
3959 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3964 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3966 struct htb_class hc;
3969 htb_parse_qdisc_details__(netdev, details, &hc);
3970 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3971 tc_make_handle(1, 0), &hc);
3973 htb_get__(netdev)->max_rate = hc.max_rate;
3979 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3980 const struct tc_queue *queue, struct smap *details)
3982 const struct htb_class *hc = htb_class_cast__(queue);
3984 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3985 if (hc->min_rate != hc->max_rate) {
3986 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3988 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3990 smap_add_format(details, "priority", "%u", hc->priority);
3996 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3997 const struct smap *details)
3999 struct htb_class hc;
4002 error = htb_parse_class_details__(netdev, details, &hc);
4007 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4008 tc_make_handle(1, 0xfffe), &hc);
4013 htb_update_queue__(netdev, queue_id, &hc);
4018 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4020 struct htb_class *hc = htb_class_cast__(queue);
4021 struct htb *htb = htb_get__(netdev);
4024 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4026 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4033 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4034 struct netdev_queue_stats *stats)
4036 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4037 tc_make_handle(1, 0xfffe), NULL, stats);
4041 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4042 const struct ofpbuf *nlmsg,
4043 netdev_dump_queue_stats_cb *cb, void *aux)
4045 struct netdev_queue_stats stats;
4046 unsigned int handle, major, minor;
4049 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4054 major = tc_get_major(handle);
4055 minor = tc_get_minor(handle);
4056 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4057 (*cb)(minor - 1, &stats, aux);
4062 static const struct tc_ops tc_ops_htb = {
4063 "htb", /* linux_name */
4064 "linux-htb", /* ovs_name */
4065 HTB_N_QUEUES, /* n_queues */
4074 htb_class_get_stats,
4075 htb_class_dump_stats
4078 /* "linux-hfsc" traffic control class. */
4080 #define HFSC_N_QUEUES 0xf000
4088 struct tc_queue tc_queue;
4093 static struct hfsc *
4094 hfsc_get__(const struct netdev *netdev_)
4096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4097 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4100 static struct hfsc_class *
4101 hfsc_class_cast__(const struct tc_queue *queue)
4103 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4107 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4109 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4112 hfsc = xmalloc(sizeof *hfsc);
4113 tc_init(&hfsc->tc, &tc_ops_hfsc);
4114 hfsc->max_rate = max_rate;
4115 netdev->tc = &hfsc->tc;
4119 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4120 const struct hfsc_class *hc)
4124 struct hfsc_class *hcp;
4125 struct tc_queue *queue;
4127 hfsc = hfsc_get__(netdev);
4128 hash = hash_int(queue_id, 0);
4130 queue = tc_find_queue__(netdev, queue_id, hash);
4132 hcp = hfsc_class_cast__(queue);
4134 hcp = xmalloc(sizeof *hcp);
4135 queue = &hcp->tc_queue;
4136 queue->queue_id = queue_id;
4137 queue->created = time_msec();
4138 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4141 hcp->min_rate = hc->min_rate;
4142 hcp->max_rate = hc->max_rate;
4146 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4148 const struct tc_service_curve *rsc, *fsc, *usc;
4149 static const struct nl_policy tca_hfsc_policy[] = {
4151 .type = NL_A_UNSPEC,
4153 .min_len = sizeof(struct tc_service_curve),
4156 .type = NL_A_UNSPEC,
4158 .min_len = sizeof(struct tc_service_curve),
4161 .type = NL_A_UNSPEC,
4163 .min_len = sizeof(struct tc_service_curve),
4166 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4168 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4169 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4170 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4174 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4175 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4176 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4178 if (rsc->m1 != 0 || rsc->d != 0 ||
4179 fsc->m1 != 0 || fsc->d != 0 ||
4180 usc->m1 != 0 || usc->d != 0) {
4181 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4182 "Non-linear service curves are not supported.");
4186 if (rsc->m2 != fsc->m2) {
4187 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4188 "Real-time service curves are not supported ");
4192 if (rsc->m2 > usc->m2) {
4193 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4194 "Min-rate service curve is greater than "
4195 "the max-rate service curve.");
4199 class->min_rate = fsc->m2;
4200 class->max_rate = usc->m2;
4205 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4206 struct hfsc_class *options,
4207 struct netdev_queue_stats *stats)
4210 unsigned int handle;
4211 struct nlattr *nl_options;
4213 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4219 unsigned int major, minor;
4221 major = tc_get_major(handle);
4222 minor = tc_get_minor(handle);
4223 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4224 *queue_id = minor - 1;
4231 error = hfsc_parse_tca_options__(nl_options, options);
4238 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4239 unsigned int parent, struct hfsc_class *options,
4240 struct netdev_queue_stats *stats)
4243 struct ofpbuf *reply;
4245 error = tc_query_class(netdev, handle, parent, &reply);
4250 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4251 ofpbuf_delete(reply);
4256 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4257 struct hfsc_class *class)
4259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4261 const char *max_rate_s;
4263 max_rate_s = smap_get(details, "max-rate");
4264 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4267 enum netdev_features current;
4269 netdev_linux_read_features(netdev);
4270 current = !netdev->get_features_error ? netdev->current : 0;
4271 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4274 class->min_rate = max_rate;
4275 class->max_rate = max_rate;
4279 hfsc_parse_class_details__(struct netdev *netdev,
4280 const struct smap *details,
4281 struct hfsc_class * class)
4283 const struct hfsc *hfsc;
4284 uint32_t min_rate, max_rate;
4285 const char *min_rate_s, *max_rate_s;
4287 hfsc = hfsc_get__(netdev);
4288 min_rate_s = smap_get(details, "min-rate");
4289 max_rate_s = smap_get(details, "max-rate");
4291 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4292 min_rate = MAX(min_rate, 1);
4293 min_rate = MIN(min_rate, hfsc->max_rate);
4295 max_rate = (max_rate_s
4296 ? strtoull(max_rate_s, NULL, 10) / 8
4298 max_rate = MAX(max_rate, min_rate);
4299 max_rate = MIN(max_rate, hfsc->max_rate);
4301 class->min_rate = min_rate;
4302 class->max_rate = max_rate;
4307 /* Create an HFSC qdisc.
4309 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4311 hfsc_setup_qdisc__(struct netdev * netdev)
4313 struct tcmsg *tcmsg;
4314 struct ofpbuf request;
4315 struct tc_hfsc_qopt opt;
4317 tc_del_qdisc(netdev);
4319 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4320 NLM_F_EXCL | NLM_F_CREATE, &request);
4326 tcmsg->tcm_handle = tc_make_handle(1, 0);
4327 tcmsg->tcm_parent = TC_H_ROOT;
4329 memset(&opt, 0, sizeof opt);
4332 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4333 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4335 return tc_transact(&request, NULL);
4338 /* Create an HFSC class.
4340 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4341 * sc rate <min_rate> ul rate <max_rate>" */
4343 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4344 unsigned int parent, struct hfsc_class *class)
4348 struct tcmsg *tcmsg;
4349 struct ofpbuf request;
4350 struct tc_service_curve min, max;
4352 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4358 tcmsg->tcm_handle = handle;
4359 tcmsg->tcm_parent = parent;
4363 min.m2 = class->min_rate;
4367 max.m2 = class->max_rate;
4369 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4370 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4371 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4372 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4373 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4374 nl_msg_end_nested(&request, opt_offset);
4376 error = tc_transact(&request, NULL);
4378 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4379 "min-rate %ubps, max-rate %ubps (%s)",
4380 netdev_get_name(netdev),
4381 tc_get_major(handle), tc_get_minor(handle),
4382 tc_get_major(parent), tc_get_minor(parent),
4383 class->min_rate, class->max_rate, ovs_strerror(error));
4390 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4393 struct hfsc_class class;
4395 error = hfsc_setup_qdisc__(netdev);
4401 hfsc_parse_qdisc_details__(netdev, details, &class);
4402 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4403 tc_make_handle(1, 0), &class);
4409 hfsc_install__(netdev, class.max_rate);
4414 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4417 struct queue_dump_state state;
4418 struct hfsc_class hc;
4421 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4422 hfsc_install__(netdev, hc.max_rate);
4424 if (!start_queue_dump(netdev, &state)) {
4428 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4429 unsigned int queue_id;
4431 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4432 hfsc_update_queue__(netdev, queue_id, &hc);
4436 finish_queue_dump(&state);
4441 hfsc_tc_destroy(struct tc *tc)
4444 struct hfsc_class *hc, *next;
4446 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4448 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4449 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4458 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4460 const struct hfsc *hfsc;
4461 hfsc = hfsc_get__(netdev);
4462 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4467 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4470 struct hfsc_class class;
4472 hfsc_parse_qdisc_details__(netdev, details, &class);
4473 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4474 tc_make_handle(1, 0), &class);
4477 hfsc_get__(netdev)->max_rate = class.max_rate;
4484 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4485 const struct tc_queue *queue, struct smap *details)
4487 const struct hfsc_class *hc;
4489 hc = hfsc_class_cast__(queue);
4490 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4491 if (hc->min_rate != hc->max_rate) {
4492 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4498 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4499 const struct smap *details)
4502 struct hfsc_class class;
4504 error = hfsc_parse_class_details__(netdev, details, &class);
4509 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4510 tc_make_handle(1, 0xfffe), &class);
4515 hfsc_update_queue__(netdev, queue_id, &class);
4520 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4524 struct hfsc_class *hc;
4526 hc = hfsc_class_cast__(queue);
4527 hfsc = hfsc_get__(netdev);
4529 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4531 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4538 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4539 struct netdev_queue_stats *stats)
4541 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4542 tc_make_handle(1, 0xfffe), NULL, stats);
4546 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4547 const struct ofpbuf *nlmsg,
4548 netdev_dump_queue_stats_cb *cb, void *aux)
4550 struct netdev_queue_stats stats;
4551 unsigned int handle, major, minor;
4554 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4559 major = tc_get_major(handle);
4560 minor = tc_get_minor(handle);
4561 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4562 (*cb)(minor - 1, &stats, aux);
4567 static const struct tc_ops tc_ops_hfsc = {
4568 "hfsc", /* linux_name */
4569 "linux-hfsc", /* ovs_name */
4570 HFSC_N_QUEUES, /* n_queues */
4571 hfsc_tc_install, /* tc_install */
4572 hfsc_tc_load, /* tc_load */
4573 hfsc_tc_destroy, /* tc_destroy */
4574 hfsc_qdisc_get, /* qdisc_get */
4575 hfsc_qdisc_set, /* qdisc_set */
4576 hfsc_class_get, /* class_get */
4577 hfsc_class_set, /* class_set */
4578 hfsc_class_delete, /* class_delete */
4579 hfsc_class_get_stats, /* class_get_stats */
4580 hfsc_class_dump_stats /* class_dump_stats */
4583 /* "linux-default" traffic control class.
4585 * This class represents the default, unnamed Linux qdisc. It corresponds to
4586 * the "" (empty string) QoS type in the OVS database. */
4589 default_install__(struct netdev *netdev_)
4591 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4592 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4594 /* Nothing but a tc class implementation is allowed to write to a tc. This
4595 * class never does that, so we can legitimately use a const tc object. */
4596 netdev->tc = CONST_CAST(struct tc *, &tc);
4600 default_tc_install(struct netdev *netdev,
4601 const struct smap *details OVS_UNUSED)
4603 default_install__(netdev);
4608 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4610 default_install__(netdev);
4614 static const struct tc_ops tc_ops_default = {
4615 NULL, /* linux_name */
4620 NULL, /* tc_destroy */
4621 NULL, /* qdisc_get */
4622 NULL, /* qdisc_set */
4623 NULL, /* class_get */
4624 NULL, /* class_set */
4625 NULL, /* class_delete */
4626 NULL, /* class_get_stats */
4627 NULL /* class_dump_stats */
4630 /* "linux-other" traffic control class.
4635 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4638 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4640 /* Nothing but a tc class implementation is allowed to write to a tc. This
4641 * class never does that, so we can legitimately use a const tc object. */
4642 netdev->tc = CONST_CAST(struct tc *, &tc);
4646 static const struct tc_ops tc_ops_other = {
4647 NULL, /* linux_name */
4648 "linux-other", /* ovs_name */
4650 NULL, /* tc_install */
4652 NULL, /* tc_destroy */
4653 NULL, /* qdisc_get */
4654 NULL, /* qdisc_set */
4655 NULL, /* class_get */
4656 NULL, /* class_set */
4657 NULL, /* class_delete */
4658 NULL, /* class_get_stats */
4659 NULL /* class_dump_stats */
4662 /* Traffic control. */
4664 /* Number of kernel "tc" ticks per second. */
4665 static double ticks_per_s;
4667 /* Number of kernel "jiffies" per second. This is used for the purpose of
4668 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4669 * one jiffy's worth of data.
4671 * There are two possibilities here:
4673 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4674 * approximate range of 100 to 1024. That means that we really need to
4675 * make sure that the qdisc can buffer that much data.
4677 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4678 * has finely granular timers and there's no need to fudge additional room
4679 * for buffers. (There's no extra effort needed to implement that: the
4680 * large 'buffer_hz' is used as a divisor, so practically any number will
4681 * come out as 0 in the division. Small integer results in the case of
4682 * really high dividends won't have any real effect anyhow.)
4684 static unsigned int buffer_hz;
4686 /* Returns tc handle 'major':'minor'. */
4688 tc_make_handle(unsigned int major, unsigned int minor)
4690 return TC_H_MAKE(major << 16, minor);
4693 /* Returns the major number from 'handle'. */
4695 tc_get_major(unsigned int handle)
4697 return TC_H_MAJ(handle) >> 16;
4700 /* Returns the minor number from 'handle'. */
4702 tc_get_minor(unsigned int handle)
4704 return TC_H_MIN(handle);
4707 static struct tcmsg *
4708 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4709 struct ofpbuf *request)
4711 struct tcmsg *tcmsg;
4715 error = get_ifindex(netdev, &ifindex);
4720 ofpbuf_init(request, 512);
4721 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4722 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4723 tcmsg->tcm_family = AF_UNSPEC;
4724 tcmsg->tcm_ifindex = ifindex;
4725 /* Caller should fill in tcmsg->tcm_handle. */
4726 /* Caller should fill in tcmsg->tcm_parent. */
4732 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4734 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4735 ofpbuf_uninit(request);
4739 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4740 * policing configuration.
4742 * This function is equivalent to running the following when 'add' is true:
4743 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4745 * This function is equivalent to running the following when 'add' is false:
4746 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4748 * The configuration and stats may be seen with the following command:
4749 * /sbin/tc -s qdisc show dev <devname>
4751 * Returns 0 if successful, otherwise a positive errno value.
4754 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4756 struct ofpbuf request;
4757 struct tcmsg *tcmsg;
4759 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4760 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4762 tcmsg = tc_make_request(netdev, type, flags, &request);
4766 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4767 tcmsg->tcm_parent = TC_H_INGRESS;
4768 nl_msg_put_string(&request, TCA_KIND, "ingress");
4769 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4771 error = tc_transact(&request, NULL);
4773 /* If we're deleting the qdisc, don't worry about some of the
4774 * error conditions. */
4775 if (!add && (error == ENOENT || error == EINVAL)) {
4784 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4787 * This function is equivalent to running:
4788 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4789 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4792 * The configuration and stats may be seen with the following command:
4793 * /sbin/tc -s filter show dev <devname> parent ffff:
4795 * Returns 0 if successful, otherwise a positive errno value.
4798 tc_add_policer(struct netdev *netdev,
4799 uint32_t kbits_rate, uint32_t kbits_burst)
4801 struct tc_police tc_police;
4802 struct ofpbuf request;
4803 struct tcmsg *tcmsg;
4804 size_t basic_offset;
4805 size_t police_offset;
4809 memset(&tc_police, 0, sizeof tc_police);
4810 tc_police.action = TC_POLICE_SHOT;
4811 tc_police.mtu = mtu;
4812 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4814 /* The following appears wrong in two ways:
4816 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4817 * arguments (or at least consistently "bytes" as both or "bits" as
4818 * both), but this supplies bytes for the first argument and bits for the
4821 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4823 * However if you "fix" those problems then "tc filter show ..." shows
4824 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4825 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4826 * tc's point of view. Whatever. */
4827 tc_police.burst = tc_bytes_to_ticks(
4828 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4830 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4831 NLM_F_EXCL | NLM_F_CREATE, &request);
4835 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4836 tcmsg->tcm_info = tc_make_handle(49,
4837 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4839 nl_msg_put_string(&request, TCA_KIND, "basic");
4840 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4841 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4842 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4843 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4844 nl_msg_end_nested(&request, police_offset);
4845 nl_msg_end_nested(&request, basic_offset);
4847 error = tc_transact(&request, NULL);
4858 /* The values in psched are not individually very meaningful, but they are
4859 * important. The tables below show some values seen in the wild.
4863 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4864 * (Before that, there are hints that it was 1000000000.)
4866 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4870 * -----------------------------------
4871 * [1] 000c8000 000f4240 000f4240 00000064
4872 * [2] 000003e8 00000400 000f4240 3b9aca00
4873 * [3] 000003e8 00000400 000f4240 3b9aca00
4874 * [4] 000003e8 00000400 000f4240 00000064
4875 * [5] 000003e8 00000040 000f4240 3b9aca00
4876 * [6] 000003e8 00000040 000f4240 000000f9
4878 * a b c d ticks_per_s buffer_hz
4879 * ------- --------- ---------- ------------- ----------- -------------
4880 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4881 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4882 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4883 * [4] 1,000 1,024 1,000,000 100 976,562 100
4884 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4885 * [6] 1,000 64 1,000,000 249 15,625,000 249
4887 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4888 * [2] 2.6.26-1-686-bigmem from Debian lenny
4889 * [3] 2.6.26-2-sparc64 from Debian lenny
4890 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4891 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4892 * [6] 2.6.34 from kernel.org on KVM
4894 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4895 static const char fn[] = "/proc/net/psched";
4896 unsigned int a, b, c, d;
4899 if (!ovsthread_once_start(&once)) {
4906 stream = fopen(fn, "r");
4908 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4912 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4913 VLOG_WARN("%s: read failed", fn);
4917 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4921 VLOG_WARN("%s: invalid scheduler parameters", fn);
4925 ticks_per_s = (double) a * c / b;
4929 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4932 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4935 ovsthread_once_done(&once);
4938 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4939 * rate of 'rate' bytes per second. */
4941 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4944 return (rate * ticks) / ticks_per_s;
4947 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4948 * rate of 'rate' bytes per second. */
4950 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4953 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4956 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4957 * a transmission rate of 'rate' bytes per second. */
4959 tc_buffer_per_jiffy(unsigned int rate)
4962 return rate / buffer_hz;
4965 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4966 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4967 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4968 * stores NULL into it if it is absent.
4970 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4973 * Returns 0 if successful, otherwise a positive errno value. */
4975 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4976 struct nlattr **options)
4978 static const struct nl_policy tca_policy[] = {
4979 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4980 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4982 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4984 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4985 tca_policy, ta, ARRAY_SIZE(ta))) {
4986 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4991 *kind = nl_attr_get_string(ta[TCA_KIND]);
4995 *options = ta[TCA_OPTIONS];
5010 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5011 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5012 * into '*options', and its queue statistics into '*stats'. Any of the output
5013 * arguments may be null.
5015 * Returns 0 if successful, otherwise a positive errno value. */
5017 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5018 struct nlattr **options, struct netdev_queue_stats *stats)
5020 static const struct nl_policy tca_policy[] = {
5021 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5022 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5024 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5026 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5027 tca_policy, ta, ARRAY_SIZE(ta))) {
5028 VLOG_WARN_RL(&rl, "failed to parse class message");
5033 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5034 *handlep = tc->tcm_handle;
5038 *options = ta[TCA_OPTIONS];
5042 const struct gnet_stats_queue *gsq;
5043 struct gnet_stats_basic gsb;
5045 static const struct nl_policy stats_policy[] = {
5046 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5047 .min_len = sizeof gsb },
5048 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5049 .min_len = sizeof *gsq },
5051 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5053 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5054 sa, ARRAY_SIZE(sa))) {
5055 VLOG_WARN_RL(&rl, "failed to parse class stats");
5059 /* Alignment issues screw up the length of struct gnet_stats_basic on
5060 * some arch/bitsize combinations. Newer versions of Linux have a
5061 * struct gnet_stats_basic_packed, but we can't depend on that. The
5062 * easiest thing to do is just to make a copy. */
5063 memset(&gsb, 0, sizeof gsb);
5064 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5065 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5066 stats->tx_bytes = gsb.bytes;
5067 stats->tx_packets = gsb.packets;
5069 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5070 stats->tx_errors = gsq->drops;
5080 memset(stats, 0, sizeof *stats);
5085 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5088 tc_query_class(const struct netdev *netdev,
5089 unsigned int handle, unsigned int parent,
5090 struct ofpbuf **replyp)
5092 struct ofpbuf request;
5093 struct tcmsg *tcmsg;
5096 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5100 tcmsg->tcm_handle = handle;
5101 tcmsg->tcm_parent = parent;
5103 error = tc_transact(&request, replyp);
5105 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5106 netdev_get_name(netdev),
5107 tc_get_major(handle), tc_get_minor(handle),
5108 tc_get_major(parent), tc_get_minor(parent),
5109 ovs_strerror(error));
5114 /* Equivalent to "tc class del dev <name> handle <handle>". */
5116 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5118 struct ofpbuf request;
5119 struct tcmsg *tcmsg;
5122 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5126 tcmsg->tcm_handle = handle;
5127 tcmsg->tcm_parent = 0;
5129 error = tc_transact(&request, NULL);
5131 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5132 netdev_get_name(netdev),
5133 tc_get_major(handle), tc_get_minor(handle),
5134 ovs_strerror(error));
5139 /* Equivalent to "tc qdisc del dev <name> root". */
5141 tc_del_qdisc(struct netdev *netdev_)
5143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5144 struct ofpbuf request;
5145 struct tcmsg *tcmsg;
5148 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5152 tcmsg->tcm_handle = tc_make_handle(1, 0);
5153 tcmsg->tcm_parent = TC_H_ROOT;
5155 error = tc_transact(&request, NULL);
5156 if (error == EINVAL) {
5157 /* EINVAL probably means that the default qdisc was in use, in which
5158 * case we've accomplished our purpose. */
5161 if (!error && netdev->tc) {
5162 if (netdev->tc->ops->tc_destroy) {
5163 netdev->tc->ops->tc_destroy(netdev->tc);
5171 getqdisc_is_safe(void)
5173 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5174 static bool safe = false;
5176 if (ovsthread_once_start(&once)) {
5177 struct utsname utsname;
5180 if (uname(&utsname) == -1) {
5181 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5182 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5183 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5184 } else if (major < 2 || (major == 2 && minor < 35)) {
5185 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5190 ovsthread_once_done(&once);
5195 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5196 * kernel to determine what they are. Returns 0 if successful, otherwise a
5197 * positive errno value. */
5199 tc_query_qdisc(const struct netdev *netdev_)
5201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5202 struct ofpbuf request, *qdisc;
5203 const struct tc_ops *ops;
5204 struct tcmsg *tcmsg;
5212 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5213 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5214 * 2.6.35 without that fix backported to it.
5216 * To avoid the OOPS, we must not make a request that would attempt to dump
5217 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5218 * few others. There are a few ways that I can see to do this, but most of
5219 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5220 * technique chosen here is to assume that any non-default qdisc that we
5221 * create will have a class with handle 1:0. The built-in qdiscs only have
5222 * a class with handle 0:0.
5224 * On Linux 2.6.35+ we use the straightforward method because it allows us
5225 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5226 * in such a case we get no response at all from the kernel (!) if a
5227 * builtin qdisc is in use (which is later caught by "!error &&
5228 * !qdisc->size"). */
5229 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5233 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5234 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5236 /* Figure out what tc class to instantiate. */
5237 error = tc_transact(&request, &qdisc);
5238 if (!error && qdisc->size) {
5241 error = tc_parse_qdisc(qdisc, &kind, NULL);
5243 ops = &tc_ops_other;
5245 ops = tc_lookup_linux_name(kind);
5247 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5248 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5250 ops = &tc_ops_other;
5253 } else if ((!error && !qdisc->size) || error == ENOENT) {
5254 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5255 * set up by some other entity that doesn't have a handle 1:0. We will
5256 * assume that it's the system default qdisc. */
5257 ops = &tc_ops_default;
5260 /* Who knows? Maybe the device got deleted. */
5261 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5262 netdev_get_name(netdev_), ovs_strerror(error));
5263 ops = &tc_ops_other;
5266 /* Instantiate it. */
5267 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5268 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5269 ofpbuf_delete(qdisc);
5271 return error ? error : load_error;
5274 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5275 approximate the time to transmit packets of various lengths. For an MTU of
5276 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5277 represents two possible packet lengths; for a MTU of 513 through 1024, four
5278 possible lengths; and so on.
5280 Returns, for the specified 'mtu', the number of bits that packet lengths
5281 need to be shifted right to fit within such a 256-entry table. */
5283 tc_calc_cell_log(unsigned int mtu)
5288 mtu = ETH_PAYLOAD_MAX;
5290 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5292 for (cell_log = 0; mtu >= 256; cell_log++) {
5299 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5302 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5304 memset(rate, 0, sizeof *rate);
5305 rate->cell_log = tc_calc_cell_log(mtu);
5306 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5307 /* rate->cell_align = 0; */ /* distro headers. */
5308 rate->mpu = ETH_TOTAL_MIN;
5312 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5313 * attribute of the specified "type".
5315 * See tc_calc_cell_log() above for a description of "rtab"s. */
5317 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5322 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5323 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5324 unsigned packet_size = (i + 1) << rate->cell_log;
5325 if (packet_size < rate->mpu) {
5326 packet_size = rate->mpu;
5328 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5332 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5333 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5334 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5337 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5339 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5340 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5343 /* Linux-only functions declared in netdev-linux.h */
5345 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5346 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5348 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5349 const char *flag_name, bool enable)
5351 const char *netdev_name = netdev_get_name(netdev);
5352 struct ethtool_value evalue;
5356 COVERAGE_INC(netdev_get_ethtool);
5357 memset(&evalue, 0, sizeof evalue);
5358 error = netdev_linux_do_ethtool(netdev_name,
5359 (struct ethtool_cmd *)&evalue,
5360 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5365 COVERAGE_INC(netdev_set_ethtool);
5366 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5367 if (new_flags == evalue.data) {
5370 evalue.data = new_flags;
5371 error = netdev_linux_do_ethtool(netdev_name,
5372 (struct ethtool_cmd *)&evalue,
5373 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5378 COVERAGE_INC(netdev_get_ethtool);
5379 memset(&evalue, 0, sizeof evalue);
5380 error = netdev_linux_do_ethtool(netdev_name,
5381 (struct ethtool_cmd *)&evalue,
5382 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5387 if (new_flags != evalue.data) {
5388 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5389 "device %s failed", enable ? "enable" : "disable",
5390 flag_name, netdev_name);
5397 /* Utility functions. */
5399 /* Copies 'src' into 'dst', performing format conversion in the process. */
5401 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5402 const struct rtnl_link_stats *src)
5404 dst->rx_packets = src->rx_packets;
5405 dst->tx_packets = src->tx_packets;
5406 dst->rx_bytes = src->rx_bytes;
5407 dst->tx_bytes = src->tx_bytes;
5408 dst->rx_errors = src->rx_errors;
5409 dst->tx_errors = src->tx_errors;
5410 dst->rx_dropped = src->rx_dropped;
5411 dst->tx_dropped = src->tx_dropped;
5412 dst->multicast = src->multicast;
5413 dst->collisions = src->collisions;
5414 dst->rx_length_errors = src->rx_length_errors;
5415 dst->rx_over_errors = src->rx_over_errors;
5416 dst->rx_crc_errors = src->rx_crc_errors;
5417 dst->rx_frame_errors = src->rx_frame_errors;
5418 dst->rx_fifo_errors = src->rx_fifo_errors;
5419 dst->rx_missed_errors = src->rx_missed_errors;
5420 dst->tx_aborted_errors = src->tx_aborted_errors;
5421 dst->tx_carrier_errors = src->tx_carrier_errors;
5422 dst->tx_fifo_errors = src->tx_fifo_errors;
5423 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5424 dst->tx_window_errors = src->tx_window_errors;
5427 /* Copies 'src' into 'dst', performing format conversion in the process. */
5429 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5430 const struct rtnl_link_stats64 *src)
5432 dst->rx_packets = src->rx_packets;
5433 dst->tx_packets = src->tx_packets;
5434 dst->rx_bytes = src->rx_bytes;
5435 dst->tx_bytes = src->tx_bytes;
5436 dst->rx_errors = src->rx_errors;
5437 dst->tx_errors = src->tx_errors;
5438 dst->rx_dropped = src->rx_dropped;
5439 dst->tx_dropped = src->tx_dropped;
5440 dst->multicast = src->multicast;
5441 dst->collisions = src->collisions;
5442 dst->rx_length_errors = src->rx_length_errors;
5443 dst->rx_over_errors = src->rx_over_errors;
5444 dst->rx_crc_errors = src->rx_crc_errors;
5445 dst->rx_frame_errors = src->rx_frame_errors;
5446 dst->rx_fifo_errors = src->rx_fifo_errors;
5447 dst->rx_missed_errors = src->rx_missed_errors;
5448 dst->tx_aborted_errors = src->tx_aborted_errors;
5449 dst->tx_carrier_errors = src->tx_carrier_errors;
5450 dst->tx_fifo_errors = src->tx_fifo_errors;
5451 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5452 dst->tx_window_errors = src->tx_window_errors;
5456 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5458 struct ofpbuf request;
5459 struct ofpbuf *reply;
5462 ofpbuf_init(&request, 0);
5463 nl_msg_put_nlmsghdr(&request,
5464 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5465 RTM_GETLINK, NLM_F_REQUEST);
5466 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5467 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5468 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5469 ofpbuf_uninit(&request);
5474 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5475 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5476 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5477 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5480 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5481 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5482 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5485 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5490 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5495 ofpbuf_delete(reply);
5500 get_flags(const struct netdev *dev, unsigned int *flags)
5506 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5508 *flags = ifr.ifr_flags;
5514 set_flags(const char *name, unsigned int flags)
5518 ifr.ifr_flags = flags;
5519 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5523 do_get_ifindex(const char *netdev_name)
5528 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5529 COVERAGE_INC(netdev_get_ifindex);
5531 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5533 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5534 netdev_name, ovs_strerror(error));
5537 return ifr.ifr_ifindex;
5541 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5543 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5545 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5546 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5549 netdev->get_ifindex_error = -ifindex;
5550 netdev->ifindex = 0;
5552 netdev->get_ifindex_error = 0;
5553 netdev->ifindex = ifindex;
5555 netdev->cache_valid |= VALID_IFINDEX;
5558 *ifindexp = netdev->ifindex;
5559 return netdev->get_ifindex_error;
5563 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5569 memset(&ifr, 0, sizeof ifr);
5570 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5571 COVERAGE_INC(netdev_get_hwaddr);
5572 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5574 /* ENODEV probably means that a vif disappeared asynchronously and
5575 * hasn't been removed from the database yet, so reduce the log level
5576 * to INFO for that case. */
5577 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5578 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5579 netdev_name, ovs_strerror(error));
5582 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5583 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5584 VLOG_INFO("%s device has unknown hardware address family %d",
5585 netdev_name, hwaddr_family);
5588 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5593 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5598 memset(&ifr, 0, sizeof ifr);
5599 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5600 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5601 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5602 COVERAGE_INC(netdev_set_hwaddr);
5603 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5605 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5606 netdev_name, ovs_strerror(error));
5612 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5613 int cmd, const char *cmd_name)
5618 memset(&ifr, 0, sizeof ifr);
5619 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5620 ifr.ifr_data = (caddr_t) ecmd;
5623 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5625 if (error != EOPNOTSUPP) {
5626 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5627 "failed: %s", cmd_name, name, ovs_strerror(error));
5629 /* The device doesn't support this operation. That's pretty
5630 * common, so there's no point in logging anything. */
5637 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5638 int cmd, const char *cmd_name)
5643 ifr.ifr_addr.sa_family = AF_INET;
5644 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5646 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5648 *ip = sin->sin_addr;
5653 /* Returns an AF_PACKET raw socket or a negative errno value. */
5655 af_packet_sock(void)
5657 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5660 if (ovsthread_once_start(&once)) {
5661 sock = socket(AF_PACKET, SOCK_RAW, 0);
5663 int error = set_nonblocking(sock);
5670 VLOG_ERR("failed to create packet socket: %s",
5671 ovs_strerror(errno));
5673 ovsthread_once_done(&once);