2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
226 VALID_POLICING = 1 << 5,
227 VALID_VPORT_STAT_ERROR = 1 << 6,
228 VALID_DRVINFO = 1 << 7,
229 VALID_FEATURES = 1 << 8,
232 /* Traffic control. */
234 /* An instance of a traffic control class. Always associated with a particular
237 * Each TC implementation subclasses this with whatever additional data it
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
248 /* One traffic control queue.
250 * Each TC implementation subclasses this with whatever additional data it
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
330 * This function may be null if 'tc' is not configurable.
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
341 * This function may be null if 'tc' is not configurable.
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
356 * This function may be null if 'tc' does not have queues ('n_queues' is
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
385 * On success, initializes '*stats'.
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
404 tc_init(struct tc *tc, const struct tc_ops *ops)
407 hmap_init(&tc->queues);
411 tc_destroy(struct tc *tc)
413 hmap_destroy(&tc->queues);
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_other;
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
431 &tc_ops_other, /* Some other qdisc. */
435 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
436 static unsigned int tc_get_major(unsigned int handle);
437 static unsigned int tc_get_minor(unsigned int handle);
439 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
440 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
441 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
443 static struct tcmsg *tc_make_request(const struct netdev *, int type,
444 unsigned int flags, struct ofpbuf *);
445 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
446 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
447 static int tc_add_policer(struct netdev *,
448 uint32_t kbits_rate, uint32_t kbits_burst);
450 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
451 struct nlattr **options);
452 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
453 struct nlattr **options,
454 struct netdev_queue_stats *);
455 static int tc_query_class(const struct netdev *,
456 unsigned int handle, unsigned int parent,
457 struct ofpbuf **replyp);
458 static int tc_delete_class(const struct netdev *, unsigned int handle);
460 static int tc_del_qdisc(struct netdev *netdev);
461 static int tc_query_qdisc(const struct netdev *netdev);
463 static int tc_calc_cell_log(unsigned int mtu);
464 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
465 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
466 const struct tc_ratespec *rate);
467 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
469 struct netdev_linux {
472 /* Protects all members below. */
473 struct ovs_mutex mutex;
475 unsigned int cache_valid;
477 bool miimon; /* Link status of last poll. */
478 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
479 struct timer miimon_timer;
481 /* The following are figured out "on demand" only. They are only valid
482 * when the corresponding VALID_* bit in 'cache_valid' is set. */
484 struct eth_addr etheraddr;
485 struct in_addr address, netmask;
488 unsigned int ifi_flags;
489 long long int carrier_resets;
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error; /* Cached error code from set policing. */
497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499 int in4_error; /* Cached error code from reading in4 addr. */
500 int in6_error; /* Cached error code from reading in6 addr. */
502 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
504 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
506 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
509 /* For devices of class netdev_tap_class only. */
513 struct netdev_rxq_linux {
514 struct netdev_rxq up;
519 /* This is set pretty low because we probably won't learn anything from the
520 * additional log messages. */
521 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
523 /* Polling miimon status for all ports causes performance degradation when
524 * handling a large number of ports. If there are no devices using miimon, then
525 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 * Readers do not depend on this variable synchronizing with the related
528 * changes in the device miimon status, so we can use atomic_count. */
529 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
531 static void netdev_linux_run(void);
533 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
534 int cmd, const char *cmd_name);
535 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
536 int cmd, const char *cmd_name);
537 static int get_flags(const struct netdev *, unsigned int *flags);
538 static int set_flags(const char *, unsigned int flags);
539 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
540 enum netdev_flags on, enum netdev_flags *old_flagsp)
541 OVS_REQUIRES(netdev->mutex);
542 static int do_get_ifindex(const char *netdev_name);
543 static int get_ifindex(const struct netdev *, int *ifindexp);
544 static int do_set_addr(struct netdev *netdev,
545 int ioctl_nr, const char *ioctl_name,
546 struct in_addr addr);
547 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
548 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
549 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
550 static int af_packet_sock(void);
551 static bool netdev_linux_miimon_enabled(void);
552 static void netdev_linux_miimon_run(void);
553 static void netdev_linux_miimon_wait(void);
554 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
557 is_netdev_linux_class(const struct netdev_class *netdev_class)
559 return netdev_class->run == netdev_linux_run;
563 is_tap_netdev(const struct netdev *netdev)
565 return netdev_get_class(netdev) == &netdev_tap_class;
568 static struct netdev_linux *
569 netdev_linux_cast(const struct netdev *netdev)
571 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
573 return CONTAINER_OF(netdev, struct netdev_linux, up);
576 static struct netdev_rxq_linux *
577 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
579 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
580 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
583 static void netdev_linux_update(struct netdev_linux *netdev,
584 const struct rtnetlink_change *)
585 OVS_REQUIRES(netdev->mutex);
586 static void netdev_linux_changed(struct netdev_linux *netdev,
587 unsigned int ifi_flags, unsigned int mask)
588 OVS_REQUIRES(netdev->mutex);
590 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
591 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
592 * if no such socket could be created. */
593 static struct nl_sock *
594 netdev_linux_notify_sock(void)
596 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
597 static struct nl_sock *sock;
598 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
599 RTNLGRP_IPV6_IFADDR};
601 if (ovsthread_once_start(&once)) {
604 error = nl_sock_create(NETLINK_ROUTE, &sock);
608 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
609 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
611 nl_sock_destroy(sock);
617 ovsthread_once_done(&once);
624 netdev_linux_miimon_enabled(void)
626 return atomic_count_get(&miimon_cnt) > 0;
630 netdev_linux_run(void)
632 struct nl_sock *sock;
635 if (netdev_linux_miimon_enabled()) {
636 netdev_linux_miimon_run();
639 sock = netdev_linux_notify_sock();
645 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
646 uint64_t buf_stub[4096 / 8];
649 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
650 error = nl_sock_recv(sock, &buf, false);
652 struct rtnetlink_change change;
654 if (rtnetlink_parse(&buf, &change)) {
655 struct netdev *netdev_ = netdev_from_name(change.ifname);
656 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
659 ovs_mutex_lock(&netdev->mutex);
660 netdev_linux_update(netdev, &change);
661 ovs_mutex_unlock(&netdev->mutex);
663 netdev_close(netdev_);
665 } else if (error == ENOBUFS) {
666 struct shash device_shash;
667 struct shash_node *node;
671 shash_init(&device_shash);
672 netdev_get_devices(&netdev_linux_class, &device_shash);
673 SHASH_FOR_EACH (node, &device_shash) {
674 struct netdev *netdev_ = node->data;
675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 ovs_mutex_lock(&netdev->mutex);
679 get_flags(netdev_, &flags);
680 netdev_linux_changed(netdev, flags, 0);
681 ovs_mutex_unlock(&netdev->mutex);
683 netdev_close(netdev_);
685 shash_destroy(&device_shash);
686 } else if (error != EAGAIN) {
687 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
688 ovs_strerror(error));
695 netdev_linux_wait(void)
697 struct nl_sock *sock;
699 if (netdev_linux_miimon_enabled()) {
700 netdev_linux_miimon_wait();
702 sock = netdev_linux_notify_sock();
704 nl_sock_wait(sock, POLLIN);
709 netdev_linux_changed(struct netdev_linux *dev,
710 unsigned int ifi_flags, unsigned int mask)
711 OVS_REQUIRES(dev->mutex)
713 netdev_change_seq_changed(&dev->up);
715 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
716 dev->carrier_resets++;
718 dev->ifi_flags = ifi_flags;
720 dev->cache_valid &= mask;
724 netdev_linux_update(struct netdev_linux *dev,
725 const struct rtnetlink_change *change)
726 OVS_REQUIRES(dev->mutex)
728 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
729 if (change->nlmsg_type == RTM_NEWLINK) {
730 /* Keep drv-info, in4, in6. */
731 netdev_linux_changed(dev, change->ifi_flags,
732 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
734 /* Update netdev from rtnl-change msg. */
736 dev->mtu = change->mtu;
737 dev->cache_valid |= VALID_MTU;
738 dev->netdev_mtu_error = 0;
741 if (!eth_addr_is_zero(change->mac)) {
742 dev->etheraddr = change->mac;
743 dev->cache_valid |= VALID_ETHERADDR;
744 dev->ether_addr_error = 0;
747 dev->ifindex = change->if_index;
748 dev->cache_valid |= VALID_IFINDEX;
749 dev->get_ifindex_error = 0;
751 netdev_linux_changed(dev, change->ifi_flags, 0);
753 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
754 /* Invalidates in4, in6. */
755 netdev_linux_changed(dev, dev->ifi_flags,
756 ~(VALID_IN4 | VALID_IN6));
762 static struct netdev *
763 netdev_linux_alloc(void)
765 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
770 netdev_linux_common_construct(struct netdev_linux *netdev)
772 ovs_mutex_init(&netdev->mutex);
775 /* Creates system and internal devices. */
777 netdev_linux_construct(struct netdev *netdev_)
779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 netdev_linux_common_construct(netdev);
784 error = get_flags(&netdev->up, &netdev->ifi_flags);
785 if (error == ENODEV) {
786 if (netdev->up.netdev_class != &netdev_internal_class) {
787 /* The device does not exist, so don't allow it to be opened. */
790 /* "Internal" netdevs have to be created as netdev objects before
791 * they exist in the kernel, because creating them in the kernel
792 * happens by passing a netdev object to dpif_port_add().
793 * Therefore, ignore the error. */
800 /* For most types of netdevs we open the device for each call of
801 * netdev_open(). However, this is not the case with tap devices,
802 * since it is only possible to open the device once. In this
803 * situation we share a single file descriptor, and consequently
804 * buffers, across all readers. Therefore once data is read it will
805 * be unavailable to other reads for tap devices. */
807 netdev_linux_construct_tap(struct netdev *netdev_)
809 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
810 static const char tap_dev[] = "/dev/net/tun";
811 const char *name = netdev_->name;
815 netdev_linux_common_construct(netdev);
817 /* Open tap device. */
818 netdev->tap_fd = open(tap_dev, O_RDWR);
819 if (netdev->tap_fd < 0) {
821 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
825 /* Create tap device. */
826 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
827 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
828 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
829 VLOG_WARN("%s: creating tap device failed: %s", name,
830 ovs_strerror(errno));
835 /* Make non-blocking. */
836 error = set_nonblocking(netdev->tap_fd);
844 close(netdev->tap_fd);
849 netdev_linux_destruct(struct netdev *netdev_)
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
853 if (netdev->tc && netdev->tc->ops->tc_destroy) {
854 netdev->tc->ops->tc_destroy(netdev->tc);
857 if (netdev_get_class(netdev_) == &netdev_tap_class
858 && netdev->tap_fd >= 0)
860 close(netdev->tap_fd);
863 if (netdev->miimon_interval > 0) {
864 atomic_count_dec(&miimon_cnt);
867 ovs_mutex_destroy(&netdev->mutex);
871 netdev_linux_dealloc(struct netdev *netdev_)
873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
877 static struct netdev_rxq *
878 netdev_linux_rxq_alloc(void)
880 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
885 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
887 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
888 struct netdev *netdev_ = rx->up.netdev;
889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
892 ovs_mutex_lock(&netdev->mutex);
893 rx->is_tap = is_tap_netdev(netdev_);
895 rx->fd = netdev->tap_fd;
897 struct sockaddr_ll sll;
899 /* Result of tcpdump -dd inbound */
900 static const struct sock_filter filt[] = {
901 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
902 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
903 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
904 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
906 static const struct sock_fprog fprog = {
907 ARRAY_SIZE(filt), (struct sock_filter *) filt
910 /* Create file descriptor. */
911 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
914 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
919 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
921 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
922 netdev_get_name(netdev_), ovs_strerror(error));
926 /* Set non-blocking mode. */
927 error = set_nonblocking(rx->fd);
932 /* Get ethernet device index. */
933 error = get_ifindex(&netdev->up, &ifindex);
938 /* Bind to specific ethernet device. */
939 memset(&sll, 0, sizeof sll);
940 sll.sll_family = AF_PACKET;
941 sll.sll_ifindex = ifindex;
942 sll.sll_protocol = htons(ETH_P_ALL);
943 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
945 VLOG_ERR("%s: failed to bind raw socket (%s)",
946 netdev_get_name(netdev_), ovs_strerror(error));
950 /* Filter for only inbound packets. */
951 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
955 VLOG_ERR("%s: failed to attach filter (%s)",
956 netdev_get_name(netdev_), ovs_strerror(error));
960 ovs_mutex_unlock(&netdev->mutex);
968 ovs_mutex_unlock(&netdev->mutex);
973 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
975 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
983 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
985 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
991 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
993 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
994 return htons(aux->tp_vlan_tpid);
996 return htons(ETH_TYPE_VLAN);
1001 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1003 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1007 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1012 struct cmsghdr *cmsg;
1014 struct cmsghdr cmsg;
1015 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1019 /* Reserve headroom for a single VLAN tag */
1020 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1021 size = dp_packet_tailroom(buffer);
1023 iov.iov_base = dp_packet_data(buffer);
1025 msgh.msg_name = NULL;
1026 msgh.msg_namelen = 0;
1027 msgh.msg_iov = &iov;
1028 msgh.msg_iovlen = 1;
1029 msgh.msg_control = &cmsg_buffer;
1030 msgh.msg_controllen = sizeof cmsg_buffer;
1034 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1035 } while (retval < 0 && errno == EINTR);
1039 } else if (retval > size) {
1043 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1045 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1046 const struct tpacket_auxdata *aux;
1048 if (cmsg->cmsg_level != SOL_PACKET
1049 || cmsg->cmsg_type != PACKET_AUXDATA
1050 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1054 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1055 if (auxdata_has_vlan_tci(aux)) {
1056 if (retval < ETH_HEADER_LEN) {
1060 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1061 htons(aux->tp_vlan_tci));
1070 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1073 size_t size = dp_packet_tailroom(buffer);
1076 retval = read(fd, dp_packet_data(buffer), size);
1077 } while (retval < 0 && errno == EINTR);
1083 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1088 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1091 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1092 struct netdev *netdev = rx->up.netdev;
1093 struct dp_packet *buffer;
1097 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1098 mtu = ETH_PAYLOAD_MAX;
1101 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1102 DP_NETDEV_HEADROOM);
1103 retval = (rx->is_tap
1104 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1105 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1108 if (retval != EAGAIN && retval != EMSGSIZE) {
1109 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1110 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1112 dp_packet_delete(buffer);
1114 dp_packet_pad(buffer);
1115 dp_packet_rss_invalidate(buffer);
1116 packets[0] = buffer;
1124 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1126 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1127 poll_fd_wait(rx->fd, POLLIN);
1131 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1133 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1136 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1137 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1141 drain_fd(rx->fd, ifr.ifr_qlen);
1144 return drain_rcvbuf(rx->fd);
1148 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1149 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1150 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1151 * the packet is too big or too small to transmit on the device.
1153 * The caller retains ownership of 'buffer' in all cases.
1155 * The kernel maintains a packet transmission queue, so the caller is not
1156 * expected to do additional queuing of packets. */
1158 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1159 struct dp_packet **pkts, int cnt, bool may_steal)
1164 /* 'i' is incremented only if there's no error */
1165 for (i = 0; i < cnt;) {
1166 const void *data = dp_packet_data(pkts[i]);
1167 size_t size = dp_packet_size(pkts[i]);
1170 if (!is_tap_netdev(netdev_)) {
1171 /* Use our AF_PACKET socket to send to this device. */
1172 struct sockaddr_ll sll;
1178 sock = af_packet_sock();
1183 ifindex = netdev_get_ifindex(netdev_);
1188 /* We don't bother setting most fields in sockaddr_ll because the
1189 * kernel ignores them for SOCK_RAW. */
1190 memset(&sll, 0, sizeof sll);
1191 sll.sll_family = AF_PACKET;
1192 sll.sll_ifindex = ifindex;
1194 iov.iov_base = CONST_CAST(void *, data);
1197 msg.msg_name = &sll;
1198 msg.msg_namelen = sizeof sll;
1201 msg.msg_control = NULL;
1202 msg.msg_controllen = 0;
1205 retval = sendmsg(sock, &msg, 0);
1207 /* Use the tap fd to send to this device. This is essential for
1208 * tap devices, because packets sent to a tap device with an
1209 * AF_PACKET socket will loop back to be *received* again on the
1210 * tap device. This doesn't occur on other interface types
1211 * because we attach a socket filter to the rx socket. */
1212 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1214 retval = write(netdev->tap_fd, data, size);
1218 /* The Linux AF_PACKET implementation never blocks waiting for room
1219 * for packets, instead returning ENOBUFS. Translate this into
1220 * EAGAIN for the caller. */
1221 error = errno == ENOBUFS ? EAGAIN : errno;
1222 if (error == EINTR) {
1223 /* continue without incrementing 'i', i.e. retry this packet */
1227 } else if (retval != size) {
1228 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1229 " of %"PRIuSIZE") on %s", retval, size,
1230 netdev_get_name(netdev_));
1235 /* Process the next packet in the batch */
1240 for (i = 0; i < cnt; i++) {
1241 dp_packet_delete(pkts[i]);
1245 if (error && error != EAGAIN) {
1246 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1247 netdev_get_name(netdev_), ovs_strerror(error));
1254 /* Registers with the poll loop to wake up from the next call to poll_block()
1255 * when the packet transmission queue has sufficient room to transmit a packet
1256 * with netdev_send().
1258 * The kernel maintains a packet transmission queue, so the client is not
1259 * expected to do additional queuing of packets. Thus, this function is
1260 * unlikely to ever be used. It is included for completeness. */
1262 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1264 if (is_tap_netdev(netdev)) {
1265 /* TAP device always accepts packets.*/
1266 poll_immediate_wake();
1270 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1271 * otherwise a positive errno value. */
1273 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1276 enum netdev_flags old_flags = 0;
1279 ovs_mutex_lock(&netdev->mutex);
1281 if (netdev->cache_valid & VALID_ETHERADDR) {
1282 error = netdev->ether_addr_error;
1283 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1286 netdev->cache_valid &= ~VALID_ETHERADDR;
1289 /* Tap devices must be brought down before setting the address. */
1290 if (is_tap_netdev(netdev_)) {
1291 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1293 error = set_etheraddr(netdev_get_name(netdev_), mac);
1294 if (!error || error == ENODEV) {
1295 netdev->ether_addr_error = error;
1296 netdev->cache_valid |= VALID_ETHERADDR;
1298 netdev->etheraddr = mac;
1302 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1303 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1307 ovs_mutex_unlock(&netdev->mutex);
1311 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1313 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1315 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1318 ovs_mutex_lock(&netdev->mutex);
1319 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1320 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1321 &netdev->etheraddr);
1322 netdev->cache_valid |= VALID_ETHERADDR;
1325 error = netdev->ether_addr_error;
1327 *mac = netdev->etheraddr;
1329 ovs_mutex_unlock(&netdev->mutex);
1335 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1339 if (!(netdev->cache_valid & VALID_MTU)) {
1342 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1343 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1344 netdev->mtu = ifr.ifr_mtu;
1345 netdev->cache_valid |= VALID_MTU;
1348 error = netdev->netdev_mtu_error;
1350 *mtup = netdev->mtu;
1356 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1357 * in bytes, not including the hardware header; thus, this is typically 1500
1358 * bytes for Ethernet devices. */
1360 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1365 ovs_mutex_lock(&netdev->mutex);
1366 error = netdev_linux_get_mtu__(netdev, mtup);
1367 ovs_mutex_unlock(&netdev->mutex);
1372 /* Sets the maximum size of transmitted (MTU) for given device using linux
1373 * networking ioctl interface.
1376 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1382 ovs_mutex_lock(&netdev->mutex);
1383 if (netdev->cache_valid & VALID_MTU) {
1384 error = netdev->netdev_mtu_error;
1385 if (error || netdev->mtu == mtu) {
1388 netdev->cache_valid &= ~VALID_MTU;
1391 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1392 SIOCSIFMTU, "SIOCSIFMTU");
1393 if (!error || error == ENODEV) {
1394 netdev->netdev_mtu_error = error;
1395 netdev->mtu = ifr.ifr_mtu;
1396 netdev->cache_valid |= VALID_MTU;
1399 ovs_mutex_unlock(&netdev->mutex);
1403 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1404 * On failure, returns a negative errno value. */
1406 netdev_linux_get_ifindex(const struct netdev *netdev_)
1408 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1411 ovs_mutex_lock(&netdev->mutex);
1412 error = get_ifindex(netdev_, &ifindex);
1413 ovs_mutex_unlock(&netdev->mutex);
1415 return error ? -error : ifindex;
1419 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1423 ovs_mutex_lock(&netdev->mutex);
1424 if (netdev->miimon_interval > 0) {
1425 *carrier = netdev->miimon;
1427 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1429 ovs_mutex_unlock(&netdev->mutex);
1434 static long long int
1435 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1438 long long int carrier_resets;
1440 ovs_mutex_lock(&netdev->mutex);
1441 carrier_resets = netdev->carrier_resets;
1442 ovs_mutex_unlock(&netdev->mutex);
1444 return carrier_resets;
1448 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1449 struct mii_ioctl_data *data)
1454 memset(&ifr, 0, sizeof ifr);
1455 memcpy(&ifr.ifr_data, data, sizeof *data);
1456 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1457 memcpy(data, &ifr.ifr_data, sizeof *data);
1463 netdev_linux_get_miimon(const char *name, bool *miimon)
1465 struct mii_ioctl_data data;
1470 memset(&data, 0, sizeof data);
1471 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1473 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1474 data.reg_num = MII_BMSR;
1475 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1479 *miimon = !!(data.val_out & BMSR_LSTATUS);
1481 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1484 struct ethtool_cmd ecmd;
1486 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1489 COVERAGE_INC(netdev_get_ethtool);
1490 memset(&ecmd, 0, sizeof ecmd);
1491 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1494 struct ethtool_value eval;
1496 memcpy(&eval, &ecmd, sizeof eval);
1497 *miimon = !!eval.data;
1499 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1507 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1508 long long int interval)
1510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1512 ovs_mutex_lock(&netdev->mutex);
1513 interval = interval > 0 ? MAX(interval, 100) : 0;
1514 if (netdev->miimon_interval != interval) {
1515 if (interval && !netdev->miimon_interval) {
1516 atomic_count_inc(&miimon_cnt);
1517 } else if (!interval && netdev->miimon_interval) {
1518 atomic_count_dec(&miimon_cnt);
1521 netdev->miimon_interval = interval;
1522 timer_set_expired(&netdev->miimon_timer);
1524 ovs_mutex_unlock(&netdev->mutex);
1530 netdev_linux_miimon_run(void)
1532 struct shash device_shash;
1533 struct shash_node *node;
1535 shash_init(&device_shash);
1536 netdev_get_devices(&netdev_linux_class, &device_shash);
1537 SHASH_FOR_EACH (node, &device_shash) {
1538 struct netdev *netdev = node->data;
1539 struct netdev_linux *dev = netdev_linux_cast(netdev);
1542 ovs_mutex_lock(&dev->mutex);
1543 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1544 netdev_linux_get_miimon(dev->up.name, &miimon);
1545 if (miimon != dev->miimon) {
1546 dev->miimon = miimon;
1547 netdev_linux_changed(dev, dev->ifi_flags, 0);
1550 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1552 ovs_mutex_unlock(&dev->mutex);
1553 netdev_close(netdev);
1556 shash_destroy(&device_shash);
1560 netdev_linux_miimon_wait(void)
1562 struct shash device_shash;
1563 struct shash_node *node;
1565 shash_init(&device_shash);
1566 netdev_get_devices(&netdev_linux_class, &device_shash);
1567 SHASH_FOR_EACH (node, &device_shash) {
1568 struct netdev *netdev = node->data;
1569 struct netdev_linux *dev = netdev_linux_cast(netdev);
1571 ovs_mutex_lock(&dev->mutex);
1572 if (dev->miimon_interval > 0) {
1573 timer_wait(&dev->miimon_timer);
1575 ovs_mutex_unlock(&dev->mutex);
1576 netdev_close(netdev);
1578 shash_destroy(&device_shash);
1582 swap_uint64(uint64_t *a, uint64_t *b)
1589 /* Copies 'src' into 'dst', performing format conversion in the process.
1591 * 'src' is allowed to be misaligned. */
1593 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1594 const struct ovs_vport_stats *src)
1596 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1597 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1598 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1599 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1600 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1601 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1602 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1603 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1605 dst->collisions = 0;
1606 dst->rx_length_errors = 0;
1607 dst->rx_over_errors = 0;
1608 dst->rx_crc_errors = 0;
1609 dst->rx_frame_errors = 0;
1610 dst->rx_fifo_errors = 0;
1611 dst->rx_missed_errors = 0;
1612 dst->tx_aborted_errors = 0;
1613 dst->tx_carrier_errors = 0;
1614 dst->tx_fifo_errors = 0;
1615 dst->tx_heartbeat_errors = 0;
1616 dst->tx_window_errors = 0;
1620 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1622 struct dpif_netlink_vport reply;
1626 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1629 } else if (!reply.stats) {
1634 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1642 get_stats_via_vport(const struct netdev *netdev_,
1643 struct netdev_stats *stats)
1645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1647 if (!netdev->vport_stats_error ||
1648 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1651 error = get_stats_via_vport__(netdev_, stats);
1652 if (error && error != ENOENT && error != ENODEV) {
1653 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1655 netdev_get_name(netdev_), ovs_strerror(error));
1657 netdev->vport_stats_error = error;
1658 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1662 /* Retrieves current device stats for 'netdev-linux'. */
1664 netdev_linux_get_stats(const struct netdev *netdev_,
1665 struct netdev_stats *stats)
1667 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1668 struct netdev_stats dev_stats;
1671 ovs_mutex_lock(&netdev->mutex);
1672 get_stats_via_vport(netdev_, stats);
1673 error = get_stats_via_netlink(netdev_, &dev_stats);
1675 if (!netdev->vport_stats_error) {
1678 } else if (netdev->vport_stats_error) {
1679 /* stats not available from OVS then use netdev stats. */
1682 /* Use kernel netdev's packet and byte counts since vport's counters
1683 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1685 stats->rx_packets = dev_stats.rx_packets;
1686 stats->rx_bytes = dev_stats.rx_bytes;
1687 stats->tx_packets = dev_stats.tx_packets;
1688 stats->tx_bytes = dev_stats.tx_bytes;
1690 stats->rx_errors += dev_stats.rx_errors;
1691 stats->tx_errors += dev_stats.tx_errors;
1692 stats->rx_dropped += dev_stats.rx_dropped;
1693 stats->tx_dropped += dev_stats.tx_dropped;
1694 stats->multicast += dev_stats.multicast;
1695 stats->collisions += dev_stats.collisions;
1696 stats->rx_length_errors += dev_stats.rx_length_errors;
1697 stats->rx_over_errors += dev_stats.rx_over_errors;
1698 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1699 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1700 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1701 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1702 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1703 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1704 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1705 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1706 stats->tx_window_errors += dev_stats.tx_window_errors;
1708 ovs_mutex_unlock(&netdev->mutex);
1713 /* Retrieves current device stats for 'netdev-tap' netdev or
1714 * netdev-internal. */
1716 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1719 struct netdev_stats dev_stats;
1722 ovs_mutex_lock(&netdev->mutex);
1723 get_stats_via_vport(netdev_, stats);
1724 error = get_stats_via_netlink(netdev_, &dev_stats);
1726 if (!netdev->vport_stats_error) {
1729 } else if (netdev->vport_stats_error) {
1730 /* Transmit and receive stats will appear to be swapped relative to the
1731 * other ports since we are the one sending the data, not a remote
1732 * computer. For consistency, we swap them back here. This does not
1733 * apply if we are getting stats from the vport layer because it always
1734 * tracks stats from the perspective of the switch. */
1737 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1738 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1739 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1740 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1741 stats->rx_length_errors = 0;
1742 stats->rx_over_errors = 0;
1743 stats->rx_crc_errors = 0;
1744 stats->rx_frame_errors = 0;
1745 stats->rx_fifo_errors = 0;
1746 stats->rx_missed_errors = 0;
1747 stats->tx_aborted_errors = 0;
1748 stats->tx_carrier_errors = 0;
1749 stats->tx_fifo_errors = 0;
1750 stats->tx_heartbeat_errors = 0;
1751 stats->tx_window_errors = 0;
1753 /* Use kernel netdev's packet and byte counts since vport counters
1754 * do not reflect packet counts on the wire when GSO, TSO or GRO
1756 stats->rx_packets = dev_stats.tx_packets;
1757 stats->rx_bytes = dev_stats.tx_bytes;
1758 stats->tx_packets = dev_stats.rx_packets;
1759 stats->tx_bytes = dev_stats.rx_bytes;
1761 stats->rx_dropped += dev_stats.tx_dropped;
1762 stats->tx_dropped += dev_stats.rx_dropped;
1764 stats->rx_errors += dev_stats.tx_errors;
1765 stats->tx_errors += dev_stats.rx_errors;
1767 stats->multicast += dev_stats.multicast;
1768 stats->collisions += dev_stats.collisions;
1770 ovs_mutex_unlock(&netdev->mutex);
1776 netdev_internal_get_stats(const struct netdev *netdev_,
1777 struct netdev_stats *stats)
1779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1782 ovs_mutex_lock(&netdev->mutex);
1783 get_stats_via_vport(netdev_, stats);
1784 error = netdev->vport_stats_error;
1785 ovs_mutex_unlock(&netdev->mutex);
1791 netdev_linux_read_features(struct netdev_linux *netdev)
1793 struct ethtool_cmd ecmd;
1797 if (netdev->cache_valid & VALID_FEATURES) {
1801 COVERAGE_INC(netdev_get_ethtool);
1802 memset(&ecmd, 0, sizeof ecmd);
1803 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1804 ETHTOOL_GSET, "ETHTOOL_GSET");
1809 /* Supported features. */
1810 netdev->supported = 0;
1811 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1812 netdev->supported |= NETDEV_F_10MB_HD;
1814 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1815 netdev->supported |= NETDEV_F_10MB_FD;
1817 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1818 netdev->supported |= NETDEV_F_100MB_HD;
1820 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1821 netdev->supported |= NETDEV_F_100MB_FD;
1823 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1824 netdev->supported |= NETDEV_F_1GB_HD;
1826 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1827 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1828 netdev->supported |= NETDEV_F_1GB_FD;
1830 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1831 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1832 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1833 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1834 netdev->supported |= NETDEV_F_10GB_FD;
1836 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1837 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1838 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1839 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1840 netdev->supported |= NETDEV_F_40GB_FD;
1842 if (ecmd.supported & SUPPORTED_TP) {
1843 netdev->supported |= NETDEV_F_COPPER;
1845 if (ecmd.supported & SUPPORTED_FIBRE) {
1846 netdev->supported |= NETDEV_F_FIBER;
1848 if (ecmd.supported & SUPPORTED_Autoneg) {
1849 netdev->supported |= NETDEV_F_AUTONEG;
1851 if (ecmd.supported & SUPPORTED_Pause) {
1852 netdev->supported |= NETDEV_F_PAUSE;
1854 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1855 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1858 /* Advertised features. */
1859 netdev->advertised = 0;
1860 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1861 netdev->advertised |= NETDEV_F_10MB_HD;
1863 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1864 netdev->advertised |= NETDEV_F_10MB_FD;
1866 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1867 netdev->advertised |= NETDEV_F_100MB_HD;
1869 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1870 netdev->advertised |= NETDEV_F_100MB_FD;
1872 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1873 netdev->advertised |= NETDEV_F_1GB_HD;
1875 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1876 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1877 netdev->advertised |= NETDEV_F_1GB_FD;
1879 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1880 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1881 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1882 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1883 netdev->advertised |= NETDEV_F_10GB_FD;
1885 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1886 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1887 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1888 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1889 netdev->advertised |= NETDEV_F_40GB_FD;
1891 if (ecmd.advertising & ADVERTISED_TP) {
1892 netdev->advertised |= NETDEV_F_COPPER;
1894 if (ecmd.advertising & ADVERTISED_FIBRE) {
1895 netdev->advertised |= NETDEV_F_FIBER;
1897 if (ecmd.advertising & ADVERTISED_Autoneg) {
1898 netdev->advertised |= NETDEV_F_AUTONEG;
1900 if (ecmd.advertising & ADVERTISED_Pause) {
1901 netdev->advertised |= NETDEV_F_PAUSE;
1903 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1904 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1907 /* Current settings. */
1908 speed = ethtool_cmd_speed(&ecmd);
1909 if (speed == SPEED_10) {
1910 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1911 } else if (speed == SPEED_100) {
1912 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1913 } else if (speed == SPEED_1000) {
1914 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1915 } else if (speed == SPEED_10000) {
1916 netdev->current = NETDEV_F_10GB_FD;
1917 } else if (speed == 40000) {
1918 netdev->current = NETDEV_F_40GB_FD;
1919 } else if (speed == 100000) {
1920 netdev->current = NETDEV_F_100GB_FD;
1921 } else if (speed == 1000000) {
1922 netdev->current = NETDEV_F_1TB_FD;
1924 netdev->current = 0;
1927 if (ecmd.port == PORT_TP) {
1928 netdev->current |= NETDEV_F_COPPER;
1929 } else if (ecmd.port == PORT_FIBRE) {
1930 netdev->current |= NETDEV_F_FIBER;
1934 netdev->current |= NETDEV_F_AUTONEG;
1938 netdev->cache_valid |= VALID_FEATURES;
1939 netdev->get_features_error = error;
1942 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1943 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1944 * Returns 0 if successful, otherwise a positive errno value. */
1946 netdev_linux_get_features(const struct netdev *netdev_,
1947 enum netdev_features *current,
1948 enum netdev_features *advertised,
1949 enum netdev_features *supported,
1950 enum netdev_features *peer)
1952 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1955 ovs_mutex_lock(&netdev->mutex);
1956 netdev_linux_read_features(netdev);
1957 if (!netdev->get_features_error) {
1958 *current = netdev->current;
1959 *advertised = netdev->advertised;
1960 *supported = netdev->supported;
1961 *peer = 0; /* XXX */
1963 error = netdev->get_features_error;
1964 ovs_mutex_unlock(&netdev->mutex);
1969 /* Set the features advertised by 'netdev' to 'advertise'. */
1971 netdev_linux_set_advertisements(struct netdev *netdev_,
1972 enum netdev_features advertise)
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1975 struct ethtool_cmd ecmd;
1978 ovs_mutex_lock(&netdev->mutex);
1980 COVERAGE_INC(netdev_get_ethtool);
1981 memset(&ecmd, 0, sizeof ecmd);
1982 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1983 ETHTOOL_GSET, "ETHTOOL_GSET");
1988 ecmd.advertising = 0;
1989 if (advertise & NETDEV_F_10MB_HD) {
1990 ecmd.advertising |= ADVERTISED_10baseT_Half;
1992 if (advertise & NETDEV_F_10MB_FD) {
1993 ecmd.advertising |= ADVERTISED_10baseT_Full;
1995 if (advertise & NETDEV_F_100MB_HD) {
1996 ecmd.advertising |= ADVERTISED_100baseT_Half;
1998 if (advertise & NETDEV_F_100MB_FD) {
1999 ecmd.advertising |= ADVERTISED_100baseT_Full;
2001 if (advertise & NETDEV_F_1GB_HD) {
2002 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2004 if (advertise & NETDEV_F_1GB_FD) {
2005 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2007 if (advertise & NETDEV_F_10GB_FD) {
2008 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2010 if (advertise & NETDEV_F_COPPER) {
2011 ecmd.advertising |= ADVERTISED_TP;
2013 if (advertise & NETDEV_F_FIBER) {
2014 ecmd.advertising |= ADVERTISED_FIBRE;
2016 if (advertise & NETDEV_F_AUTONEG) {
2017 ecmd.advertising |= ADVERTISED_Autoneg;
2019 if (advertise & NETDEV_F_PAUSE) {
2020 ecmd.advertising |= ADVERTISED_Pause;
2022 if (advertise & NETDEV_F_PAUSE_ASYM) {
2023 ecmd.advertising |= ADVERTISED_Asym_Pause;
2025 COVERAGE_INC(netdev_set_ethtool);
2026 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2027 ETHTOOL_SSET, "ETHTOOL_SSET");
2030 ovs_mutex_unlock(&netdev->mutex);
2034 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2035 * successful, otherwise a positive errno value. */
2037 netdev_linux_set_policing(struct netdev *netdev_,
2038 uint32_t kbits_rate, uint32_t kbits_burst)
2040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2041 const char *netdev_name = netdev_get_name(netdev_);
2044 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2045 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2046 : kbits_burst); /* Stick with user-specified value. */
2048 ovs_mutex_lock(&netdev->mutex);
2049 if (netdev->cache_valid & VALID_POLICING) {
2050 error = netdev->netdev_policing_error;
2051 if (error || (netdev->kbits_rate == kbits_rate &&
2052 netdev->kbits_burst == kbits_burst)) {
2053 /* Assume that settings haven't changed since we last set them. */
2056 netdev->cache_valid &= ~VALID_POLICING;
2059 COVERAGE_INC(netdev_set_policing);
2060 /* Remove any existing ingress qdisc. */
2061 error = tc_add_del_ingress_qdisc(netdev_, false);
2063 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2064 netdev_name, ovs_strerror(error));
2069 error = tc_add_del_ingress_qdisc(netdev_, true);
2071 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2072 netdev_name, ovs_strerror(error));
2076 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2078 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2079 netdev_name, ovs_strerror(error));
2084 netdev->kbits_rate = kbits_rate;
2085 netdev->kbits_burst = kbits_burst;
2088 if (!error || error == ENODEV) {
2089 netdev->netdev_policing_error = error;
2090 netdev->cache_valid |= VALID_POLICING;
2092 ovs_mutex_unlock(&netdev->mutex);
2097 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2100 const struct tc_ops *const *opsp;
2102 for (opsp = tcs; *opsp != NULL; opsp++) {
2103 const struct tc_ops *ops = *opsp;
2104 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2105 sset_add(types, ops->ovs_name);
2111 static const struct tc_ops *
2112 tc_lookup_ovs_name(const char *name)
2114 const struct tc_ops *const *opsp;
2116 for (opsp = tcs; *opsp != NULL; opsp++) {
2117 const struct tc_ops *ops = *opsp;
2118 if (!strcmp(name, ops->ovs_name)) {
2125 static const struct tc_ops *
2126 tc_lookup_linux_name(const char *name)
2128 const struct tc_ops *const *opsp;
2130 for (opsp = tcs; *opsp != NULL; opsp++) {
2131 const struct tc_ops *ops = *opsp;
2132 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2139 static struct tc_queue *
2140 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2144 struct tc_queue *queue;
2146 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2147 if (queue->queue_id == queue_id) {
2154 static struct tc_queue *
2155 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2157 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2161 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2163 struct netdev_qos_capabilities *caps)
2165 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2169 caps->n_queues = ops->n_queues;
2174 netdev_linux_get_qos(const struct netdev *netdev_,
2175 const char **typep, struct smap *details)
2177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2180 ovs_mutex_lock(&netdev->mutex);
2181 error = tc_query_qdisc(netdev_);
2183 *typep = netdev->tc->ops->ovs_name;
2184 error = (netdev->tc->ops->qdisc_get
2185 ? netdev->tc->ops->qdisc_get(netdev_, details)
2188 ovs_mutex_unlock(&netdev->mutex);
2194 netdev_linux_set_qos(struct netdev *netdev_,
2195 const char *type, const struct smap *details)
2197 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2198 const struct tc_ops *new_ops;
2201 new_ops = tc_lookup_ovs_name(type);
2202 if (!new_ops || !new_ops->tc_install) {
2206 ovs_mutex_lock(&netdev->mutex);
2207 error = tc_query_qdisc(netdev_);
2212 if (new_ops == netdev->tc->ops) {
2213 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2215 /* Delete existing qdisc. */
2216 error = tc_del_qdisc(netdev_);
2220 ovs_assert(netdev->tc == NULL);
2222 /* Install new qdisc. */
2223 error = new_ops->tc_install(netdev_, details);
2224 ovs_assert((error == 0) == (netdev->tc != NULL));
2228 ovs_mutex_unlock(&netdev->mutex);
2233 netdev_linux_get_queue(const struct netdev *netdev_,
2234 unsigned int queue_id, struct smap *details)
2236 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2239 ovs_mutex_lock(&netdev->mutex);
2240 error = tc_query_qdisc(netdev_);
2242 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2244 ? netdev->tc->ops->class_get(netdev_, queue, details)
2247 ovs_mutex_unlock(&netdev->mutex);
2253 netdev_linux_set_queue(struct netdev *netdev_,
2254 unsigned int queue_id, const struct smap *details)
2256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2259 ovs_mutex_lock(&netdev->mutex);
2260 error = tc_query_qdisc(netdev_);
2262 error = (queue_id < netdev->tc->ops->n_queues
2263 && netdev->tc->ops->class_set
2264 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2267 ovs_mutex_unlock(&netdev->mutex);
2273 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2275 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2278 ovs_mutex_lock(&netdev->mutex);
2279 error = tc_query_qdisc(netdev_);
2281 if (netdev->tc->ops->class_delete) {
2282 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2284 ? netdev->tc->ops->class_delete(netdev_, queue)
2290 ovs_mutex_unlock(&netdev->mutex);
2296 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2297 unsigned int queue_id,
2298 struct netdev_queue_stats *stats)
2300 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2303 ovs_mutex_lock(&netdev->mutex);
2304 error = tc_query_qdisc(netdev_);
2306 if (netdev->tc->ops->class_get_stats) {
2307 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2309 stats->created = queue->created;
2310 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2319 ovs_mutex_unlock(&netdev->mutex);
2324 struct queue_dump_state {
2325 struct nl_dump dump;
2330 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2332 struct ofpbuf request;
2333 struct tcmsg *tcmsg;
2335 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2339 tcmsg->tcm_parent = 0;
2340 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2341 ofpbuf_uninit(&request);
2343 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2348 finish_queue_dump(struct queue_dump_state *state)
2350 ofpbuf_uninit(&state->buf);
2351 return nl_dump_done(&state->dump);
2354 struct netdev_linux_queue_state {
2355 unsigned int *queues;
2361 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2363 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2366 ovs_mutex_lock(&netdev->mutex);
2367 error = tc_query_qdisc(netdev_);
2369 if (netdev->tc->ops->class_get) {
2370 struct netdev_linux_queue_state *state;
2371 struct tc_queue *queue;
2374 *statep = state = xmalloc(sizeof *state);
2375 state->n_queues = hmap_count(&netdev->tc->queues);
2376 state->cur_queue = 0;
2377 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2380 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2381 state->queues[i++] = queue->queue_id;
2387 ovs_mutex_unlock(&netdev->mutex);
2393 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2394 unsigned int *queue_idp, struct smap *details)
2396 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2397 struct netdev_linux_queue_state *state = state_;
2400 ovs_mutex_lock(&netdev->mutex);
2401 while (state->cur_queue < state->n_queues) {
2402 unsigned int queue_id = state->queues[state->cur_queue++];
2403 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2406 *queue_idp = queue_id;
2407 error = netdev->tc->ops->class_get(netdev_, queue, details);
2411 ovs_mutex_unlock(&netdev->mutex);
2417 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2420 struct netdev_linux_queue_state *state = state_;
2422 free(state->queues);
2428 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2429 netdev_dump_queue_stats_cb *cb, void *aux)
2431 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2434 ovs_mutex_lock(&netdev->mutex);
2435 error = tc_query_qdisc(netdev_);
2437 struct queue_dump_state state;
2439 if (!netdev->tc->ops->class_dump_stats) {
2441 } else if (!start_queue_dump(netdev_, &state)) {
2447 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2448 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2455 retval = finish_queue_dump(&state);
2461 ovs_mutex_unlock(&netdev->mutex);
2467 netdev_linux_get_in4(const struct netdev *netdev_,
2468 struct in_addr *address, struct in_addr *netmask)
2470 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2473 ovs_mutex_lock(&netdev->mutex);
2474 if (!(netdev->cache_valid & VALID_IN4)) {
2475 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2476 SIOCGIFADDR, "SIOCGIFADDR");
2478 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2479 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2481 netdev->in4_error = error;
2482 netdev->cache_valid |= VALID_IN4;
2484 error = netdev->in4_error;
2488 if (netdev->address.s_addr != INADDR_ANY) {
2489 *address = netdev->address;
2490 *netmask = netdev->netmask;
2492 error = EADDRNOTAVAIL;
2495 ovs_mutex_unlock(&netdev->mutex);
2501 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2502 struct in_addr netmask)
2504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2507 ovs_mutex_lock(&netdev->mutex);
2508 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2510 netdev->address = address;
2511 netdev->netmask = netmask;
2512 if (address.s_addr != INADDR_ANY) {
2513 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2514 "SIOCSIFNETMASK", netmask);
2519 netdev->cache_valid |= VALID_IN4;
2520 netdev->in4_error = 0;
2522 netdev->cache_valid &= ~VALID_IN4;
2524 ovs_mutex_unlock(&netdev->mutex);
2530 parse_if_inet6_line(const char *line,
2531 struct in6_addr *in6, char ifname[16 + 1])
2533 uint8_t *s6 = in6->s6_addr;
2534 #define X8 "%2"SCNx8
2535 return ovs_scan(line,
2536 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2537 "%*x %*x %*x %*x %16s\n",
2538 &s6[0], &s6[1], &s6[2], &s6[3],
2539 &s6[4], &s6[5], &s6[6], &s6[7],
2540 &s6[8], &s6[9], &s6[10], &s6[11],
2541 &s6[12], &s6[13], &s6[14], &s6[15],
2545 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2546 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2549 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2554 ovs_mutex_lock(&netdev->mutex);
2555 if (!(netdev->cache_valid & VALID_IN6)) {
2559 netdev->in6 = in6addr_any;
2560 netdev->in6_error = EADDRNOTAVAIL;
2562 file = fopen("/proc/net/if_inet6", "r");
2564 const char *name = netdev_get_name(netdev_);
2565 while (fgets(line, sizeof line, file)) {
2566 struct in6_addr in6_tmp;
2567 char ifname[16 + 1];
2568 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2569 && !strcmp(name, ifname)
2570 && !IN6_IS_ADDR_LINKLOCAL(&in6_tmp))
2572 netdev->in6 = in6_tmp;
2573 netdev->in6_error = 0;
2579 netdev->in6_error = EOPNOTSUPP;
2581 netdev->cache_valid |= VALID_IN6;
2584 error = netdev->in6_error;
2585 ovs_mutex_unlock(&netdev->mutex);
2591 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2593 struct sockaddr_in sin;
2594 memset(&sin, 0, sizeof sin);
2595 sin.sin_family = AF_INET;
2596 sin.sin_addr = addr;
2599 memset(sa, 0, sizeof *sa);
2600 memcpy(sa, &sin, sizeof sin);
2604 do_set_addr(struct netdev *netdev,
2605 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2609 make_in4_sockaddr(&ifr.ifr_addr, addr);
2610 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2614 /* Adds 'router' as a default IP gateway. */
2616 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2618 struct in_addr any = { INADDR_ANY };
2622 memset(&rt, 0, sizeof rt);
2623 make_in4_sockaddr(&rt.rt_dst, any);
2624 make_in4_sockaddr(&rt.rt_gateway, router);
2625 make_in4_sockaddr(&rt.rt_genmask, any);
2626 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2627 error = af_inet_ioctl(SIOCADDRT, &rt);
2629 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2635 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2638 static const char fn[] = "/proc/net/route";
2643 *netdev_name = NULL;
2644 stream = fopen(fn, "r");
2645 if (stream == NULL) {
2646 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2651 while (fgets(line, sizeof line, stream)) {
2654 ovs_be32 dest, gateway, mask;
2655 int refcnt, metric, mtu;
2656 unsigned int flags, use, window, irtt;
2659 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2661 iface, &dest, &gateway, &flags, &refcnt,
2662 &use, &metric, &mask, &mtu, &window, &irtt)) {
2663 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2667 if (!(flags & RTF_UP)) {
2668 /* Skip routes that aren't up. */
2672 /* The output of 'dest', 'mask', and 'gateway' were given in
2673 * network byte order, so we don't need need any endian
2674 * conversions here. */
2675 if ((dest & mask) == (host->s_addr & mask)) {
2677 /* The host is directly reachable. */
2678 next_hop->s_addr = 0;
2680 /* To reach the host, we must go through a gateway. */
2681 next_hop->s_addr = gateway;
2683 *netdev_name = xstrdup(iface);
2695 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2697 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2700 ovs_mutex_lock(&netdev->mutex);
2701 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2702 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2704 COVERAGE_INC(netdev_get_ethtool);
2705 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2706 error = netdev_linux_do_ethtool(netdev->up.name,
2709 "ETHTOOL_GDRVINFO");
2711 netdev->cache_valid |= VALID_DRVINFO;
2716 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2717 smap_add(smap, "driver_version", netdev->drvinfo.version);
2718 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2720 ovs_mutex_unlock(&netdev->mutex);
2726 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2729 smap_add(smap, "driver_name", "openvswitch");
2733 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2734 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2735 * returns 0. Otherwise, it returns a positive errno value; in particular,
2736 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2738 netdev_linux_arp_lookup(const struct netdev *netdev,
2739 ovs_be32 ip, struct eth_addr *mac)
2742 struct sockaddr_in sin;
2745 memset(&r, 0, sizeof r);
2746 memset(&sin, 0, sizeof sin);
2747 sin.sin_family = AF_INET;
2748 sin.sin_addr.s_addr = ip;
2750 memcpy(&r.arp_pa, &sin, sizeof sin);
2751 r.arp_ha.sa_family = ARPHRD_ETHER;
2753 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2754 COVERAGE_INC(netdev_arp_lookup);
2755 retval = af_inet_ioctl(SIOCGARP, &r);
2757 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2758 } else if (retval != ENXIO) {
2759 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2760 netdev_get_name(netdev), IP_ARGS(ip),
2761 ovs_strerror(retval));
2767 nd_to_iff_flags(enum netdev_flags nd)
2770 if (nd & NETDEV_UP) {
2773 if (nd & NETDEV_PROMISC) {
2776 if (nd & NETDEV_LOOPBACK) {
2777 iff |= IFF_LOOPBACK;
2783 iff_to_nd_flags(int iff)
2785 enum netdev_flags nd = 0;
2789 if (iff & IFF_PROMISC) {
2790 nd |= NETDEV_PROMISC;
2792 if (iff & IFF_LOOPBACK) {
2793 nd |= NETDEV_LOOPBACK;
2799 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2800 enum netdev_flags on, enum netdev_flags *old_flagsp)
2801 OVS_REQUIRES(netdev->mutex)
2803 int old_flags, new_flags;
2806 old_flags = netdev->ifi_flags;
2807 *old_flagsp = iff_to_nd_flags(old_flags);
2808 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2809 if (new_flags != old_flags) {
2810 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2811 get_flags(&netdev->up, &netdev->ifi_flags);
2818 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2819 enum netdev_flags on, enum netdev_flags *old_flagsp)
2821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2824 ovs_mutex_lock(&netdev->mutex);
2825 error = update_flags(netdev, off, on, old_flagsp);
2826 ovs_mutex_unlock(&netdev->mutex);
2831 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2832 GET_FEATURES, GET_STATUS) \
2838 netdev_linux_wait, \
2840 netdev_linux_alloc, \
2842 netdev_linux_destruct, \
2843 netdev_linux_dealloc, \
2844 NULL, /* get_config */ \
2845 NULL, /* set_config */ \
2846 NULL, /* get_tunnel_config */ \
2847 NULL, /* build header */ \
2848 NULL, /* push header */ \
2849 NULL, /* pop header */ \
2850 NULL, /* get_numa_id */ \
2851 NULL, /* set_multiq */ \
2853 netdev_linux_send, \
2854 netdev_linux_send_wait, \
2856 netdev_linux_set_etheraddr, \
2857 netdev_linux_get_etheraddr, \
2858 netdev_linux_get_mtu, \
2859 netdev_linux_set_mtu, \
2860 netdev_linux_get_ifindex, \
2861 netdev_linux_get_carrier, \
2862 netdev_linux_get_carrier_resets, \
2863 netdev_linux_set_miimon_interval, \
2867 netdev_linux_set_advertisements, \
2869 netdev_linux_set_policing, \
2870 netdev_linux_get_qos_types, \
2871 netdev_linux_get_qos_capabilities, \
2872 netdev_linux_get_qos, \
2873 netdev_linux_set_qos, \
2874 netdev_linux_get_queue, \
2875 netdev_linux_set_queue, \
2876 netdev_linux_delete_queue, \
2877 netdev_linux_get_queue_stats, \
2878 netdev_linux_queue_dump_start, \
2879 netdev_linux_queue_dump_next, \
2880 netdev_linux_queue_dump_done, \
2881 netdev_linux_dump_queue_stats, \
2883 netdev_linux_get_in4, \
2884 netdev_linux_set_in4, \
2885 netdev_linux_get_in6, \
2886 netdev_linux_add_router, \
2887 netdev_linux_get_next_hop, \
2889 netdev_linux_arp_lookup, \
2891 netdev_linux_update_flags, \
2893 netdev_linux_rxq_alloc, \
2894 netdev_linux_rxq_construct, \
2895 netdev_linux_rxq_destruct, \
2896 netdev_linux_rxq_dealloc, \
2897 netdev_linux_rxq_recv, \
2898 netdev_linux_rxq_wait, \
2899 netdev_linux_rxq_drain, \
2902 const struct netdev_class netdev_linux_class =
2905 netdev_linux_construct,
2906 netdev_linux_get_stats,
2907 netdev_linux_get_features,
2908 netdev_linux_get_status);
2910 const struct netdev_class netdev_tap_class =
2913 netdev_linux_construct_tap,
2914 netdev_tap_get_stats,
2915 netdev_linux_get_features,
2916 netdev_linux_get_status);
2918 const struct netdev_class netdev_internal_class =
2921 netdev_linux_construct,
2922 netdev_internal_get_stats,
2923 NULL, /* get_features */
2924 netdev_internal_get_status);
2927 #define CODEL_N_QUEUES 0x0000
2929 /* In sufficiently new kernel headers these are defined as enums in
2930 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2931 * kernels. (This overrides any enum definition in the header file but that's
2933 #define TCA_CODEL_TARGET 1
2934 #define TCA_CODEL_LIMIT 2
2935 #define TCA_CODEL_INTERVAL 3
2944 static struct codel *
2945 codel_get__(const struct netdev *netdev_)
2947 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2948 return CONTAINER_OF(netdev->tc, struct codel, tc);
2952 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2956 struct codel *codel;
2958 codel = xmalloc(sizeof *codel);
2959 tc_init(&codel->tc, &tc_ops_codel);
2960 codel->target = target;
2961 codel->limit = limit;
2962 codel->interval = interval;
2964 netdev->tc = &codel->tc;
2968 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2972 struct ofpbuf request;
2973 struct tcmsg *tcmsg;
2974 uint32_t otarget, olimit, ointerval;
2977 tc_del_qdisc(netdev);
2979 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2980 NLM_F_EXCL | NLM_F_CREATE, &request);
2984 tcmsg->tcm_handle = tc_make_handle(1, 0);
2985 tcmsg->tcm_parent = TC_H_ROOT;
2987 otarget = target ? target : 5000;
2988 olimit = limit ? limit : 10240;
2989 ointerval = interval ? interval : 100000;
2991 nl_msg_put_string(&request, TCA_KIND, "codel");
2992 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2993 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2994 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2995 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2996 nl_msg_end_nested(&request, opt_offset);
2998 error = tc_transact(&request, NULL);
3000 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3001 "target %u, limit %u, interval %u error %d(%s)",
3002 netdev_get_name(netdev),
3003 otarget, olimit, ointerval,
3004 error, ovs_strerror(error));
3010 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3011 const struct smap *details, struct codel *codel)
3013 const char *target_s;
3014 const char *limit_s;
3015 const char *interval_s;
3017 target_s = smap_get(details, "target");
3018 limit_s = smap_get(details, "limit");
3019 interval_s = smap_get(details, "interval");
3021 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3022 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3023 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3025 if (!codel->target) {
3026 codel->target = 5000;
3028 if (!codel->limit) {
3029 codel->limit = 10240;
3031 if (!codel->interval) {
3032 codel->interval = 100000;
3037 codel_tc_install(struct netdev *netdev, const struct smap *details)
3042 codel_parse_qdisc_details__(netdev, details, &codel);
3043 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3046 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3052 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3054 static const struct nl_policy tca_codel_policy[] = {
3055 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3056 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3057 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3060 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3062 if (!nl_parse_nested(nl_options, tca_codel_policy,
3063 attrs, ARRAY_SIZE(tca_codel_policy))) {
3064 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3068 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3069 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3070 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3075 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3077 struct nlattr *nlattr;
3082 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3087 error = codel_parse_tca_options__(nlattr, &codel);
3092 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3098 codel_tc_destroy(struct tc *tc)
3100 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3106 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3108 const struct codel *codel = codel_get__(netdev);
3109 smap_add_format(details, "target", "%u", codel->target);
3110 smap_add_format(details, "limit", "%u", codel->limit);
3111 smap_add_format(details, "interval", "%u", codel->interval);
3116 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3120 codel_parse_qdisc_details__(netdev, details, &codel);
3121 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3122 codel_get__(netdev)->target = codel.target;
3123 codel_get__(netdev)->limit = codel.limit;
3124 codel_get__(netdev)->interval = codel.interval;
3128 static const struct tc_ops tc_ops_codel = {
3129 "codel", /* linux_name */
3130 "linux-codel", /* ovs_name */
3131 CODEL_N_QUEUES, /* n_queues */
3144 /* FQ-CoDel traffic control class. */
3146 #define FQCODEL_N_QUEUES 0x0000
3148 /* In sufficiently new kernel headers these are defined as enums in
3149 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3150 * kernels. (This overrides any enum definition in the header file but that's
3152 #define TCA_FQ_CODEL_TARGET 1
3153 #define TCA_FQ_CODEL_LIMIT 2
3154 #define TCA_FQ_CODEL_INTERVAL 3
3155 #define TCA_FQ_CODEL_ECN 4
3156 #define TCA_FQ_CODEL_FLOWS 5
3157 #define TCA_FQ_CODEL_QUANTUM 6
3168 static struct fqcodel *
3169 fqcodel_get__(const struct netdev *netdev_)
3171 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3172 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3176 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3177 uint32_t interval, uint32_t flows, uint32_t quantum)
3179 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3180 struct fqcodel *fqcodel;
3182 fqcodel = xmalloc(sizeof *fqcodel);
3183 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3184 fqcodel->target = target;
3185 fqcodel->limit = limit;
3186 fqcodel->interval = interval;
3187 fqcodel->flows = flows;
3188 fqcodel->quantum = quantum;
3190 netdev->tc = &fqcodel->tc;
3194 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3195 uint32_t interval, uint32_t flows, uint32_t quantum)
3198 struct ofpbuf request;
3199 struct tcmsg *tcmsg;
3200 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3203 tc_del_qdisc(netdev);
3205 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3206 NLM_F_EXCL | NLM_F_CREATE, &request);
3210 tcmsg->tcm_handle = tc_make_handle(1, 0);
3211 tcmsg->tcm_parent = TC_H_ROOT;
3213 otarget = target ? target : 5000;
3214 olimit = limit ? limit : 10240;
3215 ointerval = interval ? interval : 100000;
3216 oflows = flows ? flows : 1024;
3217 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3220 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3221 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3222 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3223 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3224 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3225 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3226 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3227 nl_msg_end_nested(&request, opt_offset);
3229 error = tc_transact(&request, NULL);
3231 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3232 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3233 netdev_get_name(netdev),
3234 otarget, olimit, ointerval, oflows, oquantum,
3235 error, ovs_strerror(error));
3241 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3242 const struct smap *details, struct fqcodel *fqcodel)
3244 const char *target_s;
3245 const char *limit_s;
3246 const char *interval_s;
3247 const char *flows_s;
3248 const char *quantum_s;
3250 target_s = smap_get(details, "target");
3251 limit_s = smap_get(details, "limit");
3252 interval_s = smap_get(details, "interval");
3253 flows_s = smap_get(details, "flows");
3254 quantum_s = smap_get(details, "quantum");
3255 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3256 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3257 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3258 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3259 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3260 if (!fqcodel->target) {
3261 fqcodel->target = 5000;
3263 if (!fqcodel->limit) {
3264 fqcodel->limit = 10240;
3266 if (!fqcodel->interval) {
3267 fqcodel->interval = 1000000;
3269 if (!fqcodel->flows) {
3270 fqcodel->flows = 1024;
3272 if (!fqcodel->quantum) {
3273 fqcodel->quantum = 1514;
3278 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3281 struct fqcodel fqcodel;
3283 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3284 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3285 fqcodel.interval, fqcodel.flows,
3288 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3289 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3295 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3297 static const struct nl_policy tca_fqcodel_policy[] = {
3298 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3299 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3300 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3301 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3302 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3305 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3307 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3308 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3309 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3313 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3314 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3315 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3316 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3317 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3322 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3324 struct nlattr *nlattr;
3327 struct fqcodel fqcodel;
3329 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3334 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3339 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3340 fqcodel.flows, fqcodel.quantum);
3345 fqcodel_tc_destroy(struct tc *tc)
3347 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3353 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3355 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3356 smap_add_format(details, "target", "%u", fqcodel->target);
3357 smap_add_format(details, "limit", "%u", fqcodel->limit);
3358 smap_add_format(details, "interval", "%u", fqcodel->interval);
3359 smap_add_format(details, "flows", "%u", fqcodel->flows);
3360 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3365 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3367 struct fqcodel fqcodel;
3369 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3370 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3371 fqcodel.flows, fqcodel.quantum);
3372 fqcodel_get__(netdev)->target = fqcodel.target;
3373 fqcodel_get__(netdev)->limit = fqcodel.limit;
3374 fqcodel_get__(netdev)->interval = fqcodel.interval;
3375 fqcodel_get__(netdev)->flows = fqcodel.flows;
3376 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3380 static const struct tc_ops tc_ops_fqcodel = {
3381 "fq_codel", /* linux_name */
3382 "linux-fq_codel", /* ovs_name */
3383 FQCODEL_N_QUEUES, /* n_queues */
3396 /* SFQ traffic control class. */
3398 #define SFQ_N_QUEUES 0x0000
3407 sfq_get__(const struct netdev *netdev_)
3409 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3410 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3414 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3419 sfq = xmalloc(sizeof *sfq);
3420 tc_init(&sfq->tc, &tc_ops_sfq);
3421 sfq->perturb = perturb;
3422 sfq->quantum = quantum;
3424 netdev->tc = &sfq->tc;
3428 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3430 struct tc_sfq_qopt opt;
3431 struct ofpbuf request;
3432 struct tcmsg *tcmsg;
3434 int mtu_error, error;
3435 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3437 tc_del_qdisc(netdev);
3439 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3440 NLM_F_EXCL | NLM_F_CREATE, &request);
3444 tcmsg->tcm_handle = tc_make_handle(1, 0);
3445 tcmsg->tcm_parent = TC_H_ROOT;
3447 memset(&opt, 0, sizeof opt);
3450 opt.quantum = mtu; /* if we cannot find mtu, use default */
3453 opt.quantum = quantum;
3457 opt.perturb_period = 10;
3459 opt.perturb_period = perturb;
3462 nl_msg_put_string(&request, TCA_KIND, "sfq");
3463 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3465 error = tc_transact(&request, NULL);
3467 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3468 "quantum %u, perturb %u error %d(%s)",
3469 netdev_get_name(netdev),
3470 opt.quantum, opt.perturb_period,
3471 error, ovs_strerror(error));
3477 sfq_parse_qdisc_details__(struct netdev *netdev,
3478 const struct smap *details, struct sfq *sfq)
3480 const char *perturb_s;
3481 const char *quantum_s;
3485 perturb_s = smap_get(details, "perturb");
3486 quantum_s = smap_get(details, "quantum");
3487 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3488 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3489 if (!sfq->perturb) {
3493 if (!sfq->quantum) {
3494 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3498 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3499 "device without mtu");
3506 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3511 sfq_parse_qdisc_details__(netdev, details, &sfq);
3512 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3514 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3520 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3522 const struct tc_sfq_qopt *sfq;
3523 struct nlattr *nlattr;
3527 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3529 sfq = nl_attr_get(nlattr);
3530 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3538 sfq_tc_destroy(struct tc *tc)
3540 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3546 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3548 const struct sfq *sfq = sfq_get__(netdev);
3549 smap_add_format(details, "quantum", "%u", sfq->quantum);
3550 smap_add_format(details, "perturb", "%u", sfq->perturb);
3555 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3559 sfq_parse_qdisc_details__(netdev, details, &sfq);
3560 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3561 sfq_get__(netdev)->quantum = sfq.quantum;
3562 sfq_get__(netdev)->perturb = sfq.perturb;
3566 static const struct tc_ops tc_ops_sfq = {
3567 "sfq", /* linux_name */
3568 "linux-sfq", /* ovs_name */
3569 SFQ_N_QUEUES, /* n_queues */
3582 /* HTB traffic control class. */
3584 #define HTB_N_QUEUES 0xf000
3585 #define HTB_RATE2QUANTUM 10
3589 unsigned int max_rate; /* In bytes/s. */
3593 struct tc_queue tc_queue;
3594 unsigned int min_rate; /* In bytes/s. */
3595 unsigned int max_rate; /* In bytes/s. */
3596 unsigned int burst; /* In bytes. */
3597 unsigned int priority; /* Lower values are higher priorities. */
3601 htb_get__(const struct netdev *netdev_)
3603 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3604 return CONTAINER_OF(netdev->tc, struct htb, tc);
3608 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3610 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3613 htb = xmalloc(sizeof *htb);
3614 tc_init(&htb->tc, &tc_ops_htb);
3615 htb->max_rate = max_rate;
3617 netdev->tc = &htb->tc;
3620 /* Create an HTB qdisc.
3622 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3624 htb_setup_qdisc__(struct netdev *netdev)
3627 struct tc_htb_glob opt;
3628 struct ofpbuf request;
3629 struct tcmsg *tcmsg;
3631 tc_del_qdisc(netdev);
3633 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3634 NLM_F_EXCL | NLM_F_CREATE, &request);
3638 tcmsg->tcm_handle = tc_make_handle(1, 0);
3639 tcmsg->tcm_parent = TC_H_ROOT;
3641 nl_msg_put_string(&request, TCA_KIND, "htb");
3643 memset(&opt, 0, sizeof opt);
3644 opt.rate2quantum = HTB_RATE2QUANTUM;
3648 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3649 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3650 nl_msg_end_nested(&request, opt_offset);
3652 return tc_transact(&request, NULL);
3655 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3656 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3658 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3659 unsigned int parent, struct htb_class *class)
3662 struct tc_htb_opt opt;
3663 struct ofpbuf request;
3664 struct tcmsg *tcmsg;
3668 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3670 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3671 netdev_get_name(netdev));
3675 memset(&opt, 0, sizeof opt);
3676 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3677 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3678 /* Makes sure the quantum is at least MTU. Setting quantum will
3679 * make htb ignore the r2q for this class. */
3680 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3683 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3684 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3685 opt.prio = class->priority;
3687 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3691 tcmsg->tcm_handle = handle;
3692 tcmsg->tcm_parent = parent;
3694 nl_msg_put_string(&request, TCA_KIND, "htb");
3695 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3696 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3697 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3698 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3699 nl_msg_end_nested(&request, opt_offset);
3701 error = tc_transact(&request, NULL);
3703 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3704 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3705 netdev_get_name(netdev),
3706 tc_get_major(handle), tc_get_minor(handle),
3707 tc_get_major(parent), tc_get_minor(parent),
3708 class->min_rate, class->max_rate,
3709 class->burst, class->priority, ovs_strerror(error));
3714 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3715 * description of them into 'details'. The description complies with the
3716 * specification given in the vswitch database documentation for linux-htb
3719 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3721 static const struct nl_policy tca_htb_policy[] = {
3722 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3723 .min_len = sizeof(struct tc_htb_opt) },
3726 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3727 const struct tc_htb_opt *htb;
3729 if (!nl_parse_nested(nl_options, tca_htb_policy,
3730 attrs, ARRAY_SIZE(tca_htb_policy))) {
3731 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3735 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3736 class->min_rate = htb->rate.rate;
3737 class->max_rate = htb->ceil.rate;
3738 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3739 class->priority = htb->prio;
3744 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3745 struct htb_class *options,
3746 struct netdev_queue_stats *stats)
3748 struct nlattr *nl_options;
3749 unsigned int handle;
3752 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3753 if (!error && queue_id) {
3754 unsigned int major = tc_get_major(handle);
3755 unsigned int minor = tc_get_minor(handle);
3756 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3757 *queue_id = minor - 1;
3762 if (!error && options) {
3763 error = htb_parse_tca_options__(nl_options, options);
3769 htb_parse_qdisc_details__(struct netdev *netdev_,
3770 const struct smap *details, struct htb_class *hc)
3772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3773 const char *max_rate_s;
3775 max_rate_s = smap_get(details, "max-rate");
3776 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3777 if (!hc->max_rate) {
3778 enum netdev_features current;
3780 netdev_linux_read_features(netdev);
3781 current = !netdev->get_features_error ? netdev->current : 0;
3782 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3784 hc->min_rate = hc->max_rate;
3790 htb_parse_class_details__(struct netdev *netdev,
3791 const struct smap *details, struct htb_class *hc)
3793 const struct htb *htb = htb_get__(netdev);
3794 const char *min_rate_s = smap_get(details, "min-rate");
3795 const char *max_rate_s = smap_get(details, "max-rate");
3796 const char *burst_s = smap_get(details, "burst");
3797 const char *priority_s = smap_get(details, "priority");
3800 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3802 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3803 netdev_get_name(netdev));
3807 /* HTB requires at least an mtu sized min-rate to send any traffic even
3808 * on uncongested links. */
3809 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3810 hc->min_rate = MAX(hc->min_rate, mtu);
3811 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3814 hc->max_rate = (max_rate_s
3815 ? strtoull(max_rate_s, NULL, 10) / 8
3817 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3818 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3822 * According to hints in the documentation that I've read, it is important
3823 * that 'burst' be at least as big as the largest frame that might be
3824 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3825 * but having it a bit too small is a problem. Since netdev_get_mtu()
3826 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3827 * the MTU. We actually add 64, instead of 14, as a guard against
3828 * additional headers get tacked on somewhere that we're not aware of. */
3829 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3830 hc->burst = MAX(hc->burst, mtu + 64);
3833 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3839 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3840 unsigned int parent, struct htb_class *options,
3841 struct netdev_queue_stats *stats)
3843 struct ofpbuf *reply;
3846 error = tc_query_class(netdev, handle, parent, &reply);
3848 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3849 ofpbuf_delete(reply);
3855 htb_tc_install(struct netdev *netdev, const struct smap *details)
3859 error = htb_setup_qdisc__(netdev);
3861 struct htb_class hc;
3863 htb_parse_qdisc_details__(netdev, details, &hc);
3864 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3865 tc_make_handle(1, 0), &hc);
3867 htb_install__(netdev, hc.max_rate);
3873 static struct htb_class *
3874 htb_class_cast__(const struct tc_queue *queue)
3876 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3880 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3881 const struct htb_class *hc)
3883 struct htb *htb = htb_get__(netdev);
3884 size_t hash = hash_int(queue_id, 0);
3885 struct tc_queue *queue;
3886 struct htb_class *hcp;
3888 queue = tc_find_queue__(netdev, queue_id, hash);
3890 hcp = htb_class_cast__(queue);
3892 hcp = xmalloc(sizeof *hcp);
3893 queue = &hcp->tc_queue;
3894 queue->queue_id = queue_id;
3895 queue->created = time_msec();
3896 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3899 hcp->min_rate = hc->min_rate;
3900 hcp->max_rate = hc->max_rate;
3901 hcp->burst = hc->burst;
3902 hcp->priority = hc->priority;
3906 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3909 struct queue_dump_state state;
3910 struct htb_class hc;
3912 /* Get qdisc options. */
3914 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3915 htb_install__(netdev, hc.max_rate);
3918 if (!start_queue_dump(netdev, &state)) {
3921 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3922 unsigned int queue_id;
3924 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3925 htb_update_queue__(netdev, queue_id, &hc);
3928 finish_queue_dump(&state);
3934 htb_tc_destroy(struct tc *tc)
3936 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3937 struct htb_class *hc, *next;
3939 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3940 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3948 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3950 const struct htb *htb = htb_get__(netdev);
3951 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3956 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3958 struct htb_class hc;
3961 htb_parse_qdisc_details__(netdev, details, &hc);
3962 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3963 tc_make_handle(1, 0), &hc);
3965 htb_get__(netdev)->max_rate = hc.max_rate;
3971 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3972 const struct tc_queue *queue, struct smap *details)
3974 const struct htb_class *hc = htb_class_cast__(queue);
3976 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3977 if (hc->min_rate != hc->max_rate) {
3978 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3980 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3982 smap_add_format(details, "priority", "%u", hc->priority);
3988 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3989 const struct smap *details)
3991 struct htb_class hc;
3994 error = htb_parse_class_details__(netdev, details, &hc);
3999 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4000 tc_make_handle(1, 0xfffe), &hc);
4005 htb_update_queue__(netdev, queue_id, &hc);
4010 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
4012 struct htb_class *hc = htb_class_cast__(queue);
4013 struct htb *htb = htb_get__(netdev);
4016 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4018 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
4025 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4026 struct netdev_queue_stats *stats)
4028 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4029 tc_make_handle(1, 0xfffe), NULL, stats);
4033 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4034 const struct ofpbuf *nlmsg,
4035 netdev_dump_queue_stats_cb *cb, void *aux)
4037 struct netdev_queue_stats stats;
4038 unsigned int handle, major, minor;
4041 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4046 major = tc_get_major(handle);
4047 minor = tc_get_minor(handle);
4048 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4049 (*cb)(minor - 1, &stats, aux);
4054 static const struct tc_ops tc_ops_htb = {
4055 "htb", /* linux_name */
4056 "linux-htb", /* ovs_name */
4057 HTB_N_QUEUES, /* n_queues */
4066 htb_class_get_stats,
4067 htb_class_dump_stats
4070 /* "linux-hfsc" traffic control class. */
4072 #define HFSC_N_QUEUES 0xf000
4080 struct tc_queue tc_queue;
4085 static struct hfsc *
4086 hfsc_get__(const struct netdev *netdev_)
4088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4089 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4092 static struct hfsc_class *
4093 hfsc_class_cast__(const struct tc_queue *queue)
4095 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4099 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4101 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4104 hfsc = xmalloc(sizeof *hfsc);
4105 tc_init(&hfsc->tc, &tc_ops_hfsc);
4106 hfsc->max_rate = max_rate;
4107 netdev->tc = &hfsc->tc;
4111 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4112 const struct hfsc_class *hc)
4116 struct hfsc_class *hcp;
4117 struct tc_queue *queue;
4119 hfsc = hfsc_get__(netdev);
4120 hash = hash_int(queue_id, 0);
4122 queue = tc_find_queue__(netdev, queue_id, hash);
4124 hcp = hfsc_class_cast__(queue);
4126 hcp = xmalloc(sizeof *hcp);
4127 queue = &hcp->tc_queue;
4128 queue->queue_id = queue_id;
4129 queue->created = time_msec();
4130 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4133 hcp->min_rate = hc->min_rate;
4134 hcp->max_rate = hc->max_rate;
4138 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4140 const struct tc_service_curve *rsc, *fsc, *usc;
4141 static const struct nl_policy tca_hfsc_policy[] = {
4143 .type = NL_A_UNSPEC,
4145 .min_len = sizeof(struct tc_service_curve),
4148 .type = NL_A_UNSPEC,
4150 .min_len = sizeof(struct tc_service_curve),
4153 .type = NL_A_UNSPEC,
4155 .min_len = sizeof(struct tc_service_curve),
4158 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4160 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4161 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4162 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4166 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4167 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4168 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4170 if (rsc->m1 != 0 || rsc->d != 0 ||
4171 fsc->m1 != 0 || fsc->d != 0 ||
4172 usc->m1 != 0 || usc->d != 0) {
4173 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4174 "Non-linear service curves are not supported.");
4178 if (rsc->m2 != fsc->m2) {
4179 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4180 "Real-time service curves are not supported ");
4184 if (rsc->m2 > usc->m2) {
4185 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4186 "Min-rate service curve is greater than "
4187 "the max-rate service curve.");
4191 class->min_rate = fsc->m2;
4192 class->max_rate = usc->m2;
4197 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4198 struct hfsc_class *options,
4199 struct netdev_queue_stats *stats)
4202 unsigned int handle;
4203 struct nlattr *nl_options;
4205 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4211 unsigned int major, minor;
4213 major = tc_get_major(handle);
4214 minor = tc_get_minor(handle);
4215 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4216 *queue_id = minor - 1;
4223 error = hfsc_parse_tca_options__(nl_options, options);
4230 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4231 unsigned int parent, struct hfsc_class *options,
4232 struct netdev_queue_stats *stats)
4235 struct ofpbuf *reply;
4237 error = tc_query_class(netdev, handle, parent, &reply);
4242 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4243 ofpbuf_delete(reply);
4248 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4249 struct hfsc_class *class)
4251 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4253 const char *max_rate_s;
4255 max_rate_s = smap_get(details, "max-rate");
4256 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4259 enum netdev_features current;
4261 netdev_linux_read_features(netdev);
4262 current = !netdev->get_features_error ? netdev->current : 0;
4263 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4266 class->min_rate = max_rate;
4267 class->max_rate = max_rate;
4271 hfsc_parse_class_details__(struct netdev *netdev,
4272 const struct smap *details,
4273 struct hfsc_class * class)
4275 const struct hfsc *hfsc;
4276 uint32_t min_rate, max_rate;
4277 const char *min_rate_s, *max_rate_s;
4279 hfsc = hfsc_get__(netdev);
4280 min_rate_s = smap_get(details, "min-rate");
4281 max_rate_s = smap_get(details, "max-rate");
4283 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4284 min_rate = MAX(min_rate, 1);
4285 min_rate = MIN(min_rate, hfsc->max_rate);
4287 max_rate = (max_rate_s
4288 ? strtoull(max_rate_s, NULL, 10) / 8
4290 max_rate = MAX(max_rate, min_rate);
4291 max_rate = MIN(max_rate, hfsc->max_rate);
4293 class->min_rate = min_rate;
4294 class->max_rate = max_rate;
4299 /* Create an HFSC qdisc.
4301 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4303 hfsc_setup_qdisc__(struct netdev * netdev)
4305 struct tcmsg *tcmsg;
4306 struct ofpbuf request;
4307 struct tc_hfsc_qopt opt;
4309 tc_del_qdisc(netdev);
4311 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4312 NLM_F_EXCL | NLM_F_CREATE, &request);
4318 tcmsg->tcm_handle = tc_make_handle(1, 0);
4319 tcmsg->tcm_parent = TC_H_ROOT;
4321 memset(&opt, 0, sizeof opt);
4324 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4325 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4327 return tc_transact(&request, NULL);
4330 /* Create an HFSC class.
4332 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4333 * sc rate <min_rate> ul rate <max_rate>" */
4335 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4336 unsigned int parent, struct hfsc_class *class)
4340 struct tcmsg *tcmsg;
4341 struct ofpbuf request;
4342 struct tc_service_curve min, max;
4344 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4350 tcmsg->tcm_handle = handle;
4351 tcmsg->tcm_parent = parent;
4355 min.m2 = class->min_rate;
4359 max.m2 = class->max_rate;
4361 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4362 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4363 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4364 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4365 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4366 nl_msg_end_nested(&request, opt_offset);
4368 error = tc_transact(&request, NULL);
4370 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4371 "min-rate %ubps, max-rate %ubps (%s)",
4372 netdev_get_name(netdev),
4373 tc_get_major(handle), tc_get_minor(handle),
4374 tc_get_major(parent), tc_get_minor(parent),
4375 class->min_rate, class->max_rate, ovs_strerror(error));
4382 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4385 struct hfsc_class class;
4387 error = hfsc_setup_qdisc__(netdev);
4393 hfsc_parse_qdisc_details__(netdev, details, &class);
4394 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4395 tc_make_handle(1, 0), &class);
4401 hfsc_install__(netdev, class.max_rate);
4406 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4409 struct queue_dump_state state;
4410 struct hfsc_class hc;
4413 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4414 hfsc_install__(netdev, hc.max_rate);
4416 if (!start_queue_dump(netdev, &state)) {
4420 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4421 unsigned int queue_id;
4423 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4424 hfsc_update_queue__(netdev, queue_id, &hc);
4428 finish_queue_dump(&state);
4433 hfsc_tc_destroy(struct tc *tc)
4436 struct hfsc_class *hc, *next;
4438 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4440 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4441 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4450 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4452 const struct hfsc *hfsc;
4453 hfsc = hfsc_get__(netdev);
4454 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4459 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4462 struct hfsc_class class;
4464 hfsc_parse_qdisc_details__(netdev, details, &class);
4465 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4466 tc_make_handle(1, 0), &class);
4469 hfsc_get__(netdev)->max_rate = class.max_rate;
4476 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4477 const struct tc_queue *queue, struct smap *details)
4479 const struct hfsc_class *hc;
4481 hc = hfsc_class_cast__(queue);
4482 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4483 if (hc->min_rate != hc->max_rate) {
4484 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4490 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4491 const struct smap *details)
4494 struct hfsc_class class;
4496 error = hfsc_parse_class_details__(netdev, details, &class);
4501 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4502 tc_make_handle(1, 0xfffe), &class);
4507 hfsc_update_queue__(netdev, queue_id, &class);
4512 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4516 struct hfsc_class *hc;
4518 hc = hfsc_class_cast__(queue);
4519 hfsc = hfsc_get__(netdev);
4521 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4523 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4530 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4531 struct netdev_queue_stats *stats)
4533 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4534 tc_make_handle(1, 0xfffe), NULL, stats);
4538 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4539 const struct ofpbuf *nlmsg,
4540 netdev_dump_queue_stats_cb *cb, void *aux)
4542 struct netdev_queue_stats stats;
4543 unsigned int handle, major, minor;
4546 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4551 major = tc_get_major(handle);
4552 minor = tc_get_minor(handle);
4553 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4554 (*cb)(minor - 1, &stats, aux);
4559 static const struct tc_ops tc_ops_hfsc = {
4560 "hfsc", /* linux_name */
4561 "linux-hfsc", /* ovs_name */
4562 HFSC_N_QUEUES, /* n_queues */
4563 hfsc_tc_install, /* tc_install */
4564 hfsc_tc_load, /* tc_load */
4565 hfsc_tc_destroy, /* tc_destroy */
4566 hfsc_qdisc_get, /* qdisc_get */
4567 hfsc_qdisc_set, /* qdisc_set */
4568 hfsc_class_get, /* class_get */
4569 hfsc_class_set, /* class_set */
4570 hfsc_class_delete, /* class_delete */
4571 hfsc_class_get_stats, /* class_get_stats */
4572 hfsc_class_dump_stats /* class_dump_stats */
4575 /* "linux-default" traffic control class.
4577 * This class represents the default, unnamed Linux qdisc. It corresponds to
4578 * the "" (empty string) QoS type in the OVS database. */
4581 default_install__(struct netdev *netdev_)
4583 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4584 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4586 /* Nothing but a tc class implementation is allowed to write to a tc. This
4587 * class never does that, so we can legitimately use a const tc object. */
4588 netdev->tc = CONST_CAST(struct tc *, &tc);
4592 default_tc_install(struct netdev *netdev,
4593 const struct smap *details OVS_UNUSED)
4595 default_install__(netdev);
4600 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4602 default_install__(netdev);
4606 static const struct tc_ops tc_ops_default = {
4607 NULL, /* linux_name */
4612 NULL, /* tc_destroy */
4613 NULL, /* qdisc_get */
4614 NULL, /* qdisc_set */
4615 NULL, /* class_get */
4616 NULL, /* class_set */
4617 NULL, /* class_delete */
4618 NULL, /* class_get_stats */
4619 NULL /* class_dump_stats */
4622 /* "linux-other" traffic control class.
4627 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4630 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4632 /* Nothing but a tc class implementation is allowed to write to a tc. This
4633 * class never does that, so we can legitimately use a const tc object. */
4634 netdev->tc = CONST_CAST(struct tc *, &tc);
4638 static const struct tc_ops tc_ops_other = {
4639 NULL, /* linux_name */
4640 "linux-other", /* ovs_name */
4642 NULL, /* tc_install */
4644 NULL, /* tc_destroy */
4645 NULL, /* qdisc_get */
4646 NULL, /* qdisc_set */
4647 NULL, /* class_get */
4648 NULL, /* class_set */
4649 NULL, /* class_delete */
4650 NULL, /* class_get_stats */
4651 NULL /* class_dump_stats */
4654 /* Traffic control. */
4656 /* Number of kernel "tc" ticks per second. */
4657 static double ticks_per_s;
4659 /* Number of kernel "jiffies" per second. This is used for the purpose of
4660 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4661 * one jiffy's worth of data.
4663 * There are two possibilities here:
4665 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4666 * approximate range of 100 to 1024. That means that we really need to
4667 * make sure that the qdisc can buffer that much data.
4669 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4670 * has finely granular timers and there's no need to fudge additional room
4671 * for buffers. (There's no extra effort needed to implement that: the
4672 * large 'buffer_hz' is used as a divisor, so practically any number will
4673 * come out as 0 in the division. Small integer results in the case of
4674 * really high dividends won't have any real effect anyhow.)
4676 static unsigned int buffer_hz;
4678 /* Returns tc handle 'major':'minor'. */
4680 tc_make_handle(unsigned int major, unsigned int minor)
4682 return TC_H_MAKE(major << 16, minor);
4685 /* Returns the major number from 'handle'. */
4687 tc_get_major(unsigned int handle)
4689 return TC_H_MAJ(handle) >> 16;
4692 /* Returns the minor number from 'handle'. */
4694 tc_get_minor(unsigned int handle)
4696 return TC_H_MIN(handle);
4699 static struct tcmsg *
4700 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4701 struct ofpbuf *request)
4703 struct tcmsg *tcmsg;
4707 error = get_ifindex(netdev, &ifindex);
4712 ofpbuf_init(request, 512);
4713 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4714 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4715 tcmsg->tcm_family = AF_UNSPEC;
4716 tcmsg->tcm_ifindex = ifindex;
4717 /* Caller should fill in tcmsg->tcm_handle. */
4718 /* Caller should fill in tcmsg->tcm_parent. */
4724 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4726 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4727 ofpbuf_uninit(request);
4731 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4732 * policing configuration.
4734 * This function is equivalent to running the following when 'add' is true:
4735 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4737 * This function is equivalent to running the following when 'add' is false:
4738 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4740 * The configuration and stats may be seen with the following command:
4741 * /sbin/tc -s qdisc show dev <devname>
4743 * Returns 0 if successful, otherwise a positive errno value.
4746 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4748 struct ofpbuf request;
4749 struct tcmsg *tcmsg;
4751 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4752 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4754 tcmsg = tc_make_request(netdev, type, flags, &request);
4758 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4759 tcmsg->tcm_parent = TC_H_INGRESS;
4760 nl_msg_put_string(&request, TCA_KIND, "ingress");
4761 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4763 error = tc_transact(&request, NULL);
4765 /* If we're deleting the qdisc, don't worry about some of the
4766 * error conditions. */
4767 if (!add && (error == ENOENT || error == EINVAL)) {
4776 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4779 * This function is equivalent to running:
4780 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4781 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4784 * The configuration and stats may be seen with the following command:
4785 * /sbin/tc -s filter show dev <devname> parent ffff:
4787 * Returns 0 if successful, otherwise a positive errno value.
4790 tc_add_policer(struct netdev *netdev,
4791 uint32_t kbits_rate, uint32_t kbits_burst)
4793 struct tc_police tc_police;
4794 struct ofpbuf request;
4795 struct tcmsg *tcmsg;
4796 size_t basic_offset;
4797 size_t police_offset;
4801 memset(&tc_police, 0, sizeof tc_police);
4802 tc_police.action = TC_POLICE_SHOT;
4803 tc_police.mtu = mtu;
4804 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4806 /* The following appears wrong in two ways:
4808 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4809 * arguments (or at least consistently "bytes" as both or "bits" as
4810 * both), but this supplies bytes for the first argument and bits for the
4813 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4815 * However if you "fix" those problems then "tc filter show ..." shows
4816 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4817 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4818 * tc's point of view. Whatever. */
4819 tc_police.burst = tc_bytes_to_ticks(
4820 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4822 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4823 NLM_F_EXCL | NLM_F_CREATE, &request);
4827 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4828 tcmsg->tcm_info = tc_make_handle(49,
4829 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4831 nl_msg_put_string(&request, TCA_KIND, "basic");
4832 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4833 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4834 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4835 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4836 nl_msg_end_nested(&request, police_offset);
4837 nl_msg_end_nested(&request, basic_offset);
4839 error = tc_transact(&request, NULL);
4850 /* The values in psched are not individually very meaningful, but they are
4851 * important. The tables below show some values seen in the wild.
4855 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4856 * (Before that, there are hints that it was 1000000000.)
4858 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4862 * -----------------------------------
4863 * [1] 000c8000 000f4240 000f4240 00000064
4864 * [2] 000003e8 00000400 000f4240 3b9aca00
4865 * [3] 000003e8 00000400 000f4240 3b9aca00
4866 * [4] 000003e8 00000400 000f4240 00000064
4867 * [5] 000003e8 00000040 000f4240 3b9aca00
4868 * [6] 000003e8 00000040 000f4240 000000f9
4870 * a b c d ticks_per_s buffer_hz
4871 * ------- --------- ---------- ------------- ----------- -------------
4872 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4873 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4874 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4875 * [4] 1,000 1,024 1,000,000 100 976,562 100
4876 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4877 * [6] 1,000 64 1,000,000 249 15,625,000 249
4879 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4880 * [2] 2.6.26-1-686-bigmem from Debian lenny
4881 * [3] 2.6.26-2-sparc64 from Debian lenny
4882 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4883 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4884 * [6] 2.6.34 from kernel.org on KVM
4886 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4887 static const char fn[] = "/proc/net/psched";
4888 unsigned int a, b, c, d;
4891 if (!ovsthread_once_start(&once)) {
4898 stream = fopen(fn, "r");
4900 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4904 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4905 VLOG_WARN("%s: read failed", fn);
4909 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4913 VLOG_WARN("%s: invalid scheduler parameters", fn);
4917 ticks_per_s = (double) a * c / b;
4921 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4924 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4927 ovsthread_once_done(&once);
4930 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4931 * rate of 'rate' bytes per second. */
4933 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4936 return (rate * ticks) / ticks_per_s;
4939 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4940 * rate of 'rate' bytes per second. */
4942 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4945 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4948 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4949 * a transmission rate of 'rate' bytes per second. */
4951 tc_buffer_per_jiffy(unsigned int rate)
4954 return rate / buffer_hz;
4957 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4958 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4959 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4960 * stores NULL into it if it is absent.
4962 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4965 * Returns 0 if successful, otherwise a positive errno value. */
4967 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4968 struct nlattr **options)
4970 static const struct nl_policy tca_policy[] = {
4971 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4972 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4974 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4976 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4977 tca_policy, ta, ARRAY_SIZE(ta))) {
4978 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4983 *kind = nl_attr_get_string(ta[TCA_KIND]);
4987 *options = ta[TCA_OPTIONS];
5002 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5003 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5004 * into '*options', and its queue statistics into '*stats'. Any of the output
5005 * arguments may be null.
5007 * Returns 0 if successful, otherwise a positive errno value. */
5009 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5010 struct nlattr **options, struct netdev_queue_stats *stats)
5012 static const struct nl_policy tca_policy[] = {
5013 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5014 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5016 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5018 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5019 tca_policy, ta, ARRAY_SIZE(ta))) {
5020 VLOG_WARN_RL(&rl, "failed to parse class message");
5025 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5026 *handlep = tc->tcm_handle;
5030 *options = ta[TCA_OPTIONS];
5034 const struct gnet_stats_queue *gsq;
5035 struct gnet_stats_basic gsb;
5037 static const struct nl_policy stats_policy[] = {
5038 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5039 .min_len = sizeof gsb },
5040 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5041 .min_len = sizeof *gsq },
5043 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5045 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5046 sa, ARRAY_SIZE(sa))) {
5047 VLOG_WARN_RL(&rl, "failed to parse class stats");
5051 /* Alignment issues screw up the length of struct gnet_stats_basic on
5052 * some arch/bitsize combinations. Newer versions of Linux have a
5053 * struct gnet_stats_basic_packed, but we can't depend on that. The
5054 * easiest thing to do is just to make a copy. */
5055 memset(&gsb, 0, sizeof gsb);
5056 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5057 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5058 stats->tx_bytes = gsb.bytes;
5059 stats->tx_packets = gsb.packets;
5061 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5062 stats->tx_errors = gsq->drops;
5072 memset(stats, 0, sizeof *stats);
5077 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5080 tc_query_class(const struct netdev *netdev,
5081 unsigned int handle, unsigned int parent,
5082 struct ofpbuf **replyp)
5084 struct ofpbuf request;
5085 struct tcmsg *tcmsg;
5088 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5092 tcmsg->tcm_handle = handle;
5093 tcmsg->tcm_parent = parent;
5095 error = tc_transact(&request, replyp);
5097 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5098 netdev_get_name(netdev),
5099 tc_get_major(handle), tc_get_minor(handle),
5100 tc_get_major(parent), tc_get_minor(parent),
5101 ovs_strerror(error));
5106 /* Equivalent to "tc class del dev <name> handle <handle>". */
5108 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5110 struct ofpbuf request;
5111 struct tcmsg *tcmsg;
5114 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5118 tcmsg->tcm_handle = handle;
5119 tcmsg->tcm_parent = 0;
5121 error = tc_transact(&request, NULL);
5123 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5124 netdev_get_name(netdev),
5125 tc_get_major(handle), tc_get_minor(handle),
5126 ovs_strerror(error));
5131 /* Equivalent to "tc qdisc del dev <name> root". */
5133 tc_del_qdisc(struct netdev *netdev_)
5135 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5136 struct ofpbuf request;
5137 struct tcmsg *tcmsg;
5140 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5144 tcmsg->tcm_handle = tc_make_handle(1, 0);
5145 tcmsg->tcm_parent = TC_H_ROOT;
5147 error = tc_transact(&request, NULL);
5148 if (error == EINVAL) {
5149 /* EINVAL probably means that the default qdisc was in use, in which
5150 * case we've accomplished our purpose. */
5153 if (!error && netdev->tc) {
5154 if (netdev->tc->ops->tc_destroy) {
5155 netdev->tc->ops->tc_destroy(netdev->tc);
5163 getqdisc_is_safe(void)
5165 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5166 static bool safe = false;
5168 if (ovsthread_once_start(&once)) {
5169 struct utsname utsname;
5172 if (uname(&utsname) == -1) {
5173 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5174 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5175 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5176 } else if (major < 2 || (major == 2 && minor < 35)) {
5177 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5182 ovsthread_once_done(&once);
5187 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5188 * kernel to determine what they are. Returns 0 if successful, otherwise a
5189 * positive errno value. */
5191 tc_query_qdisc(const struct netdev *netdev_)
5193 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5194 struct ofpbuf request, *qdisc;
5195 const struct tc_ops *ops;
5196 struct tcmsg *tcmsg;
5204 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5205 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5206 * 2.6.35 without that fix backported to it.
5208 * To avoid the OOPS, we must not make a request that would attempt to dump
5209 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5210 * few others. There are a few ways that I can see to do this, but most of
5211 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5212 * technique chosen here is to assume that any non-default qdisc that we
5213 * create will have a class with handle 1:0. The built-in qdiscs only have
5214 * a class with handle 0:0.
5216 * On Linux 2.6.35+ we use the straightforward method because it allows us
5217 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5218 * in such a case we get no response at all from the kernel (!) if a
5219 * builtin qdisc is in use (which is later caught by "!error &&
5220 * !qdisc->size"). */
5221 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5225 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5226 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5228 /* Figure out what tc class to instantiate. */
5229 error = tc_transact(&request, &qdisc);
5230 if (!error && qdisc->size) {
5233 error = tc_parse_qdisc(qdisc, &kind, NULL);
5235 ops = &tc_ops_other;
5237 ops = tc_lookup_linux_name(kind);
5239 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5240 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5242 ops = &tc_ops_other;
5245 } else if ((!error && !qdisc->size) || error == ENOENT) {
5246 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5247 * set up by some other entity that doesn't have a handle 1:0. We will
5248 * assume that it's the system default qdisc. */
5249 ops = &tc_ops_default;
5252 /* Who knows? Maybe the device got deleted. */
5253 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5254 netdev_get_name(netdev_), ovs_strerror(error));
5255 ops = &tc_ops_other;
5258 /* Instantiate it. */
5259 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5260 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5261 ofpbuf_delete(qdisc);
5263 return error ? error : load_error;
5266 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5267 approximate the time to transmit packets of various lengths. For an MTU of
5268 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5269 represents two possible packet lengths; for a MTU of 513 through 1024, four
5270 possible lengths; and so on.
5272 Returns, for the specified 'mtu', the number of bits that packet lengths
5273 need to be shifted right to fit within such a 256-entry table. */
5275 tc_calc_cell_log(unsigned int mtu)
5280 mtu = ETH_PAYLOAD_MAX;
5282 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5284 for (cell_log = 0; mtu >= 256; cell_log++) {
5291 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5294 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5296 memset(rate, 0, sizeof *rate);
5297 rate->cell_log = tc_calc_cell_log(mtu);
5298 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5299 /* rate->cell_align = 0; */ /* distro headers. */
5300 rate->mpu = ETH_TOTAL_MIN;
5304 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5305 * attribute of the specified "type".
5307 * See tc_calc_cell_log() above for a description of "rtab"s. */
5309 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5314 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5315 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5316 unsigned packet_size = (i + 1) << rate->cell_log;
5317 if (packet_size < rate->mpu) {
5318 packet_size = rate->mpu;
5320 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5324 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5325 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5326 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5329 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5331 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5332 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5335 /* Linux-only functions declared in netdev-linux.h */
5337 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5338 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5340 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5341 const char *flag_name, bool enable)
5343 const char *netdev_name = netdev_get_name(netdev);
5344 struct ethtool_value evalue;
5348 COVERAGE_INC(netdev_get_ethtool);
5349 memset(&evalue, 0, sizeof evalue);
5350 error = netdev_linux_do_ethtool(netdev_name,
5351 (struct ethtool_cmd *)&evalue,
5352 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5357 COVERAGE_INC(netdev_set_ethtool);
5358 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5359 if (new_flags == evalue.data) {
5362 evalue.data = new_flags;
5363 error = netdev_linux_do_ethtool(netdev_name,
5364 (struct ethtool_cmd *)&evalue,
5365 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5370 COVERAGE_INC(netdev_get_ethtool);
5371 memset(&evalue, 0, sizeof evalue);
5372 error = netdev_linux_do_ethtool(netdev_name,
5373 (struct ethtool_cmd *)&evalue,
5374 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5379 if (new_flags != evalue.data) {
5380 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5381 "device %s failed", enable ? "enable" : "disable",
5382 flag_name, netdev_name);
5389 /* Utility functions. */
5391 /* Copies 'src' into 'dst', performing format conversion in the process. */
5393 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5394 const struct rtnl_link_stats *src)
5396 dst->rx_packets = src->rx_packets;
5397 dst->tx_packets = src->tx_packets;
5398 dst->rx_bytes = src->rx_bytes;
5399 dst->tx_bytes = src->tx_bytes;
5400 dst->rx_errors = src->rx_errors;
5401 dst->tx_errors = src->tx_errors;
5402 dst->rx_dropped = src->rx_dropped;
5403 dst->tx_dropped = src->tx_dropped;
5404 dst->multicast = src->multicast;
5405 dst->collisions = src->collisions;
5406 dst->rx_length_errors = src->rx_length_errors;
5407 dst->rx_over_errors = src->rx_over_errors;
5408 dst->rx_crc_errors = src->rx_crc_errors;
5409 dst->rx_frame_errors = src->rx_frame_errors;
5410 dst->rx_fifo_errors = src->rx_fifo_errors;
5411 dst->rx_missed_errors = src->rx_missed_errors;
5412 dst->tx_aborted_errors = src->tx_aborted_errors;
5413 dst->tx_carrier_errors = src->tx_carrier_errors;
5414 dst->tx_fifo_errors = src->tx_fifo_errors;
5415 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5416 dst->tx_window_errors = src->tx_window_errors;
5419 /* Copies 'src' into 'dst', performing format conversion in the process. */
5421 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5422 const struct rtnl_link_stats64 *src)
5424 dst->rx_packets = src->rx_packets;
5425 dst->tx_packets = src->tx_packets;
5426 dst->rx_bytes = src->rx_bytes;
5427 dst->tx_bytes = src->tx_bytes;
5428 dst->rx_errors = src->rx_errors;
5429 dst->tx_errors = src->tx_errors;
5430 dst->rx_dropped = src->rx_dropped;
5431 dst->tx_dropped = src->tx_dropped;
5432 dst->multicast = src->multicast;
5433 dst->collisions = src->collisions;
5434 dst->rx_length_errors = src->rx_length_errors;
5435 dst->rx_over_errors = src->rx_over_errors;
5436 dst->rx_crc_errors = src->rx_crc_errors;
5437 dst->rx_frame_errors = src->rx_frame_errors;
5438 dst->rx_fifo_errors = src->rx_fifo_errors;
5439 dst->rx_missed_errors = src->rx_missed_errors;
5440 dst->tx_aborted_errors = src->tx_aborted_errors;
5441 dst->tx_carrier_errors = src->tx_carrier_errors;
5442 dst->tx_fifo_errors = src->tx_fifo_errors;
5443 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5444 dst->tx_window_errors = src->tx_window_errors;
5448 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5450 struct ofpbuf request;
5451 struct ofpbuf *reply;
5454 ofpbuf_init(&request, 0);
5455 nl_msg_put_nlmsghdr(&request,
5456 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5457 RTM_GETLINK, NLM_F_REQUEST);
5458 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5459 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5460 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5461 ofpbuf_uninit(&request);
5466 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5467 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5468 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5469 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5472 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5473 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5474 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5477 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5482 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5487 ofpbuf_delete(reply);
5492 get_flags(const struct netdev *dev, unsigned int *flags)
5498 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5500 *flags = ifr.ifr_flags;
5506 set_flags(const char *name, unsigned int flags)
5510 ifr.ifr_flags = flags;
5511 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5515 do_get_ifindex(const char *netdev_name)
5520 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5521 COVERAGE_INC(netdev_get_ifindex);
5523 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5525 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5526 netdev_name, ovs_strerror(error));
5529 return ifr.ifr_ifindex;
5533 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5535 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5537 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5538 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5541 netdev->get_ifindex_error = -ifindex;
5542 netdev->ifindex = 0;
5544 netdev->get_ifindex_error = 0;
5545 netdev->ifindex = ifindex;
5547 netdev->cache_valid |= VALID_IFINDEX;
5550 *ifindexp = netdev->ifindex;
5551 return netdev->get_ifindex_error;
5555 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5561 memset(&ifr, 0, sizeof ifr);
5562 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5563 COVERAGE_INC(netdev_get_hwaddr);
5564 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5566 /* ENODEV probably means that a vif disappeared asynchronously and
5567 * hasn't been removed from the database yet, so reduce the log level
5568 * to INFO for that case. */
5569 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5570 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5571 netdev_name, ovs_strerror(error));
5574 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5575 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5576 VLOG_INFO("%s device has unknown hardware address family %d",
5577 netdev_name, hwaddr_family);
5580 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5585 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5590 memset(&ifr, 0, sizeof ifr);
5591 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5592 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5593 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5594 COVERAGE_INC(netdev_set_hwaddr);
5595 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5597 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5598 netdev_name, ovs_strerror(error));
5604 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5605 int cmd, const char *cmd_name)
5610 memset(&ifr, 0, sizeof ifr);
5611 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5612 ifr.ifr_data = (caddr_t) ecmd;
5615 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5617 if (error != EOPNOTSUPP) {
5618 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5619 "failed: %s", cmd_name, name, ovs_strerror(error));
5621 /* The device doesn't support this operation. That's pretty
5622 * common, so there's no point in logging anything. */
5629 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5630 int cmd, const char *cmd_name)
5635 ifr.ifr_addr.sa_family = AF_INET;
5636 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5638 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5640 *ip = sin->sin_addr;
5645 /* Returns an AF_PACKET raw socket or a negative errno value. */
5647 af_packet_sock(void)
5649 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5652 if (ovsthread_once_start(&once)) {
5653 sock = socket(AF_PACKET, SOCK_RAW, 0);
5655 int error = set_nonblocking(sock);
5662 VLOG_ERR("failed to create packet socket: %s",
5663 ovs_strerror(errno));
5665 ovsthread_once_done(&once);