2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
226 VALID_POLICING = 1 << 5,
227 VALID_VPORT_STAT_ERROR = 1 << 6,
228 VALID_DRVINFO = 1 << 7,
229 VALID_FEATURES = 1 << 8,
232 /* Traffic control. */
234 /* An instance of a traffic control class. Always associated with a particular
237 * Each TC implementation subclasses this with whatever additional data it
240 const struct tc_ops *ops;
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
248 /* One traffic control queue.
250 * Each TC implementation subclasses this with whatever additional data it
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
255 long long int created; /* Time queue was created, in msecs. */
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
330 * This function may be null if 'tc' is not configurable.
332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
341 * This function may be null if 'tc' is not configurable.
343 int (*qdisc_set)(struct netdev *, const struct smap *details);
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
356 * This function may be null if 'tc' does not have queues ('n_queues' is
358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
359 struct smap *details);
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
373 const struct smap *details);
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
385 * On success, initializes '*stats'.
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
391 struct netdev_queue_stats *stats);
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
404 tc_init(struct tc *tc, const struct tc_ops *ops)
407 hmap_init(&tc->queues);
411 tc_destroy(struct tc *tc)
413 hmap_destroy(&tc->queues);
416 static const struct tc_ops tc_ops_htb;
417 static const struct tc_ops tc_ops_hfsc;
418 static const struct tc_ops tc_ops_codel;
419 static const struct tc_ops tc_ops_fqcodel;
420 static const struct tc_ops tc_ops_sfq;
421 static const struct tc_ops tc_ops_default;
422 static const struct tc_ops tc_ops_other;
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
431 &tc_ops_other, /* Some other qdisc. */
435 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
436 static unsigned int tc_get_major(unsigned int handle);
437 static unsigned int tc_get_minor(unsigned int handle);
439 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
440 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
441 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
443 static struct tcmsg *tc_make_request(const struct netdev *, int type,
444 unsigned int flags, struct ofpbuf *);
445 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
446 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
447 static int tc_add_policer(struct netdev *,
448 uint32_t kbits_rate, uint32_t kbits_burst);
450 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
451 struct nlattr **options);
452 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
453 struct nlattr **options,
454 struct netdev_queue_stats *);
455 static int tc_query_class(const struct netdev *,
456 unsigned int handle, unsigned int parent,
457 struct ofpbuf **replyp);
458 static int tc_delete_class(const struct netdev *, unsigned int handle);
460 static int tc_del_qdisc(struct netdev *netdev);
461 static int tc_query_qdisc(const struct netdev *netdev);
463 static int tc_calc_cell_log(unsigned int mtu);
464 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
465 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
466 const struct tc_ratespec *rate);
467 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
469 struct netdev_linux {
472 /* Protects all members below. */
473 struct ovs_mutex mutex;
475 unsigned int cache_valid;
477 bool miimon; /* Link status of last poll. */
478 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
479 struct timer miimon_timer;
481 /* The following are figured out "on demand" only. They are only valid
482 * when the corresponding VALID_* bit in 'cache_valid' is set. */
484 struct eth_addr etheraddr;
485 struct in_addr address, netmask;
487 unsigned int ifi_flags;
488 long long int carrier_resets;
489 uint32_t kbits_rate; /* Policing data. */
490 uint32_t kbits_burst;
491 int vport_stats_error; /* Cached error code from vport_get_stats().
492 0 or an errno value. */
493 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
494 int ether_addr_error; /* Cached error code from set/get etheraddr. */
495 int netdev_policing_error; /* Cached error code from set policing. */
496 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
497 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
498 int in4_error; /* Cached error code from reading in4 addr. */
499 int in6_error; /* Cached error code from reading in6 addr. */
501 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
505 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
508 /* For devices of class netdev_tap_class only. */
512 struct netdev_rxq_linux {
513 struct netdev_rxq up;
518 /* This is set pretty low because we probably won't learn anything from the
519 * additional log messages. */
520 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
522 /* Polling miimon status for all ports causes performance degradation when
523 * handling a large number of ports. If there are no devices using miimon, then
524 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
526 * Readers do not depend on this variable synchronizing with the related
527 * changes in the device miimon status, so we can use atomic_count. */
528 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
530 static void netdev_linux_run(void);
532 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
533 int cmd, const char *cmd_name);
534 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
535 int cmd, const char *cmd_name);
536 static int get_flags(const struct netdev *, unsigned int *flags);
537 static int set_flags(const char *, unsigned int flags);
538 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
539 enum netdev_flags on, enum netdev_flags *old_flagsp)
540 OVS_REQUIRES(netdev->mutex);
541 static int do_get_ifindex(const char *netdev_name);
542 static int get_ifindex(const struct netdev *, int *ifindexp);
543 static int do_set_addr(struct netdev *netdev,
544 int ioctl_nr, const char *ioctl_name,
545 struct in_addr addr);
546 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
547 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
548 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
549 static int af_packet_sock(void);
550 static bool netdev_linux_miimon_enabled(void);
551 static void netdev_linux_miimon_run(void);
552 static void netdev_linux_miimon_wait(void);
553 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
556 is_netdev_linux_class(const struct netdev_class *netdev_class)
558 return netdev_class->run == netdev_linux_run;
562 is_tap_netdev(const struct netdev *netdev)
564 return netdev_get_class(netdev) == &netdev_tap_class;
567 static struct netdev_linux *
568 netdev_linux_cast(const struct netdev *netdev)
570 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
572 return CONTAINER_OF(netdev, struct netdev_linux, up);
575 static struct netdev_rxq_linux *
576 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
578 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
579 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
582 static void netdev_linux_update(struct netdev_linux *netdev,
583 const struct rtnetlink_change *)
584 OVS_REQUIRES(netdev->mutex);
585 static void netdev_linux_changed(struct netdev_linux *netdev,
586 unsigned int ifi_flags, unsigned int mask)
587 OVS_REQUIRES(netdev->mutex);
589 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
590 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
591 * if no such socket could be created. */
592 static struct nl_sock *
593 netdev_linux_notify_sock(void)
595 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
596 static struct nl_sock *sock;
597 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
598 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
600 if (ovsthread_once_start(&once)) {
603 error = nl_sock_create(NETLINK_ROUTE, &sock);
607 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
608 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
610 nl_sock_destroy(sock);
616 ovsthread_once_done(&once);
623 netdev_linux_miimon_enabled(void)
625 return atomic_count_get(&miimon_cnt) > 0;
629 netdev_linux_run(void)
631 struct nl_sock *sock;
634 if (netdev_linux_miimon_enabled()) {
635 netdev_linux_miimon_run();
638 sock = netdev_linux_notify_sock();
644 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
645 uint64_t buf_stub[4096 / 8];
648 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
649 error = nl_sock_recv(sock, &buf, false);
651 struct rtnetlink_change change;
653 if (rtnetlink_parse(&buf, &change)) {
654 struct netdev *netdev_ = NULL;
655 char dev_name[IFNAMSIZ];
657 if (!change.ifname) {
658 change.ifname = if_indextoname(change.if_index, dev_name);
662 netdev_ = netdev_from_name(change.ifname);
664 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
667 ovs_mutex_lock(&netdev->mutex);
668 netdev_linux_update(netdev, &change);
669 ovs_mutex_unlock(&netdev->mutex);
671 netdev_close(netdev_);
673 } else if (error == ENOBUFS) {
674 struct shash device_shash;
675 struct shash_node *node;
679 shash_init(&device_shash);
680 netdev_get_devices(&netdev_linux_class, &device_shash);
681 SHASH_FOR_EACH (node, &device_shash) {
682 struct netdev *netdev_ = node->data;
683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
686 ovs_mutex_lock(&netdev->mutex);
687 get_flags(netdev_, &flags);
688 netdev_linux_changed(netdev, flags, 0);
689 ovs_mutex_unlock(&netdev->mutex);
691 netdev_close(netdev_);
693 shash_destroy(&device_shash);
694 } else if (error != EAGAIN) {
695 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
696 ovs_strerror(error));
703 netdev_linux_wait(void)
705 struct nl_sock *sock;
707 if (netdev_linux_miimon_enabled()) {
708 netdev_linux_miimon_wait();
710 sock = netdev_linux_notify_sock();
712 nl_sock_wait(sock, POLLIN);
717 netdev_linux_changed(struct netdev_linux *dev,
718 unsigned int ifi_flags, unsigned int mask)
719 OVS_REQUIRES(dev->mutex)
721 netdev_change_seq_changed(&dev->up);
723 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
724 dev->carrier_resets++;
726 dev->ifi_flags = ifi_flags;
728 dev->cache_valid &= mask;
729 if (!(mask & (VALID_IN4 | VALID_IN6))) {
730 netdev_get_addrs_list_flush();
735 netdev_linux_update(struct netdev_linux *dev,
736 const struct rtnetlink_change *change)
737 OVS_REQUIRES(dev->mutex)
739 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
740 if (change->nlmsg_type == RTM_NEWLINK) {
741 /* Keep drv-info, in4, in6. */
742 netdev_linux_changed(dev, change->ifi_flags,
743 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
745 /* Update netdev from rtnl-change msg. */
747 dev->mtu = change->mtu;
748 dev->cache_valid |= VALID_MTU;
749 dev->netdev_mtu_error = 0;
752 if (!eth_addr_is_zero(change->mac)) {
753 dev->etheraddr = change->mac;
754 dev->cache_valid |= VALID_ETHERADDR;
755 dev->ether_addr_error = 0;
758 dev->ifindex = change->if_index;
759 dev->cache_valid |= VALID_IFINDEX;
760 dev->get_ifindex_error = 0;
762 netdev_linux_changed(dev, change->ifi_flags, 0);
764 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
765 /* Invalidates in4, in6. */
766 netdev_linux_changed(dev, dev->ifi_flags,
767 ~(VALID_IN4 | VALID_IN6));
773 static struct netdev *
774 netdev_linux_alloc(void)
776 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
781 netdev_linux_common_construct(struct netdev_linux *netdev)
783 ovs_mutex_init(&netdev->mutex);
786 /* Creates system and internal devices. */
788 netdev_linux_construct(struct netdev *netdev_)
790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 netdev_linux_common_construct(netdev);
795 error = get_flags(&netdev->up, &netdev->ifi_flags);
796 if (error == ENODEV) {
797 if (netdev->up.netdev_class != &netdev_internal_class) {
798 /* The device does not exist, so don't allow it to be opened. */
801 /* "Internal" netdevs have to be created as netdev objects before
802 * they exist in the kernel, because creating them in the kernel
803 * happens by passing a netdev object to dpif_port_add().
804 * Therefore, ignore the error. */
811 /* For most types of netdevs we open the device for each call of
812 * netdev_open(). However, this is not the case with tap devices,
813 * since it is only possible to open the device once. In this
814 * situation we share a single file descriptor, and consequently
815 * buffers, across all readers. Therefore once data is read it will
816 * be unavailable to other reads for tap devices. */
818 netdev_linux_construct_tap(struct netdev *netdev_)
820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
821 static const char tap_dev[] = "/dev/net/tun";
822 const char *name = netdev_->name;
826 netdev_linux_common_construct(netdev);
828 /* Open tap device. */
829 netdev->tap_fd = open(tap_dev, O_RDWR);
830 if (netdev->tap_fd < 0) {
832 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
836 /* Create tap device. */
837 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
838 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
839 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
840 VLOG_WARN("%s: creating tap device failed: %s", name,
841 ovs_strerror(errno));
846 /* Make non-blocking. */
847 error = set_nonblocking(netdev->tap_fd);
855 close(netdev->tap_fd);
860 netdev_linux_destruct(struct netdev *netdev_)
862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
864 if (netdev->tc && netdev->tc->ops->tc_destroy) {
865 netdev->tc->ops->tc_destroy(netdev->tc);
868 if (netdev_get_class(netdev_) == &netdev_tap_class
869 && netdev->tap_fd >= 0)
871 close(netdev->tap_fd);
874 if (netdev->miimon_interval > 0) {
875 atomic_count_dec(&miimon_cnt);
878 ovs_mutex_destroy(&netdev->mutex);
882 netdev_linux_dealloc(struct netdev *netdev_)
884 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
888 static struct netdev_rxq *
889 netdev_linux_rxq_alloc(void)
891 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
896 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
898 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
899 struct netdev *netdev_ = rx->up.netdev;
900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
903 ovs_mutex_lock(&netdev->mutex);
904 rx->is_tap = is_tap_netdev(netdev_);
906 rx->fd = netdev->tap_fd;
908 struct sockaddr_ll sll;
910 /* Result of tcpdump -dd inbound */
911 static const struct sock_filter filt[] = {
912 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
913 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
914 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
915 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
917 static const struct sock_fprog fprog = {
918 ARRAY_SIZE(filt), (struct sock_filter *) filt
921 /* Create file descriptor. */
922 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
925 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
930 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
932 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
933 netdev_get_name(netdev_), ovs_strerror(error));
937 /* Set non-blocking mode. */
938 error = set_nonblocking(rx->fd);
943 /* Get ethernet device index. */
944 error = get_ifindex(&netdev->up, &ifindex);
949 /* Bind to specific ethernet device. */
950 memset(&sll, 0, sizeof sll);
951 sll.sll_family = AF_PACKET;
952 sll.sll_ifindex = ifindex;
953 sll.sll_protocol = htons(ETH_P_ALL);
954 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
956 VLOG_ERR("%s: failed to bind raw socket (%s)",
957 netdev_get_name(netdev_), ovs_strerror(error));
961 /* Filter for only inbound packets. */
962 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
966 VLOG_ERR("%s: failed to attach filter (%s)",
967 netdev_get_name(netdev_), ovs_strerror(error));
971 ovs_mutex_unlock(&netdev->mutex);
979 ovs_mutex_unlock(&netdev->mutex);
984 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
986 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
994 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
996 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1002 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
1004 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1005 return htons(aux->tp_vlan_tpid);
1007 return htons(ETH_TYPE_VLAN);
1012 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1014 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1018 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1023 struct cmsghdr *cmsg;
1025 struct cmsghdr cmsg;
1026 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1030 /* Reserve headroom for a single VLAN tag */
1031 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1032 size = dp_packet_tailroom(buffer);
1034 iov.iov_base = dp_packet_data(buffer);
1036 msgh.msg_name = NULL;
1037 msgh.msg_namelen = 0;
1038 msgh.msg_iov = &iov;
1039 msgh.msg_iovlen = 1;
1040 msgh.msg_control = &cmsg_buffer;
1041 msgh.msg_controllen = sizeof cmsg_buffer;
1045 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1046 } while (retval < 0 && errno == EINTR);
1050 } else if (retval > size) {
1054 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1056 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1057 const struct tpacket_auxdata *aux;
1059 if (cmsg->cmsg_level != SOL_PACKET
1060 || cmsg->cmsg_type != PACKET_AUXDATA
1061 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1065 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1066 if (auxdata_has_vlan_tci(aux)) {
1067 if (retval < ETH_HEADER_LEN) {
1071 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1072 htons(aux->tp_vlan_tci));
1081 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1084 size_t size = dp_packet_tailroom(buffer);
1087 retval = read(fd, dp_packet_data(buffer), size);
1088 } while (retval < 0 && errno == EINTR);
1094 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1099 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1102 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1103 struct netdev *netdev = rx->up.netdev;
1104 struct dp_packet *buffer;
1108 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1109 mtu = ETH_PAYLOAD_MAX;
1112 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1113 DP_NETDEV_HEADROOM);
1114 retval = (rx->is_tap
1115 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1116 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1119 if (retval != EAGAIN && retval != EMSGSIZE) {
1120 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1121 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1123 dp_packet_delete(buffer);
1125 dp_packet_pad(buffer);
1126 dp_packet_rss_invalidate(buffer);
1127 packets[0] = buffer;
1135 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1137 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1138 poll_fd_wait(rx->fd, POLLIN);
1142 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1144 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1147 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1148 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1152 drain_fd(rx->fd, ifr.ifr_qlen);
1155 return drain_rcvbuf(rx->fd);
1159 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1160 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1161 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1162 * the packet is too big or too small to transmit on the device.
1164 * The caller retains ownership of 'buffer' in all cases.
1166 * The kernel maintains a packet transmission queue, so the caller is not
1167 * expected to do additional queuing of packets. */
1169 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1170 struct dp_packet **pkts, int cnt, bool may_steal)
1175 /* 'i' is incremented only if there's no error */
1176 for (i = 0; i < cnt;) {
1177 const void *data = dp_packet_data(pkts[i]);
1178 size_t size = dp_packet_size(pkts[i]);
1181 if (!is_tap_netdev(netdev_)) {
1182 /* Use our AF_PACKET socket to send to this device. */
1183 struct sockaddr_ll sll;
1189 sock = af_packet_sock();
1194 ifindex = netdev_get_ifindex(netdev_);
1199 /* We don't bother setting most fields in sockaddr_ll because the
1200 * kernel ignores them for SOCK_RAW. */
1201 memset(&sll, 0, sizeof sll);
1202 sll.sll_family = AF_PACKET;
1203 sll.sll_ifindex = ifindex;
1205 iov.iov_base = CONST_CAST(void *, data);
1208 msg.msg_name = &sll;
1209 msg.msg_namelen = sizeof sll;
1212 msg.msg_control = NULL;
1213 msg.msg_controllen = 0;
1216 retval = sendmsg(sock, &msg, 0);
1218 /* Use the tap fd to send to this device. This is essential for
1219 * tap devices, because packets sent to a tap device with an
1220 * AF_PACKET socket will loop back to be *received* again on the
1221 * tap device. This doesn't occur on other interface types
1222 * because we attach a socket filter to the rx socket. */
1223 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1225 retval = write(netdev->tap_fd, data, size);
1229 /* The Linux AF_PACKET implementation never blocks waiting for room
1230 * for packets, instead returning ENOBUFS. Translate this into
1231 * EAGAIN for the caller. */
1232 error = errno == ENOBUFS ? EAGAIN : errno;
1233 if (error == EINTR) {
1234 /* continue without incrementing 'i', i.e. retry this packet */
1238 } else if (retval != size) {
1239 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1240 " of %"PRIuSIZE") on %s", retval, size,
1241 netdev_get_name(netdev_));
1246 /* Process the next packet in the batch */
1251 for (i = 0; i < cnt; i++) {
1252 dp_packet_delete(pkts[i]);
1256 if (error && error != EAGAIN) {
1257 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1258 netdev_get_name(netdev_), ovs_strerror(error));
1265 /* Registers with the poll loop to wake up from the next call to poll_block()
1266 * when the packet transmission queue has sufficient room to transmit a packet
1267 * with netdev_send().
1269 * The kernel maintains a packet transmission queue, so the client is not
1270 * expected to do additional queuing of packets. Thus, this function is
1271 * unlikely to ever be used. It is included for completeness. */
1273 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1275 if (is_tap_netdev(netdev)) {
1276 /* TAP device always accepts packets.*/
1277 poll_immediate_wake();
1281 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1282 * otherwise a positive errno value. */
1284 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1286 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1287 enum netdev_flags old_flags = 0;
1290 ovs_mutex_lock(&netdev->mutex);
1292 if (netdev->cache_valid & VALID_ETHERADDR) {
1293 error = netdev->ether_addr_error;
1294 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1297 netdev->cache_valid &= ~VALID_ETHERADDR;
1300 /* Tap devices must be brought down before setting the address. */
1301 if (is_tap_netdev(netdev_)) {
1302 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1304 error = set_etheraddr(netdev_get_name(netdev_), mac);
1305 if (!error || error == ENODEV) {
1306 netdev->ether_addr_error = error;
1307 netdev->cache_valid |= VALID_ETHERADDR;
1309 netdev->etheraddr = mac;
1313 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1314 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1318 ovs_mutex_unlock(&netdev->mutex);
1322 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1324 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1329 ovs_mutex_lock(&netdev->mutex);
1330 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1331 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1332 &netdev->etheraddr);
1333 netdev->cache_valid |= VALID_ETHERADDR;
1336 error = netdev->ether_addr_error;
1338 *mac = netdev->etheraddr;
1340 ovs_mutex_unlock(&netdev->mutex);
1346 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1350 if (!(netdev->cache_valid & VALID_MTU)) {
1353 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1354 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1355 netdev->mtu = ifr.ifr_mtu;
1356 netdev->cache_valid |= VALID_MTU;
1359 error = netdev->netdev_mtu_error;
1361 *mtup = netdev->mtu;
1367 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1368 * in bytes, not including the hardware header; thus, this is typically 1500
1369 * bytes for Ethernet devices. */
1371 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1373 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1376 ovs_mutex_lock(&netdev->mutex);
1377 error = netdev_linux_get_mtu__(netdev, mtup);
1378 ovs_mutex_unlock(&netdev->mutex);
1383 /* Sets the maximum size of transmitted (MTU) for given device using linux
1384 * networking ioctl interface.
1387 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1389 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1393 ovs_mutex_lock(&netdev->mutex);
1394 if (netdev->cache_valid & VALID_MTU) {
1395 error = netdev->netdev_mtu_error;
1396 if (error || netdev->mtu == mtu) {
1399 netdev->cache_valid &= ~VALID_MTU;
1402 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1403 SIOCSIFMTU, "SIOCSIFMTU");
1404 if (!error || error == ENODEV) {
1405 netdev->netdev_mtu_error = error;
1406 netdev->mtu = ifr.ifr_mtu;
1407 netdev->cache_valid |= VALID_MTU;
1410 ovs_mutex_unlock(&netdev->mutex);
1414 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1415 * On failure, returns a negative errno value. */
1417 netdev_linux_get_ifindex(const struct netdev *netdev_)
1419 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1422 ovs_mutex_lock(&netdev->mutex);
1423 error = get_ifindex(netdev_, &ifindex);
1424 ovs_mutex_unlock(&netdev->mutex);
1426 return error ? -error : ifindex;
1430 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1432 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1434 ovs_mutex_lock(&netdev->mutex);
1435 if (netdev->miimon_interval > 0) {
1436 *carrier = netdev->miimon;
1438 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1440 ovs_mutex_unlock(&netdev->mutex);
1445 static long long int
1446 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1449 long long int carrier_resets;
1451 ovs_mutex_lock(&netdev->mutex);
1452 carrier_resets = netdev->carrier_resets;
1453 ovs_mutex_unlock(&netdev->mutex);
1455 return carrier_resets;
1459 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1460 struct mii_ioctl_data *data)
1465 memset(&ifr, 0, sizeof ifr);
1466 memcpy(&ifr.ifr_data, data, sizeof *data);
1467 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1468 memcpy(data, &ifr.ifr_data, sizeof *data);
1474 netdev_linux_get_miimon(const char *name, bool *miimon)
1476 struct mii_ioctl_data data;
1481 memset(&data, 0, sizeof data);
1482 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1484 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1485 data.reg_num = MII_BMSR;
1486 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1490 *miimon = !!(data.val_out & BMSR_LSTATUS);
1492 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1495 struct ethtool_cmd ecmd;
1497 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1500 COVERAGE_INC(netdev_get_ethtool);
1501 memset(&ecmd, 0, sizeof ecmd);
1502 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1505 struct ethtool_value eval;
1507 memcpy(&eval, &ecmd, sizeof eval);
1508 *miimon = !!eval.data;
1510 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1518 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1519 long long int interval)
1521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1523 ovs_mutex_lock(&netdev->mutex);
1524 interval = interval > 0 ? MAX(interval, 100) : 0;
1525 if (netdev->miimon_interval != interval) {
1526 if (interval && !netdev->miimon_interval) {
1527 atomic_count_inc(&miimon_cnt);
1528 } else if (!interval && netdev->miimon_interval) {
1529 atomic_count_dec(&miimon_cnt);
1532 netdev->miimon_interval = interval;
1533 timer_set_expired(&netdev->miimon_timer);
1535 ovs_mutex_unlock(&netdev->mutex);
1541 netdev_linux_miimon_run(void)
1543 struct shash device_shash;
1544 struct shash_node *node;
1546 shash_init(&device_shash);
1547 netdev_get_devices(&netdev_linux_class, &device_shash);
1548 SHASH_FOR_EACH (node, &device_shash) {
1549 struct netdev *netdev = node->data;
1550 struct netdev_linux *dev = netdev_linux_cast(netdev);
1553 ovs_mutex_lock(&dev->mutex);
1554 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1555 netdev_linux_get_miimon(dev->up.name, &miimon);
1556 if (miimon != dev->miimon) {
1557 dev->miimon = miimon;
1558 netdev_linux_changed(dev, dev->ifi_flags, 0);
1561 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1563 ovs_mutex_unlock(&dev->mutex);
1564 netdev_close(netdev);
1567 shash_destroy(&device_shash);
1571 netdev_linux_miimon_wait(void)
1573 struct shash device_shash;
1574 struct shash_node *node;
1576 shash_init(&device_shash);
1577 netdev_get_devices(&netdev_linux_class, &device_shash);
1578 SHASH_FOR_EACH (node, &device_shash) {
1579 struct netdev *netdev = node->data;
1580 struct netdev_linux *dev = netdev_linux_cast(netdev);
1582 ovs_mutex_lock(&dev->mutex);
1583 if (dev->miimon_interval > 0) {
1584 timer_wait(&dev->miimon_timer);
1586 ovs_mutex_unlock(&dev->mutex);
1587 netdev_close(netdev);
1589 shash_destroy(&device_shash);
1593 swap_uint64(uint64_t *a, uint64_t *b)
1600 /* Copies 'src' into 'dst', performing format conversion in the process.
1602 * 'src' is allowed to be misaligned. */
1604 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1605 const struct ovs_vport_stats *src)
1607 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1608 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1609 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1610 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1611 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1612 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1613 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1614 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1616 dst->collisions = 0;
1617 dst->rx_length_errors = 0;
1618 dst->rx_over_errors = 0;
1619 dst->rx_crc_errors = 0;
1620 dst->rx_frame_errors = 0;
1621 dst->rx_fifo_errors = 0;
1622 dst->rx_missed_errors = 0;
1623 dst->tx_aborted_errors = 0;
1624 dst->tx_carrier_errors = 0;
1625 dst->tx_fifo_errors = 0;
1626 dst->tx_heartbeat_errors = 0;
1627 dst->tx_window_errors = 0;
1631 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1633 struct dpif_netlink_vport reply;
1637 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1640 } else if (!reply.stats) {
1645 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1653 get_stats_via_vport(const struct netdev *netdev_,
1654 struct netdev_stats *stats)
1656 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1658 if (!netdev->vport_stats_error ||
1659 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1662 error = get_stats_via_vport__(netdev_, stats);
1663 if (error && error != ENOENT && error != ENODEV) {
1664 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1666 netdev_get_name(netdev_), ovs_strerror(error));
1668 netdev->vport_stats_error = error;
1669 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1673 /* Retrieves current device stats for 'netdev-linux'. */
1675 netdev_linux_get_stats(const struct netdev *netdev_,
1676 struct netdev_stats *stats)
1678 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1679 struct netdev_stats dev_stats;
1682 ovs_mutex_lock(&netdev->mutex);
1683 get_stats_via_vport(netdev_, stats);
1684 error = get_stats_via_netlink(netdev_, &dev_stats);
1686 if (!netdev->vport_stats_error) {
1689 } else if (netdev->vport_stats_error) {
1690 /* stats not available from OVS then use netdev stats. */
1693 /* Use kernel netdev's packet and byte counts since vport's counters
1694 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1696 stats->rx_packets = dev_stats.rx_packets;
1697 stats->rx_bytes = dev_stats.rx_bytes;
1698 stats->tx_packets = dev_stats.tx_packets;
1699 stats->tx_bytes = dev_stats.tx_bytes;
1701 stats->rx_errors += dev_stats.rx_errors;
1702 stats->tx_errors += dev_stats.tx_errors;
1703 stats->rx_dropped += dev_stats.rx_dropped;
1704 stats->tx_dropped += dev_stats.tx_dropped;
1705 stats->multicast += dev_stats.multicast;
1706 stats->collisions += dev_stats.collisions;
1707 stats->rx_length_errors += dev_stats.rx_length_errors;
1708 stats->rx_over_errors += dev_stats.rx_over_errors;
1709 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1710 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1711 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1712 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1713 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1714 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1715 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1716 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1717 stats->tx_window_errors += dev_stats.tx_window_errors;
1719 ovs_mutex_unlock(&netdev->mutex);
1724 /* Retrieves current device stats for 'netdev-tap' netdev or
1725 * netdev-internal. */
1727 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1730 struct netdev_stats dev_stats;
1733 ovs_mutex_lock(&netdev->mutex);
1734 get_stats_via_vport(netdev_, stats);
1735 error = get_stats_via_netlink(netdev_, &dev_stats);
1737 if (!netdev->vport_stats_error) {
1740 } else if (netdev->vport_stats_error) {
1741 /* Transmit and receive stats will appear to be swapped relative to the
1742 * other ports since we are the one sending the data, not a remote
1743 * computer. For consistency, we swap them back here. This does not
1744 * apply if we are getting stats from the vport layer because it always
1745 * tracks stats from the perspective of the switch. */
1748 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1749 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1750 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1751 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1752 stats->rx_length_errors = 0;
1753 stats->rx_over_errors = 0;
1754 stats->rx_crc_errors = 0;
1755 stats->rx_frame_errors = 0;
1756 stats->rx_fifo_errors = 0;
1757 stats->rx_missed_errors = 0;
1758 stats->tx_aborted_errors = 0;
1759 stats->tx_carrier_errors = 0;
1760 stats->tx_fifo_errors = 0;
1761 stats->tx_heartbeat_errors = 0;
1762 stats->tx_window_errors = 0;
1764 /* Use kernel netdev's packet and byte counts since vport counters
1765 * do not reflect packet counts on the wire when GSO, TSO or GRO
1767 stats->rx_packets = dev_stats.tx_packets;
1768 stats->rx_bytes = dev_stats.tx_bytes;
1769 stats->tx_packets = dev_stats.rx_packets;
1770 stats->tx_bytes = dev_stats.rx_bytes;
1772 stats->rx_dropped += dev_stats.tx_dropped;
1773 stats->tx_dropped += dev_stats.rx_dropped;
1775 stats->rx_errors += dev_stats.tx_errors;
1776 stats->tx_errors += dev_stats.rx_errors;
1778 stats->multicast += dev_stats.multicast;
1779 stats->collisions += dev_stats.collisions;
1781 ovs_mutex_unlock(&netdev->mutex);
1787 netdev_internal_get_stats(const struct netdev *netdev_,
1788 struct netdev_stats *stats)
1790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1793 ovs_mutex_lock(&netdev->mutex);
1794 get_stats_via_vport(netdev_, stats);
1795 error = netdev->vport_stats_error;
1796 ovs_mutex_unlock(&netdev->mutex);
1802 netdev_linux_read_features(struct netdev_linux *netdev)
1804 struct ethtool_cmd ecmd;
1808 if (netdev->cache_valid & VALID_FEATURES) {
1812 COVERAGE_INC(netdev_get_ethtool);
1813 memset(&ecmd, 0, sizeof ecmd);
1814 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1815 ETHTOOL_GSET, "ETHTOOL_GSET");
1820 /* Supported features. */
1821 netdev->supported = 0;
1822 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1823 netdev->supported |= NETDEV_F_10MB_HD;
1825 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1826 netdev->supported |= NETDEV_F_10MB_FD;
1828 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1829 netdev->supported |= NETDEV_F_100MB_HD;
1831 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1832 netdev->supported |= NETDEV_F_100MB_FD;
1834 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1835 netdev->supported |= NETDEV_F_1GB_HD;
1837 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1838 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1839 netdev->supported |= NETDEV_F_1GB_FD;
1841 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1842 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1843 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1844 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1845 netdev->supported |= NETDEV_F_10GB_FD;
1847 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1848 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1849 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1850 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1851 netdev->supported |= NETDEV_F_40GB_FD;
1853 if (ecmd.supported & SUPPORTED_TP) {
1854 netdev->supported |= NETDEV_F_COPPER;
1856 if (ecmd.supported & SUPPORTED_FIBRE) {
1857 netdev->supported |= NETDEV_F_FIBER;
1859 if (ecmd.supported & SUPPORTED_Autoneg) {
1860 netdev->supported |= NETDEV_F_AUTONEG;
1862 if (ecmd.supported & SUPPORTED_Pause) {
1863 netdev->supported |= NETDEV_F_PAUSE;
1865 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1866 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1869 /* Advertised features. */
1870 netdev->advertised = 0;
1871 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1872 netdev->advertised |= NETDEV_F_10MB_HD;
1874 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1875 netdev->advertised |= NETDEV_F_10MB_FD;
1877 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1878 netdev->advertised |= NETDEV_F_100MB_HD;
1880 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1881 netdev->advertised |= NETDEV_F_100MB_FD;
1883 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1884 netdev->advertised |= NETDEV_F_1GB_HD;
1886 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1887 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1888 netdev->advertised |= NETDEV_F_1GB_FD;
1890 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1891 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1892 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1893 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1894 netdev->advertised |= NETDEV_F_10GB_FD;
1896 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1897 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1898 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1899 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1900 netdev->advertised |= NETDEV_F_40GB_FD;
1902 if (ecmd.advertising & ADVERTISED_TP) {
1903 netdev->advertised |= NETDEV_F_COPPER;
1905 if (ecmd.advertising & ADVERTISED_FIBRE) {
1906 netdev->advertised |= NETDEV_F_FIBER;
1908 if (ecmd.advertising & ADVERTISED_Autoneg) {
1909 netdev->advertised |= NETDEV_F_AUTONEG;
1911 if (ecmd.advertising & ADVERTISED_Pause) {
1912 netdev->advertised |= NETDEV_F_PAUSE;
1914 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1915 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1918 /* Current settings. */
1919 speed = ethtool_cmd_speed(&ecmd);
1920 if (speed == SPEED_10) {
1921 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1922 } else if (speed == SPEED_100) {
1923 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1924 } else if (speed == SPEED_1000) {
1925 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1926 } else if (speed == SPEED_10000) {
1927 netdev->current = NETDEV_F_10GB_FD;
1928 } else if (speed == 40000) {
1929 netdev->current = NETDEV_F_40GB_FD;
1930 } else if (speed == 100000) {
1931 netdev->current = NETDEV_F_100GB_FD;
1932 } else if (speed == 1000000) {
1933 netdev->current = NETDEV_F_1TB_FD;
1935 netdev->current = 0;
1938 if (ecmd.port == PORT_TP) {
1939 netdev->current |= NETDEV_F_COPPER;
1940 } else if (ecmd.port == PORT_FIBRE) {
1941 netdev->current |= NETDEV_F_FIBER;
1945 netdev->current |= NETDEV_F_AUTONEG;
1949 netdev->cache_valid |= VALID_FEATURES;
1950 netdev->get_features_error = error;
1953 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1954 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1955 * Returns 0 if successful, otherwise a positive errno value. */
1957 netdev_linux_get_features(const struct netdev *netdev_,
1958 enum netdev_features *current,
1959 enum netdev_features *advertised,
1960 enum netdev_features *supported,
1961 enum netdev_features *peer)
1963 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1966 ovs_mutex_lock(&netdev->mutex);
1967 netdev_linux_read_features(netdev);
1968 if (!netdev->get_features_error) {
1969 *current = netdev->current;
1970 *advertised = netdev->advertised;
1971 *supported = netdev->supported;
1972 *peer = 0; /* XXX */
1974 error = netdev->get_features_error;
1975 ovs_mutex_unlock(&netdev->mutex);
1980 /* Set the features advertised by 'netdev' to 'advertise'. */
1982 netdev_linux_set_advertisements(struct netdev *netdev_,
1983 enum netdev_features advertise)
1985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1986 struct ethtool_cmd ecmd;
1989 ovs_mutex_lock(&netdev->mutex);
1991 COVERAGE_INC(netdev_get_ethtool);
1992 memset(&ecmd, 0, sizeof ecmd);
1993 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1994 ETHTOOL_GSET, "ETHTOOL_GSET");
1999 ecmd.advertising = 0;
2000 if (advertise & NETDEV_F_10MB_HD) {
2001 ecmd.advertising |= ADVERTISED_10baseT_Half;
2003 if (advertise & NETDEV_F_10MB_FD) {
2004 ecmd.advertising |= ADVERTISED_10baseT_Full;
2006 if (advertise & NETDEV_F_100MB_HD) {
2007 ecmd.advertising |= ADVERTISED_100baseT_Half;
2009 if (advertise & NETDEV_F_100MB_FD) {
2010 ecmd.advertising |= ADVERTISED_100baseT_Full;
2012 if (advertise & NETDEV_F_1GB_HD) {
2013 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2015 if (advertise & NETDEV_F_1GB_FD) {
2016 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2018 if (advertise & NETDEV_F_10GB_FD) {
2019 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2021 if (advertise & NETDEV_F_COPPER) {
2022 ecmd.advertising |= ADVERTISED_TP;
2024 if (advertise & NETDEV_F_FIBER) {
2025 ecmd.advertising |= ADVERTISED_FIBRE;
2027 if (advertise & NETDEV_F_AUTONEG) {
2028 ecmd.advertising |= ADVERTISED_Autoneg;
2030 if (advertise & NETDEV_F_PAUSE) {
2031 ecmd.advertising |= ADVERTISED_Pause;
2033 if (advertise & NETDEV_F_PAUSE_ASYM) {
2034 ecmd.advertising |= ADVERTISED_Asym_Pause;
2036 COVERAGE_INC(netdev_set_ethtool);
2037 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2038 ETHTOOL_SSET, "ETHTOOL_SSET");
2041 ovs_mutex_unlock(&netdev->mutex);
2045 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2046 * successful, otherwise a positive errno value. */
2048 netdev_linux_set_policing(struct netdev *netdev_,
2049 uint32_t kbits_rate, uint32_t kbits_burst)
2051 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052 const char *netdev_name = netdev_get_name(netdev_);
2055 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2056 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
2057 : kbits_burst); /* Stick with user-specified value. */
2059 ovs_mutex_lock(&netdev->mutex);
2060 if (netdev->cache_valid & VALID_POLICING) {
2061 error = netdev->netdev_policing_error;
2062 if (error || (netdev->kbits_rate == kbits_rate &&
2063 netdev->kbits_burst == kbits_burst)) {
2064 /* Assume that settings haven't changed since we last set them. */
2067 netdev->cache_valid &= ~VALID_POLICING;
2070 COVERAGE_INC(netdev_set_policing);
2071 /* Remove any existing ingress qdisc. */
2072 error = tc_add_del_ingress_qdisc(netdev_, false);
2074 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2075 netdev_name, ovs_strerror(error));
2080 error = tc_add_del_ingress_qdisc(netdev_, true);
2082 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2083 netdev_name, ovs_strerror(error));
2087 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2089 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2090 netdev_name, ovs_strerror(error));
2095 netdev->kbits_rate = kbits_rate;
2096 netdev->kbits_burst = kbits_burst;
2099 if (!error || error == ENODEV) {
2100 netdev->netdev_policing_error = error;
2101 netdev->cache_valid |= VALID_POLICING;
2103 ovs_mutex_unlock(&netdev->mutex);
2108 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2111 const struct tc_ops *const *opsp;
2113 for (opsp = tcs; *opsp != NULL; opsp++) {
2114 const struct tc_ops *ops = *opsp;
2115 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2116 sset_add(types, ops->ovs_name);
2122 static const struct tc_ops *
2123 tc_lookup_ovs_name(const char *name)
2125 const struct tc_ops *const *opsp;
2127 for (opsp = tcs; *opsp != NULL; opsp++) {
2128 const struct tc_ops *ops = *opsp;
2129 if (!strcmp(name, ops->ovs_name)) {
2136 static const struct tc_ops *
2137 tc_lookup_linux_name(const char *name)
2139 const struct tc_ops *const *opsp;
2141 for (opsp = tcs; *opsp != NULL; opsp++) {
2142 const struct tc_ops *ops = *opsp;
2143 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2150 static struct tc_queue *
2151 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2155 struct tc_queue *queue;
2157 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2158 if (queue->queue_id == queue_id) {
2165 static struct tc_queue *
2166 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2168 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2172 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2174 struct netdev_qos_capabilities *caps)
2176 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2180 caps->n_queues = ops->n_queues;
2185 netdev_linux_get_qos(const struct netdev *netdev_,
2186 const char **typep, struct smap *details)
2188 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2191 ovs_mutex_lock(&netdev->mutex);
2192 error = tc_query_qdisc(netdev_);
2194 *typep = netdev->tc->ops->ovs_name;
2195 error = (netdev->tc->ops->qdisc_get
2196 ? netdev->tc->ops->qdisc_get(netdev_, details)
2199 ovs_mutex_unlock(&netdev->mutex);
2205 netdev_linux_set_qos(struct netdev *netdev_,
2206 const char *type, const struct smap *details)
2208 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2209 const struct tc_ops *new_ops;
2212 new_ops = tc_lookup_ovs_name(type);
2213 if (!new_ops || !new_ops->tc_install) {
2217 ovs_mutex_lock(&netdev->mutex);
2218 error = tc_query_qdisc(netdev_);
2223 if (new_ops == netdev->tc->ops) {
2224 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2226 /* Delete existing qdisc. */
2227 error = tc_del_qdisc(netdev_);
2231 ovs_assert(netdev->tc == NULL);
2233 /* Install new qdisc. */
2234 error = new_ops->tc_install(netdev_, details);
2235 ovs_assert((error == 0) == (netdev->tc != NULL));
2239 ovs_mutex_unlock(&netdev->mutex);
2244 netdev_linux_get_queue(const struct netdev *netdev_,
2245 unsigned int queue_id, struct smap *details)
2247 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2250 ovs_mutex_lock(&netdev->mutex);
2251 error = tc_query_qdisc(netdev_);
2253 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2255 ? netdev->tc->ops->class_get(netdev_, queue, details)
2258 ovs_mutex_unlock(&netdev->mutex);
2264 netdev_linux_set_queue(struct netdev *netdev_,
2265 unsigned int queue_id, const struct smap *details)
2267 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2270 ovs_mutex_lock(&netdev->mutex);
2271 error = tc_query_qdisc(netdev_);
2273 error = (queue_id < netdev->tc->ops->n_queues
2274 && netdev->tc->ops->class_set
2275 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2278 ovs_mutex_unlock(&netdev->mutex);
2284 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2286 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2289 ovs_mutex_lock(&netdev->mutex);
2290 error = tc_query_qdisc(netdev_);
2292 if (netdev->tc->ops->class_delete) {
2293 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2295 ? netdev->tc->ops->class_delete(netdev_, queue)
2301 ovs_mutex_unlock(&netdev->mutex);
2307 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2308 unsigned int queue_id,
2309 struct netdev_queue_stats *stats)
2311 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2314 ovs_mutex_lock(&netdev->mutex);
2315 error = tc_query_qdisc(netdev_);
2317 if (netdev->tc->ops->class_get_stats) {
2318 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2320 stats->created = queue->created;
2321 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2330 ovs_mutex_unlock(&netdev->mutex);
2335 struct queue_dump_state {
2336 struct nl_dump dump;
2341 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2343 struct ofpbuf request;
2344 struct tcmsg *tcmsg;
2346 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2350 tcmsg->tcm_parent = 0;
2351 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2352 ofpbuf_uninit(&request);
2354 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2359 finish_queue_dump(struct queue_dump_state *state)
2361 ofpbuf_uninit(&state->buf);
2362 return nl_dump_done(&state->dump);
2365 struct netdev_linux_queue_state {
2366 unsigned int *queues;
2372 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2374 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2377 ovs_mutex_lock(&netdev->mutex);
2378 error = tc_query_qdisc(netdev_);
2380 if (netdev->tc->ops->class_get) {
2381 struct netdev_linux_queue_state *state;
2382 struct tc_queue *queue;
2385 *statep = state = xmalloc(sizeof *state);
2386 state->n_queues = hmap_count(&netdev->tc->queues);
2387 state->cur_queue = 0;
2388 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2391 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2392 state->queues[i++] = queue->queue_id;
2398 ovs_mutex_unlock(&netdev->mutex);
2404 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2405 unsigned int *queue_idp, struct smap *details)
2407 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2408 struct netdev_linux_queue_state *state = state_;
2411 ovs_mutex_lock(&netdev->mutex);
2412 while (state->cur_queue < state->n_queues) {
2413 unsigned int queue_id = state->queues[state->cur_queue++];
2414 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2417 *queue_idp = queue_id;
2418 error = netdev->tc->ops->class_get(netdev_, queue, details);
2422 ovs_mutex_unlock(&netdev->mutex);
2428 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2431 struct netdev_linux_queue_state *state = state_;
2433 free(state->queues);
2439 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2440 netdev_dump_queue_stats_cb *cb, void *aux)
2442 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2445 ovs_mutex_lock(&netdev->mutex);
2446 error = tc_query_qdisc(netdev_);
2448 struct queue_dump_state state;
2450 if (!netdev->tc->ops->class_dump_stats) {
2452 } else if (!start_queue_dump(netdev_, &state)) {
2458 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2459 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2466 retval = finish_queue_dump(&state);
2472 ovs_mutex_unlock(&netdev->mutex);
2478 netdev_linux_get_in4(const struct netdev *netdev_,
2479 struct in_addr *address, struct in_addr *netmask)
2481 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2484 ovs_mutex_lock(&netdev->mutex);
2485 if (!(netdev->cache_valid & VALID_IN4)) {
2486 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2487 SIOCGIFADDR, "SIOCGIFADDR");
2489 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2490 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2492 netdev->in4_error = error;
2493 netdev->cache_valid |= VALID_IN4;
2495 error = netdev->in4_error;
2499 if (netdev->address.s_addr != INADDR_ANY) {
2500 *address = netdev->address;
2501 *netmask = netdev->netmask;
2503 error = EADDRNOTAVAIL;
2506 ovs_mutex_unlock(&netdev->mutex);
2512 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2513 struct in_addr netmask)
2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2518 ovs_mutex_lock(&netdev->mutex);
2519 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2521 netdev->address = address;
2522 netdev->netmask = netmask;
2523 if (address.s_addr != INADDR_ANY) {
2524 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2525 "SIOCSIFNETMASK", netmask);
2530 netdev->cache_valid |= VALID_IN4;
2531 netdev->in4_error = 0;
2533 netdev->cache_valid &= ~VALID_IN4;
2535 ovs_mutex_unlock(&netdev->mutex);
2540 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2541 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2544 netdev_linux_get_addr_list(const struct netdev *netdev_,
2545 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2550 ovs_mutex_lock(&netdev->mutex);
2551 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2552 ovs_mutex_unlock(&netdev->mutex);
2558 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2560 struct sockaddr_in sin;
2561 memset(&sin, 0, sizeof sin);
2562 sin.sin_family = AF_INET;
2563 sin.sin_addr = addr;
2566 memset(sa, 0, sizeof *sa);
2567 memcpy(sa, &sin, sizeof sin);
2571 do_set_addr(struct netdev *netdev,
2572 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2576 make_in4_sockaddr(&ifr.ifr_addr, addr);
2577 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2581 /* Adds 'router' as a default IP gateway. */
2583 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2585 struct in_addr any = { INADDR_ANY };
2589 memset(&rt, 0, sizeof rt);
2590 make_in4_sockaddr(&rt.rt_dst, any);
2591 make_in4_sockaddr(&rt.rt_gateway, router);
2592 make_in4_sockaddr(&rt.rt_genmask, any);
2593 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2594 error = af_inet_ioctl(SIOCADDRT, &rt);
2596 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2602 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2605 static const char fn[] = "/proc/net/route";
2610 *netdev_name = NULL;
2611 stream = fopen(fn, "r");
2612 if (stream == NULL) {
2613 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2618 while (fgets(line, sizeof line, stream)) {
2621 ovs_be32 dest, gateway, mask;
2622 int refcnt, metric, mtu;
2623 unsigned int flags, use, window, irtt;
2626 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2628 iface, &dest, &gateway, &flags, &refcnt,
2629 &use, &metric, &mask, &mtu, &window, &irtt)) {
2630 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2634 if (!(flags & RTF_UP)) {
2635 /* Skip routes that aren't up. */
2639 /* The output of 'dest', 'mask', and 'gateway' were given in
2640 * network byte order, so we don't need need any endian
2641 * conversions here. */
2642 if ((dest & mask) == (host->s_addr & mask)) {
2644 /* The host is directly reachable. */
2645 next_hop->s_addr = 0;
2647 /* To reach the host, we must go through a gateway. */
2648 next_hop->s_addr = gateway;
2650 *netdev_name = xstrdup(iface);
2662 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2664 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2667 ovs_mutex_lock(&netdev->mutex);
2668 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2669 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2671 COVERAGE_INC(netdev_get_ethtool);
2672 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2673 error = netdev_linux_do_ethtool(netdev->up.name,
2676 "ETHTOOL_GDRVINFO");
2678 netdev->cache_valid |= VALID_DRVINFO;
2683 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2684 smap_add(smap, "driver_version", netdev->drvinfo.version);
2685 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2687 ovs_mutex_unlock(&netdev->mutex);
2693 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2696 smap_add(smap, "driver_name", "openvswitch");
2700 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2701 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2702 * returns 0. Otherwise, it returns a positive errno value; in particular,
2703 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2705 netdev_linux_arp_lookup(const struct netdev *netdev,
2706 ovs_be32 ip, struct eth_addr *mac)
2709 struct sockaddr_in sin;
2712 memset(&r, 0, sizeof r);
2713 memset(&sin, 0, sizeof sin);
2714 sin.sin_family = AF_INET;
2715 sin.sin_addr.s_addr = ip;
2717 memcpy(&r.arp_pa, &sin, sizeof sin);
2718 r.arp_ha.sa_family = ARPHRD_ETHER;
2720 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2721 COVERAGE_INC(netdev_arp_lookup);
2722 retval = af_inet_ioctl(SIOCGARP, &r);
2724 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2725 } else if (retval != ENXIO) {
2726 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2727 netdev_get_name(netdev), IP_ARGS(ip),
2728 ovs_strerror(retval));
2734 nd_to_iff_flags(enum netdev_flags nd)
2737 if (nd & NETDEV_UP) {
2740 if (nd & NETDEV_PROMISC) {
2743 if (nd & NETDEV_LOOPBACK) {
2744 iff |= IFF_LOOPBACK;
2750 iff_to_nd_flags(int iff)
2752 enum netdev_flags nd = 0;
2756 if (iff & IFF_PROMISC) {
2757 nd |= NETDEV_PROMISC;
2759 if (iff & IFF_LOOPBACK) {
2760 nd |= NETDEV_LOOPBACK;
2766 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2767 enum netdev_flags on, enum netdev_flags *old_flagsp)
2768 OVS_REQUIRES(netdev->mutex)
2770 int old_flags, new_flags;
2773 old_flags = netdev->ifi_flags;
2774 *old_flagsp = iff_to_nd_flags(old_flags);
2775 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2776 if (new_flags != old_flags) {
2777 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2778 get_flags(&netdev->up, &netdev->ifi_flags);
2785 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2786 enum netdev_flags on, enum netdev_flags *old_flagsp)
2788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2791 ovs_mutex_lock(&netdev->mutex);
2792 error = update_flags(netdev, off, on, old_flagsp);
2793 ovs_mutex_unlock(&netdev->mutex);
2798 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2799 GET_FEATURES, GET_STATUS) \
2802 false, /* is_pmd */ \
2806 netdev_linux_wait, \
2808 netdev_linux_alloc, \
2810 netdev_linux_destruct, \
2811 netdev_linux_dealloc, \
2812 NULL, /* get_config */ \
2813 NULL, /* set_config */ \
2814 NULL, /* get_tunnel_config */ \
2815 NULL, /* build header */ \
2816 NULL, /* push header */ \
2817 NULL, /* pop header */ \
2818 NULL, /* get_numa_id */ \
2819 NULL, /* set_multiq */ \
2821 netdev_linux_send, \
2822 netdev_linux_send_wait, \
2824 netdev_linux_set_etheraddr, \
2825 netdev_linux_get_etheraddr, \
2826 netdev_linux_get_mtu, \
2827 netdev_linux_set_mtu, \
2828 netdev_linux_get_ifindex, \
2829 netdev_linux_get_carrier, \
2830 netdev_linux_get_carrier_resets, \
2831 netdev_linux_set_miimon_interval, \
2835 netdev_linux_set_advertisements, \
2837 netdev_linux_set_policing, \
2838 netdev_linux_get_qos_types, \
2839 netdev_linux_get_qos_capabilities, \
2840 netdev_linux_get_qos, \
2841 netdev_linux_set_qos, \
2842 netdev_linux_get_queue, \
2843 netdev_linux_set_queue, \
2844 netdev_linux_delete_queue, \
2845 netdev_linux_get_queue_stats, \
2846 netdev_linux_queue_dump_start, \
2847 netdev_linux_queue_dump_next, \
2848 netdev_linux_queue_dump_done, \
2849 netdev_linux_dump_queue_stats, \
2851 netdev_linux_get_in4, \
2852 netdev_linux_set_in4, \
2853 netdev_linux_get_addr_list, \
2854 netdev_linux_add_router, \
2855 netdev_linux_get_next_hop, \
2857 netdev_linux_arp_lookup, \
2859 netdev_linux_update_flags, \
2861 netdev_linux_rxq_alloc, \
2862 netdev_linux_rxq_construct, \
2863 netdev_linux_rxq_destruct, \
2864 netdev_linux_rxq_dealloc, \
2865 netdev_linux_rxq_recv, \
2866 netdev_linux_rxq_wait, \
2867 netdev_linux_rxq_drain, \
2870 const struct netdev_class netdev_linux_class =
2873 netdev_linux_construct,
2874 netdev_linux_get_stats,
2875 netdev_linux_get_features,
2876 netdev_linux_get_status);
2878 const struct netdev_class netdev_tap_class =
2881 netdev_linux_construct_tap,
2882 netdev_tap_get_stats,
2883 netdev_linux_get_features,
2884 netdev_linux_get_status);
2886 const struct netdev_class netdev_internal_class =
2889 netdev_linux_construct,
2890 netdev_internal_get_stats,
2891 NULL, /* get_features */
2892 netdev_internal_get_status);
2895 #define CODEL_N_QUEUES 0x0000
2897 /* In sufficiently new kernel headers these are defined as enums in
2898 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2899 * kernels. (This overrides any enum definition in the header file but that's
2901 #define TCA_CODEL_TARGET 1
2902 #define TCA_CODEL_LIMIT 2
2903 #define TCA_CODEL_INTERVAL 3
2912 static struct codel *
2913 codel_get__(const struct netdev *netdev_)
2915 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2916 return CONTAINER_OF(netdev->tc, struct codel, tc);
2920 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2923 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2924 struct codel *codel;
2926 codel = xmalloc(sizeof *codel);
2927 tc_init(&codel->tc, &tc_ops_codel);
2928 codel->target = target;
2929 codel->limit = limit;
2930 codel->interval = interval;
2932 netdev->tc = &codel->tc;
2936 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2940 struct ofpbuf request;
2941 struct tcmsg *tcmsg;
2942 uint32_t otarget, olimit, ointerval;
2945 tc_del_qdisc(netdev);
2947 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2948 NLM_F_EXCL | NLM_F_CREATE, &request);
2952 tcmsg->tcm_handle = tc_make_handle(1, 0);
2953 tcmsg->tcm_parent = TC_H_ROOT;
2955 otarget = target ? target : 5000;
2956 olimit = limit ? limit : 10240;
2957 ointerval = interval ? interval : 100000;
2959 nl_msg_put_string(&request, TCA_KIND, "codel");
2960 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2961 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2962 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2963 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2964 nl_msg_end_nested(&request, opt_offset);
2966 error = tc_transact(&request, NULL);
2968 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2969 "target %u, limit %u, interval %u error %d(%s)",
2970 netdev_get_name(netdev),
2971 otarget, olimit, ointerval,
2972 error, ovs_strerror(error));
2978 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2979 const struct smap *details, struct codel *codel)
2981 const char *target_s;
2982 const char *limit_s;
2983 const char *interval_s;
2985 target_s = smap_get(details, "target");
2986 limit_s = smap_get(details, "limit");
2987 interval_s = smap_get(details, "interval");
2989 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2990 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2991 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2993 if (!codel->target) {
2994 codel->target = 5000;
2996 if (!codel->limit) {
2997 codel->limit = 10240;
2999 if (!codel->interval) {
3000 codel->interval = 100000;
3005 codel_tc_install(struct netdev *netdev, const struct smap *details)
3010 codel_parse_qdisc_details__(netdev, details, &codel);
3011 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3014 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3020 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3022 static const struct nl_policy tca_codel_policy[] = {
3023 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3024 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3025 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3028 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3030 if (!nl_parse_nested(nl_options, tca_codel_policy,
3031 attrs, ARRAY_SIZE(tca_codel_policy))) {
3032 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3036 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3037 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3038 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3043 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3045 struct nlattr *nlattr;
3050 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3055 error = codel_parse_tca_options__(nlattr, &codel);
3060 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3066 codel_tc_destroy(struct tc *tc)
3068 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3074 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3076 const struct codel *codel = codel_get__(netdev);
3077 smap_add_format(details, "target", "%u", codel->target);
3078 smap_add_format(details, "limit", "%u", codel->limit);
3079 smap_add_format(details, "interval", "%u", codel->interval);
3084 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3088 codel_parse_qdisc_details__(netdev, details, &codel);
3089 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3090 codel_get__(netdev)->target = codel.target;
3091 codel_get__(netdev)->limit = codel.limit;
3092 codel_get__(netdev)->interval = codel.interval;
3096 static const struct tc_ops tc_ops_codel = {
3097 "codel", /* linux_name */
3098 "linux-codel", /* ovs_name */
3099 CODEL_N_QUEUES, /* n_queues */
3112 /* FQ-CoDel traffic control class. */
3114 #define FQCODEL_N_QUEUES 0x0000
3116 /* In sufficiently new kernel headers these are defined as enums in
3117 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3118 * kernels. (This overrides any enum definition in the header file but that's
3120 #define TCA_FQ_CODEL_TARGET 1
3121 #define TCA_FQ_CODEL_LIMIT 2
3122 #define TCA_FQ_CODEL_INTERVAL 3
3123 #define TCA_FQ_CODEL_ECN 4
3124 #define TCA_FQ_CODEL_FLOWS 5
3125 #define TCA_FQ_CODEL_QUANTUM 6
3136 static struct fqcodel *
3137 fqcodel_get__(const struct netdev *netdev_)
3139 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3140 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3144 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3145 uint32_t interval, uint32_t flows, uint32_t quantum)
3147 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3148 struct fqcodel *fqcodel;
3150 fqcodel = xmalloc(sizeof *fqcodel);
3151 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3152 fqcodel->target = target;
3153 fqcodel->limit = limit;
3154 fqcodel->interval = interval;
3155 fqcodel->flows = flows;
3156 fqcodel->quantum = quantum;
3158 netdev->tc = &fqcodel->tc;
3162 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3163 uint32_t interval, uint32_t flows, uint32_t quantum)
3166 struct ofpbuf request;
3167 struct tcmsg *tcmsg;
3168 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3171 tc_del_qdisc(netdev);
3173 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3174 NLM_F_EXCL | NLM_F_CREATE, &request);
3178 tcmsg->tcm_handle = tc_make_handle(1, 0);
3179 tcmsg->tcm_parent = TC_H_ROOT;
3181 otarget = target ? target : 5000;
3182 olimit = limit ? limit : 10240;
3183 ointerval = interval ? interval : 100000;
3184 oflows = flows ? flows : 1024;
3185 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3188 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3189 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3190 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3191 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3192 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3193 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3194 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3195 nl_msg_end_nested(&request, opt_offset);
3197 error = tc_transact(&request, NULL);
3199 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3200 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3201 netdev_get_name(netdev),
3202 otarget, olimit, ointerval, oflows, oquantum,
3203 error, ovs_strerror(error));
3209 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3210 const struct smap *details, struct fqcodel *fqcodel)
3212 const char *target_s;
3213 const char *limit_s;
3214 const char *interval_s;
3215 const char *flows_s;
3216 const char *quantum_s;
3218 target_s = smap_get(details, "target");
3219 limit_s = smap_get(details, "limit");
3220 interval_s = smap_get(details, "interval");
3221 flows_s = smap_get(details, "flows");
3222 quantum_s = smap_get(details, "quantum");
3223 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3224 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3225 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3226 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3227 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3228 if (!fqcodel->target) {
3229 fqcodel->target = 5000;
3231 if (!fqcodel->limit) {
3232 fqcodel->limit = 10240;
3234 if (!fqcodel->interval) {
3235 fqcodel->interval = 1000000;
3237 if (!fqcodel->flows) {
3238 fqcodel->flows = 1024;
3240 if (!fqcodel->quantum) {
3241 fqcodel->quantum = 1514;
3246 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3249 struct fqcodel fqcodel;
3251 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3252 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3253 fqcodel.interval, fqcodel.flows,
3256 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3257 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3263 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3265 static const struct nl_policy tca_fqcodel_policy[] = {
3266 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3267 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3268 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3269 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3270 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3273 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3275 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3276 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3277 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3281 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3282 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3283 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3284 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3285 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3290 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3292 struct nlattr *nlattr;
3295 struct fqcodel fqcodel;
3297 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3302 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3307 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3308 fqcodel.flows, fqcodel.quantum);
3313 fqcodel_tc_destroy(struct tc *tc)
3315 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3321 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3323 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3324 smap_add_format(details, "target", "%u", fqcodel->target);
3325 smap_add_format(details, "limit", "%u", fqcodel->limit);
3326 smap_add_format(details, "interval", "%u", fqcodel->interval);
3327 smap_add_format(details, "flows", "%u", fqcodel->flows);
3328 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3333 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3335 struct fqcodel fqcodel;
3337 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3338 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3339 fqcodel.flows, fqcodel.quantum);
3340 fqcodel_get__(netdev)->target = fqcodel.target;
3341 fqcodel_get__(netdev)->limit = fqcodel.limit;
3342 fqcodel_get__(netdev)->interval = fqcodel.interval;
3343 fqcodel_get__(netdev)->flows = fqcodel.flows;
3344 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3348 static const struct tc_ops tc_ops_fqcodel = {
3349 "fq_codel", /* linux_name */
3350 "linux-fq_codel", /* ovs_name */
3351 FQCODEL_N_QUEUES, /* n_queues */
3364 /* SFQ traffic control class. */
3366 #define SFQ_N_QUEUES 0x0000
3375 sfq_get__(const struct netdev *netdev_)
3377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3378 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3382 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3384 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3387 sfq = xmalloc(sizeof *sfq);
3388 tc_init(&sfq->tc, &tc_ops_sfq);
3389 sfq->perturb = perturb;
3390 sfq->quantum = quantum;
3392 netdev->tc = &sfq->tc;
3396 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3398 struct tc_sfq_qopt opt;
3399 struct ofpbuf request;
3400 struct tcmsg *tcmsg;
3402 int mtu_error, error;
3403 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3405 tc_del_qdisc(netdev);
3407 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3408 NLM_F_EXCL | NLM_F_CREATE, &request);
3412 tcmsg->tcm_handle = tc_make_handle(1, 0);
3413 tcmsg->tcm_parent = TC_H_ROOT;
3415 memset(&opt, 0, sizeof opt);
3418 opt.quantum = mtu; /* if we cannot find mtu, use default */
3421 opt.quantum = quantum;
3425 opt.perturb_period = 10;
3427 opt.perturb_period = perturb;
3430 nl_msg_put_string(&request, TCA_KIND, "sfq");
3431 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3433 error = tc_transact(&request, NULL);
3435 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3436 "quantum %u, perturb %u error %d(%s)",
3437 netdev_get_name(netdev),
3438 opt.quantum, opt.perturb_period,
3439 error, ovs_strerror(error));
3445 sfq_parse_qdisc_details__(struct netdev *netdev,
3446 const struct smap *details, struct sfq *sfq)
3448 const char *perturb_s;
3449 const char *quantum_s;
3453 perturb_s = smap_get(details, "perturb");
3454 quantum_s = smap_get(details, "quantum");
3455 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3456 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3457 if (!sfq->perturb) {
3461 if (!sfq->quantum) {
3462 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3466 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3467 "device without mtu");
3474 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3479 sfq_parse_qdisc_details__(netdev, details, &sfq);
3480 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3482 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3488 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3490 const struct tc_sfq_qopt *sfq;
3491 struct nlattr *nlattr;
3495 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3497 sfq = nl_attr_get(nlattr);
3498 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3506 sfq_tc_destroy(struct tc *tc)
3508 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3514 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3516 const struct sfq *sfq = sfq_get__(netdev);
3517 smap_add_format(details, "quantum", "%u", sfq->quantum);
3518 smap_add_format(details, "perturb", "%u", sfq->perturb);
3523 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3527 sfq_parse_qdisc_details__(netdev, details, &sfq);
3528 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3529 sfq_get__(netdev)->quantum = sfq.quantum;
3530 sfq_get__(netdev)->perturb = sfq.perturb;
3534 static const struct tc_ops tc_ops_sfq = {
3535 "sfq", /* linux_name */
3536 "linux-sfq", /* ovs_name */
3537 SFQ_N_QUEUES, /* n_queues */
3550 /* HTB traffic control class. */
3552 #define HTB_N_QUEUES 0xf000
3553 #define HTB_RATE2QUANTUM 10
3557 unsigned int max_rate; /* In bytes/s. */
3561 struct tc_queue tc_queue;
3562 unsigned int min_rate; /* In bytes/s. */
3563 unsigned int max_rate; /* In bytes/s. */
3564 unsigned int burst; /* In bytes. */
3565 unsigned int priority; /* Lower values are higher priorities. */
3569 htb_get__(const struct netdev *netdev_)
3571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3572 return CONTAINER_OF(netdev->tc, struct htb, tc);
3576 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3578 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3581 htb = xmalloc(sizeof *htb);
3582 tc_init(&htb->tc, &tc_ops_htb);
3583 htb->max_rate = max_rate;
3585 netdev->tc = &htb->tc;
3588 /* Create an HTB qdisc.
3590 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3592 htb_setup_qdisc__(struct netdev *netdev)
3595 struct tc_htb_glob opt;
3596 struct ofpbuf request;
3597 struct tcmsg *tcmsg;
3599 tc_del_qdisc(netdev);
3601 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3602 NLM_F_EXCL | NLM_F_CREATE, &request);
3606 tcmsg->tcm_handle = tc_make_handle(1, 0);
3607 tcmsg->tcm_parent = TC_H_ROOT;
3609 nl_msg_put_string(&request, TCA_KIND, "htb");
3611 memset(&opt, 0, sizeof opt);
3612 opt.rate2quantum = HTB_RATE2QUANTUM;
3616 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3617 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3618 nl_msg_end_nested(&request, opt_offset);
3620 return tc_transact(&request, NULL);
3623 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3624 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3626 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3627 unsigned int parent, struct htb_class *class)
3630 struct tc_htb_opt opt;
3631 struct ofpbuf request;
3632 struct tcmsg *tcmsg;
3636 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3638 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3639 netdev_get_name(netdev));
3643 memset(&opt, 0, sizeof opt);
3644 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3645 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3646 /* Makes sure the quantum is at least MTU. Setting quantum will
3647 * make htb ignore the r2q for this class. */
3648 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3651 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3652 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3653 opt.prio = class->priority;
3655 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3659 tcmsg->tcm_handle = handle;
3660 tcmsg->tcm_parent = parent;
3662 nl_msg_put_string(&request, TCA_KIND, "htb");
3663 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3664 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3665 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3666 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3667 nl_msg_end_nested(&request, opt_offset);
3669 error = tc_transact(&request, NULL);
3671 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3672 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3673 netdev_get_name(netdev),
3674 tc_get_major(handle), tc_get_minor(handle),
3675 tc_get_major(parent), tc_get_minor(parent),
3676 class->min_rate, class->max_rate,
3677 class->burst, class->priority, ovs_strerror(error));
3682 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3683 * description of them into 'details'. The description complies with the
3684 * specification given in the vswitch database documentation for linux-htb
3687 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3689 static const struct nl_policy tca_htb_policy[] = {
3690 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3691 .min_len = sizeof(struct tc_htb_opt) },
3694 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3695 const struct tc_htb_opt *htb;
3697 if (!nl_parse_nested(nl_options, tca_htb_policy,
3698 attrs, ARRAY_SIZE(tca_htb_policy))) {
3699 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3703 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3704 class->min_rate = htb->rate.rate;
3705 class->max_rate = htb->ceil.rate;
3706 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3707 class->priority = htb->prio;
3712 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3713 struct htb_class *options,
3714 struct netdev_queue_stats *stats)
3716 struct nlattr *nl_options;
3717 unsigned int handle;
3720 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3721 if (!error && queue_id) {
3722 unsigned int major = tc_get_major(handle);
3723 unsigned int minor = tc_get_minor(handle);
3724 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3725 *queue_id = minor - 1;
3730 if (!error && options) {
3731 error = htb_parse_tca_options__(nl_options, options);
3737 htb_parse_qdisc_details__(struct netdev *netdev_,
3738 const struct smap *details, struct htb_class *hc)
3740 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3741 const char *max_rate_s;
3743 max_rate_s = smap_get(details, "max-rate");
3744 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3745 if (!hc->max_rate) {
3746 enum netdev_features current;
3748 netdev_linux_read_features(netdev);
3749 current = !netdev->get_features_error ? netdev->current : 0;
3750 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3752 hc->min_rate = hc->max_rate;
3758 htb_parse_class_details__(struct netdev *netdev,
3759 const struct smap *details, struct htb_class *hc)
3761 const struct htb *htb = htb_get__(netdev);
3762 const char *min_rate_s = smap_get(details, "min-rate");
3763 const char *max_rate_s = smap_get(details, "max-rate");
3764 const char *burst_s = smap_get(details, "burst");
3765 const char *priority_s = smap_get(details, "priority");
3768 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3770 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3771 netdev_get_name(netdev));
3775 /* HTB requires at least an mtu sized min-rate to send any traffic even
3776 * on uncongested links. */
3777 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3778 hc->min_rate = MAX(hc->min_rate, mtu);
3779 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3782 hc->max_rate = (max_rate_s
3783 ? strtoull(max_rate_s, NULL, 10) / 8
3785 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3786 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3790 * According to hints in the documentation that I've read, it is important
3791 * that 'burst' be at least as big as the largest frame that might be
3792 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3793 * but having it a bit too small is a problem. Since netdev_get_mtu()
3794 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3795 * the MTU. We actually add 64, instead of 14, as a guard against
3796 * additional headers get tacked on somewhere that we're not aware of. */
3797 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3798 hc->burst = MAX(hc->burst, mtu + 64);
3801 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3807 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3808 unsigned int parent, struct htb_class *options,
3809 struct netdev_queue_stats *stats)
3811 struct ofpbuf *reply;
3814 error = tc_query_class(netdev, handle, parent, &reply);
3816 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3817 ofpbuf_delete(reply);
3823 htb_tc_install(struct netdev *netdev, const struct smap *details)
3827 error = htb_setup_qdisc__(netdev);
3829 struct htb_class hc;
3831 htb_parse_qdisc_details__(netdev, details, &hc);
3832 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3833 tc_make_handle(1, 0), &hc);
3835 htb_install__(netdev, hc.max_rate);
3841 static struct htb_class *
3842 htb_class_cast__(const struct tc_queue *queue)
3844 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3848 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3849 const struct htb_class *hc)
3851 struct htb *htb = htb_get__(netdev);
3852 size_t hash = hash_int(queue_id, 0);
3853 struct tc_queue *queue;
3854 struct htb_class *hcp;
3856 queue = tc_find_queue__(netdev, queue_id, hash);
3858 hcp = htb_class_cast__(queue);
3860 hcp = xmalloc(sizeof *hcp);
3861 queue = &hcp->tc_queue;
3862 queue->queue_id = queue_id;
3863 queue->created = time_msec();
3864 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3867 hcp->min_rate = hc->min_rate;
3868 hcp->max_rate = hc->max_rate;
3869 hcp->burst = hc->burst;
3870 hcp->priority = hc->priority;
3874 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3877 struct queue_dump_state state;
3878 struct htb_class hc;
3880 /* Get qdisc options. */
3882 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3883 htb_install__(netdev, hc.max_rate);
3886 if (!start_queue_dump(netdev, &state)) {
3889 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3890 unsigned int queue_id;
3892 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3893 htb_update_queue__(netdev, queue_id, &hc);
3896 finish_queue_dump(&state);
3902 htb_tc_destroy(struct tc *tc)
3904 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3905 struct htb_class *hc, *next;
3907 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3908 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3916 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3918 const struct htb *htb = htb_get__(netdev);
3919 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3924 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3926 struct htb_class hc;
3929 htb_parse_qdisc_details__(netdev, details, &hc);
3930 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3931 tc_make_handle(1, 0), &hc);
3933 htb_get__(netdev)->max_rate = hc.max_rate;
3939 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3940 const struct tc_queue *queue, struct smap *details)
3942 const struct htb_class *hc = htb_class_cast__(queue);
3944 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3945 if (hc->min_rate != hc->max_rate) {
3946 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3948 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3950 smap_add_format(details, "priority", "%u", hc->priority);
3956 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3957 const struct smap *details)
3959 struct htb_class hc;
3962 error = htb_parse_class_details__(netdev, details, &hc);
3967 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3968 tc_make_handle(1, 0xfffe), &hc);
3973 htb_update_queue__(netdev, queue_id, &hc);
3978 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3980 struct htb_class *hc = htb_class_cast__(queue);
3981 struct htb *htb = htb_get__(netdev);
3984 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3986 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3993 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3994 struct netdev_queue_stats *stats)
3996 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3997 tc_make_handle(1, 0xfffe), NULL, stats);
4001 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4002 const struct ofpbuf *nlmsg,
4003 netdev_dump_queue_stats_cb *cb, void *aux)
4005 struct netdev_queue_stats stats;
4006 unsigned int handle, major, minor;
4009 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4014 major = tc_get_major(handle);
4015 minor = tc_get_minor(handle);
4016 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4017 (*cb)(minor - 1, &stats, aux);
4022 static const struct tc_ops tc_ops_htb = {
4023 "htb", /* linux_name */
4024 "linux-htb", /* ovs_name */
4025 HTB_N_QUEUES, /* n_queues */
4034 htb_class_get_stats,
4035 htb_class_dump_stats
4038 /* "linux-hfsc" traffic control class. */
4040 #define HFSC_N_QUEUES 0xf000
4048 struct tc_queue tc_queue;
4053 static struct hfsc *
4054 hfsc_get__(const struct netdev *netdev_)
4056 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4057 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4060 static struct hfsc_class *
4061 hfsc_class_cast__(const struct tc_queue *queue)
4063 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4067 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4072 hfsc = xmalloc(sizeof *hfsc);
4073 tc_init(&hfsc->tc, &tc_ops_hfsc);
4074 hfsc->max_rate = max_rate;
4075 netdev->tc = &hfsc->tc;
4079 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4080 const struct hfsc_class *hc)
4084 struct hfsc_class *hcp;
4085 struct tc_queue *queue;
4087 hfsc = hfsc_get__(netdev);
4088 hash = hash_int(queue_id, 0);
4090 queue = tc_find_queue__(netdev, queue_id, hash);
4092 hcp = hfsc_class_cast__(queue);
4094 hcp = xmalloc(sizeof *hcp);
4095 queue = &hcp->tc_queue;
4096 queue->queue_id = queue_id;
4097 queue->created = time_msec();
4098 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4101 hcp->min_rate = hc->min_rate;
4102 hcp->max_rate = hc->max_rate;
4106 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4108 const struct tc_service_curve *rsc, *fsc, *usc;
4109 static const struct nl_policy tca_hfsc_policy[] = {
4111 .type = NL_A_UNSPEC,
4113 .min_len = sizeof(struct tc_service_curve),
4116 .type = NL_A_UNSPEC,
4118 .min_len = sizeof(struct tc_service_curve),
4121 .type = NL_A_UNSPEC,
4123 .min_len = sizeof(struct tc_service_curve),
4126 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4128 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4129 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4130 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4134 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4135 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4136 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4138 if (rsc->m1 != 0 || rsc->d != 0 ||
4139 fsc->m1 != 0 || fsc->d != 0 ||
4140 usc->m1 != 0 || usc->d != 0) {
4141 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4142 "Non-linear service curves are not supported.");
4146 if (rsc->m2 != fsc->m2) {
4147 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4148 "Real-time service curves are not supported ");
4152 if (rsc->m2 > usc->m2) {
4153 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4154 "Min-rate service curve is greater than "
4155 "the max-rate service curve.");
4159 class->min_rate = fsc->m2;
4160 class->max_rate = usc->m2;
4165 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4166 struct hfsc_class *options,
4167 struct netdev_queue_stats *stats)
4170 unsigned int handle;
4171 struct nlattr *nl_options;
4173 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4179 unsigned int major, minor;
4181 major = tc_get_major(handle);
4182 minor = tc_get_minor(handle);
4183 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4184 *queue_id = minor - 1;
4191 error = hfsc_parse_tca_options__(nl_options, options);
4198 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4199 unsigned int parent, struct hfsc_class *options,
4200 struct netdev_queue_stats *stats)
4203 struct ofpbuf *reply;
4205 error = tc_query_class(netdev, handle, parent, &reply);
4210 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4211 ofpbuf_delete(reply);
4216 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4217 struct hfsc_class *class)
4219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4221 const char *max_rate_s;
4223 max_rate_s = smap_get(details, "max-rate");
4224 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4227 enum netdev_features current;
4229 netdev_linux_read_features(netdev);
4230 current = !netdev->get_features_error ? netdev->current : 0;
4231 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4234 class->min_rate = max_rate;
4235 class->max_rate = max_rate;
4239 hfsc_parse_class_details__(struct netdev *netdev,
4240 const struct smap *details,
4241 struct hfsc_class * class)
4243 const struct hfsc *hfsc;
4244 uint32_t min_rate, max_rate;
4245 const char *min_rate_s, *max_rate_s;
4247 hfsc = hfsc_get__(netdev);
4248 min_rate_s = smap_get(details, "min-rate");
4249 max_rate_s = smap_get(details, "max-rate");
4251 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4252 min_rate = MAX(min_rate, 1);
4253 min_rate = MIN(min_rate, hfsc->max_rate);
4255 max_rate = (max_rate_s
4256 ? strtoull(max_rate_s, NULL, 10) / 8
4258 max_rate = MAX(max_rate, min_rate);
4259 max_rate = MIN(max_rate, hfsc->max_rate);
4261 class->min_rate = min_rate;
4262 class->max_rate = max_rate;
4267 /* Create an HFSC qdisc.
4269 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4271 hfsc_setup_qdisc__(struct netdev * netdev)
4273 struct tcmsg *tcmsg;
4274 struct ofpbuf request;
4275 struct tc_hfsc_qopt opt;
4277 tc_del_qdisc(netdev);
4279 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4280 NLM_F_EXCL | NLM_F_CREATE, &request);
4286 tcmsg->tcm_handle = tc_make_handle(1, 0);
4287 tcmsg->tcm_parent = TC_H_ROOT;
4289 memset(&opt, 0, sizeof opt);
4292 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4293 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4295 return tc_transact(&request, NULL);
4298 /* Create an HFSC class.
4300 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4301 * sc rate <min_rate> ul rate <max_rate>" */
4303 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4304 unsigned int parent, struct hfsc_class *class)
4308 struct tcmsg *tcmsg;
4309 struct ofpbuf request;
4310 struct tc_service_curve min, max;
4312 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4318 tcmsg->tcm_handle = handle;
4319 tcmsg->tcm_parent = parent;
4323 min.m2 = class->min_rate;
4327 max.m2 = class->max_rate;
4329 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4330 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4331 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4332 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4333 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4334 nl_msg_end_nested(&request, opt_offset);
4336 error = tc_transact(&request, NULL);
4338 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4339 "min-rate %ubps, max-rate %ubps (%s)",
4340 netdev_get_name(netdev),
4341 tc_get_major(handle), tc_get_minor(handle),
4342 tc_get_major(parent), tc_get_minor(parent),
4343 class->min_rate, class->max_rate, ovs_strerror(error));
4350 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4353 struct hfsc_class class;
4355 error = hfsc_setup_qdisc__(netdev);
4361 hfsc_parse_qdisc_details__(netdev, details, &class);
4362 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4363 tc_make_handle(1, 0), &class);
4369 hfsc_install__(netdev, class.max_rate);
4374 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4377 struct queue_dump_state state;
4378 struct hfsc_class hc;
4381 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4382 hfsc_install__(netdev, hc.max_rate);
4384 if (!start_queue_dump(netdev, &state)) {
4388 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4389 unsigned int queue_id;
4391 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4392 hfsc_update_queue__(netdev, queue_id, &hc);
4396 finish_queue_dump(&state);
4401 hfsc_tc_destroy(struct tc *tc)
4404 struct hfsc_class *hc, *next;
4406 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4408 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4409 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4418 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4420 const struct hfsc *hfsc;
4421 hfsc = hfsc_get__(netdev);
4422 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4427 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4430 struct hfsc_class class;
4432 hfsc_parse_qdisc_details__(netdev, details, &class);
4433 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4434 tc_make_handle(1, 0), &class);
4437 hfsc_get__(netdev)->max_rate = class.max_rate;
4444 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4445 const struct tc_queue *queue, struct smap *details)
4447 const struct hfsc_class *hc;
4449 hc = hfsc_class_cast__(queue);
4450 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4451 if (hc->min_rate != hc->max_rate) {
4452 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4458 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4459 const struct smap *details)
4462 struct hfsc_class class;
4464 error = hfsc_parse_class_details__(netdev, details, &class);
4469 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4470 tc_make_handle(1, 0xfffe), &class);
4475 hfsc_update_queue__(netdev, queue_id, &class);
4480 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4484 struct hfsc_class *hc;
4486 hc = hfsc_class_cast__(queue);
4487 hfsc = hfsc_get__(netdev);
4489 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4491 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4498 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4499 struct netdev_queue_stats *stats)
4501 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4502 tc_make_handle(1, 0xfffe), NULL, stats);
4506 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4507 const struct ofpbuf *nlmsg,
4508 netdev_dump_queue_stats_cb *cb, void *aux)
4510 struct netdev_queue_stats stats;
4511 unsigned int handle, major, minor;
4514 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4519 major = tc_get_major(handle);
4520 minor = tc_get_minor(handle);
4521 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4522 (*cb)(minor - 1, &stats, aux);
4527 static const struct tc_ops tc_ops_hfsc = {
4528 "hfsc", /* linux_name */
4529 "linux-hfsc", /* ovs_name */
4530 HFSC_N_QUEUES, /* n_queues */
4531 hfsc_tc_install, /* tc_install */
4532 hfsc_tc_load, /* tc_load */
4533 hfsc_tc_destroy, /* tc_destroy */
4534 hfsc_qdisc_get, /* qdisc_get */
4535 hfsc_qdisc_set, /* qdisc_set */
4536 hfsc_class_get, /* class_get */
4537 hfsc_class_set, /* class_set */
4538 hfsc_class_delete, /* class_delete */
4539 hfsc_class_get_stats, /* class_get_stats */
4540 hfsc_class_dump_stats /* class_dump_stats */
4543 /* "linux-default" traffic control class.
4545 * This class represents the default, unnamed Linux qdisc. It corresponds to
4546 * the "" (empty string) QoS type in the OVS database. */
4549 default_install__(struct netdev *netdev_)
4551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4552 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4554 /* Nothing but a tc class implementation is allowed to write to a tc. This
4555 * class never does that, so we can legitimately use a const tc object. */
4556 netdev->tc = CONST_CAST(struct tc *, &tc);
4560 default_tc_install(struct netdev *netdev,
4561 const struct smap *details OVS_UNUSED)
4563 default_install__(netdev);
4568 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4570 default_install__(netdev);
4574 static const struct tc_ops tc_ops_default = {
4575 NULL, /* linux_name */
4580 NULL, /* tc_destroy */
4581 NULL, /* qdisc_get */
4582 NULL, /* qdisc_set */
4583 NULL, /* class_get */
4584 NULL, /* class_set */
4585 NULL, /* class_delete */
4586 NULL, /* class_get_stats */
4587 NULL /* class_dump_stats */
4590 /* "linux-other" traffic control class.
4595 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4597 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4598 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4600 /* Nothing but a tc class implementation is allowed to write to a tc. This
4601 * class never does that, so we can legitimately use a const tc object. */
4602 netdev->tc = CONST_CAST(struct tc *, &tc);
4606 static const struct tc_ops tc_ops_other = {
4607 NULL, /* linux_name */
4608 "linux-other", /* ovs_name */
4610 NULL, /* tc_install */
4612 NULL, /* tc_destroy */
4613 NULL, /* qdisc_get */
4614 NULL, /* qdisc_set */
4615 NULL, /* class_get */
4616 NULL, /* class_set */
4617 NULL, /* class_delete */
4618 NULL, /* class_get_stats */
4619 NULL /* class_dump_stats */
4622 /* Traffic control. */
4624 /* Number of kernel "tc" ticks per second. */
4625 static double ticks_per_s;
4627 /* Number of kernel "jiffies" per second. This is used for the purpose of
4628 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4629 * one jiffy's worth of data.
4631 * There are two possibilities here:
4633 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4634 * approximate range of 100 to 1024. That means that we really need to
4635 * make sure that the qdisc can buffer that much data.
4637 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4638 * has finely granular timers and there's no need to fudge additional room
4639 * for buffers. (There's no extra effort needed to implement that: the
4640 * large 'buffer_hz' is used as a divisor, so practically any number will
4641 * come out as 0 in the division. Small integer results in the case of
4642 * really high dividends won't have any real effect anyhow.)
4644 static unsigned int buffer_hz;
4646 /* Returns tc handle 'major':'minor'. */
4648 tc_make_handle(unsigned int major, unsigned int minor)
4650 return TC_H_MAKE(major << 16, minor);
4653 /* Returns the major number from 'handle'. */
4655 tc_get_major(unsigned int handle)
4657 return TC_H_MAJ(handle) >> 16;
4660 /* Returns the minor number from 'handle'. */
4662 tc_get_minor(unsigned int handle)
4664 return TC_H_MIN(handle);
4667 static struct tcmsg *
4668 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4669 struct ofpbuf *request)
4671 struct tcmsg *tcmsg;
4675 error = get_ifindex(netdev, &ifindex);
4680 ofpbuf_init(request, 512);
4681 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4682 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4683 tcmsg->tcm_family = AF_UNSPEC;
4684 tcmsg->tcm_ifindex = ifindex;
4685 /* Caller should fill in tcmsg->tcm_handle. */
4686 /* Caller should fill in tcmsg->tcm_parent. */
4692 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4694 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4695 ofpbuf_uninit(request);
4699 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4700 * policing configuration.
4702 * This function is equivalent to running the following when 'add' is true:
4703 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4705 * This function is equivalent to running the following when 'add' is false:
4706 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4708 * The configuration and stats may be seen with the following command:
4709 * /sbin/tc -s qdisc show dev <devname>
4711 * Returns 0 if successful, otherwise a positive errno value.
4714 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4716 struct ofpbuf request;
4717 struct tcmsg *tcmsg;
4719 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4720 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4722 tcmsg = tc_make_request(netdev, type, flags, &request);
4726 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4727 tcmsg->tcm_parent = TC_H_INGRESS;
4728 nl_msg_put_string(&request, TCA_KIND, "ingress");
4729 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4731 error = tc_transact(&request, NULL);
4733 /* If we're deleting the qdisc, don't worry about some of the
4734 * error conditions. */
4735 if (!add && (error == ENOENT || error == EINVAL)) {
4744 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4747 * This function is equivalent to running:
4748 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4749 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4752 * The configuration and stats may be seen with the following command:
4753 * /sbin/tc -s filter show dev <devname> parent ffff:
4755 * Returns 0 if successful, otherwise a positive errno value.
4758 tc_add_policer(struct netdev *netdev,
4759 uint32_t kbits_rate, uint32_t kbits_burst)
4761 struct tc_police tc_police;
4762 struct ofpbuf request;
4763 struct tcmsg *tcmsg;
4764 size_t basic_offset;
4765 size_t police_offset;
4769 memset(&tc_police, 0, sizeof tc_police);
4770 tc_police.action = TC_POLICE_SHOT;
4771 tc_police.mtu = mtu;
4772 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4774 /* The following appears wrong in two ways:
4776 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4777 * arguments (or at least consistently "bytes" as both or "bits" as
4778 * both), but this supplies bytes for the first argument and bits for the
4781 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4783 * However if you "fix" those problems then "tc filter show ..." shows
4784 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4785 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4786 * tc's point of view. Whatever. */
4787 tc_police.burst = tc_bytes_to_ticks(
4788 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4790 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4791 NLM_F_EXCL | NLM_F_CREATE, &request);
4795 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4796 tcmsg->tcm_info = tc_make_handle(49,
4797 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4799 nl_msg_put_string(&request, TCA_KIND, "basic");
4800 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4801 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4802 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4803 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4804 nl_msg_end_nested(&request, police_offset);
4805 nl_msg_end_nested(&request, basic_offset);
4807 error = tc_transact(&request, NULL);
4818 /* The values in psched are not individually very meaningful, but they are
4819 * important. The tables below show some values seen in the wild.
4823 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4824 * (Before that, there are hints that it was 1000000000.)
4826 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4830 * -----------------------------------
4831 * [1] 000c8000 000f4240 000f4240 00000064
4832 * [2] 000003e8 00000400 000f4240 3b9aca00
4833 * [3] 000003e8 00000400 000f4240 3b9aca00
4834 * [4] 000003e8 00000400 000f4240 00000064
4835 * [5] 000003e8 00000040 000f4240 3b9aca00
4836 * [6] 000003e8 00000040 000f4240 000000f9
4838 * a b c d ticks_per_s buffer_hz
4839 * ------- --------- ---------- ------------- ----------- -------------
4840 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4841 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4842 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4843 * [4] 1,000 1,024 1,000,000 100 976,562 100
4844 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4845 * [6] 1,000 64 1,000,000 249 15,625,000 249
4847 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4848 * [2] 2.6.26-1-686-bigmem from Debian lenny
4849 * [3] 2.6.26-2-sparc64 from Debian lenny
4850 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4851 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4852 * [6] 2.6.34 from kernel.org on KVM
4854 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4855 static const char fn[] = "/proc/net/psched";
4856 unsigned int a, b, c, d;
4859 if (!ovsthread_once_start(&once)) {
4866 stream = fopen(fn, "r");
4868 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4872 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4873 VLOG_WARN("%s: read failed", fn);
4877 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4881 VLOG_WARN("%s: invalid scheduler parameters", fn);
4885 ticks_per_s = (double) a * c / b;
4889 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4892 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4895 ovsthread_once_done(&once);
4898 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4899 * rate of 'rate' bytes per second. */
4901 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4904 return (rate * ticks) / ticks_per_s;
4907 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4908 * rate of 'rate' bytes per second. */
4910 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4913 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4916 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4917 * a transmission rate of 'rate' bytes per second. */
4919 tc_buffer_per_jiffy(unsigned int rate)
4922 return rate / buffer_hz;
4925 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4926 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4927 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4928 * stores NULL into it if it is absent.
4930 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4933 * Returns 0 if successful, otherwise a positive errno value. */
4935 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4936 struct nlattr **options)
4938 static const struct nl_policy tca_policy[] = {
4939 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4940 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4942 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4944 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4945 tca_policy, ta, ARRAY_SIZE(ta))) {
4946 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4951 *kind = nl_attr_get_string(ta[TCA_KIND]);
4955 *options = ta[TCA_OPTIONS];
4970 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4971 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4972 * into '*options', and its queue statistics into '*stats'. Any of the output
4973 * arguments may be null.
4975 * Returns 0 if successful, otherwise a positive errno value. */
4977 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4978 struct nlattr **options, struct netdev_queue_stats *stats)
4980 static const struct nl_policy tca_policy[] = {
4981 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4982 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4984 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4986 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4987 tca_policy, ta, ARRAY_SIZE(ta))) {
4988 VLOG_WARN_RL(&rl, "failed to parse class message");
4993 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4994 *handlep = tc->tcm_handle;
4998 *options = ta[TCA_OPTIONS];
5002 const struct gnet_stats_queue *gsq;
5003 struct gnet_stats_basic gsb;
5005 static const struct nl_policy stats_policy[] = {
5006 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5007 .min_len = sizeof gsb },
5008 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5009 .min_len = sizeof *gsq },
5011 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5013 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5014 sa, ARRAY_SIZE(sa))) {
5015 VLOG_WARN_RL(&rl, "failed to parse class stats");
5019 /* Alignment issues screw up the length of struct gnet_stats_basic on
5020 * some arch/bitsize combinations. Newer versions of Linux have a
5021 * struct gnet_stats_basic_packed, but we can't depend on that. The
5022 * easiest thing to do is just to make a copy. */
5023 memset(&gsb, 0, sizeof gsb);
5024 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5025 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5026 stats->tx_bytes = gsb.bytes;
5027 stats->tx_packets = gsb.packets;
5029 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5030 stats->tx_errors = gsq->drops;
5040 memset(stats, 0, sizeof *stats);
5045 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5048 tc_query_class(const struct netdev *netdev,
5049 unsigned int handle, unsigned int parent,
5050 struct ofpbuf **replyp)
5052 struct ofpbuf request;
5053 struct tcmsg *tcmsg;
5056 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5060 tcmsg->tcm_handle = handle;
5061 tcmsg->tcm_parent = parent;
5063 error = tc_transact(&request, replyp);
5065 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5066 netdev_get_name(netdev),
5067 tc_get_major(handle), tc_get_minor(handle),
5068 tc_get_major(parent), tc_get_minor(parent),
5069 ovs_strerror(error));
5074 /* Equivalent to "tc class del dev <name> handle <handle>". */
5076 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5078 struct ofpbuf request;
5079 struct tcmsg *tcmsg;
5082 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5086 tcmsg->tcm_handle = handle;
5087 tcmsg->tcm_parent = 0;
5089 error = tc_transact(&request, NULL);
5091 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5092 netdev_get_name(netdev),
5093 tc_get_major(handle), tc_get_minor(handle),
5094 ovs_strerror(error));
5099 /* Equivalent to "tc qdisc del dev <name> root". */
5101 tc_del_qdisc(struct netdev *netdev_)
5103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5104 struct ofpbuf request;
5105 struct tcmsg *tcmsg;
5108 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5112 tcmsg->tcm_handle = tc_make_handle(1, 0);
5113 tcmsg->tcm_parent = TC_H_ROOT;
5115 error = tc_transact(&request, NULL);
5116 if (error == EINVAL) {
5117 /* EINVAL probably means that the default qdisc was in use, in which
5118 * case we've accomplished our purpose. */
5121 if (!error && netdev->tc) {
5122 if (netdev->tc->ops->tc_destroy) {
5123 netdev->tc->ops->tc_destroy(netdev->tc);
5131 getqdisc_is_safe(void)
5133 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5134 static bool safe = false;
5136 if (ovsthread_once_start(&once)) {
5137 struct utsname utsname;
5140 if (uname(&utsname) == -1) {
5141 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5142 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5143 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5144 } else if (major < 2 || (major == 2 && minor < 35)) {
5145 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5150 ovsthread_once_done(&once);
5155 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5156 * kernel to determine what they are. Returns 0 if successful, otherwise a
5157 * positive errno value. */
5159 tc_query_qdisc(const struct netdev *netdev_)
5161 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5162 struct ofpbuf request, *qdisc;
5163 const struct tc_ops *ops;
5164 struct tcmsg *tcmsg;
5172 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5173 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5174 * 2.6.35 without that fix backported to it.
5176 * To avoid the OOPS, we must not make a request that would attempt to dump
5177 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5178 * few others. There are a few ways that I can see to do this, but most of
5179 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5180 * technique chosen here is to assume that any non-default qdisc that we
5181 * create will have a class with handle 1:0. The built-in qdiscs only have
5182 * a class with handle 0:0.
5184 * On Linux 2.6.35+ we use the straightforward method because it allows us
5185 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5186 * in such a case we get no response at all from the kernel (!) if a
5187 * builtin qdisc is in use (which is later caught by "!error &&
5188 * !qdisc->size"). */
5189 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5193 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5194 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5196 /* Figure out what tc class to instantiate. */
5197 error = tc_transact(&request, &qdisc);
5198 if (!error && qdisc->size) {
5201 error = tc_parse_qdisc(qdisc, &kind, NULL);
5203 ops = &tc_ops_other;
5205 ops = tc_lookup_linux_name(kind);
5207 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5208 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5210 ops = &tc_ops_other;
5213 } else if ((!error && !qdisc->size) || error == ENOENT) {
5214 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5215 * set up by some other entity that doesn't have a handle 1:0. We will
5216 * assume that it's the system default qdisc. */
5217 ops = &tc_ops_default;
5220 /* Who knows? Maybe the device got deleted. */
5221 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5222 netdev_get_name(netdev_), ovs_strerror(error));
5223 ops = &tc_ops_other;
5226 /* Instantiate it. */
5227 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5228 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5229 ofpbuf_delete(qdisc);
5231 return error ? error : load_error;
5234 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5235 approximate the time to transmit packets of various lengths. For an MTU of
5236 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5237 represents two possible packet lengths; for a MTU of 513 through 1024, four
5238 possible lengths; and so on.
5240 Returns, for the specified 'mtu', the number of bits that packet lengths
5241 need to be shifted right to fit within such a 256-entry table. */
5243 tc_calc_cell_log(unsigned int mtu)
5248 mtu = ETH_PAYLOAD_MAX;
5250 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5252 for (cell_log = 0; mtu >= 256; cell_log++) {
5259 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5262 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5264 memset(rate, 0, sizeof *rate);
5265 rate->cell_log = tc_calc_cell_log(mtu);
5266 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5267 /* rate->cell_align = 0; */ /* distro headers. */
5268 rate->mpu = ETH_TOTAL_MIN;
5272 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5273 * attribute of the specified "type".
5275 * See tc_calc_cell_log() above for a description of "rtab"s. */
5277 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5282 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5283 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5284 unsigned packet_size = (i + 1) << rate->cell_log;
5285 if (packet_size < rate->mpu) {
5286 packet_size = rate->mpu;
5288 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5292 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5293 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5294 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5297 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5299 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5300 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5303 /* Linux-only functions declared in netdev-linux.h */
5305 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5306 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5308 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5309 const char *flag_name, bool enable)
5311 const char *netdev_name = netdev_get_name(netdev);
5312 struct ethtool_value evalue;
5316 COVERAGE_INC(netdev_get_ethtool);
5317 memset(&evalue, 0, sizeof evalue);
5318 error = netdev_linux_do_ethtool(netdev_name,
5319 (struct ethtool_cmd *)&evalue,
5320 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5325 COVERAGE_INC(netdev_set_ethtool);
5326 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5327 if (new_flags == evalue.data) {
5330 evalue.data = new_flags;
5331 error = netdev_linux_do_ethtool(netdev_name,
5332 (struct ethtool_cmd *)&evalue,
5333 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5338 COVERAGE_INC(netdev_get_ethtool);
5339 memset(&evalue, 0, sizeof evalue);
5340 error = netdev_linux_do_ethtool(netdev_name,
5341 (struct ethtool_cmd *)&evalue,
5342 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5347 if (new_flags != evalue.data) {
5348 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5349 "device %s failed", enable ? "enable" : "disable",
5350 flag_name, netdev_name);
5357 /* Utility functions. */
5359 /* Copies 'src' into 'dst', performing format conversion in the process. */
5361 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5362 const struct rtnl_link_stats *src)
5364 dst->rx_packets = src->rx_packets;
5365 dst->tx_packets = src->tx_packets;
5366 dst->rx_bytes = src->rx_bytes;
5367 dst->tx_bytes = src->tx_bytes;
5368 dst->rx_errors = src->rx_errors;
5369 dst->tx_errors = src->tx_errors;
5370 dst->rx_dropped = src->rx_dropped;
5371 dst->tx_dropped = src->tx_dropped;
5372 dst->multicast = src->multicast;
5373 dst->collisions = src->collisions;
5374 dst->rx_length_errors = src->rx_length_errors;
5375 dst->rx_over_errors = src->rx_over_errors;
5376 dst->rx_crc_errors = src->rx_crc_errors;
5377 dst->rx_frame_errors = src->rx_frame_errors;
5378 dst->rx_fifo_errors = src->rx_fifo_errors;
5379 dst->rx_missed_errors = src->rx_missed_errors;
5380 dst->tx_aborted_errors = src->tx_aborted_errors;
5381 dst->tx_carrier_errors = src->tx_carrier_errors;
5382 dst->tx_fifo_errors = src->tx_fifo_errors;
5383 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5384 dst->tx_window_errors = src->tx_window_errors;
5387 /* Copies 'src' into 'dst', performing format conversion in the process. */
5389 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5390 const struct rtnl_link_stats64 *src)
5392 dst->rx_packets = src->rx_packets;
5393 dst->tx_packets = src->tx_packets;
5394 dst->rx_bytes = src->rx_bytes;
5395 dst->tx_bytes = src->tx_bytes;
5396 dst->rx_errors = src->rx_errors;
5397 dst->tx_errors = src->tx_errors;
5398 dst->rx_dropped = src->rx_dropped;
5399 dst->tx_dropped = src->tx_dropped;
5400 dst->multicast = src->multicast;
5401 dst->collisions = src->collisions;
5402 dst->rx_length_errors = src->rx_length_errors;
5403 dst->rx_over_errors = src->rx_over_errors;
5404 dst->rx_crc_errors = src->rx_crc_errors;
5405 dst->rx_frame_errors = src->rx_frame_errors;
5406 dst->rx_fifo_errors = src->rx_fifo_errors;
5407 dst->rx_missed_errors = src->rx_missed_errors;
5408 dst->tx_aborted_errors = src->tx_aborted_errors;
5409 dst->tx_carrier_errors = src->tx_carrier_errors;
5410 dst->tx_fifo_errors = src->tx_fifo_errors;
5411 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5412 dst->tx_window_errors = src->tx_window_errors;
5416 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5418 struct ofpbuf request;
5419 struct ofpbuf *reply;
5422 ofpbuf_init(&request, 0);
5423 nl_msg_put_nlmsghdr(&request,
5424 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5425 RTM_GETLINK, NLM_F_REQUEST);
5426 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5427 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5428 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5429 ofpbuf_uninit(&request);
5434 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5435 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5436 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5437 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5440 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5441 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5442 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5445 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5450 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5455 ofpbuf_delete(reply);
5460 get_flags(const struct netdev *dev, unsigned int *flags)
5466 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5468 *flags = ifr.ifr_flags;
5474 set_flags(const char *name, unsigned int flags)
5478 ifr.ifr_flags = flags;
5479 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5483 do_get_ifindex(const char *netdev_name)
5488 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5489 COVERAGE_INC(netdev_get_ifindex);
5491 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5493 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5494 netdev_name, ovs_strerror(error));
5497 return ifr.ifr_ifindex;
5501 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5503 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5505 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5506 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5509 netdev->get_ifindex_error = -ifindex;
5510 netdev->ifindex = 0;
5512 netdev->get_ifindex_error = 0;
5513 netdev->ifindex = ifindex;
5515 netdev->cache_valid |= VALID_IFINDEX;
5518 *ifindexp = netdev->ifindex;
5519 return netdev->get_ifindex_error;
5523 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5529 memset(&ifr, 0, sizeof ifr);
5530 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5531 COVERAGE_INC(netdev_get_hwaddr);
5532 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5534 /* ENODEV probably means that a vif disappeared asynchronously and
5535 * hasn't been removed from the database yet, so reduce the log level
5536 * to INFO for that case. */
5537 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5538 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5539 netdev_name, ovs_strerror(error));
5542 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5543 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5544 VLOG_INFO("%s device has unknown hardware address family %d",
5545 netdev_name, hwaddr_family);
5548 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5553 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5558 memset(&ifr, 0, sizeof ifr);
5559 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5560 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5561 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5562 COVERAGE_INC(netdev_set_hwaddr);
5563 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5565 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5566 netdev_name, ovs_strerror(error));
5572 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5573 int cmd, const char *cmd_name)
5578 memset(&ifr, 0, sizeof ifr);
5579 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5580 ifr.ifr_data = (caddr_t) ecmd;
5583 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5585 if (error != EOPNOTSUPP) {
5586 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5587 "failed: %s", cmd_name, name, ovs_strerror(error));
5589 /* The device doesn't support this operation. That's pretty
5590 * common, so there's no point in logging anything. */
5597 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5598 int cmd, const char *cmd_name)
5603 ifr.ifr_addr.sa_family = AF_INET;
5604 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5606 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5608 *ip = sin->sin_addr;
5613 /* Returns an AF_PACKET raw socket or a negative errno value. */
5615 af_packet_sock(void)
5617 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5620 if (ovsthread_once_start(&once)) {
5621 sock = socket(AF_PACKET, SOCK_RAW, 0);
5623 int error = set_nonblocking(sock);
5630 VLOG_ERR("failed to create packet socket: %s",
5631 ovs_strerror(errno));
5633 ovsthread_once_done(&once);