2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
403 tc_init(struct tc *tc, const struct tc_ops *ops)
406 hmap_init(&tc->queues);
410 tc_destroy(struct tc *tc)
412 hmap_destroy(&tc->queues);
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_noop;
422 static const struct tc_ops tc_ops_other;
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_noop, /* Non operating qos type. */
431 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
432 &tc_ops_other, /* Some other qdisc. */
436 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
437 static unsigned int tc_get_major(unsigned int handle);
438 static unsigned int tc_get_minor(unsigned int handle);
440 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
441 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
442 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444 static struct tcmsg *tc_make_request(const struct netdev *, int type,
445 unsigned int flags, struct ofpbuf *);
446 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
447 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
448 static int tc_add_policer(struct netdev *,
449 uint32_t kbits_rate, uint32_t kbits_burst);
451 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
452 struct nlattr **options);
453 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
454 struct nlattr **options,
455 struct netdev_queue_stats *);
456 static int tc_query_class(const struct netdev *,
457 unsigned int handle, unsigned int parent,
458 struct ofpbuf **replyp);
459 static int tc_delete_class(const struct netdev *, unsigned int handle);
461 static int tc_del_qdisc(struct netdev *netdev);
462 static int tc_query_qdisc(const struct netdev *netdev);
464 static int tc_calc_cell_log(unsigned int mtu);
465 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
466 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
467 const struct tc_ratespec *rate);
468 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470 struct netdev_linux {
473 /* Protects all members below. */
474 struct ovs_mutex mutex;
476 unsigned int cache_valid;
478 bool miimon; /* Link status of last poll. */
479 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
480 struct timer miimon_timer;
482 /* The following are figured out "on demand" only. They are only valid
483 * when the corresponding VALID_* bit in 'cache_valid' is set. */
485 struct eth_addr etheraddr;
487 unsigned int ifi_flags;
488 long long int carrier_resets;
489 uint32_t kbits_rate; /* Policing data. */
490 uint32_t kbits_burst;
491 int vport_stats_error; /* Cached error code from vport_get_stats().
492 0 or an errno value. */
493 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
494 int ether_addr_error; /* Cached error code from set/get etheraddr. */
495 int netdev_policing_error; /* Cached error code from set policing. */
496 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
497 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
500 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
503 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
506 /* For devices of class netdev_tap_class only. */
510 struct netdev_rxq_linux {
511 struct netdev_rxq up;
516 /* This is set pretty low because we probably won't learn anything from the
517 * additional log messages. */
518 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520 /* Polling miimon status for all ports causes performance degradation when
521 * handling a large number of ports. If there are no devices using miimon, then
522 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 * Readers do not depend on this variable synchronizing with the related
525 * changes in the device miimon status, so we can use atomic_count. */
526 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
528 static void netdev_linux_run(void);
530 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
531 int cmd, const char *cmd_name);
532 static int get_flags(const struct netdev *, unsigned int *flags);
533 static int set_flags(const char *, unsigned int flags);
534 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
535 enum netdev_flags on, enum netdev_flags *old_flagsp)
536 OVS_REQUIRES(netdev->mutex);
537 static int do_get_ifindex(const char *netdev_name);
538 static int get_ifindex(const struct netdev *, int *ifindexp);
539 static int do_set_addr(struct netdev *netdev,
540 int ioctl_nr, const char *ioctl_name,
541 struct in_addr addr);
542 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
543 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
544 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
545 static int af_packet_sock(void);
546 static bool netdev_linux_miimon_enabled(void);
547 static void netdev_linux_miimon_run(void);
548 static void netdev_linux_miimon_wait(void);
549 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
552 is_netdev_linux_class(const struct netdev_class *netdev_class)
554 return netdev_class->run == netdev_linux_run;
558 is_tap_netdev(const struct netdev *netdev)
560 return netdev_get_class(netdev) == &netdev_tap_class;
563 static struct netdev_linux *
564 netdev_linux_cast(const struct netdev *netdev)
566 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
568 return CONTAINER_OF(netdev, struct netdev_linux, up);
571 static struct netdev_rxq_linux *
572 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
574 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
575 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
578 static void netdev_linux_update(struct netdev_linux *netdev,
579 const struct rtnetlink_change *)
580 OVS_REQUIRES(netdev->mutex);
581 static void netdev_linux_changed(struct netdev_linux *netdev,
582 unsigned int ifi_flags, unsigned int mask)
583 OVS_REQUIRES(netdev->mutex);
585 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
586 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
587 * if no such socket could be created. */
588 static struct nl_sock *
589 netdev_linux_notify_sock(void)
591 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
592 static struct nl_sock *sock;
593 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
594 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
596 if (ovsthread_once_start(&once)) {
599 error = nl_sock_create(NETLINK_ROUTE, &sock);
603 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
604 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 nl_sock_destroy(sock);
612 ovsthread_once_done(&once);
619 netdev_linux_miimon_enabled(void)
621 return atomic_count_get(&miimon_cnt) > 0;
625 netdev_linux_run(void)
627 struct nl_sock *sock;
630 if (netdev_linux_miimon_enabled()) {
631 netdev_linux_miimon_run();
634 sock = netdev_linux_notify_sock();
640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
641 uint64_t buf_stub[4096 / 8];
644 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
645 error = nl_sock_recv(sock, &buf, false);
647 struct rtnetlink_change change;
649 if (rtnetlink_parse(&buf, &change)) {
650 struct netdev *netdev_ = NULL;
651 char dev_name[IFNAMSIZ];
653 if (!change.ifname) {
654 change.ifname = if_indextoname(change.if_index, dev_name);
658 netdev_ = netdev_from_name(change.ifname);
660 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
663 ovs_mutex_lock(&netdev->mutex);
664 netdev_linux_update(netdev, &change);
665 ovs_mutex_unlock(&netdev->mutex);
667 netdev_close(netdev_);
669 } else if (error == ENOBUFS) {
670 struct shash device_shash;
671 struct shash_node *node;
675 shash_init(&device_shash);
676 netdev_get_devices(&netdev_linux_class, &device_shash);
677 SHASH_FOR_EACH (node, &device_shash) {
678 struct netdev *netdev_ = node->data;
679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
682 ovs_mutex_lock(&netdev->mutex);
683 get_flags(netdev_, &flags);
684 netdev_linux_changed(netdev, flags, 0);
685 ovs_mutex_unlock(&netdev->mutex);
687 netdev_close(netdev_);
689 shash_destroy(&device_shash);
690 } else if (error != EAGAIN) {
691 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
692 ovs_strerror(error));
699 netdev_linux_wait(void)
701 struct nl_sock *sock;
703 if (netdev_linux_miimon_enabled()) {
704 netdev_linux_miimon_wait();
706 sock = netdev_linux_notify_sock();
708 nl_sock_wait(sock, POLLIN);
713 netdev_linux_changed(struct netdev_linux *dev,
714 unsigned int ifi_flags, unsigned int mask)
715 OVS_REQUIRES(dev->mutex)
717 netdev_change_seq_changed(&dev->up);
719 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
720 dev->carrier_resets++;
722 dev->ifi_flags = ifi_flags;
724 dev->cache_valid &= mask;
725 if (!(mask & VALID_IN)) {
726 netdev_get_addrs_list_flush();
731 netdev_linux_update(struct netdev_linux *dev,
732 const struct rtnetlink_change *change)
733 OVS_REQUIRES(dev->mutex)
735 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
736 if (change->nlmsg_type == RTM_NEWLINK) {
737 /* Keep drv-info, and ip addresses. */
738 netdev_linux_changed(dev, change->ifi_flags,
739 VALID_DRVINFO | VALID_IN);
741 /* Update netdev from rtnl-change msg. */
743 dev->mtu = change->mtu;
744 dev->cache_valid |= VALID_MTU;
745 dev->netdev_mtu_error = 0;
748 if (!eth_addr_is_zero(change->mac)) {
749 dev->etheraddr = change->mac;
750 dev->cache_valid |= VALID_ETHERADDR;
751 dev->ether_addr_error = 0;
754 dev->ifindex = change->if_index;
755 dev->cache_valid |= VALID_IFINDEX;
756 dev->get_ifindex_error = 0;
758 netdev_linux_changed(dev, change->ifi_flags, 0);
760 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
761 /* Invalidates in4, in6. */
762 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
768 static struct netdev *
769 netdev_linux_alloc(void)
771 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
776 netdev_linux_common_construct(struct netdev_linux *netdev)
778 ovs_mutex_init(&netdev->mutex);
781 /* Creates system and internal devices. */
783 netdev_linux_construct(struct netdev *netdev_)
785 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 netdev_linux_common_construct(netdev);
790 error = get_flags(&netdev->up, &netdev->ifi_flags);
791 if (error == ENODEV) {
792 if (netdev->up.netdev_class != &netdev_internal_class) {
793 /* The device does not exist, so don't allow it to be opened. */
796 /* "Internal" netdevs have to be created as netdev objects before
797 * they exist in the kernel, because creating them in the kernel
798 * happens by passing a netdev object to dpif_port_add().
799 * Therefore, ignore the error. */
806 /* For most types of netdevs we open the device for each call of
807 * netdev_open(). However, this is not the case with tap devices,
808 * since it is only possible to open the device once. In this
809 * situation we share a single file descriptor, and consequently
810 * buffers, across all readers. Therefore once data is read it will
811 * be unavailable to other reads for tap devices. */
813 netdev_linux_construct_tap(struct netdev *netdev_)
815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
816 static const char tap_dev[] = "/dev/net/tun";
817 const char *name = netdev_->name;
821 netdev_linux_common_construct(netdev);
823 /* Open tap device. */
824 netdev->tap_fd = open(tap_dev, O_RDWR);
825 if (netdev->tap_fd < 0) {
827 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
831 /* Create tap device. */
832 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
833 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
834 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
835 VLOG_WARN("%s: creating tap device failed: %s", name,
836 ovs_strerror(errno));
841 /* Make non-blocking. */
842 error = set_nonblocking(netdev->tap_fd);
850 close(netdev->tap_fd);
855 netdev_linux_destruct(struct netdev *netdev_)
857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
859 if (netdev->tc && netdev->tc->ops->tc_destroy) {
860 netdev->tc->ops->tc_destroy(netdev->tc);
863 if (netdev_get_class(netdev_) == &netdev_tap_class
864 && netdev->tap_fd >= 0)
866 close(netdev->tap_fd);
869 if (netdev->miimon_interval > 0) {
870 atomic_count_dec(&miimon_cnt);
873 ovs_mutex_destroy(&netdev->mutex);
877 netdev_linux_dealloc(struct netdev *netdev_)
879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
883 static struct netdev_rxq *
884 netdev_linux_rxq_alloc(void)
886 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
891 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
893 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
894 struct netdev *netdev_ = rx->up.netdev;
895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
898 ovs_mutex_lock(&netdev->mutex);
899 rx->is_tap = is_tap_netdev(netdev_);
901 rx->fd = netdev->tap_fd;
903 struct sockaddr_ll sll;
905 /* Result of tcpdump -dd inbound */
906 static const struct sock_filter filt[] = {
907 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
908 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
909 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
910 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 static const struct sock_fprog fprog = {
913 ARRAY_SIZE(filt), (struct sock_filter *) filt
916 /* Create file descriptor. */
917 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
920 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
925 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
927 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
928 netdev_get_name(netdev_), ovs_strerror(error));
932 /* Set non-blocking mode. */
933 error = set_nonblocking(rx->fd);
938 /* Get ethernet device index. */
939 error = get_ifindex(&netdev->up, &ifindex);
944 /* Bind to specific ethernet device. */
945 memset(&sll, 0, sizeof sll);
946 sll.sll_family = AF_PACKET;
947 sll.sll_ifindex = ifindex;
948 sll.sll_protocol = htons(ETH_P_ALL);
949 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
951 VLOG_ERR("%s: failed to bind raw socket (%s)",
952 netdev_get_name(netdev_), ovs_strerror(error));
956 /* Filter for only inbound packets. */
957 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
961 VLOG_ERR("%s: failed to attach filter (%s)",
962 netdev_get_name(netdev_), ovs_strerror(error));
966 ovs_mutex_unlock(&netdev->mutex);
974 ovs_mutex_unlock(&netdev->mutex);
979 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
981 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
989 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
997 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
999 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1000 return htons(aux->tp_vlan_tpid);
1002 return htons(ETH_TYPE_VLAN);
1007 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1009 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1013 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1018 struct cmsghdr *cmsg;
1020 struct cmsghdr cmsg;
1021 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1025 /* Reserve headroom for a single VLAN tag */
1026 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1027 size = dp_packet_tailroom(buffer);
1029 iov.iov_base = dp_packet_data(buffer);
1031 msgh.msg_name = NULL;
1032 msgh.msg_namelen = 0;
1033 msgh.msg_iov = &iov;
1034 msgh.msg_iovlen = 1;
1035 msgh.msg_control = &cmsg_buffer;
1036 msgh.msg_controllen = sizeof cmsg_buffer;
1040 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1041 } while (retval < 0 && errno == EINTR);
1045 } else if (retval > size) {
1049 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1051 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1052 const struct tpacket_auxdata *aux;
1054 if (cmsg->cmsg_level != SOL_PACKET
1055 || cmsg->cmsg_type != PACKET_AUXDATA
1056 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1060 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1061 if (auxdata_has_vlan_tci(aux)) {
1062 if (retval < ETH_HEADER_LEN) {
1066 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1067 htons(aux->tp_vlan_tci));
1076 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1079 size_t size = dp_packet_tailroom(buffer);
1082 retval = read(fd, dp_packet_data(buffer), size);
1083 } while (retval < 0 && errno == EINTR);
1089 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1094 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 struct netdev *netdev = rx->up.netdev;
1099 struct dp_packet *buffer;
1103 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1104 mtu = ETH_PAYLOAD_MAX;
1107 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1108 DP_NETDEV_HEADROOM);
1109 retval = (rx->is_tap
1110 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1111 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1114 if (retval != EAGAIN && retval != EMSGSIZE) {
1115 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1116 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1118 dp_packet_delete(buffer);
1120 dp_packet_pad(buffer);
1121 packets[0] = buffer;
1129 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1131 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1132 poll_fd_wait(rx->fd, POLLIN);
1136 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1138 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1141 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1142 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1146 drain_fd(rx->fd, ifr.ifr_qlen);
1149 return drain_rcvbuf(rx->fd);
1153 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1154 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1155 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1156 * the packet is too big or too small to transmit on the device.
1158 * The caller retains ownership of 'buffer' in all cases.
1160 * The kernel maintains a packet transmission queue, so the caller is not
1161 * expected to do additional queuing of packets. */
1163 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1164 struct dp_packet **pkts, int cnt, bool may_steal)
1169 /* 'i' is incremented only if there's no error */
1170 for (i = 0; i < cnt;) {
1171 const void *data = dp_packet_data(pkts[i]);
1172 size_t size = dp_packet_size(pkts[i]);
1175 /* Truncate the packet if it is configured. */
1176 size -= dp_packet_get_cutlen(pkts[i]);
1178 if (!is_tap_netdev(netdev_)) {
1179 /* Use our AF_PACKET socket to send to this device. */
1180 struct sockaddr_ll sll;
1186 sock = af_packet_sock();
1191 ifindex = netdev_get_ifindex(netdev_);
1196 /* We don't bother setting most fields in sockaddr_ll because the
1197 * kernel ignores them for SOCK_RAW. */
1198 memset(&sll, 0, sizeof sll);
1199 sll.sll_family = AF_PACKET;
1200 sll.sll_ifindex = ifindex;
1202 iov.iov_base = CONST_CAST(void *, data);
1205 msg.msg_name = &sll;
1206 msg.msg_namelen = sizeof sll;
1209 msg.msg_control = NULL;
1210 msg.msg_controllen = 0;
1213 retval = sendmsg(sock, &msg, 0);
1215 /* Use the tap fd to send to this device. This is essential for
1216 * tap devices, because packets sent to a tap device with an
1217 * AF_PACKET socket will loop back to be *received* again on the
1218 * tap device. This doesn't occur on other interface types
1219 * because we attach a socket filter to the rx socket. */
1220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 retval = write(netdev->tap_fd, data, size);
1226 if (errno == EINTR) {
1227 /* The send was interrupted by a signal. Retry the packet by
1228 * continuing without incrementing 'i'.*/
1230 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1231 /* The Linux tap driver returns EIO if the device is not up.
1232 * From the OVS side this is not an error, so ignore it. */
1234 /* The Linux AF_PACKET implementation never blocks waiting for
1235 * room for packets, instead returning ENOBUFS. Translate this
1236 * into EAGAIN for the caller. */
1237 error = errno == ENOBUFS ? EAGAIN : errno;
1240 } else if (retval != size) {
1241 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1242 " of %"PRIuSIZE") on %s", retval, size,
1243 netdev_get_name(netdev_));
1248 /* Process the next packet in the batch */
1253 for (i = 0; i < cnt; i++) {
1254 dp_packet_delete(pkts[i]);
1258 if (error && error != EAGAIN) {
1259 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1260 netdev_get_name(netdev_), ovs_strerror(error));
1267 /* Registers with the poll loop to wake up from the next call to poll_block()
1268 * when the packet transmission queue has sufficient room to transmit a packet
1269 * with netdev_send().
1271 * The kernel maintains a packet transmission queue, so the client is not
1272 * expected to do additional queuing of packets. Thus, this function is
1273 * unlikely to ever be used. It is included for completeness. */
1275 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1277 if (is_tap_netdev(netdev)) {
1278 /* TAP device always accepts packets.*/
1279 poll_immediate_wake();
1283 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1284 * otherwise a positive errno value. */
1286 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1288 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1289 enum netdev_flags old_flags = 0;
1292 ovs_mutex_lock(&netdev->mutex);
1294 if (netdev->cache_valid & VALID_ETHERADDR) {
1295 error = netdev->ether_addr_error;
1296 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1299 netdev->cache_valid &= ~VALID_ETHERADDR;
1302 /* Tap devices must be brought down before setting the address. */
1303 if (is_tap_netdev(netdev_)) {
1304 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1306 error = set_etheraddr(netdev_get_name(netdev_), mac);
1307 if (!error || error == ENODEV) {
1308 netdev->ether_addr_error = error;
1309 netdev->cache_valid |= VALID_ETHERADDR;
1311 netdev->etheraddr = mac;
1315 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1316 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1320 ovs_mutex_unlock(&netdev->mutex);
1324 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1326 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1331 ovs_mutex_lock(&netdev->mutex);
1332 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1333 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1334 &netdev->etheraddr);
1335 netdev->cache_valid |= VALID_ETHERADDR;
1338 error = netdev->ether_addr_error;
1340 *mac = netdev->etheraddr;
1342 ovs_mutex_unlock(&netdev->mutex);
1348 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1352 if (!(netdev->cache_valid & VALID_MTU)) {
1355 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1356 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1357 netdev->mtu = ifr.ifr_mtu;
1358 netdev->cache_valid |= VALID_MTU;
1361 error = netdev->netdev_mtu_error;
1363 *mtup = netdev->mtu;
1369 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1370 * in bytes, not including the hardware header; thus, this is typically 1500
1371 * bytes for Ethernet devices. */
1373 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1375 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1378 ovs_mutex_lock(&netdev->mutex);
1379 error = netdev_linux_get_mtu__(netdev, mtup);
1380 ovs_mutex_unlock(&netdev->mutex);
1385 /* Sets the maximum size of transmitted (MTU) for given device using linux
1386 * networking ioctl interface.
1389 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1391 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1395 ovs_mutex_lock(&netdev->mutex);
1396 if (netdev->cache_valid & VALID_MTU) {
1397 error = netdev->netdev_mtu_error;
1398 if (error || netdev->mtu == mtu) {
1401 netdev->cache_valid &= ~VALID_MTU;
1404 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1405 SIOCSIFMTU, "SIOCSIFMTU");
1406 if (!error || error == ENODEV) {
1407 netdev->netdev_mtu_error = error;
1408 netdev->mtu = ifr.ifr_mtu;
1409 netdev->cache_valid |= VALID_MTU;
1412 ovs_mutex_unlock(&netdev->mutex);
1416 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1417 * On failure, returns a negative errno value. */
1419 netdev_linux_get_ifindex(const struct netdev *netdev_)
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1424 ovs_mutex_lock(&netdev->mutex);
1425 error = get_ifindex(netdev_, &ifindex);
1426 ovs_mutex_unlock(&netdev->mutex);
1428 return error ? -error : ifindex;
1432 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1436 ovs_mutex_lock(&netdev->mutex);
1437 if (netdev->miimon_interval > 0) {
1438 *carrier = netdev->miimon;
1440 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1442 ovs_mutex_unlock(&netdev->mutex);
1447 static long long int
1448 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1451 long long int carrier_resets;
1453 ovs_mutex_lock(&netdev->mutex);
1454 carrier_resets = netdev->carrier_resets;
1455 ovs_mutex_unlock(&netdev->mutex);
1457 return carrier_resets;
1461 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1462 struct mii_ioctl_data *data)
1467 memset(&ifr, 0, sizeof ifr);
1468 memcpy(&ifr.ifr_data, data, sizeof *data);
1469 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1470 memcpy(data, &ifr.ifr_data, sizeof *data);
1476 netdev_linux_get_miimon(const char *name, bool *miimon)
1478 struct mii_ioctl_data data;
1483 memset(&data, 0, sizeof data);
1484 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1486 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1487 data.reg_num = MII_BMSR;
1488 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1492 *miimon = !!(data.val_out & BMSR_LSTATUS);
1494 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1497 struct ethtool_cmd ecmd;
1499 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1502 COVERAGE_INC(netdev_get_ethtool);
1503 memset(&ecmd, 0, sizeof ecmd);
1504 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1507 struct ethtool_value eval;
1509 memcpy(&eval, &ecmd, sizeof eval);
1510 *miimon = !!eval.data;
1512 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1520 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1521 long long int interval)
1523 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1525 ovs_mutex_lock(&netdev->mutex);
1526 interval = interval > 0 ? MAX(interval, 100) : 0;
1527 if (netdev->miimon_interval != interval) {
1528 if (interval && !netdev->miimon_interval) {
1529 atomic_count_inc(&miimon_cnt);
1530 } else if (!interval && netdev->miimon_interval) {
1531 atomic_count_dec(&miimon_cnt);
1534 netdev->miimon_interval = interval;
1535 timer_set_expired(&netdev->miimon_timer);
1537 ovs_mutex_unlock(&netdev->mutex);
1543 netdev_linux_miimon_run(void)
1545 struct shash device_shash;
1546 struct shash_node *node;
1548 shash_init(&device_shash);
1549 netdev_get_devices(&netdev_linux_class, &device_shash);
1550 SHASH_FOR_EACH (node, &device_shash) {
1551 struct netdev *netdev = node->data;
1552 struct netdev_linux *dev = netdev_linux_cast(netdev);
1555 ovs_mutex_lock(&dev->mutex);
1556 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1557 netdev_linux_get_miimon(dev->up.name, &miimon);
1558 if (miimon != dev->miimon) {
1559 dev->miimon = miimon;
1560 netdev_linux_changed(dev, dev->ifi_flags, 0);
1563 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1565 ovs_mutex_unlock(&dev->mutex);
1566 netdev_close(netdev);
1569 shash_destroy(&device_shash);
1573 netdev_linux_miimon_wait(void)
1575 struct shash device_shash;
1576 struct shash_node *node;
1578 shash_init(&device_shash);
1579 netdev_get_devices(&netdev_linux_class, &device_shash);
1580 SHASH_FOR_EACH (node, &device_shash) {
1581 struct netdev *netdev = node->data;
1582 struct netdev_linux *dev = netdev_linux_cast(netdev);
1584 ovs_mutex_lock(&dev->mutex);
1585 if (dev->miimon_interval > 0) {
1586 timer_wait(&dev->miimon_timer);
1588 ovs_mutex_unlock(&dev->mutex);
1589 netdev_close(netdev);
1591 shash_destroy(&device_shash);
1595 swap_uint64(uint64_t *a, uint64_t *b)
1602 /* Copies 'src' into 'dst', performing format conversion in the process.
1604 * 'src' is allowed to be misaligned. */
1606 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1607 const struct ovs_vport_stats *src)
1609 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1610 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1611 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1612 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1613 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1614 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1615 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1616 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1618 dst->collisions = 0;
1619 dst->rx_length_errors = 0;
1620 dst->rx_over_errors = 0;
1621 dst->rx_crc_errors = 0;
1622 dst->rx_frame_errors = 0;
1623 dst->rx_fifo_errors = 0;
1624 dst->rx_missed_errors = 0;
1625 dst->tx_aborted_errors = 0;
1626 dst->tx_carrier_errors = 0;
1627 dst->tx_fifo_errors = 0;
1628 dst->tx_heartbeat_errors = 0;
1629 dst->tx_window_errors = 0;
1633 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1635 struct dpif_netlink_vport reply;
1639 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1642 } else if (!reply.stats) {
1647 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1655 get_stats_via_vport(const struct netdev *netdev_,
1656 struct netdev_stats *stats)
1658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1660 if (!netdev->vport_stats_error ||
1661 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1664 error = get_stats_via_vport__(netdev_, stats);
1665 if (error && error != ENOENT && error != ENODEV) {
1666 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1668 netdev_get_name(netdev_), ovs_strerror(error));
1670 netdev->vport_stats_error = error;
1671 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1675 /* Retrieves current device stats for 'netdev-linux'. */
1677 netdev_linux_get_stats(const struct netdev *netdev_,
1678 struct netdev_stats *stats)
1680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1681 struct netdev_stats dev_stats;
1684 ovs_mutex_lock(&netdev->mutex);
1685 get_stats_via_vport(netdev_, stats);
1686 error = get_stats_via_netlink(netdev_, &dev_stats);
1688 if (!netdev->vport_stats_error) {
1691 } else if (netdev->vport_stats_error) {
1692 /* stats not available from OVS then use netdev stats. */
1695 /* Use kernel netdev's packet and byte counts since vport's counters
1696 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1698 stats->rx_packets = dev_stats.rx_packets;
1699 stats->rx_bytes = dev_stats.rx_bytes;
1700 stats->tx_packets = dev_stats.tx_packets;
1701 stats->tx_bytes = dev_stats.tx_bytes;
1703 stats->rx_errors += dev_stats.rx_errors;
1704 stats->tx_errors += dev_stats.tx_errors;
1705 stats->rx_dropped += dev_stats.rx_dropped;
1706 stats->tx_dropped += dev_stats.tx_dropped;
1707 stats->multicast += dev_stats.multicast;
1708 stats->collisions += dev_stats.collisions;
1709 stats->rx_length_errors += dev_stats.rx_length_errors;
1710 stats->rx_over_errors += dev_stats.rx_over_errors;
1711 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1712 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1713 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1714 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1715 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1716 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1717 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1718 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1719 stats->tx_window_errors += dev_stats.tx_window_errors;
1721 ovs_mutex_unlock(&netdev->mutex);
1726 /* Retrieves current device stats for 'netdev-tap' netdev or
1727 * netdev-internal. */
1729 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1732 struct netdev_stats dev_stats;
1735 ovs_mutex_lock(&netdev->mutex);
1736 get_stats_via_vport(netdev_, stats);
1737 error = get_stats_via_netlink(netdev_, &dev_stats);
1739 if (!netdev->vport_stats_error) {
1742 } else if (netdev->vport_stats_error) {
1743 /* Transmit and receive stats will appear to be swapped relative to the
1744 * other ports since we are the one sending the data, not a remote
1745 * computer. For consistency, we swap them back here. This does not
1746 * apply if we are getting stats from the vport layer because it always
1747 * tracks stats from the perspective of the switch. */
1750 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1751 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1752 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1753 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1754 stats->rx_length_errors = 0;
1755 stats->rx_over_errors = 0;
1756 stats->rx_crc_errors = 0;
1757 stats->rx_frame_errors = 0;
1758 stats->rx_fifo_errors = 0;
1759 stats->rx_missed_errors = 0;
1760 stats->tx_aborted_errors = 0;
1761 stats->tx_carrier_errors = 0;
1762 stats->tx_fifo_errors = 0;
1763 stats->tx_heartbeat_errors = 0;
1764 stats->tx_window_errors = 0;
1766 /* Use kernel netdev's packet and byte counts since vport counters
1767 * do not reflect packet counts on the wire when GSO, TSO or GRO
1769 stats->rx_packets = dev_stats.tx_packets;
1770 stats->rx_bytes = dev_stats.tx_bytes;
1771 stats->tx_packets = dev_stats.rx_packets;
1772 stats->tx_bytes = dev_stats.rx_bytes;
1774 stats->rx_dropped += dev_stats.tx_dropped;
1775 stats->tx_dropped += dev_stats.rx_dropped;
1777 stats->rx_errors += dev_stats.tx_errors;
1778 stats->tx_errors += dev_stats.rx_errors;
1780 stats->multicast += dev_stats.multicast;
1781 stats->collisions += dev_stats.collisions;
1783 ovs_mutex_unlock(&netdev->mutex);
1789 netdev_internal_get_stats(const struct netdev *netdev_,
1790 struct netdev_stats *stats)
1792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1795 ovs_mutex_lock(&netdev->mutex);
1796 get_stats_via_vport(netdev_, stats);
1797 error = netdev->vport_stats_error;
1798 ovs_mutex_unlock(&netdev->mutex);
1804 netdev_linux_read_features(struct netdev_linux *netdev)
1806 struct ethtool_cmd ecmd;
1810 if (netdev->cache_valid & VALID_FEATURES) {
1814 COVERAGE_INC(netdev_get_ethtool);
1815 memset(&ecmd, 0, sizeof ecmd);
1816 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1817 ETHTOOL_GSET, "ETHTOOL_GSET");
1822 /* Supported features. */
1823 netdev->supported = 0;
1824 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1825 netdev->supported |= NETDEV_F_10MB_HD;
1827 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1828 netdev->supported |= NETDEV_F_10MB_FD;
1830 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1831 netdev->supported |= NETDEV_F_100MB_HD;
1833 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1834 netdev->supported |= NETDEV_F_100MB_FD;
1836 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1837 netdev->supported |= NETDEV_F_1GB_HD;
1839 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1840 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1841 netdev->supported |= NETDEV_F_1GB_FD;
1843 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1844 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1845 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1846 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1847 netdev->supported |= NETDEV_F_10GB_FD;
1849 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1850 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1851 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1852 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1853 netdev->supported |= NETDEV_F_40GB_FD;
1855 if (ecmd.supported & SUPPORTED_TP) {
1856 netdev->supported |= NETDEV_F_COPPER;
1858 if (ecmd.supported & SUPPORTED_FIBRE) {
1859 netdev->supported |= NETDEV_F_FIBER;
1861 if (ecmd.supported & SUPPORTED_Autoneg) {
1862 netdev->supported |= NETDEV_F_AUTONEG;
1864 if (ecmd.supported & SUPPORTED_Pause) {
1865 netdev->supported |= NETDEV_F_PAUSE;
1867 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1868 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1871 /* Advertised features. */
1872 netdev->advertised = 0;
1873 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1874 netdev->advertised |= NETDEV_F_10MB_HD;
1876 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1877 netdev->advertised |= NETDEV_F_10MB_FD;
1879 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1880 netdev->advertised |= NETDEV_F_100MB_HD;
1882 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1883 netdev->advertised |= NETDEV_F_100MB_FD;
1885 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1886 netdev->advertised |= NETDEV_F_1GB_HD;
1888 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1889 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1890 netdev->advertised |= NETDEV_F_1GB_FD;
1892 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1893 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1894 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1895 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1896 netdev->advertised |= NETDEV_F_10GB_FD;
1898 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1899 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1900 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1901 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1902 netdev->advertised |= NETDEV_F_40GB_FD;
1904 if (ecmd.advertising & ADVERTISED_TP) {
1905 netdev->advertised |= NETDEV_F_COPPER;
1907 if (ecmd.advertising & ADVERTISED_FIBRE) {
1908 netdev->advertised |= NETDEV_F_FIBER;
1910 if (ecmd.advertising & ADVERTISED_Autoneg) {
1911 netdev->advertised |= NETDEV_F_AUTONEG;
1913 if (ecmd.advertising & ADVERTISED_Pause) {
1914 netdev->advertised |= NETDEV_F_PAUSE;
1916 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1917 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1920 /* Current settings. */
1921 speed = ethtool_cmd_speed(&ecmd);
1922 if (speed == SPEED_10) {
1923 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1924 } else if (speed == SPEED_100) {
1925 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1926 } else if (speed == SPEED_1000) {
1927 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1928 } else if (speed == SPEED_10000) {
1929 netdev->current = NETDEV_F_10GB_FD;
1930 } else if (speed == 40000) {
1931 netdev->current = NETDEV_F_40GB_FD;
1932 } else if (speed == 100000) {
1933 netdev->current = NETDEV_F_100GB_FD;
1934 } else if (speed == 1000000) {
1935 netdev->current = NETDEV_F_1TB_FD;
1937 netdev->current = 0;
1940 if (ecmd.port == PORT_TP) {
1941 netdev->current |= NETDEV_F_COPPER;
1942 } else if (ecmd.port == PORT_FIBRE) {
1943 netdev->current |= NETDEV_F_FIBER;
1947 netdev->current |= NETDEV_F_AUTONEG;
1951 netdev->cache_valid |= VALID_FEATURES;
1952 netdev->get_features_error = error;
1955 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1956 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1957 * Returns 0 if successful, otherwise a positive errno value. */
1959 netdev_linux_get_features(const struct netdev *netdev_,
1960 enum netdev_features *current,
1961 enum netdev_features *advertised,
1962 enum netdev_features *supported,
1963 enum netdev_features *peer)
1965 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1968 ovs_mutex_lock(&netdev->mutex);
1969 netdev_linux_read_features(netdev);
1970 if (!netdev->get_features_error) {
1971 *current = netdev->current;
1972 *advertised = netdev->advertised;
1973 *supported = netdev->supported;
1974 *peer = 0; /* XXX */
1976 error = netdev->get_features_error;
1977 ovs_mutex_unlock(&netdev->mutex);
1982 /* Set the features advertised by 'netdev' to 'advertise'. */
1984 netdev_linux_set_advertisements(struct netdev *netdev_,
1985 enum netdev_features advertise)
1987 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1988 struct ethtool_cmd ecmd;
1991 ovs_mutex_lock(&netdev->mutex);
1993 COVERAGE_INC(netdev_get_ethtool);
1994 memset(&ecmd, 0, sizeof ecmd);
1995 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1996 ETHTOOL_GSET, "ETHTOOL_GSET");
2001 ecmd.advertising = 0;
2002 if (advertise & NETDEV_F_10MB_HD) {
2003 ecmd.advertising |= ADVERTISED_10baseT_Half;
2005 if (advertise & NETDEV_F_10MB_FD) {
2006 ecmd.advertising |= ADVERTISED_10baseT_Full;
2008 if (advertise & NETDEV_F_100MB_HD) {
2009 ecmd.advertising |= ADVERTISED_100baseT_Half;
2011 if (advertise & NETDEV_F_100MB_FD) {
2012 ecmd.advertising |= ADVERTISED_100baseT_Full;
2014 if (advertise & NETDEV_F_1GB_HD) {
2015 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2017 if (advertise & NETDEV_F_1GB_FD) {
2018 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2020 if (advertise & NETDEV_F_10GB_FD) {
2021 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2023 if (advertise & NETDEV_F_COPPER) {
2024 ecmd.advertising |= ADVERTISED_TP;
2026 if (advertise & NETDEV_F_FIBER) {
2027 ecmd.advertising |= ADVERTISED_FIBRE;
2029 if (advertise & NETDEV_F_AUTONEG) {
2030 ecmd.advertising |= ADVERTISED_Autoneg;
2032 if (advertise & NETDEV_F_PAUSE) {
2033 ecmd.advertising |= ADVERTISED_Pause;
2035 if (advertise & NETDEV_F_PAUSE_ASYM) {
2036 ecmd.advertising |= ADVERTISED_Asym_Pause;
2038 COVERAGE_INC(netdev_set_ethtool);
2039 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2040 ETHTOOL_SSET, "ETHTOOL_SSET");
2043 ovs_mutex_unlock(&netdev->mutex);
2047 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2048 * successful, otherwise a positive errno value. */
2050 netdev_linux_set_policing(struct netdev *netdev_,
2051 uint32_t kbits_rate, uint32_t kbits_burst)
2053 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2054 const char *netdev_name = netdev_get_name(netdev_);
2057 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2058 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2059 : kbits_burst); /* Stick with user-specified value. */
2061 ovs_mutex_lock(&netdev->mutex);
2062 if (netdev->cache_valid & VALID_POLICING) {
2063 error = netdev->netdev_policing_error;
2064 if (error || (netdev->kbits_rate == kbits_rate &&
2065 netdev->kbits_burst == kbits_burst)) {
2066 /* Assume that settings haven't changed since we last set them. */
2069 netdev->cache_valid &= ~VALID_POLICING;
2072 COVERAGE_INC(netdev_set_policing);
2073 /* Remove any existing ingress qdisc. */
2074 error = tc_add_del_ingress_qdisc(netdev_, false);
2076 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2077 netdev_name, ovs_strerror(error));
2082 error = tc_add_del_ingress_qdisc(netdev_, true);
2084 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2085 netdev_name, ovs_strerror(error));
2089 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2091 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2092 netdev_name, ovs_strerror(error));
2097 netdev->kbits_rate = kbits_rate;
2098 netdev->kbits_burst = kbits_burst;
2101 if (!error || error == ENODEV) {
2102 netdev->netdev_policing_error = error;
2103 netdev->cache_valid |= VALID_POLICING;
2105 ovs_mutex_unlock(&netdev->mutex);
2110 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2113 const struct tc_ops *const *opsp;
2114 for (opsp = tcs; *opsp != NULL; opsp++) {
2115 const struct tc_ops *ops = *opsp;
2116 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2117 sset_add(types, ops->ovs_name);
2123 static const struct tc_ops *
2124 tc_lookup_ovs_name(const char *name)
2126 const struct tc_ops *const *opsp;
2128 for (opsp = tcs; *opsp != NULL; opsp++) {
2129 const struct tc_ops *ops = *opsp;
2130 if (!strcmp(name, ops->ovs_name)) {
2137 static const struct tc_ops *
2138 tc_lookup_linux_name(const char *name)
2140 const struct tc_ops *const *opsp;
2142 for (opsp = tcs; *opsp != NULL; opsp++) {
2143 const struct tc_ops *ops = *opsp;
2144 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2151 static struct tc_queue *
2152 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2155 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2156 struct tc_queue *queue;
2158 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2159 if (queue->queue_id == queue_id) {
2166 static struct tc_queue *
2167 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2169 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2173 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2175 struct netdev_qos_capabilities *caps)
2177 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2181 caps->n_queues = ops->n_queues;
2186 netdev_linux_get_qos(const struct netdev *netdev_,
2187 const char **typep, struct smap *details)
2189 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2192 ovs_mutex_lock(&netdev->mutex);
2193 error = tc_query_qdisc(netdev_);
2195 *typep = netdev->tc->ops->ovs_name;
2196 error = (netdev->tc->ops->qdisc_get
2197 ? netdev->tc->ops->qdisc_get(netdev_, details)
2200 ovs_mutex_unlock(&netdev->mutex);
2206 netdev_linux_set_qos(struct netdev *netdev_,
2207 const char *type, const struct smap *details)
2209 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2210 const struct tc_ops *new_ops;
2213 new_ops = tc_lookup_ovs_name(type);
2214 if (!new_ops || !new_ops->tc_install) {
2218 if (new_ops == &tc_ops_noop) {
2219 return new_ops->tc_install(netdev_, details);
2222 ovs_mutex_lock(&netdev->mutex);
2223 error = tc_query_qdisc(netdev_);
2228 if (new_ops == netdev->tc->ops) {
2229 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2231 /* Delete existing qdisc. */
2232 error = tc_del_qdisc(netdev_);
2236 ovs_assert(netdev->tc == NULL);
2238 /* Install new qdisc. */
2239 error = new_ops->tc_install(netdev_, details);
2240 ovs_assert((error == 0) == (netdev->tc != NULL));
2244 ovs_mutex_unlock(&netdev->mutex);
2249 netdev_linux_get_queue(const struct netdev *netdev_,
2250 unsigned int queue_id, struct smap *details)
2252 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2255 ovs_mutex_lock(&netdev->mutex);
2256 error = tc_query_qdisc(netdev_);
2258 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2260 ? netdev->tc->ops->class_get(netdev_, queue, details)
2263 ovs_mutex_unlock(&netdev->mutex);
2269 netdev_linux_set_queue(struct netdev *netdev_,
2270 unsigned int queue_id, const struct smap *details)
2272 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2275 ovs_mutex_lock(&netdev->mutex);
2276 error = tc_query_qdisc(netdev_);
2278 error = (queue_id < netdev->tc->ops->n_queues
2279 && netdev->tc->ops->class_set
2280 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2283 ovs_mutex_unlock(&netdev->mutex);
2289 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2291 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2294 ovs_mutex_lock(&netdev->mutex);
2295 error = tc_query_qdisc(netdev_);
2297 if (netdev->tc->ops->class_delete) {
2298 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2300 ? netdev->tc->ops->class_delete(netdev_, queue)
2306 ovs_mutex_unlock(&netdev->mutex);
2312 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2313 unsigned int queue_id,
2314 struct netdev_queue_stats *stats)
2316 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2319 ovs_mutex_lock(&netdev->mutex);
2320 error = tc_query_qdisc(netdev_);
2322 if (netdev->tc->ops->class_get_stats) {
2323 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2325 stats->created = queue->created;
2326 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2335 ovs_mutex_unlock(&netdev->mutex);
2340 struct queue_dump_state {
2341 struct nl_dump dump;
2346 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2348 struct ofpbuf request;
2349 struct tcmsg *tcmsg;
2351 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2355 tcmsg->tcm_parent = 0;
2356 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2357 ofpbuf_uninit(&request);
2359 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2364 finish_queue_dump(struct queue_dump_state *state)
2366 ofpbuf_uninit(&state->buf);
2367 return nl_dump_done(&state->dump);
2370 struct netdev_linux_queue_state {
2371 unsigned int *queues;
2377 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2379 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2382 ovs_mutex_lock(&netdev->mutex);
2383 error = tc_query_qdisc(netdev_);
2385 if (netdev->tc->ops->class_get) {
2386 struct netdev_linux_queue_state *state;
2387 struct tc_queue *queue;
2390 *statep = state = xmalloc(sizeof *state);
2391 state->n_queues = hmap_count(&netdev->tc->queues);
2392 state->cur_queue = 0;
2393 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2396 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2397 state->queues[i++] = queue->queue_id;
2403 ovs_mutex_unlock(&netdev->mutex);
2409 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2410 unsigned int *queue_idp, struct smap *details)
2412 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2413 struct netdev_linux_queue_state *state = state_;
2416 ovs_mutex_lock(&netdev->mutex);
2417 while (state->cur_queue < state->n_queues) {
2418 unsigned int queue_id = state->queues[state->cur_queue++];
2419 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2422 *queue_idp = queue_id;
2423 error = netdev->tc->ops->class_get(netdev_, queue, details);
2427 ovs_mutex_unlock(&netdev->mutex);
2433 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2436 struct netdev_linux_queue_state *state = state_;
2438 free(state->queues);
2444 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2445 netdev_dump_queue_stats_cb *cb, void *aux)
2447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2450 ovs_mutex_lock(&netdev->mutex);
2451 error = tc_query_qdisc(netdev_);
2453 struct queue_dump_state state;
2455 if (!netdev->tc->ops->class_dump_stats) {
2457 } else if (!start_queue_dump(netdev_, &state)) {
2463 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2464 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2471 retval = finish_queue_dump(&state);
2477 ovs_mutex_unlock(&netdev->mutex);
2483 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2484 struct in_addr netmask)
2486 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2489 ovs_mutex_lock(&netdev->mutex);
2490 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2492 if (address.s_addr != INADDR_ANY) {
2493 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2494 "SIOCSIFNETMASK", netmask);
2498 ovs_mutex_unlock(&netdev->mutex);
2503 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2504 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2507 netdev_linux_get_addr_list(const struct netdev *netdev_,
2508 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2513 ovs_mutex_lock(&netdev->mutex);
2514 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2515 ovs_mutex_unlock(&netdev->mutex);
2521 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2523 struct sockaddr_in sin;
2524 memset(&sin, 0, sizeof sin);
2525 sin.sin_family = AF_INET;
2526 sin.sin_addr = addr;
2529 memset(sa, 0, sizeof *sa);
2530 memcpy(sa, &sin, sizeof sin);
2534 do_set_addr(struct netdev *netdev,
2535 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2539 make_in4_sockaddr(&ifr.ifr_addr, addr);
2540 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2544 /* Adds 'router' as a default IP gateway. */
2546 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2548 struct in_addr any = { INADDR_ANY };
2552 memset(&rt, 0, sizeof rt);
2553 make_in4_sockaddr(&rt.rt_dst, any);
2554 make_in4_sockaddr(&rt.rt_gateway, router);
2555 make_in4_sockaddr(&rt.rt_genmask, any);
2556 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2557 error = af_inet_ioctl(SIOCADDRT, &rt);
2559 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2565 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2568 static const char fn[] = "/proc/net/route";
2573 *netdev_name = NULL;
2574 stream = fopen(fn, "r");
2575 if (stream == NULL) {
2576 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2581 while (fgets(line, sizeof line, stream)) {
2584 ovs_be32 dest, gateway, mask;
2585 int refcnt, metric, mtu;
2586 unsigned int flags, use, window, irtt;
2589 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2591 iface, &dest, &gateway, &flags, &refcnt,
2592 &use, &metric, &mask, &mtu, &window, &irtt)) {
2593 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2597 if (!(flags & RTF_UP)) {
2598 /* Skip routes that aren't up. */
2602 /* The output of 'dest', 'mask', and 'gateway' were given in
2603 * network byte order, so we don't need need any endian
2604 * conversions here. */
2605 if ((dest & mask) == (host->s_addr & mask)) {
2607 /* The host is directly reachable. */
2608 next_hop->s_addr = 0;
2610 /* To reach the host, we must go through a gateway. */
2611 next_hop->s_addr = gateway;
2613 *netdev_name = xstrdup(iface);
2625 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2630 ovs_mutex_lock(&netdev->mutex);
2631 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2632 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2634 COVERAGE_INC(netdev_get_ethtool);
2635 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2636 error = netdev_linux_do_ethtool(netdev->up.name,
2639 "ETHTOOL_GDRVINFO");
2641 netdev->cache_valid |= VALID_DRVINFO;
2646 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2647 smap_add(smap, "driver_version", netdev->drvinfo.version);
2648 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2650 ovs_mutex_unlock(&netdev->mutex);
2656 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2659 smap_add(smap, "driver_name", "openvswitch");
2663 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2664 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2665 * returns 0. Otherwise, it returns a positive errno value; in particular,
2666 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2668 netdev_linux_arp_lookup(const struct netdev *netdev,
2669 ovs_be32 ip, struct eth_addr *mac)
2672 struct sockaddr_in sin;
2675 memset(&r, 0, sizeof r);
2676 memset(&sin, 0, sizeof sin);
2677 sin.sin_family = AF_INET;
2678 sin.sin_addr.s_addr = ip;
2680 memcpy(&r.arp_pa, &sin, sizeof sin);
2681 r.arp_ha.sa_family = ARPHRD_ETHER;
2683 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2684 COVERAGE_INC(netdev_arp_lookup);
2685 retval = af_inet_ioctl(SIOCGARP, &r);
2687 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2688 } else if (retval != ENXIO) {
2689 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2690 netdev_get_name(netdev), IP_ARGS(ip),
2691 ovs_strerror(retval));
2697 nd_to_iff_flags(enum netdev_flags nd)
2700 if (nd & NETDEV_UP) {
2703 if (nd & NETDEV_PROMISC) {
2706 if (nd & NETDEV_LOOPBACK) {
2707 iff |= IFF_LOOPBACK;
2713 iff_to_nd_flags(int iff)
2715 enum netdev_flags nd = 0;
2719 if (iff & IFF_PROMISC) {
2720 nd |= NETDEV_PROMISC;
2722 if (iff & IFF_LOOPBACK) {
2723 nd |= NETDEV_LOOPBACK;
2729 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2730 enum netdev_flags on, enum netdev_flags *old_flagsp)
2731 OVS_REQUIRES(netdev->mutex)
2733 int old_flags, new_flags;
2736 old_flags = netdev->ifi_flags;
2737 *old_flagsp = iff_to_nd_flags(old_flags);
2738 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2739 if (new_flags != old_flags) {
2740 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2741 get_flags(&netdev->up, &netdev->ifi_flags);
2748 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2749 enum netdev_flags on, enum netdev_flags *old_flagsp)
2751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2754 ovs_mutex_lock(&netdev->mutex);
2755 error = update_flags(netdev, off, on, old_flagsp);
2756 ovs_mutex_unlock(&netdev->mutex);
2761 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2762 GET_FEATURES, GET_STATUS) \
2765 false, /* is_pmd */ \
2769 netdev_linux_wait, \
2771 netdev_linux_alloc, \
2773 netdev_linux_destruct, \
2774 netdev_linux_dealloc, \
2775 NULL, /* get_config */ \
2776 NULL, /* set_config */ \
2777 NULL, /* get_tunnel_config */ \
2778 NULL, /* build header */ \
2779 NULL, /* push header */ \
2780 NULL, /* pop header */ \
2781 NULL, /* get_numa_id */ \
2782 NULL, /* set_tx_multiq */ \
2784 netdev_linux_send, \
2785 netdev_linux_send_wait, \
2787 netdev_linux_set_etheraddr, \
2788 netdev_linux_get_etheraddr, \
2789 netdev_linux_get_mtu, \
2790 netdev_linux_set_mtu, \
2791 netdev_linux_get_ifindex, \
2792 netdev_linux_get_carrier, \
2793 netdev_linux_get_carrier_resets, \
2794 netdev_linux_set_miimon_interval, \
2798 netdev_linux_set_advertisements, \
2800 netdev_linux_set_policing, \
2801 netdev_linux_get_qos_types, \
2802 netdev_linux_get_qos_capabilities, \
2803 netdev_linux_get_qos, \
2804 netdev_linux_set_qos, \
2805 netdev_linux_get_queue, \
2806 netdev_linux_set_queue, \
2807 netdev_linux_delete_queue, \
2808 netdev_linux_get_queue_stats, \
2809 netdev_linux_queue_dump_start, \
2810 netdev_linux_queue_dump_next, \
2811 netdev_linux_queue_dump_done, \
2812 netdev_linux_dump_queue_stats, \
2814 netdev_linux_set_in4, \
2815 netdev_linux_get_addr_list, \
2816 netdev_linux_add_router, \
2817 netdev_linux_get_next_hop, \
2819 netdev_linux_arp_lookup, \
2821 netdev_linux_update_flags, \
2822 NULL, /* reconfigure */ \
2824 netdev_linux_rxq_alloc, \
2825 netdev_linux_rxq_construct, \
2826 netdev_linux_rxq_destruct, \
2827 netdev_linux_rxq_dealloc, \
2828 netdev_linux_rxq_recv, \
2829 netdev_linux_rxq_wait, \
2830 netdev_linux_rxq_drain, \
2833 const struct netdev_class netdev_linux_class =
2836 netdev_linux_construct,
2837 netdev_linux_get_stats,
2838 netdev_linux_get_features,
2839 netdev_linux_get_status);
2841 const struct netdev_class netdev_tap_class =
2844 netdev_linux_construct_tap,
2845 netdev_tap_get_stats,
2846 netdev_linux_get_features,
2847 netdev_linux_get_status);
2849 const struct netdev_class netdev_internal_class =
2852 netdev_linux_construct,
2853 netdev_internal_get_stats,
2854 NULL, /* get_features */
2855 netdev_internal_get_status);
2858 #define CODEL_N_QUEUES 0x0000
2860 /* In sufficiently new kernel headers these are defined as enums in
2861 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2862 * kernels. (This overrides any enum definition in the header file but that's
2864 #define TCA_CODEL_TARGET 1
2865 #define TCA_CODEL_LIMIT 2
2866 #define TCA_CODEL_INTERVAL 3
2875 static struct codel *
2876 codel_get__(const struct netdev *netdev_)
2878 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2879 return CONTAINER_OF(netdev->tc, struct codel, tc);
2883 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2887 struct codel *codel;
2889 codel = xmalloc(sizeof *codel);
2890 tc_init(&codel->tc, &tc_ops_codel);
2891 codel->target = target;
2892 codel->limit = limit;
2893 codel->interval = interval;
2895 netdev->tc = &codel->tc;
2899 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2903 struct ofpbuf request;
2904 struct tcmsg *tcmsg;
2905 uint32_t otarget, olimit, ointerval;
2908 tc_del_qdisc(netdev);
2910 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2911 NLM_F_EXCL | NLM_F_CREATE, &request);
2915 tcmsg->tcm_handle = tc_make_handle(1, 0);
2916 tcmsg->tcm_parent = TC_H_ROOT;
2918 otarget = target ? target : 5000;
2919 olimit = limit ? limit : 10240;
2920 ointerval = interval ? interval : 100000;
2922 nl_msg_put_string(&request, TCA_KIND, "codel");
2923 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2924 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2925 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2926 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2927 nl_msg_end_nested(&request, opt_offset);
2929 error = tc_transact(&request, NULL);
2931 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2932 "target %u, limit %u, interval %u error %d(%s)",
2933 netdev_get_name(netdev),
2934 otarget, olimit, ointerval,
2935 error, ovs_strerror(error));
2941 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2942 const struct smap *details, struct codel *codel)
2944 const char *target_s;
2945 const char *limit_s;
2946 const char *interval_s;
2948 target_s = smap_get(details, "target");
2949 limit_s = smap_get(details, "limit");
2950 interval_s = smap_get(details, "interval");
2952 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2953 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2954 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2956 if (!codel->target) {
2957 codel->target = 5000;
2959 if (!codel->limit) {
2960 codel->limit = 10240;
2962 if (!codel->interval) {
2963 codel->interval = 100000;
2968 codel_tc_install(struct netdev *netdev, const struct smap *details)
2973 codel_parse_qdisc_details__(netdev, details, &codel);
2974 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2977 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2983 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2985 static const struct nl_policy tca_codel_policy[] = {
2986 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2987 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2988 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2991 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2993 if (!nl_parse_nested(nl_options, tca_codel_policy,
2994 attrs, ARRAY_SIZE(tca_codel_policy))) {
2995 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2999 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3000 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3001 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3006 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3008 struct nlattr *nlattr;
3013 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3018 error = codel_parse_tca_options__(nlattr, &codel);
3023 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3029 codel_tc_destroy(struct tc *tc)
3031 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3037 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3039 const struct codel *codel = codel_get__(netdev);
3040 smap_add_format(details, "target", "%u", codel->target);
3041 smap_add_format(details, "limit", "%u", codel->limit);
3042 smap_add_format(details, "interval", "%u", codel->interval);
3047 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3051 codel_parse_qdisc_details__(netdev, details, &codel);
3052 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3053 codel_get__(netdev)->target = codel.target;
3054 codel_get__(netdev)->limit = codel.limit;
3055 codel_get__(netdev)->interval = codel.interval;
3059 static const struct tc_ops tc_ops_codel = {
3060 "codel", /* linux_name */
3061 "linux-codel", /* ovs_name */
3062 CODEL_N_QUEUES, /* n_queues */
3075 /* FQ-CoDel traffic control class. */
3077 #define FQCODEL_N_QUEUES 0x0000
3079 /* In sufficiently new kernel headers these are defined as enums in
3080 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3081 * kernels. (This overrides any enum definition in the header file but that's
3083 #define TCA_FQ_CODEL_TARGET 1
3084 #define TCA_FQ_CODEL_LIMIT 2
3085 #define TCA_FQ_CODEL_INTERVAL 3
3086 #define TCA_FQ_CODEL_ECN 4
3087 #define TCA_FQ_CODEL_FLOWS 5
3088 #define TCA_FQ_CODEL_QUANTUM 6
3099 static struct fqcodel *
3100 fqcodel_get__(const struct netdev *netdev_)
3102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3103 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3107 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3108 uint32_t interval, uint32_t flows, uint32_t quantum)
3110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3111 struct fqcodel *fqcodel;
3113 fqcodel = xmalloc(sizeof *fqcodel);
3114 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3115 fqcodel->target = target;
3116 fqcodel->limit = limit;
3117 fqcodel->interval = interval;
3118 fqcodel->flows = flows;
3119 fqcodel->quantum = quantum;
3121 netdev->tc = &fqcodel->tc;
3125 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3126 uint32_t interval, uint32_t flows, uint32_t quantum)
3129 struct ofpbuf request;
3130 struct tcmsg *tcmsg;
3131 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3134 tc_del_qdisc(netdev);
3136 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3137 NLM_F_EXCL | NLM_F_CREATE, &request);
3141 tcmsg->tcm_handle = tc_make_handle(1, 0);
3142 tcmsg->tcm_parent = TC_H_ROOT;
3144 otarget = target ? target : 5000;
3145 olimit = limit ? limit : 10240;
3146 ointerval = interval ? interval : 100000;
3147 oflows = flows ? flows : 1024;
3148 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3151 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3152 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3153 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3154 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3155 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3156 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3157 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3158 nl_msg_end_nested(&request, opt_offset);
3160 error = tc_transact(&request, NULL);
3162 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3163 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3164 netdev_get_name(netdev),
3165 otarget, olimit, ointerval, oflows, oquantum,
3166 error, ovs_strerror(error));
3172 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3173 const struct smap *details, struct fqcodel *fqcodel)
3175 const char *target_s;
3176 const char *limit_s;
3177 const char *interval_s;
3178 const char *flows_s;
3179 const char *quantum_s;
3181 target_s = smap_get(details, "target");
3182 limit_s = smap_get(details, "limit");
3183 interval_s = smap_get(details, "interval");
3184 flows_s = smap_get(details, "flows");
3185 quantum_s = smap_get(details, "quantum");
3186 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3187 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3188 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3189 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3190 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3191 if (!fqcodel->target) {
3192 fqcodel->target = 5000;
3194 if (!fqcodel->limit) {
3195 fqcodel->limit = 10240;
3197 if (!fqcodel->interval) {
3198 fqcodel->interval = 1000000;
3200 if (!fqcodel->flows) {
3201 fqcodel->flows = 1024;
3203 if (!fqcodel->quantum) {
3204 fqcodel->quantum = 1514;
3209 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3212 struct fqcodel fqcodel;
3214 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3215 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3216 fqcodel.interval, fqcodel.flows,
3219 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3220 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3226 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3228 static const struct nl_policy tca_fqcodel_policy[] = {
3229 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3230 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3231 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3232 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3233 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3236 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3238 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3239 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3240 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3244 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3245 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3246 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3247 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3248 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3253 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3255 struct nlattr *nlattr;
3258 struct fqcodel fqcodel;
3260 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3265 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3270 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3271 fqcodel.flows, fqcodel.quantum);
3276 fqcodel_tc_destroy(struct tc *tc)
3278 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3284 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3286 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3287 smap_add_format(details, "target", "%u", fqcodel->target);
3288 smap_add_format(details, "limit", "%u", fqcodel->limit);
3289 smap_add_format(details, "interval", "%u", fqcodel->interval);
3290 smap_add_format(details, "flows", "%u", fqcodel->flows);
3291 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3296 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3298 struct fqcodel fqcodel;
3300 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3301 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3302 fqcodel.flows, fqcodel.quantum);
3303 fqcodel_get__(netdev)->target = fqcodel.target;
3304 fqcodel_get__(netdev)->limit = fqcodel.limit;
3305 fqcodel_get__(netdev)->interval = fqcodel.interval;
3306 fqcodel_get__(netdev)->flows = fqcodel.flows;
3307 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3311 static const struct tc_ops tc_ops_fqcodel = {
3312 "fq_codel", /* linux_name */
3313 "linux-fq_codel", /* ovs_name */
3314 FQCODEL_N_QUEUES, /* n_queues */
3327 /* SFQ traffic control class. */
3329 #define SFQ_N_QUEUES 0x0000
3338 sfq_get__(const struct netdev *netdev_)
3340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3341 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3345 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3347 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3350 sfq = xmalloc(sizeof *sfq);
3351 tc_init(&sfq->tc, &tc_ops_sfq);
3352 sfq->perturb = perturb;
3353 sfq->quantum = quantum;
3355 netdev->tc = &sfq->tc;
3359 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3361 struct tc_sfq_qopt opt;
3362 struct ofpbuf request;
3363 struct tcmsg *tcmsg;
3365 int mtu_error, error;
3366 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3368 tc_del_qdisc(netdev);
3370 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3371 NLM_F_EXCL | NLM_F_CREATE, &request);
3375 tcmsg->tcm_handle = tc_make_handle(1, 0);
3376 tcmsg->tcm_parent = TC_H_ROOT;
3378 memset(&opt, 0, sizeof opt);
3381 opt.quantum = mtu; /* if we cannot find mtu, use default */
3384 opt.quantum = quantum;
3388 opt.perturb_period = 10;
3390 opt.perturb_period = perturb;
3393 nl_msg_put_string(&request, TCA_KIND, "sfq");
3394 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3396 error = tc_transact(&request, NULL);
3398 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3399 "quantum %u, perturb %u error %d(%s)",
3400 netdev_get_name(netdev),
3401 opt.quantum, opt.perturb_period,
3402 error, ovs_strerror(error));
3408 sfq_parse_qdisc_details__(struct netdev *netdev,
3409 const struct smap *details, struct sfq *sfq)
3411 const char *perturb_s;
3412 const char *quantum_s;
3416 perturb_s = smap_get(details, "perturb");
3417 quantum_s = smap_get(details, "quantum");
3418 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3419 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3420 if (!sfq->perturb) {
3424 if (!sfq->quantum) {
3425 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3429 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3430 "device without mtu");
3437 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3442 sfq_parse_qdisc_details__(netdev, details, &sfq);
3443 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3445 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3451 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3453 const struct tc_sfq_qopt *sfq;
3454 struct nlattr *nlattr;
3458 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3460 sfq = nl_attr_get(nlattr);
3461 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3469 sfq_tc_destroy(struct tc *tc)
3471 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3477 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3479 const struct sfq *sfq = sfq_get__(netdev);
3480 smap_add_format(details, "quantum", "%u", sfq->quantum);
3481 smap_add_format(details, "perturb", "%u", sfq->perturb);
3486 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3490 sfq_parse_qdisc_details__(netdev, details, &sfq);
3491 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3492 sfq_get__(netdev)->quantum = sfq.quantum;
3493 sfq_get__(netdev)->perturb = sfq.perturb;
3497 static const struct tc_ops tc_ops_sfq = {
3498 "sfq", /* linux_name */
3499 "linux-sfq", /* ovs_name */
3500 SFQ_N_QUEUES, /* n_queues */
3513 /* HTB traffic control class. */
3515 #define HTB_N_QUEUES 0xf000
3516 #define HTB_RATE2QUANTUM 10
3520 unsigned int max_rate; /* In bytes/s. */
3524 struct tc_queue tc_queue;
3525 unsigned int min_rate; /* In bytes/s. */
3526 unsigned int max_rate; /* In bytes/s. */
3527 unsigned int burst; /* In bytes. */
3528 unsigned int priority; /* Lower values are higher priorities. */
3532 htb_get__(const struct netdev *netdev_)
3534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3535 return CONTAINER_OF(netdev->tc, struct htb, tc);
3539 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3541 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3544 htb = xmalloc(sizeof *htb);
3545 tc_init(&htb->tc, &tc_ops_htb);
3546 htb->max_rate = max_rate;
3548 netdev->tc = &htb->tc;
3551 /* Create an HTB qdisc.
3553 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3555 htb_setup_qdisc__(struct netdev *netdev)
3558 struct tc_htb_glob opt;
3559 struct ofpbuf request;
3560 struct tcmsg *tcmsg;
3562 tc_del_qdisc(netdev);
3564 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3565 NLM_F_EXCL | NLM_F_CREATE, &request);
3569 tcmsg->tcm_handle = tc_make_handle(1, 0);
3570 tcmsg->tcm_parent = TC_H_ROOT;
3572 nl_msg_put_string(&request, TCA_KIND, "htb");
3574 memset(&opt, 0, sizeof opt);
3575 opt.rate2quantum = HTB_RATE2QUANTUM;
3579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3580 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3581 nl_msg_end_nested(&request, opt_offset);
3583 return tc_transact(&request, NULL);
3586 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3587 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3589 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3590 unsigned int parent, struct htb_class *class)
3593 struct tc_htb_opt opt;
3594 struct ofpbuf request;
3595 struct tcmsg *tcmsg;
3599 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3601 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3602 netdev_get_name(netdev));
3606 memset(&opt, 0, sizeof opt);
3607 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3608 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3609 /* Makes sure the quantum is at least MTU. Setting quantum will
3610 * make htb ignore the r2q for this class. */
3611 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3614 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3615 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3616 opt.prio = class->priority;
3618 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3622 tcmsg->tcm_handle = handle;
3623 tcmsg->tcm_parent = parent;
3625 nl_msg_put_string(&request, TCA_KIND, "htb");
3626 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3627 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3628 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3629 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3630 nl_msg_end_nested(&request, opt_offset);
3632 error = tc_transact(&request, NULL);
3634 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3635 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3636 netdev_get_name(netdev),
3637 tc_get_major(handle), tc_get_minor(handle),
3638 tc_get_major(parent), tc_get_minor(parent),
3639 class->min_rate, class->max_rate,
3640 class->burst, class->priority, ovs_strerror(error));
3645 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3646 * description of them into 'details'. The description complies with the
3647 * specification given in the vswitch database documentation for linux-htb
3650 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3652 static const struct nl_policy tca_htb_policy[] = {
3653 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3654 .min_len = sizeof(struct tc_htb_opt) },
3657 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3658 const struct tc_htb_opt *htb;
3660 if (!nl_parse_nested(nl_options, tca_htb_policy,
3661 attrs, ARRAY_SIZE(tca_htb_policy))) {
3662 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3666 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3667 class->min_rate = htb->rate.rate;
3668 class->max_rate = htb->ceil.rate;
3669 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3670 class->priority = htb->prio;
3675 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3676 struct htb_class *options,
3677 struct netdev_queue_stats *stats)
3679 struct nlattr *nl_options;
3680 unsigned int handle;
3683 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3684 if (!error && queue_id) {
3685 unsigned int major = tc_get_major(handle);
3686 unsigned int minor = tc_get_minor(handle);
3687 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3688 *queue_id = minor - 1;
3693 if (!error && options) {
3694 error = htb_parse_tca_options__(nl_options, options);
3700 htb_parse_qdisc_details__(struct netdev *netdev_,
3701 const struct smap *details, struct htb_class *hc)
3703 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3704 const char *max_rate_s;
3706 max_rate_s = smap_get(details, "max-rate");
3707 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3708 if (!hc->max_rate) {
3709 enum netdev_features current;
3711 netdev_linux_read_features(netdev);
3712 current = !netdev->get_features_error ? netdev->current : 0;
3713 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3715 hc->min_rate = hc->max_rate;
3721 htb_parse_class_details__(struct netdev *netdev,
3722 const struct smap *details, struct htb_class *hc)
3724 const struct htb *htb = htb_get__(netdev);
3725 const char *min_rate_s = smap_get(details, "min-rate");
3726 const char *max_rate_s = smap_get(details, "max-rate");
3727 const char *burst_s = smap_get(details, "burst");
3728 const char *priority_s = smap_get(details, "priority");
3731 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3733 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3734 netdev_get_name(netdev));
3738 /* HTB requires at least an mtu sized min-rate to send any traffic even
3739 * on uncongested links. */
3740 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3741 hc->min_rate = MAX(hc->min_rate, mtu);
3742 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3745 hc->max_rate = (max_rate_s
3746 ? strtoull(max_rate_s, NULL, 10) / 8
3748 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3749 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3753 * According to hints in the documentation that I've read, it is important
3754 * that 'burst' be at least as big as the largest frame that might be
3755 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3756 * but having it a bit too small is a problem. Since netdev_get_mtu()
3757 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3758 * the MTU. We actually add 64, instead of 14, as a guard against
3759 * additional headers get tacked on somewhere that we're not aware of. */
3760 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3761 hc->burst = MAX(hc->burst, mtu + 64);
3764 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3770 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3771 unsigned int parent, struct htb_class *options,
3772 struct netdev_queue_stats *stats)
3774 struct ofpbuf *reply;
3777 error = tc_query_class(netdev, handle, parent, &reply);
3779 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3780 ofpbuf_delete(reply);
3786 htb_tc_install(struct netdev *netdev, const struct smap *details)
3790 error = htb_setup_qdisc__(netdev);
3792 struct htb_class hc;
3794 htb_parse_qdisc_details__(netdev, details, &hc);
3795 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3796 tc_make_handle(1, 0), &hc);
3798 htb_install__(netdev, hc.max_rate);
3804 static struct htb_class *
3805 htb_class_cast__(const struct tc_queue *queue)
3807 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3811 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3812 const struct htb_class *hc)
3814 struct htb *htb = htb_get__(netdev);
3815 size_t hash = hash_int(queue_id, 0);
3816 struct tc_queue *queue;
3817 struct htb_class *hcp;
3819 queue = tc_find_queue__(netdev, queue_id, hash);
3821 hcp = htb_class_cast__(queue);
3823 hcp = xmalloc(sizeof *hcp);
3824 queue = &hcp->tc_queue;
3825 queue->queue_id = queue_id;
3826 queue->created = time_msec();
3827 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3830 hcp->min_rate = hc->min_rate;
3831 hcp->max_rate = hc->max_rate;
3832 hcp->burst = hc->burst;
3833 hcp->priority = hc->priority;
3837 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3840 struct queue_dump_state state;
3841 struct htb_class hc;
3843 /* Get qdisc options. */
3845 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3846 htb_install__(netdev, hc.max_rate);
3849 if (!start_queue_dump(netdev, &state)) {
3852 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3853 unsigned int queue_id;
3855 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3856 htb_update_queue__(netdev, queue_id, &hc);
3859 finish_queue_dump(&state);
3865 htb_tc_destroy(struct tc *tc)
3867 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3868 struct htb_class *hc;
3870 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3878 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3880 const struct htb *htb = htb_get__(netdev);
3881 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3886 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3888 struct htb_class hc;
3891 htb_parse_qdisc_details__(netdev, details, &hc);
3892 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3893 tc_make_handle(1, 0), &hc);
3895 htb_get__(netdev)->max_rate = hc.max_rate;
3901 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3902 const struct tc_queue *queue, struct smap *details)
3904 const struct htb_class *hc = htb_class_cast__(queue);
3906 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3907 if (hc->min_rate != hc->max_rate) {
3908 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3910 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3912 smap_add_format(details, "priority", "%u", hc->priority);
3918 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3919 const struct smap *details)
3921 struct htb_class hc;
3924 error = htb_parse_class_details__(netdev, details, &hc);
3929 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3930 tc_make_handle(1, 0xfffe), &hc);
3935 htb_update_queue__(netdev, queue_id, &hc);
3940 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3942 struct htb_class *hc = htb_class_cast__(queue);
3943 struct htb *htb = htb_get__(netdev);
3946 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3948 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3955 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3956 struct netdev_queue_stats *stats)
3958 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3959 tc_make_handle(1, 0xfffe), NULL, stats);
3963 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3964 const struct ofpbuf *nlmsg,
3965 netdev_dump_queue_stats_cb *cb, void *aux)
3967 struct netdev_queue_stats stats;
3968 unsigned int handle, major, minor;
3971 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3976 major = tc_get_major(handle);
3977 minor = tc_get_minor(handle);
3978 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3979 (*cb)(minor - 1, &stats, aux);
3984 static const struct tc_ops tc_ops_htb = {
3985 "htb", /* linux_name */
3986 "linux-htb", /* ovs_name */
3987 HTB_N_QUEUES, /* n_queues */
3996 htb_class_get_stats,
3997 htb_class_dump_stats
4000 /* "linux-hfsc" traffic control class. */
4002 #define HFSC_N_QUEUES 0xf000
4010 struct tc_queue tc_queue;
4015 static struct hfsc *
4016 hfsc_get__(const struct netdev *netdev_)
4018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4019 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4022 static struct hfsc_class *
4023 hfsc_class_cast__(const struct tc_queue *queue)
4025 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4029 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4034 hfsc = xmalloc(sizeof *hfsc);
4035 tc_init(&hfsc->tc, &tc_ops_hfsc);
4036 hfsc->max_rate = max_rate;
4037 netdev->tc = &hfsc->tc;
4041 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4042 const struct hfsc_class *hc)
4046 struct hfsc_class *hcp;
4047 struct tc_queue *queue;
4049 hfsc = hfsc_get__(netdev);
4050 hash = hash_int(queue_id, 0);
4052 queue = tc_find_queue__(netdev, queue_id, hash);
4054 hcp = hfsc_class_cast__(queue);
4056 hcp = xmalloc(sizeof *hcp);
4057 queue = &hcp->tc_queue;
4058 queue->queue_id = queue_id;
4059 queue->created = time_msec();
4060 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4063 hcp->min_rate = hc->min_rate;
4064 hcp->max_rate = hc->max_rate;
4068 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4070 const struct tc_service_curve *rsc, *fsc, *usc;
4071 static const struct nl_policy tca_hfsc_policy[] = {
4073 .type = NL_A_UNSPEC,
4075 .min_len = sizeof(struct tc_service_curve),
4078 .type = NL_A_UNSPEC,
4080 .min_len = sizeof(struct tc_service_curve),
4083 .type = NL_A_UNSPEC,
4085 .min_len = sizeof(struct tc_service_curve),
4088 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4090 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4091 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4092 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4096 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4097 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4098 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4100 if (rsc->m1 != 0 || rsc->d != 0 ||
4101 fsc->m1 != 0 || fsc->d != 0 ||
4102 usc->m1 != 0 || usc->d != 0) {
4103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4104 "Non-linear service curves are not supported.");
4108 if (rsc->m2 != fsc->m2) {
4109 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4110 "Real-time service curves are not supported ");
4114 if (rsc->m2 > usc->m2) {
4115 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4116 "Min-rate service curve is greater than "
4117 "the max-rate service curve.");
4121 class->min_rate = fsc->m2;
4122 class->max_rate = usc->m2;
4127 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4128 struct hfsc_class *options,
4129 struct netdev_queue_stats *stats)
4132 unsigned int handle;
4133 struct nlattr *nl_options;
4135 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4141 unsigned int major, minor;
4143 major = tc_get_major(handle);
4144 minor = tc_get_minor(handle);
4145 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4146 *queue_id = minor - 1;
4153 error = hfsc_parse_tca_options__(nl_options, options);
4160 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4161 unsigned int parent, struct hfsc_class *options,
4162 struct netdev_queue_stats *stats)
4165 struct ofpbuf *reply;
4167 error = tc_query_class(netdev, handle, parent, &reply);
4172 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4173 ofpbuf_delete(reply);
4178 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4179 struct hfsc_class *class)
4181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4183 const char *max_rate_s;
4185 max_rate_s = smap_get(details, "max-rate");
4186 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4189 enum netdev_features current;
4191 netdev_linux_read_features(netdev);
4192 current = !netdev->get_features_error ? netdev->current : 0;
4193 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4196 class->min_rate = max_rate;
4197 class->max_rate = max_rate;
4201 hfsc_parse_class_details__(struct netdev *netdev,
4202 const struct smap *details,
4203 struct hfsc_class * class)
4205 const struct hfsc *hfsc;
4206 uint32_t min_rate, max_rate;
4207 const char *min_rate_s, *max_rate_s;
4209 hfsc = hfsc_get__(netdev);
4210 min_rate_s = smap_get(details, "min-rate");
4211 max_rate_s = smap_get(details, "max-rate");
4213 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4214 min_rate = MAX(min_rate, 1);
4215 min_rate = MIN(min_rate, hfsc->max_rate);
4217 max_rate = (max_rate_s
4218 ? strtoull(max_rate_s, NULL, 10) / 8
4220 max_rate = MAX(max_rate, min_rate);
4221 max_rate = MIN(max_rate, hfsc->max_rate);
4223 class->min_rate = min_rate;
4224 class->max_rate = max_rate;
4229 /* Create an HFSC qdisc.
4231 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4233 hfsc_setup_qdisc__(struct netdev * netdev)
4235 struct tcmsg *tcmsg;
4236 struct ofpbuf request;
4237 struct tc_hfsc_qopt opt;
4239 tc_del_qdisc(netdev);
4241 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4242 NLM_F_EXCL | NLM_F_CREATE, &request);
4248 tcmsg->tcm_handle = tc_make_handle(1, 0);
4249 tcmsg->tcm_parent = TC_H_ROOT;
4251 memset(&opt, 0, sizeof opt);
4254 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4255 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4257 return tc_transact(&request, NULL);
4260 /* Create an HFSC class.
4262 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4263 * sc rate <min_rate> ul rate <max_rate>" */
4265 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4266 unsigned int parent, struct hfsc_class *class)
4270 struct tcmsg *tcmsg;
4271 struct ofpbuf request;
4272 struct tc_service_curve min, max;
4274 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4280 tcmsg->tcm_handle = handle;
4281 tcmsg->tcm_parent = parent;
4285 min.m2 = class->min_rate;
4289 max.m2 = class->max_rate;
4291 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4292 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4293 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4294 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4295 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4296 nl_msg_end_nested(&request, opt_offset);
4298 error = tc_transact(&request, NULL);
4300 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4301 "min-rate %ubps, max-rate %ubps (%s)",
4302 netdev_get_name(netdev),
4303 tc_get_major(handle), tc_get_minor(handle),
4304 tc_get_major(parent), tc_get_minor(parent),
4305 class->min_rate, class->max_rate, ovs_strerror(error));
4312 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4315 struct hfsc_class class;
4317 error = hfsc_setup_qdisc__(netdev);
4323 hfsc_parse_qdisc_details__(netdev, details, &class);
4324 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4325 tc_make_handle(1, 0), &class);
4331 hfsc_install__(netdev, class.max_rate);
4336 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4339 struct queue_dump_state state;
4340 struct hfsc_class hc;
4343 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4344 hfsc_install__(netdev, hc.max_rate);
4346 if (!start_queue_dump(netdev, &state)) {
4350 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4351 unsigned int queue_id;
4353 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4354 hfsc_update_queue__(netdev, queue_id, &hc);
4358 finish_queue_dump(&state);
4363 hfsc_tc_destroy(struct tc *tc)
4366 struct hfsc_class *hc, *next;
4368 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4370 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4371 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4380 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4382 const struct hfsc *hfsc;
4383 hfsc = hfsc_get__(netdev);
4384 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4389 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4392 struct hfsc_class class;
4394 hfsc_parse_qdisc_details__(netdev, details, &class);
4395 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4396 tc_make_handle(1, 0), &class);
4399 hfsc_get__(netdev)->max_rate = class.max_rate;
4406 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4407 const struct tc_queue *queue, struct smap *details)
4409 const struct hfsc_class *hc;
4411 hc = hfsc_class_cast__(queue);
4412 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4413 if (hc->min_rate != hc->max_rate) {
4414 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4420 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4421 const struct smap *details)
4424 struct hfsc_class class;
4426 error = hfsc_parse_class_details__(netdev, details, &class);
4431 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4432 tc_make_handle(1, 0xfffe), &class);
4437 hfsc_update_queue__(netdev, queue_id, &class);
4442 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4446 struct hfsc_class *hc;
4448 hc = hfsc_class_cast__(queue);
4449 hfsc = hfsc_get__(netdev);
4451 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4453 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4460 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4461 struct netdev_queue_stats *stats)
4463 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4464 tc_make_handle(1, 0xfffe), NULL, stats);
4468 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4469 const struct ofpbuf *nlmsg,
4470 netdev_dump_queue_stats_cb *cb, void *aux)
4472 struct netdev_queue_stats stats;
4473 unsigned int handle, major, minor;
4476 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4481 major = tc_get_major(handle);
4482 minor = tc_get_minor(handle);
4483 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4484 (*cb)(minor - 1, &stats, aux);
4489 static const struct tc_ops tc_ops_hfsc = {
4490 "hfsc", /* linux_name */
4491 "linux-hfsc", /* ovs_name */
4492 HFSC_N_QUEUES, /* n_queues */
4493 hfsc_tc_install, /* tc_install */
4494 hfsc_tc_load, /* tc_load */
4495 hfsc_tc_destroy, /* tc_destroy */
4496 hfsc_qdisc_get, /* qdisc_get */
4497 hfsc_qdisc_set, /* qdisc_set */
4498 hfsc_class_get, /* class_get */
4499 hfsc_class_set, /* class_set */
4500 hfsc_class_delete, /* class_delete */
4501 hfsc_class_get_stats, /* class_get_stats */
4502 hfsc_class_dump_stats /* class_dump_stats */
4505 /* "linux-noop" traffic control class. */
4508 noop_install__(struct netdev *netdev_)
4510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4511 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4513 netdev->tc = CONST_CAST(struct tc *, &tc);
4517 noop_tc_install(struct netdev *netdev,
4518 const struct smap *details OVS_UNUSED)
4520 noop_install__(netdev);
4525 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4527 noop_install__(netdev);
4531 static const struct tc_ops tc_ops_noop = {
4532 NULL, /* linux_name */
4533 "linux-noop", /* ovs_name */
4537 NULL, /* tc_destroy */
4538 NULL, /* qdisc_get */
4539 NULL, /* qdisc_set */
4540 NULL, /* class_get */
4541 NULL, /* class_set */
4542 NULL, /* class_delete */
4543 NULL, /* class_get_stats */
4544 NULL /* class_dump_stats */
4547 /* "linux-default" traffic control class.
4549 * This class represents the default, unnamed Linux qdisc. It corresponds to
4550 * the "" (empty string) QoS type in the OVS database. */
4553 default_install__(struct netdev *netdev_)
4555 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4556 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4558 /* Nothing but a tc class implementation is allowed to write to a tc. This
4559 * class never does that, so we can legitimately use a const tc object. */
4560 netdev->tc = CONST_CAST(struct tc *, &tc);
4564 default_tc_install(struct netdev *netdev,
4565 const struct smap *details OVS_UNUSED)
4567 default_install__(netdev);
4572 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4574 default_install__(netdev);
4578 static const struct tc_ops tc_ops_default = {
4579 NULL, /* linux_name */
4584 NULL, /* tc_destroy */
4585 NULL, /* qdisc_get */
4586 NULL, /* qdisc_set */
4587 NULL, /* class_get */
4588 NULL, /* class_set */
4589 NULL, /* class_delete */
4590 NULL, /* class_get_stats */
4591 NULL /* class_dump_stats */
4594 /* "linux-other" traffic control class.
4599 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4601 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4602 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4604 /* Nothing but a tc class implementation is allowed to write to a tc. This
4605 * class never does that, so we can legitimately use a const tc object. */
4606 netdev->tc = CONST_CAST(struct tc *, &tc);
4610 static const struct tc_ops tc_ops_other = {
4611 NULL, /* linux_name */
4612 "linux-other", /* ovs_name */
4614 NULL, /* tc_install */
4616 NULL, /* tc_destroy */
4617 NULL, /* qdisc_get */
4618 NULL, /* qdisc_set */
4619 NULL, /* class_get */
4620 NULL, /* class_set */
4621 NULL, /* class_delete */
4622 NULL, /* class_get_stats */
4623 NULL /* class_dump_stats */
4626 /* Traffic control. */
4628 /* Number of kernel "tc" ticks per second. */
4629 static double ticks_per_s;
4631 /* Number of kernel "jiffies" per second. This is used for the purpose of
4632 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4633 * one jiffy's worth of data.
4635 * There are two possibilities here:
4637 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4638 * approximate range of 100 to 1024. That means that we really need to
4639 * make sure that the qdisc can buffer that much data.
4641 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4642 * has finely granular timers and there's no need to fudge additional room
4643 * for buffers. (There's no extra effort needed to implement that: the
4644 * large 'buffer_hz' is used as a divisor, so practically any number will
4645 * come out as 0 in the division. Small integer results in the case of
4646 * really high dividends won't have any real effect anyhow.)
4648 static unsigned int buffer_hz;
4650 /* Returns tc handle 'major':'minor'. */
4652 tc_make_handle(unsigned int major, unsigned int minor)
4654 return TC_H_MAKE(major << 16, minor);
4657 /* Returns the major number from 'handle'. */
4659 tc_get_major(unsigned int handle)
4661 return TC_H_MAJ(handle) >> 16;
4664 /* Returns the minor number from 'handle'. */
4666 tc_get_minor(unsigned int handle)
4668 return TC_H_MIN(handle);
4671 static struct tcmsg *
4672 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4673 struct ofpbuf *request)
4675 struct tcmsg *tcmsg;
4679 error = get_ifindex(netdev, &ifindex);
4684 ofpbuf_init(request, 512);
4685 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4686 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4687 tcmsg->tcm_family = AF_UNSPEC;
4688 tcmsg->tcm_ifindex = ifindex;
4689 /* Caller should fill in tcmsg->tcm_handle. */
4690 /* Caller should fill in tcmsg->tcm_parent. */
4696 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4698 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4699 ofpbuf_uninit(request);
4703 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4704 * policing configuration.
4706 * This function is equivalent to running the following when 'add' is true:
4707 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4709 * This function is equivalent to running the following when 'add' is false:
4710 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4712 * The configuration and stats may be seen with the following command:
4713 * /sbin/tc -s qdisc show dev <devname>
4715 * Returns 0 if successful, otherwise a positive errno value.
4718 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4720 struct ofpbuf request;
4721 struct tcmsg *tcmsg;
4723 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4724 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4726 tcmsg = tc_make_request(netdev, type, flags, &request);
4730 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4731 tcmsg->tcm_parent = TC_H_INGRESS;
4732 nl_msg_put_string(&request, TCA_KIND, "ingress");
4733 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4735 error = tc_transact(&request, NULL);
4737 /* If we're deleting the qdisc, don't worry about some of the
4738 * error conditions. */
4739 if (!add && (error == ENOENT || error == EINVAL)) {
4748 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4751 * This function is equivalent to running:
4752 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4753 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4756 * The configuration and stats may be seen with the following command:
4757 * /sbin/tc -s filter show dev <devname> parent ffff:
4759 * Returns 0 if successful, otherwise a positive errno value.
4762 tc_add_policer(struct netdev *netdev,
4763 uint32_t kbits_rate, uint32_t kbits_burst)
4765 struct tc_police tc_police;
4766 struct ofpbuf request;
4767 struct tcmsg *tcmsg;
4768 size_t basic_offset;
4769 size_t police_offset;
4773 memset(&tc_police, 0, sizeof tc_police);
4774 tc_police.action = TC_POLICE_SHOT;
4775 tc_police.mtu = mtu;
4776 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4778 /* The following appears wrong in one way: In networking a kilobit is
4779 * usually 1000 bits but this uses 1024 bits.
4781 * However if you "fix" those problems then "tc filter show ..." shows
4782 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4783 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4784 * tc's point of view. Whatever. */
4785 tc_police.burst = tc_bytes_to_ticks(
4786 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4788 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4789 NLM_F_EXCL | NLM_F_CREATE, &request);
4793 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4794 tcmsg->tcm_info = tc_make_handle(49,
4795 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4797 nl_msg_put_string(&request, TCA_KIND, "basic");
4798 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4799 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4800 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4801 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4802 nl_msg_end_nested(&request, police_offset);
4803 nl_msg_end_nested(&request, basic_offset);
4805 error = tc_transact(&request, NULL);
4816 /* The values in psched are not individually very meaningful, but they are
4817 * important. The tables below show some values seen in the wild.
4821 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4822 * (Before that, there are hints that it was 1000000000.)
4824 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4828 * -----------------------------------
4829 * [1] 000c8000 000f4240 000f4240 00000064
4830 * [2] 000003e8 00000400 000f4240 3b9aca00
4831 * [3] 000003e8 00000400 000f4240 3b9aca00
4832 * [4] 000003e8 00000400 000f4240 00000064
4833 * [5] 000003e8 00000040 000f4240 3b9aca00
4834 * [6] 000003e8 00000040 000f4240 000000f9
4836 * a b c d ticks_per_s buffer_hz
4837 * ------- --------- ---------- ------------- ----------- -------------
4838 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4839 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4840 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4841 * [4] 1,000 1,024 1,000,000 100 976,562 100
4842 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4843 * [6] 1,000 64 1,000,000 249 15,625,000 249
4845 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4846 * [2] 2.6.26-1-686-bigmem from Debian lenny
4847 * [3] 2.6.26-2-sparc64 from Debian lenny
4848 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4849 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4850 * [6] 2.6.34 from kernel.org on KVM
4852 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4853 static const char fn[] = "/proc/net/psched";
4854 unsigned int a, b, c, d;
4857 if (!ovsthread_once_start(&once)) {
4864 stream = fopen(fn, "r");
4866 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4870 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4871 VLOG_WARN("%s: read failed", fn);
4875 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4879 VLOG_WARN("%s: invalid scheduler parameters", fn);
4883 ticks_per_s = (double) a * c / b;
4887 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4890 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4893 ovsthread_once_done(&once);
4896 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4897 * rate of 'rate' bytes per second. */
4899 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4902 return (rate * ticks) / ticks_per_s;
4905 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4906 * rate of 'rate' bytes per second. */
4908 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4911 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4914 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4915 * a transmission rate of 'rate' bytes per second. */
4917 tc_buffer_per_jiffy(unsigned int rate)
4920 return rate / buffer_hz;
4923 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4924 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4925 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4926 * stores NULL into it if it is absent.
4928 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4931 * Returns 0 if successful, otherwise a positive errno value. */
4933 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4934 struct nlattr **options)
4936 static const struct nl_policy tca_policy[] = {
4937 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4938 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4940 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4942 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4943 tca_policy, ta, ARRAY_SIZE(ta))) {
4944 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4949 *kind = nl_attr_get_string(ta[TCA_KIND]);
4953 *options = ta[TCA_OPTIONS];
4968 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4969 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4970 * into '*options', and its queue statistics into '*stats'. Any of the output
4971 * arguments may be null.
4973 * Returns 0 if successful, otherwise a positive errno value. */
4975 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4976 struct nlattr **options, struct netdev_queue_stats *stats)
4978 static const struct nl_policy tca_policy[] = {
4979 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4980 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4982 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4984 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4985 tca_policy, ta, ARRAY_SIZE(ta))) {
4986 VLOG_WARN_RL(&rl, "failed to parse class message");
4991 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4992 *handlep = tc->tcm_handle;
4996 *options = ta[TCA_OPTIONS];
5000 const struct gnet_stats_queue *gsq;
5001 struct gnet_stats_basic gsb;
5003 static const struct nl_policy stats_policy[] = {
5004 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5005 .min_len = sizeof gsb },
5006 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5007 .min_len = sizeof *gsq },
5009 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5011 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5012 sa, ARRAY_SIZE(sa))) {
5013 VLOG_WARN_RL(&rl, "failed to parse class stats");
5017 /* Alignment issues screw up the length of struct gnet_stats_basic on
5018 * some arch/bitsize combinations. Newer versions of Linux have a
5019 * struct gnet_stats_basic_packed, but we can't depend on that. The
5020 * easiest thing to do is just to make a copy. */
5021 memset(&gsb, 0, sizeof gsb);
5022 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5023 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5024 stats->tx_bytes = gsb.bytes;
5025 stats->tx_packets = gsb.packets;
5027 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5028 stats->tx_errors = gsq->drops;
5038 memset(stats, 0, sizeof *stats);
5043 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5046 tc_query_class(const struct netdev *netdev,
5047 unsigned int handle, unsigned int parent,
5048 struct ofpbuf **replyp)
5050 struct ofpbuf request;
5051 struct tcmsg *tcmsg;
5054 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5058 tcmsg->tcm_handle = handle;
5059 tcmsg->tcm_parent = parent;
5061 error = tc_transact(&request, replyp);
5063 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5064 netdev_get_name(netdev),
5065 tc_get_major(handle), tc_get_minor(handle),
5066 tc_get_major(parent), tc_get_minor(parent),
5067 ovs_strerror(error));
5072 /* Equivalent to "tc class del dev <name> handle <handle>". */
5074 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5076 struct ofpbuf request;
5077 struct tcmsg *tcmsg;
5080 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5084 tcmsg->tcm_handle = handle;
5085 tcmsg->tcm_parent = 0;
5087 error = tc_transact(&request, NULL);
5089 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5090 netdev_get_name(netdev),
5091 tc_get_major(handle), tc_get_minor(handle),
5092 ovs_strerror(error));
5097 /* Equivalent to "tc qdisc del dev <name> root". */
5099 tc_del_qdisc(struct netdev *netdev_)
5101 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5102 struct ofpbuf request;
5103 struct tcmsg *tcmsg;
5106 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5110 tcmsg->tcm_handle = tc_make_handle(1, 0);
5111 tcmsg->tcm_parent = TC_H_ROOT;
5113 error = tc_transact(&request, NULL);
5114 if (error == EINVAL) {
5115 /* EINVAL probably means that the default qdisc was in use, in which
5116 * case we've accomplished our purpose. */
5119 if (!error && netdev->tc) {
5120 if (netdev->tc->ops->tc_destroy) {
5121 netdev->tc->ops->tc_destroy(netdev->tc);
5129 getqdisc_is_safe(void)
5131 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5132 static bool safe = false;
5134 if (ovsthread_once_start(&once)) {
5135 struct utsname utsname;
5138 if (uname(&utsname) == -1) {
5139 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5140 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5141 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5142 } else if (major < 2 || (major == 2 && minor < 35)) {
5143 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5148 ovsthread_once_done(&once);
5153 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5154 * kernel to determine what they are. Returns 0 if successful, otherwise a
5155 * positive errno value. */
5157 tc_query_qdisc(const struct netdev *netdev_)
5159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5160 struct ofpbuf request, *qdisc;
5161 const struct tc_ops *ops;
5162 struct tcmsg *tcmsg;
5170 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5171 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5172 * 2.6.35 without that fix backported to it.
5174 * To avoid the OOPS, we must not make a request that would attempt to dump
5175 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5176 * few others. There are a few ways that I can see to do this, but most of
5177 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5178 * technique chosen here is to assume that any non-default qdisc that we
5179 * create will have a class with handle 1:0. The built-in qdiscs only have
5180 * a class with handle 0:0.
5182 * On Linux 2.6.35+ we use the straightforward method because it allows us
5183 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5184 * in such a case we get no response at all from the kernel (!) if a
5185 * builtin qdisc is in use (which is later caught by "!error &&
5186 * !qdisc->size"). */
5187 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5191 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5192 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5194 /* Figure out what tc class to instantiate. */
5195 error = tc_transact(&request, &qdisc);
5196 if (!error && qdisc->size) {
5199 error = tc_parse_qdisc(qdisc, &kind, NULL);
5201 ops = &tc_ops_other;
5203 ops = tc_lookup_linux_name(kind);
5205 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5206 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5208 ops = &tc_ops_other;
5211 } else if ((!error && !qdisc->size) || error == ENOENT) {
5212 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5213 * set up by some other entity that doesn't have a handle 1:0. We will
5214 * assume that it's the system default qdisc. */
5215 ops = &tc_ops_default;
5218 /* Who knows? Maybe the device got deleted. */
5219 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5220 netdev_get_name(netdev_), ovs_strerror(error));
5221 ops = &tc_ops_other;
5224 /* Instantiate it. */
5225 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5226 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5227 ofpbuf_delete(qdisc);
5229 return error ? error : load_error;
5232 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5233 approximate the time to transmit packets of various lengths. For an MTU of
5234 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5235 represents two possible packet lengths; for a MTU of 513 through 1024, four
5236 possible lengths; and so on.
5238 Returns, for the specified 'mtu', the number of bits that packet lengths
5239 need to be shifted right to fit within such a 256-entry table. */
5241 tc_calc_cell_log(unsigned int mtu)
5246 mtu = ETH_PAYLOAD_MAX;
5248 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5250 for (cell_log = 0; mtu >= 256; cell_log++) {
5257 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5260 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5262 memset(rate, 0, sizeof *rate);
5263 rate->cell_log = tc_calc_cell_log(mtu);
5264 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5265 /* rate->cell_align = 0; */ /* distro headers. */
5266 rate->mpu = ETH_TOTAL_MIN;
5270 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5271 * attribute of the specified "type".
5273 * See tc_calc_cell_log() above for a description of "rtab"s. */
5275 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5280 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5281 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5282 unsigned packet_size = (i + 1) << rate->cell_log;
5283 if (packet_size < rate->mpu) {
5284 packet_size = rate->mpu;
5286 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5290 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5291 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5292 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5295 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5297 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5298 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5301 /* Linux-only functions declared in netdev-linux.h */
5303 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5304 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5306 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5307 const char *flag_name, bool enable)
5309 const char *netdev_name = netdev_get_name(netdev);
5310 struct ethtool_value evalue;
5314 COVERAGE_INC(netdev_get_ethtool);
5315 memset(&evalue, 0, sizeof evalue);
5316 error = netdev_linux_do_ethtool(netdev_name,
5317 (struct ethtool_cmd *)&evalue,
5318 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5323 COVERAGE_INC(netdev_set_ethtool);
5324 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5325 if (new_flags == evalue.data) {
5328 evalue.data = new_flags;
5329 error = netdev_linux_do_ethtool(netdev_name,
5330 (struct ethtool_cmd *)&evalue,
5331 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5336 COVERAGE_INC(netdev_get_ethtool);
5337 memset(&evalue, 0, sizeof evalue);
5338 error = netdev_linux_do_ethtool(netdev_name,
5339 (struct ethtool_cmd *)&evalue,
5340 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5345 if (new_flags != evalue.data) {
5346 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5347 "device %s failed", enable ? "enable" : "disable",
5348 flag_name, netdev_name);
5355 /* Utility functions. */
5357 /* Copies 'src' into 'dst', performing format conversion in the process. */
5359 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5360 const struct rtnl_link_stats *src)
5362 dst->rx_packets = src->rx_packets;
5363 dst->tx_packets = src->tx_packets;
5364 dst->rx_bytes = src->rx_bytes;
5365 dst->tx_bytes = src->tx_bytes;
5366 dst->rx_errors = src->rx_errors;
5367 dst->tx_errors = src->tx_errors;
5368 dst->rx_dropped = src->rx_dropped;
5369 dst->tx_dropped = src->tx_dropped;
5370 dst->multicast = src->multicast;
5371 dst->collisions = src->collisions;
5372 dst->rx_length_errors = src->rx_length_errors;
5373 dst->rx_over_errors = src->rx_over_errors;
5374 dst->rx_crc_errors = src->rx_crc_errors;
5375 dst->rx_frame_errors = src->rx_frame_errors;
5376 dst->rx_fifo_errors = src->rx_fifo_errors;
5377 dst->rx_missed_errors = src->rx_missed_errors;
5378 dst->tx_aborted_errors = src->tx_aborted_errors;
5379 dst->tx_carrier_errors = src->tx_carrier_errors;
5380 dst->tx_fifo_errors = src->tx_fifo_errors;
5381 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5382 dst->tx_window_errors = src->tx_window_errors;
5385 /* Copies 'src' into 'dst', performing format conversion in the process. */
5387 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5388 const struct rtnl_link_stats64 *src)
5390 dst->rx_packets = src->rx_packets;
5391 dst->tx_packets = src->tx_packets;
5392 dst->rx_bytes = src->rx_bytes;
5393 dst->tx_bytes = src->tx_bytes;
5394 dst->rx_errors = src->rx_errors;
5395 dst->tx_errors = src->tx_errors;
5396 dst->rx_dropped = src->rx_dropped;
5397 dst->tx_dropped = src->tx_dropped;
5398 dst->multicast = src->multicast;
5399 dst->collisions = src->collisions;
5400 dst->rx_length_errors = src->rx_length_errors;
5401 dst->rx_over_errors = src->rx_over_errors;
5402 dst->rx_crc_errors = src->rx_crc_errors;
5403 dst->rx_frame_errors = src->rx_frame_errors;
5404 dst->rx_fifo_errors = src->rx_fifo_errors;
5405 dst->rx_missed_errors = src->rx_missed_errors;
5406 dst->tx_aborted_errors = src->tx_aborted_errors;
5407 dst->tx_carrier_errors = src->tx_carrier_errors;
5408 dst->tx_fifo_errors = src->tx_fifo_errors;
5409 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5410 dst->tx_window_errors = src->tx_window_errors;
5414 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5416 struct ofpbuf request;
5417 struct ofpbuf *reply;
5420 /* Filtering all counters by default */
5421 memset(stats, 0xFF, sizeof(struct netdev_stats));
5423 ofpbuf_init(&request, 0);
5424 nl_msg_put_nlmsghdr(&request,
5425 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5426 RTM_GETLINK, NLM_F_REQUEST);
5427 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5428 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5429 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5430 ofpbuf_uninit(&request);
5435 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5436 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5437 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5438 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5441 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5442 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5443 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5446 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5451 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5456 ofpbuf_delete(reply);
5461 get_flags(const struct netdev *dev, unsigned int *flags)
5467 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5469 *flags = ifr.ifr_flags;
5475 set_flags(const char *name, unsigned int flags)
5479 ifr.ifr_flags = flags;
5480 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5484 do_get_ifindex(const char *netdev_name)
5489 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5490 COVERAGE_INC(netdev_get_ifindex);
5492 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5494 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5495 netdev_name, ovs_strerror(error));
5498 return ifr.ifr_ifindex;
5502 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5504 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5506 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5507 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5510 netdev->get_ifindex_error = -ifindex;
5511 netdev->ifindex = 0;
5513 netdev->get_ifindex_error = 0;
5514 netdev->ifindex = ifindex;
5516 netdev->cache_valid |= VALID_IFINDEX;
5519 *ifindexp = netdev->ifindex;
5520 return netdev->get_ifindex_error;
5524 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5530 memset(&ifr, 0, sizeof ifr);
5531 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5532 COVERAGE_INC(netdev_get_hwaddr);
5533 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5535 /* ENODEV probably means that a vif disappeared asynchronously and
5536 * hasn't been removed from the database yet, so reduce the log level
5537 * to INFO for that case. */
5538 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5539 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5540 netdev_name, ovs_strerror(error));
5543 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5544 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5545 VLOG_INFO("%s device has unknown hardware address family %d",
5546 netdev_name, hwaddr_family);
5549 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5554 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5559 memset(&ifr, 0, sizeof ifr);
5560 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5561 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5562 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5563 COVERAGE_INC(netdev_set_hwaddr);
5564 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5566 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5567 netdev_name, ovs_strerror(error));
5573 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5574 int cmd, const char *cmd_name)
5579 memset(&ifr, 0, sizeof ifr);
5580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5581 ifr.ifr_data = (caddr_t) ecmd;
5584 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5586 if (error != EOPNOTSUPP) {
5587 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5588 "failed: %s", cmd_name, name, ovs_strerror(error));
5590 /* The device doesn't support this operation. That's pretty
5591 * common, so there's no point in logging anything. */
5597 /* Returns an AF_PACKET raw socket or a negative errno value. */
5599 af_packet_sock(void)
5601 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5604 if (ovsthread_once_start(&once)) {
5605 sock = socket(AF_PACKET, SOCK_RAW, 0);
5607 int error = set_nonblocking(sock);
5614 VLOG_ERR("failed to create packet socket: %s",
5615 ovs_strerror(errno));
5617 ovsthread_once_done(&once);