2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151 return ep->speed | (ep->speed_hi << 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64 {
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops *ops;
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
254 long long int created; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set)(struct netdev *, const struct smap *details);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
358 struct smap *details);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
372 const struct smap *details);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
390 struct netdev_queue_stats *stats);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
403 tc_init(struct tc *tc, const struct tc_ops *ops)
406 hmap_init(&tc->queues);
410 tc_destroy(struct tc *tc)
412 hmap_destroy(&tc->queues);
415 static const struct tc_ops tc_ops_htb;
416 static const struct tc_ops tc_ops_hfsc;
417 static const struct tc_ops tc_ops_codel;
418 static const struct tc_ops tc_ops_fqcodel;
419 static const struct tc_ops tc_ops_sfq;
420 static const struct tc_ops tc_ops_default;
421 static const struct tc_ops tc_ops_noop;
422 static const struct tc_ops tc_ops_other;
424 static const struct tc_ops *const tcs[] = {
425 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc, /* Hierarchical fair service curve. */
427 &tc_ops_codel, /* Controlled delay */
428 &tc_ops_fqcodel, /* Fair queue controlled delay */
429 &tc_ops_sfq, /* Stochastic fair queueing */
430 &tc_ops_noop, /* Non operating qos type. */
431 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
432 &tc_ops_other, /* Some other qdisc. */
436 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
437 static unsigned int tc_get_major(unsigned int handle);
438 static unsigned int tc_get_minor(unsigned int handle);
440 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
441 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
442 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444 static struct tcmsg *tc_make_request(const struct netdev *, int type,
445 unsigned int flags, struct ofpbuf *);
446 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
447 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
448 static int tc_add_policer(struct netdev *,
449 uint32_t kbits_rate, uint32_t kbits_burst);
451 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
452 struct nlattr **options);
453 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
454 struct nlattr **options,
455 struct netdev_queue_stats *);
456 static int tc_query_class(const struct netdev *,
457 unsigned int handle, unsigned int parent,
458 struct ofpbuf **replyp);
459 static int tc_delete_class(const struct netdev *, unsigned int handle);
461 static int tc_del_qdisc(struct netdev *netdev);
462 static int tc_query_qdisc(const struct netdev *netdev);
464 static int tc_calc_cell_log(unsigned int mtu);
465 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
466 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
467 const struct tc_ratespec *rate);
468 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470 struct netdev_linux {
473 /* Protects all members below. */
474 struct ovs_mutex mutex;
476 unsigned int cache_valid;
478 bool miimon; /* Link status of last poll. */
479 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
480 struct timer miimon_timer;
482 /* The following are figured out "on demand" only. They are only valid
483 * when the corresponding VALID_* bit in 'cache_valid' is set. */
485 struct eth_addr etheraddr;
487 unsigned int ifi_flags;
488 long long int carrier_resets;
489 uint32_t kbits_rate; /* Policing data. */
490 uint32_t kbits_burst;
491 int vport_stats_error; /* Cached error code from vport_get_stats().
492 0 or an errno value. */
493 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
494 int ether_addr_error; /* Cached error code from set/get etheraddr. */
495 int netdev_policing_error; /* Cached error code from set policing. */
496 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
497 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
499 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
500 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
503 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
506 /* For devices of class netdev_tap_class only. */
510 struct netdev_rxq_linux {
511 struct netdev_rxq up;
516 /* This is set pretty low because we probably won't learn anything from the
517 * additional log messages. */
518 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520 /* Polling miimon status for all ports causes performance degradation when
521 * handling a large number of ports. If there are no devices using miimon, then
522 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 * Readers do not depend on this variable synchronizing with the related
525 * changes in the device miimon status, so we can use atomic_count. */
526 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
528 static void netdev_linux_run(void);
530 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
531 int cmd, const char *cmd_name);
532 static int get_flags(const struct netdev *, unsigned int *flags);
533 static int set_flags(const char *, unsigned int flags);
534 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
535 enum netdev_flags on, enum netdev_flags *old_flagsp)
536 OVS_REQUIRES(netdev->mutex);
537 static int do_get_ifindex(const char *netdev_name);
538 static int get_ifindex(const struct netdev *, int *ifindexp);
539 static int do_set_addr(struct netdev *netdev,
540 int ioctl_nr, const char *ioctl_name,
541 struct in_addr addr);
542 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
543 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
544 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
545 static int af_packet_sock(void);
546 static bool netdev_linux_miimon_enabled(void);
547 static void netdev_linux_miimon_run(void);
548 static void netdev_linux_miimon_wait(void);
549 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
552 is_netdev_linux_class(const struct netdev_class *netdev_class)
554 return netdev_class->run == netdev_linux_run;
558 is_tap_netdev(const struct netdev *netdev)
560 return netdev_get_class(netdev) == &netdev_tap_class;
563 static struct netdev_linux *
564 netdev_linux_cast(const struct netdev *netdev)
566 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
568 return CONTAINER_OF(netdev, struct netdev_linux, up);
571 static struct netdev_rxq_linux *
572 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
574 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
575 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
578 static void netdev_linux_update(struct netdev_linux *netdev,
579 const struct rtnetlink_change *)
580 OVS_REQUIRES(netdev->mutex);
581 static void netdev_linux_changed(struct netdev_linux *netdev,
582 unsigned int ifi_flags, unsigned int mask)
583 OVS_REQUIRES(netdev->mutex);
585 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
586 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
587 * if no such socket could be created. */
588 static struct nl_sock *
589 netdev_linux_notify_sock(void)
591 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
592 static struct nl_sock *sock;
593 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
594 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
596 if (ovsthread_once_start(&once)) {
599 error = nl_sock_create(NETLINK_ROUTE, &sock);
603 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
604 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 nl_sock_destroy(sock);
612 ovsthread_once_done(&once);
619 netdev_linux_miimon_enabled(void)
621 return atomic_count_get(&miimon_cnt) > 0;
625 netdev_linux_run(void)
627 struct nl_sock *sock;
630 if (netdev_linux_miimon_enabled()) {
631 netdev_linux_miimon_run();
634 sock = netdev_linux_notify_sock();
640 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
641 uint64_t buf_stub[4096 / 8];
644 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
645 error = nl_sock_recv(sock, &buf, false);
647 struct rtnetlink_change change;
649 if (rtnetlink_parse(&buf, &change)) {
650 struct netdev *netdev_ = NULL;
651 char dev_name[IFNAMSIZ];
653 if (!change.ifname) {
654 change.ifname = if_indextoname(change.if_index, dev_name);
658 netdev_ = netdev_from_name(change.ifname);
660 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
663 ovs_mutex_lock(&netdev->mutex);
664 netdev_linux_update(netdev, &change);
665 ovs_mutex_unlock(&netdev->mutex);
667 netdev_close(netdev_);
669 } else if (error == ENOBUFS) {
670 struct shash device_shash;
671 struct shash_node *node;
675 shash_init(&device_shash);
676 netdev_get_devices(&netdev_linux_class, &device_shash);
677 SHASH_FOR_EACH (node, &device_shash) {
678 struct netdev *netdev_ = node->data;
679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
682 ovs_mutex_lock(&netdev->mutex);
683 get_flags(netdev_, &flags);
684 netdev_linux_changed(netdev, flags, 0);
685 ovs_mutex_unlock(&netdev->mutex);
687 netdev_close(netdev_);
689 shash_destroy(&device_shash);
690 } else if (error != EAGAIN) {
691 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
692 ovs_strerror(error));
699 netdev_linux_wait(void)
701 struct nl_sock *sock;
703 if (netdev_linux_miimon_enabled()) {
704 netdev_linux_miimon_wait();
706 sock = netdev_linux_notify_sock();
708 nl_sock_wait(sock, POLLIN);
713 netdev_linux_changed(struct netdev_linux *dev,
714 unsigned int ifi_flags, unsigned int mask)
715 OVS_REQUIRES(dev->mutex)
717 netdev_change_seq_changed(&dev->up);
719 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
720 dev->carrier_resets++;
722 dev->ifi_flags = ifi_flags;
724 dev->cache_valid &= mask;
725 if (!(mask & VALID_IN)) {
726 netdev_get_addrs_list_flush();
731 netdev_linux_update(struct netdev_linux *dev,
732 const struct rtnetlink_change *change)
733 OVS_REQUIRES(dev->mutex)
735 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
736 if (change->nlmsg_type == RTM_NEWLINK) {
737 /* Keep drv-info, and ip addresses. */
738 netdev_linux_changed(dev, change->ifi_flags,
739 VALID_DRVINFO | VALID_IN);
741 /* Update netdev from rtnl-change msg. */
743 dev->mtu = change->mtu;
744 dev->cache_valid |= VALID_MTU;
745 dev->netdev_mtu_error = 0;
748 if (!eth_addr_is_zero(change->mac)) {
749 dev->etheraddr = change->mac;
750 dev->cache_valid |= VALID_ETHERADDR;
751 dev->ether_addr_error = 0;
754 dev->ifindex = change->if_index;
755 dev->cache_valid |= VALID_IFINDEX;
756 dev->get_ifindex_error = 0;
758 netdev_linux_changed(dev, change->ifi_flags, 0);
760 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
761 /* Invalidates in4, in6. */
762 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
768 static struct netdev *
769 netdev_linux_alloc(void)
771 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
776 netdev_linux_common_construct(struct netdev_linux *netdev)
778 ovs_mutex_init(&netdev->mutex);
781 /* Creates system and internal devices. */
783 netdev_linux_construct(struct netdev *netdev_)
785 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 netdev_linux_common_construct(netdev);
790 error = get_flags(&netdev->up, &netdev->ifi_flags);
791 if (error == ENODEV) {
792 if (netdev->up.netdev_class != &netdev_internal_class) {
793 /* The device does not exist, so don't allow it to be opened. */
796 /* "Internal" netdevs have to be created as netdev objects before
797 * they exist in the kernel, because creating them in the kernel
798 * happens by passing a netdev object to dpif_port_add().
799 * Therefore, ignore the error. */
806 /* For most types of netdevs we open the device for each call of
807 * netdev_open(). However, this is not the case with tap devices,
808 * since it is only possible to open the device once. In this
809 * situation we share a single file descriptor, and consequently
810 * buffers, across all readers. Therefore once data is read it will
811 * be unavailable to other reads for tap devices. */
813 netdev_linux_construct_tap(struct netdev *netdev_)
815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
816 static const char tap_dev[] = "/dev/net/tun";
817 const char *name = netdev_->name;
821 netdev_linux_common_construct(netdev);
823 /* Open tap device. */
824 netdev->tap_fd = open(tap_dev, O_RDWR);
825 if (netdev->tap_fd < 0) {
827 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
831 /* Create tap device. */
832 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
833 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
834 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
835 VLOG_WARN("%s: creating tap device failed: %s", name,
836 ovs_strerror(errno));
841 /* Make non-blocking. */
842 error = set_nonblocking(netdev->tap_fd);
850 close(netdev->tap_fd);
855 netdev_linux_destruct(struct netdev *netdev_)
857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
859 if (netdev->tc && netdev->tc->ops->tc_destroy) {
860 netdev->tc->ops->tc_destroy(netdev->tc);
863 if (netdev_get_class(netdev_) == &netdev_tap_class
864 && netdev->tap_fd >= 0)
866 close(netdev->tap_fd);
869 if (netdev->miimon_interval > 0) {
870 atomic_count_dec(&miimon_cnt);
873 ovs_mutex_destroy(&netdev->mutex);
877 netdev_linux_dealloc(struct netdev *netdev_)
879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
883 static struct netdev_rxq *
884 netdev_linux_rxq_alloc(void)
886 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
891 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
893 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
894 struct netdev *netdev_ = rx->up.netdev;
895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
898 ovs_mutex_lock(&netdev->mutex);
899 rx->is_tap = is_tap_netdev(netdev_);
901 rx->fd = netdev->tap_fd;
903 struct sockaddr_ll sll;
905 /* Result of tcpdump -dd inbound */
906 static const struct sock_filter filt[] = {
907 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
908 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
909 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
910 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 static const struct sock_fprog fprog = {
913 ARRAY_SIZE(filt), (struct sock_filter *) filt
916 /* Create file descriptor. */
917 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
920 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
925 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
927 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
928 netdev_get_name(netdev_), ovs_strerror(error));
932 /* Set non-blocking mode. */
933 error = set_nonblocking(rx->fd);
938 /* Get ethernet device index. */
939 error = get_ifindex(&netdev->up, &ifindex);
944 /* Bind to specific ethernet device. */
945 memset(&sll, 0, sizeof sll);
946 sll.sll_family = AF_PACKET;
947 sll.sll_ifindex = ifindex;
948 sll.sll_protocol = htons(ETH_P_ALL);
949 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
951 VLOG_ERR("%s: failed to bind raw socket (%s)",
952 netdev_get_name(netdev_), ovs_strerror(error));
956 /* Filter for only inbound packets. */
957 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
961 VLOG_ERR("%s: failed to attach filter (%s)",
962 netdev_get_name(netdev_), ovs_strerror(error));
966 ovs_mutex_unlock(&netdev->mutex);
974 ovs_mutex_unlock(&netdev->mutex);
979 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
981 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
989 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
997 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
999 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1000 return htons(aux->tp_vlan_tpid);
1002 return htons(ETH_TYPE_VLAN);
1007 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1009 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1013 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
1018 struct cmsghdr *cmsg;
1020 struct cmsghdr cmsg;
1021 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1025 /* Reserve headroom for a single VLAN tag */
1026 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1027 size = dp_packet_tailroom(buffer);
1029 iov.iov_base = dp_packet_data(buffer);
1031 msgh.msg_name = NULL;
1032 msgh.msg_namelen = 0;
1033 msgh.msg_iov = &iov;
1034 msgh.msg_iovlen = 1;
1035 msgh.msg_control = &cmsg_buffer;
1036 msgh.msg_controllen = sizeof cmsg_buffer;
1040 retval = recvmsg(fd, &msgh, MSG_TRUNC);
1041 } while (retval < 0 && errno == EINTR);
1045 } else if (retval > size) {
1049 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1051 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1052 const struct tpacket_auxdata *aux;
1054 if (cmsg->cmsg_level != SOL_PACKET
1055 || cmsg->cmsg_type != PACKET_AUXDATA
1056 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1060 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1061 if (auxdata_has_vlan_tci(aux)) {
1062 if (retval < ETH_HEADER_LEN) {
1066 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1067 htons(aux->tp_vlan_tci));
1076 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1079 size_t size = dp_packet_tailroom(buffer);
1082 retval = read(fd, dp_packet_data(buffer), size);
1083 } while (retval < 0 && errno == EINTR);
1089 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1094 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
1096 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1097 struct netdev *netdev = rx->up.netdev;
1098 struct dp_packet *buffer;
1102 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1103 mtu = ETH_PAYLOAD_MAX;
1106 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1107 DP_NETDEV_HEADROOM);
1108 retval = (rx->is_tap
1109 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1110 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1113 if (retval != EAGAIN && retval != EMSGSIZE) {
1114 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1115 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1117 dp_packet_delete(buffer);
1119 dp_packet_pad(buffer);
1120 batch->packets[0] = buffer;
1128 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1130 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1131 poll_fd_wait(rx->fd, POLLIN);
1135 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1137 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1140 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1141 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1145 drain_fd(rx->fd, ifr.ifr_qlen);
1148 return drain_rcvbuf(rx->fd);
1152 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1153 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1154 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1155 * the packet is too big or too small to transmit on the device.
1157 * The caller retains ownership of 'buffer' in all cases.
1159 * The kernel maintains a packet transmission queue, so the caller is not
1160 * expected to do additional queuing of packets. */
1162 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1163 struct dp_packet_batch *batch, bool may_steal)
1168 /* 'i' is incremented only if there's no error */
1169 for (i = 0; i < batch->count;) {
1170 const void *data = dp_packet_data(batch->packets[i]);
1171 size_t size = dp_packet_size(batch->packets[i]);
1174 /* Truncate the packet if it is configured. */
1175 size -= dp_packet_get_cutlen(batch->packets[i]);
1177 if (!is_tap_netdev(netdev_)) {
1178 /* Use our AF_PACKET socket to send to this device. */
1179 struct sockaddr_ll sll;
1185 sock = af_packet_sock();
1190 ifindex = netdev_get_ifindex(netdev_);
1195 /* We don't bother setting most fields in sockaddr_ll because the
1196 * kernel ignores them for SOCK_RAW. */
1197 memset(&sll, 0, sizeof sll);
1198 sll.sll_family = AF_PACKET;
1199 sll.sll_ifindex = ifindex;
1201 iov.iov_base = CONST_CAST(void *, data);
1204 msg.msg_name = &sll;
1205 msg.msg_namelen = sizeof sll;
1208 msg.msg_control = NULL;
1209 msg.msg_controllen = 0;
1212 retval = sendmsg(sock, &msg, 0);
1214 /* Use the tap fd to send to this device. This is essential for
1215 * tap devices, because packets sent to a tap device with an
1216 * AF_PACKET socket will loop back to be *received* again on the
1217 * tap device. This doesn't occur on other interface types
1218 * because we attach a socket filter to the rx socket. */
1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1221 retval = write(netdev->tap_fd, data, size);
1225 if (errno == EINTR) {
1226 /* The send was interrupted by a signal. Retry the packet by
1227 * continuing without incrementing 'i'.*/
1229 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1230 /* The Linux tap driver returns EIO if the device is not up.
1231 * From the OVS side this is not an error, so ignore it. */
1233 /* The Linux AF_PACKET implementation never blocks waiting for
1234 * room for packets, instead returning ENOBUFS. Translate this
1235 * into EAGAIN for the caller. */
1236 error = errno == ENOBUFS ? EAGAIN : errno;
1239 } else if (retval != size) {
1240 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1241 " of %"PRIuSIZE") on %s", retval, size,
1242 netdev_get_name(netdev_));
1247 /* Process the next packet in the batch */
1251 dp_packet_delete_batch(batch, may_steal);
1253 if (error && error != EAGAIN) {
1254 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1255 netdev_get_name(netdev_), ovs_strerror(error));
1262 /* Registers with the poll loop to wake up from the next call to poll_block()
1263 * when the packet transmission queue has sufficient room to transmit a packet
1264 * with netdev_send().
1266 * The kernel maintains a packet transmission queue, so the client is not
1267 * expected to do additional queuing of packets. Thus, this function is
1268 * unlikely to ever be used. It is included for completeness. */
1270 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1272 if (is_tap_netdev(netdev)) {
1273 /* TAP device always accepts packets.*/
1274 poll_immediate_wake();
1278 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1279 * otherwise a positive errno value. */
1281 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1283 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1284 enum netdev_flags old_flags = 0;
1287 ovs_mutex_lock(&netdev->mutex);
1289 if (netdev->cache_valid & VALID_ETHERADDR) {
1290 error = netdev->ether_addr_error;
1291 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1294 netdev->cache_valid &= ~VALID_ETHERADDR;
1297 /* Tap devices must be brought down before setting the address. */
1298 if (is_tap_netdev(netdev_)) {
1299 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1301 error = set_etheraddr(netdev_get_name(netdev_), mac);
1302 if (!error || error == ENODEV) {
1303 netdev->ether_addr_error = error;
1304 netdev->cache_valid |= VALID_ETHERADDR;
1306 netdev->etheraddr = mac;
1310 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1311 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1315 ovs_mutex_unlock(&netdev->mutex);
1319 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1321 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1323 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1326 ovs_mutex_lock(&netdev->mutex);
1327 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1328 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1329 &netdev->etheraddr);
1330 netdev->cache_valid |= VALID_ETHERADDR;
1333 error = netdev->ether_addr_error;
1335 *mac = netdev->etheraddr;
1337 ovs_mutex_unlock(&netdev->mutex);
1343 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1347 if (!(netdev->cache_valid & VALID_MTU)) {
1350 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1351 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1352 netdev->mtu = ifr.ifr_mtu;
1353 netdev->cache_valid |= VALID_MTU;
1356 error = netdev->netdev_mtu_error;
1358 *mtup = netdev->mtu;
1364 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1365 * in bytes, not including the hardware header; thus, this is typically 1500
1366 * bytes for Ethernet devices. */
1368 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1370 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1373 ovs_mutex_lock(&netdev->mutex);
1374 error = netdev_linux_get_mtu__(netdev, mtup);
1375 ovs_mutex_unlock(&netdev->mutex);
1380 /* Sets the maximum size of transmitted (MTU) for given device using linux
1381 * networking ioctl interface.
1384 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1390 ovs_mutex_lock(&netdev->mutex);
1391 if (netdev->cache_valid & VALID_MTU) {
1392 error = netdev->netdev_mtu_error;
1393 if (error || netdev->mtu == mtu) {
1396 netdev->cache_valid &= ~VALID_MTU;
1399 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1400 SIOCSIFMTU, "SIOCSIFMTU");
1401 if (!error || error == ENODEV) {
1402 netdev->netdev_mtu_error = error;
1403 netdev->mtu = ifr.ifr_mtu;
1404 netdev->cache_valid |= VALID_MTU;
1407 ovs_mutex_unlock(&netdev->mutex);
1411 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1412 * On failure, returns a negative errno value. */
1414 netdev_linux_get_ifindex(const struct netdev *netdev_)
1416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1419 ovs_mutex_lock(&netdev->mutex);
1420 error = get_ifindex(netdev_, &ifindex);
1421 ovs_mutex_unlock(&netdev->mutex);
1423 return error ? -error : ifindex;
1427 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1431 ovs_mutex_lock(&netdev->mutex);
1432 if (netdev->miimon_interval > 0) {
1433 *carrier = netdev->miimon;
1435 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1437 ovs_mutex_unlock(&netdev->mutex);
1442 static long long int
1443 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1446 long long int carrier_resets;
1448 ovs_mutex_lock(&netdev->mutex);
1449 carrier_resets = netdev->carrier_resets;
1450 ovs_mutex_unlock(&netdev->mutex);
1452 return carrier_resets;
1456 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1457 struct mii_ioctl_data *data)
1462 memset(&ifr, 0, sizeof ifr);
1463 memcpy(&ifr.ifr_data, data, sizeof *data);
1464 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1465 memcpy(data, &ifr.ifr_data, sizeof *data);
1471 netdev_linux_get_miimon(const char *name, bool *miimon)
1473 struct mii_ioctl_data data;
1478 memset(&data, 0, sizeof data);
1479 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1481 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1482 data.reg_num = MII_BMSR;
1483 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1487 *miimon = !!(data.val_out & BMSR_LSTATUS);
1489 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1492 struct ethtool_cmd ecmd;
1494 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1497 COVERAGE_INC(netdev_get_ethtool);
1498 memset(&ecmd, 0, sizeof ecmd);
1499 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1502 struct ethtool_value eval;
1504 memcpy(&eval, &ecmd, sizeof eval);
1505 *miimon = !!eval.data;
1507 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1515 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1516 long long int interval)
1518 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1520 ovs_mutex_lock(&netdev->mutex);
1521 interval = interval > 0 ? MAX(interval, 100) : 0;
1522 if (netdev->miimon_interval != interval) {
1523 if (interval && !netdev->miimon_interval) {
1524 atomic_count_inc(&miimon_cnt);
1525 } else if (!interval && netdev->miimon_interval) {
1526 atomic_count_dec(&miimon_cnt);
1529 netdev->miimon_interval = interval;
1530 timer_set_expired(&netdev->miimon_timer);
1532 ovs_mutex_unlock(&netdev->mutex);
1538 netdev_linux_miimon_run(void)
1540 struct shash device_shash;
1541 struct shash_node *node;
1543 shash_init(&device_shash);
1544 netdev_get_devices(&netdev_linux_class, &device_shash);
1545 SHASH_FOR_EACH (node, &device_shash) {
1546 struct netdev *netdev = node->data;
1547 struct netdev_linux *dev = netdev_linux_cast(netdev);
1550 ovs_mutex_lock(&dev->mutex);
1551 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1552 netdev_linux_get_miimon(dev->up.name, &miimon);
1553 if (miimon != dev->miimon) {
1554 dev->miimon = miimon;
1555 netdev_linux_changed(dev, dev->ifi_flags, 0);
1558 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1560 ovs_mutex_unlock(&dev->mutex);
1561 netdev_close(netdev);
1564 shash_destroy(&device_shash);
1568 netdev_linux_miimon_wait(void)
1570 struct shash device_shash;
1571 struct shash_node *node;
1573 shash_init(&device_shash);
1574 netdev_get_devices(&netdev_linux_class, &device_shash);
1575 SHASH_FOR_EACH (node, &device_shash) {
1576 struct netdev *netdev = node->data;
1577 struct netdev_linux *dev = netdev_linux_cast(netdev);
1579 ovs_mutex_lock(&dev->mutex);
1580 if (dev->miimon_interval > 0) {
1581 timer_wait(&dev->miimon_timer);
1583 ovs_mutex_unlock(&dev->mutex);
1584 netdev_close(netdev);
1586 shash_destroy(&device_shash);
1590 swap_uint64(uint64_t *a, uint64_t *b)
1597 /* Copies 'src' into 'dst', performing format conversion in the process.
1599 * 'src' is allowed to be misaligned. */
1601 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1602 const struct ovs_vport_stats *src)
1604 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1605 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1606 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1607 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1608 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1609 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1610 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1611 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1613 dst->collisions = 0;
1614 dst->rx_length_errors = 0;
1615 dst->rx_over_errors = 0;
1616 dst->rx_crc_errors = 0;
1617 dst->rx_frame_errors = 0;
1618 dst->rx_fifo_errors = 0;
1619 dst->rx_missed_errors = 0;
1620 dst->tx_aborted_errors = 0;
1621 dst->tx_carrier_errors = 0;
1622 dst->tx_fifo_errors = 0;
1623 dst->tx_heartbeat_errors = 0;
1624 dst->tx_window_errors = 0;
1628 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1630 struct dpif_netlink_vport reply;
1634 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1637 } else if (!reply.stats) {
1642 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1650 get_stats_via_vport(const struct netdev *netdev_,
1651 struct netdev_stats *stats)
1653 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1655 if (!netdev->vport_stats_error ||
1656 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1659 error = get_stats_via_vport__(netdev_, stats);
1660 if (error && error != ENOENT && error != ENODEV) {
1661 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1663 netdev_get_name(netdev_), ovs_strerror(error));
1665 netdev->vport_stats_error = error;
1666 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1670 /* Retrieves current device stats for 'netdev-linux'. */
1672 netdev_linux_get_stats(const struct netdev *netdev_,
1673 struct netdev_stats *stats)
1675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1676 struct netdev_stats dev_stats;
1679 ovs_mutex_lock(&netdev->mutex);
1680 get_stats_via_vport(netdev_, stats);
1681 error = get_stats_via_netlink(netdev_, &dev_stats);
1683 if (!netdev->vport_stats_error) {
1686 } else if (netdev->vport_stats_error) {
1687 /* stats not available from OVS then use netdev stats. */
1690 /* Use kernel netdev's packet and byte counts since vport's counters
1691 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1693 stats->rx_packets = dev_stats.rx_packets;
1694 stats->rx_bytes = dev_stats.rx_bytes;
1695 stats->tx_packets = dev_stats.tx_packets;
1696 stats->tx_bytes = dev_stats.tx_bytes;
1698 stats->rx_errors += dev_stats.rx_errors;
1699 stats->tx_errors += dev_stats.tx_errors;
1700 stats->rx_dropped += dev_stats.rx_dropped;
1701 stats->tx_dropped += dev_stats.tx_dropped;
1702 stats->multicast += dev_stats.multicast;
1703 stats->collisions += dev_stats.collisions;
1704 stats->rx_length_errors += dev_stats.rx_length_errors;
1705 stats->rx_over_errors += dev_stats.rx_over_errors;
1706 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1707 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1708 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1709 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1710 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1711 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1712 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1713 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1714 stats->tx_window_errors += dev_stats.tx_window_errors;
1716 ovs_mutex_unlock(&netdev->mutex);
1721 /* Retrieves current device stats for 'netdev-tap' netdev or
1722 * netdev-internal. */
1724 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1727 struct netdev_stats dev_stats;
1730 ovs_mutex_lock(&netdev->mutex);
1731 get_stats_via_vport(netdev_, stats);
1732 error = get_stats_via_netlink(netdev_, &dev_stats);
1734 if (!netdev->vport_stats_error) {
1737 } else if (netdev->vport_stats_error) {
1738 /* Transmit and receive stats will appear to be swapped relative to the
1739 * other ports since we are the one sending the data, not a remote
1740 * computer. For consistency, we swap them back here. This does not
1741 * apply if we are getting stats from the vport layer because it always
1742 * tracks stats from the perspective of the switch. */
1745 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1746 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1747 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1748 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1749 stats->rx_length_errors = 0;
1750 stats->rx_over_errors = 0;
1751 stats->rx_crc_errors = 0;
1752 stats->rx_frame_errors = 0;
1753 stats->rx_fifo_errors = 0;
1754 stats->rx_missed_errors = 0;
1755 stats->tx_aborted_errors = 0;
1756 stats->tx_carrier_errors = 0;
1757 stats->tx_fifo_errors = 0;
1758 stats->tx_heartbeat_errors = 0;
1759 stats->tx_window_errors = 0;
1761 /* Use kernel netdev's packet and byte counts since vport counters
1762 * do not reflect packet counts on the wire when GSO, TSO or GRO
1764 stats->rx_packets = dev_stats.tx_packets;
1765 stats->rx_bytes = dev_stats.tx_bytes;
1766 stats->tx_packets = dev_stats.rx_packets;
1767 stats->tx_bytes = dev_stats.rx_bytes;
1769 stats->rx_dropped += dev_stats.tx_dropped;
1770 stats->tx_dropped += dev_stats.rx_dropped;
1772 stats->rx_errors += dev_stats.tx_errors;
1773 stats->tx_errors += dev_stats.rx_errors;
1775 stats->multicast += dev_stats.multicast;
1776 stats->collisions += dev_stats.collisions;
1778 ovs_mutex_unlock(&netdev->mutex);
1784 netdev_internal_get_stats(const struct netdev *netdev_,
1785 struct netdev_stats *stats)
1787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1790 ovs_mutex_lock(&netdev->mutex);
1791 get_stats_via_vport(netdev_, stats);
1792 error = netdev->vport_stats_error;
1793 ovs_mutex_unlock(&netdev->mutex);
1799 netdev_linux_read_features(struct netdev_linux *netdev)
1801 struct ethtool_cmd ecmd;
1805 if (netdev->cache_valid & VALID_FEATURES) {
1809 COVERAGE_INC(netdev_get_ethtool);
1810 memset(&ecmd, 0, sizeof ecmd);
1811 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1812 ETHTOOL_GSET, "ETHTOOL_GSET");
1817 /* Supported features. */
1818 netdev->supported = 0;
1819 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1820 netdev->supported |= NETDEV_F_10MB_HD;
1822 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1823 netdev->supported |= NETDEV_F_10MB_FD;
1825 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1826 netdev->supported |= NETDEV_F_100MB_HD;
1828 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1829 netdev->supported |= NETDEV_F_100MB_FD;
1831 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1832 netdev->supported |= NETDEV_F_1GB_HD;
1834 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1835 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
1836 netdev->supported |= NETDEV_F_1GB_FD;
1838 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1839 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1840 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1841 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
1842 netdev->supported |= NETDEV_F_10GB_FD;
1844 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1845 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1846 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1847 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1848 netdev->supported |= NETDEV_F_40GB_FD;
1850 if (ecmd.supported & SUPPORTED_TP) {
1851 netdev->supported |= NETDEV_F_COPPER;
1853 if (ecmd.supported & SUPPORTED_FIBRE) {
1854 netdev->supported |= NETDEV_F_FIBER;
1856 if (ecmd.supported & SUPPORTED_Autoneg) {
1857 netdev->supported |= NETDEV_F_AUTONEG;
1859 if (ecmd.supported & SUPPORTED_Pause) {
1860 netdev->supported |= NETDEV_F_PAUSE;
1862 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1863 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1866 /* Advertised features. */
1867 netdev->advertised = 0;
1868 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1869 netdev->advertised |= NETDEV_F_10MB_HD;
1871 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1872 netdev->advertised |= NETDEV_F_10MB_FD;
1874 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1875 netdev->advertised |= NETDEV_F_100MB_HD;
1877 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1878 netdev->advertised |= NETDEV_F_100MB_FD;
1880 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1881 netdev->advertised |= NETDEV_F_1GB_HD;
1883 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1884 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
1885 netdev->advertised |= NETDEV_F_1GB_FD;
1887 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1888 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1889 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1890 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
1891 netdev->advertised |= NETDEV_F_10GB_FD;
1893 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1894 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1895 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1896 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1897 netdev->advertised |= NETDEV_F_40GB_FD;
1899 if (ecmd.advertising & ADVERTISED_TP) {
1900 netdev->advertised |= NETDEV_F_COPPER;
1902 if (ecmd.advertising & ADVERTISED_FIBRE) {
1903 netdev->advertised |= NETDEV_F_FIBER;
1905 if (ecmd.advertising & ADVERTISED_Autoneg) {
1906 netdev->advertised |= NETDEV_F_AUTONEG;
1908 if (ecmd.advertising & ADVERTISED_Pause) {
1909 netdev->advertised |= NETDEV_F_PAUSE;
1911 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1912 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1915 /* Current settings. */
1916 speed = ethtool_cmd_speed(&ecmd);
1917 if (speed == SPEED_10) {
1918 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1919 } else if (speed == SPEED_100) {
1920 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1921 } else if (speed == SPEED_1000) {
1922 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1923 } else if (speed == SPEED_10000) {
1924 netdev->current = NETDEV_F_10GB_FD;
1925 } else if (speed == 40000) {
1926 netdev->current = NETDEV_F_40GB_FD;
1927 } else if (speed == 100000) {
1928 netdev->current = NETDEV_F_100GB_FD;
1929 } else if (speed == 1000000) {
1930 netdev->current = NETDEV_F_1TB_FD;
1932 netdev->current = 0;
1935 if (ecmd.port == PORT_TP) {
1936 netdev->current |= NETDEV_F_COPPER;
1937 } else if (ecmd.port == PORT_FIBRE) {
1938 netdev->current |= NETDEV_F_FIBER;
1942 netdev->current |= NETDEV_F_AUTONEG;
1946 netdev->cache_valid |= VALID_FEATURES;
1947 netdev->get_features_error = error;
1950 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1951 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1952 * Returns 0 if successful, otherwise a positive errno value. */
1954 netdev_linux_get_features(const struct netdev *netdev_,
1955 enum netdev_features *current,
1956 enum netdev_features *advertised,
1957 enum netdev_features *supported,
1958 enum netdev_features *peer)
1960 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1963 ovs_mutex_lock(&netdev->mutex);
1964 netdev_linux_read_features(netdev);
1965 if (!netdev->get_features_error) {
1966 *current = netdev->current;
1967 *advertised = netdev->advertised;
1968 *supported = netdev->supported;
1969 *peer = 0; /* XXX */
1971 error = netdev->get_features_error;
1972 ovs_mutex_unlock(&netdev->mutex);
1977 /* Set the features advertised by 'netdev' to 'advertise'. */
1979 netdev_linux_set_advertisements(struct netdev *netdev_,
1980 enum netdev_features advertise)
1982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1983 struct ethtool_cmd ecmd;
1986 ovs_mutex_lock(&netdev->mutex);
1988 COVERAGE_INC(netdev_get_ethtool);
1989 memset(&ecmd, 0, sizeof ecmd);
1990 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1991 ETHTOOL_GSET, "ETHTOOL_GSET");
1996 ecmd.advertising = 0;
1997 if (advertise & NETDEV_F_10MB_HD) {
1998 ecmd.advertising |= ADVERTISED_10baseT_Half;
2000 if (advertise & NETDEV_F_10MB_FD) {
2001 ecmd.advertising |= ADVERTISED_10baseT_Full;
2003 if (advertise & NETDEV_F_100MB_HD) {
2004 ecmd.advertising |= ADVERTISED_100baseT_Half;
2006 if (advertise & NETDEV_F_100MB_FD) {
2007 ecmd.advertising |= ADVERTISED_100baseT_Full;
2009 if (advertise & NETDEV_F_1GB_HD) {
2010 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2012 if (advertise & NETDEV_F_1GB_FD) {
2013 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2015 if (advertise & NETDEV_F_10GB_FD) {
2016 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2018 if (advertise & NETDEV_F_COPPER) {
2019 ecmd.advertising |= ADVERTISED_TP;
2021 if (advertise & NETDEV_F_FIBER) {
2022 ecmd.advertising |= ADVERTISED_FIBRE;
2024 if (advertise & NETDEV_F_AUTONEG) {
2025 ecmd.advertising |= ADVERTISED_Autoneg;
2027 if (advertise & NETDEV_F_PAUSE) {
2028 ecmd.advertising |= ADVERTISED_Pause;
2030 if (advertise & NETDEV_F_PAUSE_ASYM) {
2031 ecmd.advertising |= ADVERTISED_Asym_Pause;
2033 COVERAGE_INC(netdev_set_ethtool);
2034 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2035 ETHTOOL_SSET, "ETHTOOL_SSET");
2038 ovs_mutex_unlock(&netdev->mutex);
2042 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2043 * successful, otherwise a positive errno value. */
2045 netdev_linux_set_policing(struct netdev *netdev_,
2046 uint32_t kbits_rate, uint32_t kbits_burst)
2048 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2049 const char *netdev_name = netdev_get_name(netdev_);
2052 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
2053 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2054 : kbits_burst); /* Stick with user-specified value. */
2056 ovs_mutex_lock(&netdev->mutex);
2057 if (netdev->cache_valid & VALID_POLICING) {
2058 error = netdev->netdev_policing_error;
2059 if (error || (netdev->kbits_rate == kbits_rate &&
2060 netdev->kbits_burst == kbits_burst)) {
2061 /* Assume that settings haven't changed since we last set them. */
2064 netdev->cache_valid &= ~VALID_POLICING;
2067 COVERAGE_INC(netdev_set_policing);
2068 /* Remove any existing ingress qdisc. */
2069 error = tc_add_del_ingress_qdisc(netdev_, false);
2071 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2072 netdev_name, ovs_strerror(error));
2077 error = tc_add_del_ingress_qdisc(netdev_, true);
2079 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2080 netdev_name, ovs_strerror(error));
2084 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2086 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2087 netdev_name, ovs_strerror(error));
2092 netdev->kbits_rate = kbits_rate;
2093 netdev->kbits_burst = kbits_burst;
2096 if (!error || error == ENODEV) {
2097 netdev->netdev_policing_error = error;
2098 netdev->cache_valid |= VALID_POLICING;
2100 ovs_mutex_unlock(&netdev->mutex);
2105 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2108 const struct tc_ops *const *opsp;
2109 for (opsp = tcs; *opsp != NULL; opsp++) {
2110 const struct tc_ops *ops = *opsp;
2111 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2112 sset_add(types, ops->ovs_name);
2118 static const struct tc_ops *
2119 tc_lookup_ovs_name(const char *name)
2121 const struct tc_ops *const *opsp;
2123 for (opsp = tcs; *opsp != NULL; opsp++) {
2124 const struct tc_ops *ops = *opsp;
2125 if (!strcmp(name, ops->ovs_name)) {
2132 static const struct tc_ops *
2133 tc_lookup_linux_name(const char *name)
2135 const struct tc_ops *const *opsp;
2137 for (opsp = tcs; *opsp != NULL; opsp++) {
2138 const struct tc_ops *ops = *opsp;
2139 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2146 static struct tc_queue *
2147 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2150 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2151 struct tc_queue *queue;
2153 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2154 if (queue->queue_id == queue_id) {
2161 static struct tc_queue *
2162 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2164 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2168 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2170 struct netdev_qos_capabilities *caps)
2172 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2176 caps->n_queues = ops->n_queues;
2181 netdev_linux_get_qos(const struct netdev *netdev_,
2182 const char **typep, struct smap *details)
2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2187 ovs_mutex_lock(&netdev->mutex);
2188 error = tc_query_qdisc(netdev_);
2190 *typep = netdev->tc->ops->ovs_name;
2191 error = (netdev->tc->ops->qdisc_get
2192 ? netdev->tc->ops->qdisc_get(netdev_, details)
2195 ovs_mutex_unlock(&netdev->mutex);
2201 netdev_linux_set_qos(struct netdev *netdev_,
2202 const char *type, const struct smap *details)
2204 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2205 const struct tc_ops *new_ops;
2208 new_ops = tc_lookup_ovs_name(type);
2209 if (!new_ops || !new_ops->tc_install) {
2213 if (new_ops == &tc_ops_noop) {
2214 return new_ops->tc_install(netdev_, details);
2217 ovs_mutex_lock(&netdev->mutex);
2218 error = tc_query_qdisc(netdev_);
2223 if (new_ops == netdev->tc->ops) {
2224 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2226 /* Delete existing qdisc. */
2227 error = tc_del_qdisc(netdev_);
2231 ovs_assert(netdev->tc == NULL);
2233 /* Install new qdisc. */
2234 error = new_ops->tc_install(netdev_, details);
2235 ovs_assert((error == 0) == (netdev->tc != NULL));
2239 ovs_mutex_unlock(&netdev->mutex);
2244 netdev_linux_get_queue(const struct netdev *netdev_,
2245 unsigned int queue_id, struct smap *details)
2247 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2250 ovs_mutex_lock(&netdev->mutex);
2251 error = tc_query_qdisc(netdev_);
2253 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2255 ? netdev->tc->ops->class_get(netdev_, queue, details)
2258 ovs_mutex_unlock(&netdev->mutex);
2264 netdev_linux_set_queue(struct netdev *netdev_,
2265 unsigned int queue_id, const struct smap *details)
2267 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2270 ovs_mutex_lock(&netdev->mutex);
2271 error = tc_query_qdisc(netdev_);
2273 error = (queue_id < netdev->tc->ops->n_queues
2274 && netdev->tc->ops->class_set
2275 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2278 ovs_mutex_unlock(&netdev->mutex);
2284 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2286 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2289 ovs_mutex_lock(&netdev->mutex);
2290 error = tc_query_qdisc(netdev_);
2292 if (netdev->tc->ops->class_delete) {
2293 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2295 ? netdev->tc->ops->class_delete(netdev_, queue)
2301 ovs_mutex_unlock(&netdev->mutex);
2307 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2308 unsigned int queue_id,
2309 struct netdev_queue_stats *stats)
2311 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2314 ovs_mutex_lock(&netdev->mutex);
2315 error = tc_query_qdisc(netdev_);
2317 if (netdev->tc->ops->class_get_stats) {
2318 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2320 stats->created = queue->created;
2321 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2330 ovs_mutex_unlock(&netdev->mutex);
2335 struct queue_dump_state {
2336 struct nl_dump dump;
2341 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2343 struct ofpbuf request;
2344 struct tcmsg *tcmsg;
2346 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2350 tcmsg->tcm_parent = 0;
2351 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2352 ofpbuf_uninit(&request);
2354 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2359 finish_queue_dump(struct queue_dump_state *state)
2361 ofpbuf_uninit(&state->buf);
2362 return nl_dump_done(&state->dump);
2365 struct netdev_linux_queue_state {
2366 unsigned int *queues;
2372 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2374 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2377 ovs_mutex_lock(&netdev->mutex);
2378 error = tc_query_qdisc(netdev_);
2380 if (netdev->tc->ops->class_get) {
2381 struct netdev_linux_queue_state *state;
2382 struct tc_queue *queue;
2385 *statep = state = xmalloc(sizeof *state);
2386 state->n_queues = hmap_count(&netdev->tc->queues);
2387 state->cur_queue = 0;
2388 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2391 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2392 state->queues[i++] = queue->queue_id;
2398 ovs_mutex_unlock(&netdev->mutex);
2404 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2405 unsigned int *queue_idp, struct smap *details)
2407 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2408 struct netdev_linux_queue_state *state = state_;
2411 ovs_mutex_lock(&netdev->mutex);
2412 while (state->cur_queue < state->n_queues) {
2413 unsigned int queue_id = state->queues[state->cur_queue++];
2414 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2417 *queue_idp = queue_id;
2418 error = netdev->tc->ops->class_get(netdev_, queue, details);
2422 ovs_mutex_unlock(&netdev->mutex);
2428 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2431 struct netdev_linux_queue_state *state = state_;
2433 free(state->queues);
2439 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2440 netdev_dump_queue_stats_cb *cb, void *aux)
2442 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2445 ovs_mutex_lock(&netdev->mutex);
2446 error = tc_query_qdisc(netdev_);
2448 struct queue_dump_state state;
2450 if (!netdev->tc->ops->class_dump_stats) {
2452 } else if (!start_queue_dump(netdev_, &state)) {
2458 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2459 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2466 retval = finish_queue_dump(&state);
2472 ovs_mutex_unlock(&netdev->mutex);
2478 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2479 struct in_addr netmask)
2481 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2484 ovs_mutex_lock(&netdev->mutex);
2485 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2487 if (address.s_addr != INADDR_ANY) {
2488 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2489 "SIOCSIFNETMASK", netmask);
2493 ovs_mutex_unlock(&netdev->mutex);
2498 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2499 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2502 netdev_linux_get_addr_list(const struct netdev *netdev_,
2503 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
2505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2508 ovs_mutex_lock(&netdev->mutex);
2509 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
2510 ovs_mutex_unlock(&netdev->mutex);
2516 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2518 struct sockaddr_in sin;
2519 memset(&sin, 0, sizeof sin);
2520 sin.sin_family = AF_INET;
2521 sin.sin_addr = addr;
2524 memset(sa, 0, sizeof *sa);
2525 memcpy(sa, &sin, sizeof sin);
2529 do_set_addr(struct netdev *netdev,
2530 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2534 make_in4_sockaddr(&ifr.ifr_addr, addr);
2535 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2539 /* Adds 'router' as a default IP gateway. */
2541 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2543 struct in_addr any = { INADDR_ANY };
2547 memset(&rt, 0, sizeof rt);
2548 make_in4_sockaddr(&rt.rt_dst, any);
2549 make_in4_sockaddr(&rt.rt_gateway, router);
2550 make_in4_sockaddr(&rt.rt_genmask, any);
2551 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2552 error = af_inet_ioctl(SIOCADDRT, &rt);
2554 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2560 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2563 static const char fn[] = "/proc/net/route";
2568 *netdev_name = NULL;
2569 stream = fopen(fn, "r");
2570 if (stream == NULL) {
2571 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2576 while (fgets(line, sizeof line, stream)) {
2579 ovs_be32 dest, gateway, mask;
2580 int refcnt, metric, mtu;
2581 unsigned int flags, use, window, irtt;
2584 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2586 iface, &dest, &gateway, &flags, &refcnt,
2587 &use, &metric, &mask, &mtu, &window, &irtt)) {
2588 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2592 if (!(flags & RTF_UP)) {
2593 /* Skip routes that aren't up. */
2597 /* The output of 'dest', 'mask', and 'gateway' were given in
2598 * network byte order, so we don't need need any endian
2599 * conversions here. */
2600 if ((dest & mask) == (host->s_addr & mask)) {
2602 /* The host is directly reachable. */
2603 next_hop->s_addr = 0;
2605 /* To reach the host, we must go through a gateway. */
2606 next_hop->s_addr = gateway;
2608 *netdev_name = xstrdup(iface);
2620 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2625 ovs_mutex_lock(&netdev->mutex);
2626 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2627 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2629 COVERAGE_INC(netdev_get_ethtool);
2630 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2631 error = netdev_linux_do_ethtool(netdev->up.name,
2634 "ETHTOOL_GDRVINFO");
2636 netdev->cache_valid |= VALID_DRVINFO;
2641 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2642 smap_add(smap, "driver_version", netdev->drvinfo.version);
2643 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2645 ovs_mutex_unlock(&netdev->mutex);
2651 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2654 smap_add(smap, "driver_name", "openvswitch");
2658 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2659 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2660 * returns 0. Otherwise, it returns a positive errno value; in particular,
2661 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2663 netdev_linux_arp_lookup(const struct netdev *netdev,
2664 ovs_be32 ip, struct eth_addr *mac)
2667 struct sockaddr_in sin;
2670 memset(&r, 0, sizeof r);
2671 memset(&sin, 0, sizeof sin);
2672 sin.sin_family = AF_INET;
2673 sin.sin_addr.s_addr = ip;
2675 memcpy(&r.arp_pa, &sin, sizeof sin);
2676 r.arp_ha.sa_family = ARPHRD_ETHER;
2678 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2679 COVERAGE_INC(netdev_arp_lookup);
2680 retval = af_inet_ioctl(SIOCGARP, &r);
2682 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2683 } else if (retval != ENXIO) {
2684 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2685 netdev_get_name(netdev), IP_ARGS(ip),
2686 ovs_strerror(retval));
2692 nd_to_iff_flags(enum netdev_flags nd)
2695 if (nd & NETDEV_UP) {
2698 if (nd & NETDEV_PROMISC) {
2701 if (nd & NETDEV_LOOPBACK) {
2702 iff |= IFF_LOOPBACK;
2708 iff_to_nd_flags(int iff)
2710 enum netdev_flags nd = 0;
2714 if (iff & IFF_PROMISC) {
2715 nd |= NETDEV_PROMISC;
2717 if (iff & IFF_LOOPBACK) {
2718 nd |= NETDEV_LOOPBACK;
2724 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2725 enum netdev_flags on, enum netdev_flags *old_flagsp)
2726 OVS_REQUIRES(netdev->mutex)
2728 int old_flags, new_flags;
2731 old_flags = netdev->ifi_flags;
2732 *old_flagsp = iff_to_nd_flags(old_flags);
2733 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2734 if (new_flags != old_flags) {
2735 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2736 get_flags(&netdev->up, &netdev->ifi_flags);
2743 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2744 enum netdev_flags on, enum netdev_flags *old_flagsp)
2746 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2749 ovs_mutex_lock(&netdev->mutex);
2750 error = update_flags(netdev, off, on, old_flagsp);
2751 ovs_mutex_unlock(&netdev->mutex);
2756 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2757 GET_FEATURES, GET_STATUS) \
2760 false, /* is_pmd */ \
2764 netdev_linux_wait, \
2766 netdev_linux_alloc, \
2768 netdev_linux_destruct, \
2769 netdev_linux_dealloc, \
2770 NULL, /* get_config */ \
2771 NULL, /* set_config */ \
2772 NULL, /* get_tunnel_config */ \
2773 NULL, /* build header */ \
2774 NULL, /* push header */ \
2775 NULL, /* pop header */ \
2776 NULL, /* get_numa_id */ \
2777 NULL, /* set_tx_multiq */ \
2779 netdev_linux_send, \
2780 netdev_linux_send_wait, \
2782 netdev_linux_set_etheraddr, \
2783 netdev_linux_get_etheraddr, \
2784 netdev_linux_get_mtu, \
2785 netdev_linux_set_mtu, \
2786 netdev_linux_get_ifindex, \
2787 netdev_linux_get_carrier, \
2788 netdev_linux_get_carrier_resets, \
2789 netdev_linux_set_miimon_interval, \
2793 netdev_linux_set_advertisements, \
2795 netdev_linux_set_policing, \
2796 netdev_linux_get_qos_types, \
2797 netdev_linux_get_qos_capabilities, \
2798 netdev_linux_get_qos, \
2799 netdev_linux_set_qos, \
2800 netdev_linux_get_queue, \
2801 netdev_linux_set_queue, \
2802 netdev_linux_delete_queue, \
2803 netdev_linux_get_queue_stats, \
2804 netdev_linux_queue_dump_start, \
2805 netdev_linux_queue_dump_next, \
2806 netdev_linux_queue_dump_done, \
2807 netdev_linux_dump_queue_stats, \
2809 netdev_linux_set_in4, \
2810 netdev_linux_get_addr_list, \
2811 netdev_linux_add_router, \
2812 netdev_linux_get_next_hop, \
2814 netdev_linux_arp_lookup, \
2816 netdev_linux_update_flags, \
2817 NULL, /* reconfigure */ \
2819 netdev_linux_rxq_alloc, \
2820 netdev_linux_rxq_construct, \
2821 netdev_linux_rxq_destruct, \
2822 netdev_linux_rxq_dealloc, \
2823 netdev_linux_rxq_recv, \
2824 netdev_linux_rxq_wait, \
2825 netdev_linux_rxq_drain, \
2828 const struct netdev_class netdev_linux_class =
2831 netdev_linux_construct,
2832 netdev_linux_get_stats,
2833 netdev_linux_get_features,
2834 netdev_linux_get_status);
2836 const struct netdev_class netdev_tap_class =
2839 netdev_linux_construct_tap,
2840 netdev_tap_get_stats,
2841 netdev_linux_get_features,
2842 netdev_linux_get_status);
2844 const struct netdev_class netdev_internal_class =
2847 netdev_linux_construct,
2848 netdev_internal_get_stats,
2849 NULL, /* get_features */
2850 netdev_internal_get_status);
2853 #define CODEL_N_QUEUES 0x0000
2855 /* In sufficiently new kernel headers these are defined as enums in
2856 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2857 * kernels. (This overrides any enum definition in the header file but that's
2859 #define TCA_CODEL_TARGET 1
2860 #define TCA_CODEL_LIMIT 2
2861 #define TCA_CODEL_INTERVAL 3
2870 static struct codel *
2871 codel_get__(const struct netdev *netdev_)
2873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2874 return CONTAINER_OF(netdev->tc, struct codel, tc);
2878 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2881 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2882 struct codel *codel;
2884 codel = xmalloc(sizeof *codel);
2885 tc_init(&codel->tc, &tc_ops_codel);
2886 codel->target = target;
2887 codel->limit = limit;
2888 codel->interval = interval;
2890 netdev->tc = &codel->tc;
2894 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2898 struct ofpbuf request;
2899 struct tcmsg *tcmsg;
2900 uint32_t otarget, olimit, ointerval;
2903 tc_del_qdisc(netdev);
2905 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2906 NLM_F_EXCL | NLM_F_CREATE, &request);
2910 tcmsg->tcm_handle = tc_make_handle(1, 0);
2911 tcmsg->tcm_parent = TC_H_ROOT;
2913 otarget = target ? target : 5000;
2914 olimit = limit ? limit : 10240;
2915 ointerval = interval ? interval : 100000;
2917 nl_msg_put_string(&request, TCA_KIND, "codel");
2918 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2919 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2920 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2921 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2922 nl_msg_end_nested(&request, opt_offset);
2924 error = tc_transact(&request, NULL);
2926 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2927 "target %u, limit %u, interval %u error %d(%s)",
2928 netdev_get_name(netdev),
2929 otarget, olimit, ointerval,
2930 error, ovs_strerror(error));
2936 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2937 const struct smap *details, struct codel *codel)
2939 const char *target_s;
2940 const char *limit_s;
2941 const char *interval_s;
2943 target_s = smap_get(details, "target");
2944 limit_s = smap_get(details, "limit");
2945 interval_s = smap_get(details, "interval");
2947 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2948 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2949 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2951 if (!codel->target) {
2952 codel->target = 5000;
2954 if (!codel->limit) {
2955 codel->limit = 10240;
2957 if (!codel->interval) {
2958 codel->interval = 100000;
2963 codel_tc_install(struct netdev *netdev, const struct smap *details)
2968 codel_parse_qdisc_details__(netdev, details, &codel);
2969 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2972 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2978 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2980 static const struct nl_policy tca_codel_policy[] = {
2981 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2982 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2983 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2986 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2988 if (!nl_parse_nested(nl_options, tca_codel_policy,
2989 attrs, ARRAY_SIZE(tca_codel_policy))) {
2990 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2994 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2995 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2996 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3001 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3003 struct nlattr *nlattr;
3008 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3013 error = codel_parse_tca_options__(nlattr, &codel);
3018 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3024 codel_tc_destroy(struct tc *tc)
3026 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3032 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3034 const struct codel *codel = codel_get__(netdev);
3035 smap_add_format(details, "target", "%u", codel->target);
3036 smap_add_format(details, "limit", "%u", codel->limit);
3037 smap_add_format(details, "interval", "%u", codel->interval);
3042 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3046 codel_parse_qdisc_details__(netdev, details, &codel);
3047 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3048 codel_get__(netdev)->target = codel.target;
3049 codel_get__(netdev)->limit = codel.limit;
3050 codel_get__(netdev)->interval = codel.interval;
3054 static const struct tc_ops tc_ops_codel = {
3055 "codel", /* linux_name */
3056 "linux-codel", /* ovs_name */
3057 CODEL_N_QUEUES, /* n_queues */
3070 /* FQ-CoDel traffic control class. */
3072 #define FQCODEL_N_QUEUES 0x0000
3074 /* In sufficiently new kernel headers these are defined as enums in
3075 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3076 * kernels. (This overrides any enum definition in the header file but that's
3078 #define TCA_FQ_CODEL_TARGET 1
3079 #define TCA_FQ_CODEL_LIMIT 2
3080 #define TCA_FQ_CODEL_INTERVAL 3
3081 #define TCA_FQ_CODEL_ECN 4
3082 #define TCA_FQ_CODEL_FLOWS 5
3083 #define TCA_FQ_CODEL_QUANTUM 6
3094 static struct fqcodel *
3095 fqcodel_get__(const struct netdev *netdev_)
3097 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3098 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3102 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3103 uint32_t interval, uint32_t flows, uint32_t quantum)
3105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3106 struct fqcodel *fqcodel;
3108 fqcodel = xmalloc(sizeof *fqcodel);
3109 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3110 fqcodel->target = target;
3111 fqcodel->limit = limit;
3112 fqcodel->interval = interval;
3113 fqcodel->flows = flows;
3114 fqcodel->quantum = quantum;
3116 netdev->tc = &fqcodel->tc;
3120 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3121 uint32_t interval, uint32_t flows, uint32_t quantum)
3124 struct ofpbuf request;
3125 struct tcmsg *tcmsg;
3126 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3129 tc_del_qdisc(netdev);
3131 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3132 NLM_F_EXCL | NLM_F_CREATE, &request);
3136 tcmsg->tcm_handle = tc_make_handle(1, 0);
3137 tcmsg->tcm_parent = TC_H_ROOT;
3139 otarget = target ? target : 5000;
3140 olimit = limit ? limit : 10240;
3141 ointerval = interval ? interval : 100000;
3142 oflows = flows ? flows : 1024;
3143 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3146 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3147 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3148 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3149 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3150 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3151 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3152 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3153 nl_msg_end_nested(&request, opt_offset);
3155 error = tc_transact(&request, NULL);
3157 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3158 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3159 netdev_get_name(netdev),
3160 otarget, olimit, ointerval, oflows, oquantum,
3161 error, ovs_strerror(error));
3167 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3168 const struct smap *details, struct fqcodel *fqcodel)
3170 const char *target_s;
3171 const char *limit_s;
3172 const char *interval_s;
3173 const char *flows_s;
3174 const char *quantum_s;
3176 target_s = smap_get(details, "target");
3177 limit_s = smap_get(details, "limit");
3178 interval_s = smap_get(details, "interval");
3179 flows_s = smap_get(details, "flows");
3180 quantum_s = smap_get(details, "quantum");
3181 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3182 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3183 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3184 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3185 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3186 if (!fqcodel->target) {
3187 fqcodel->target = 5000;
3189 if (!fqcodel->limit) {
3190 fqcodel->limit = 10240;
3192 if (!fqcodel->interval) {
3193 fqcodel->interval = 1000000;
3195 if (!fqcodel->flows) {
3196 fqcodel->flows = 1024;
3198 if (!fqcodel->quantum) {
3199 fqcodel->quantum = 1514;
3204 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3207 struct fqcodel fqcodel;
3209 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3210 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3211 fqcodel.interval, fqcodel.flows,
3214 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3215 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3221 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3223 static const struct nl_policy tca_fqcodel_policy[] = {
3224 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3225 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3226 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3227 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3228 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3231 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3233 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3234 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3235 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3239 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3240 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3241 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3242 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3243 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3248 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3250 struct nlattr *nlattr;
3253 struct fqcodel fqcodel;
3255 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3260 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3265 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3266 fqcodel.flows, fqcodel.quantum);
3271 fqcodel_tc_destroy(struct tc *tc)
3273 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3279 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3281 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3282 smap_add_format(details, "target", "%u", fqcodel->target);
3283 smap_add_format(details, "limit", "%u", fqcodel->limit);
3284 smap_add_format(details, "interval", "%u", fqcodel->interval);
3285 smap_add_format(details, "flows", "%u", fqcodel->flows);
3286 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3291 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3293 struct fqcodel fqcodel;
3295 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3296 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3297 fqcodel.flows, fqcodel.quantum);
3298 fqcodel_get__(netdev)->target = fqcodel.target;
3299 fqcodel_get__(netdev)->limit = fqcodel.limit;
3300 fqcodel_get__(netdev)->interval = fqcodel.interval;
3301 fqcodel_get__(netdev)->flows = fqcodel.flows;
3302 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3306 static const struct tc_ops tc_ops_fqcodel = {
3307 "fq_codel", /* linux_name */
3308 "linux-fq_codel", /* ovs_name */
3309 FQCODEL_N_QUEUES, /* n_queues */
3322 /* SFQ traffic control class. */
3324 #define SFQ_N_QUEUES 0x0000
3333 sfq_get__(const struct netdev *netdev_)
3335 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3336 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3340 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3345 sfq = xmalloc(sizeof *sfq);
3346 tc_init(&sfq->tc, &tc_ops_sfq);
3347 sfq->perturb = perturb;
3348 sfq->quantum = quantum;
3350 netdev->tc = &sfq->tc;
3354 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3356 struct tc_sfq_qopt opt;
3357 struct ofpbuf request;
3358 struct tcmsg *tcmsg;
3360 int mtu_error, error;
3361 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3363 tc_del_qdisc(netdev);
3365 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3366 NLM_F_EXCL | NLM_F_CREATE, &request);
3370 tcmsg->tcm_handle = tc_make_handle(1, 0);
3371 tcmsg->tcm_parent = TC_H_ROOT;
3373 memset(&opt, 0, sizeof opt);
3376 opt.quantum = mtu; /* if we cannot find mtu, use default */
3379 opt.quantum = quantum;
3383 opt.perturb_period = 10;
3385 opt.perturb_period = perturb;
3388 nl_msg_put_string(&request, TCA_KIND, "sfq");
3389 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3391 error = tc_transact(&request, NULL);
3393 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3394 "quantum %u, perturb %u error %d(%s)",
3395 netdev_get_name(netdev),
3396 opt.quantum, opt.perturb_period,
3397 error, ovs_strerror(error));
3403 sfq_parse_qdisc_details__(struct netdev *netdev,
3404 const struct smap *details, struct sfq *sfq)
3406 const char *perturb_s;
3407 const char *quantum_s;
3411 perturb_s = smap_get(details, "perturb");
3412 quantum_s = smap_get(details, "quantum");
3413 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3414 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3415 if (!sfq->perturb) {
3419 if (!sfq->quantum) {
3420 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3424 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3425 "device without mtu");
3432 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3437 sfq_parse_qdisc_details__(netdev, details, &sfq);
3438 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3440 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3446 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3448 const struct tc_sfq_qopt *sfq;
3449 struct nlattr *nlattr;
3453 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3455 sfq = nl_attr_get(nlattr);
3456 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3464 sfq_tc_destroy(struct tc *tc)
3466 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3472 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3474 const struct sfq *sfq = sfq_get__(netdev);
3475 smap_add_format(details, "quantum", "%u", sfq->quantum);
3476 smap_add_format(details, "perturb", "%u", sfq->perturb);
3481 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3485 sfq_parse_qdisc_details__(netdev, details, &sfq);
3486 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3487 sfq_get__(netdev)->quantum = sfq.quantum;
3488 sfq_get__(netdev)->perturb = sfq.perturb;
3492 static const struct tc_ops tc_ops_sfq = {
3493 "sfq", /* linux_name */
3494 "linux-sfq", /* ovs_name */
3495 SFQ_N_QUEUES, /* n_queues */
3508 /* HTB traffic control class. */
3510 #define HTB_N_QUEUES 0xf000
3511 #define HTB_RATE2QUANTUM 10
3515 unsigned int max_rate; /* In bytes/s. */
3519 struct tc_queue tc_queue;
3520 unsigned int min_rate; /* In bytes/s. */
3521 unsigned int max_rate; /* In bytes/s. */
3522 unsigned int burst; /* In bytes. */
3523 unsigned int priority; /* Lower values are higher priorities. */
3527 htb_get__(const struct netdev *netdev_)
3529 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3530 return CONTAINER_OF(netdev->tc, struct htb, tc);
3534 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3536 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3539 htb = xmalloc(sizeof *htb);
3540 tc_init(&htb->tc, &tc_ops_htb);
3541 htb->max_rate = max_rate;
3543 netdev->tc = &htb->tc;
3546 /* Create an HTB qdisc.
3548 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3550 htb_setup_qdisc__(struct netdev *netdev)
3553 struct tc_htb_glob opt;
3554 struct ofpbuf request;
3555 struct tcmsg *tcmsg;
3557 tc_del_qdisc(netdev);
3559 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3560 NLM_F_EXCL | NLM_F_CREATE, &request);
3564 tcmsg->tcm_handle = tc_make_handle(1, 0);
3565 tcmsg->tcm_parent = TC_H_ROOT;
3567 nl_msg_put_string(&request, TCA_KIND, "htb");
3569 memset(&opt, 0, sizeof opt);
3570 opt.rate2quantum = HTB_RATE2QUANTUM;
3574 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3575 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3576 nl_msg_end_nested(&request, opt_offset);
3578 return tc_transact(&request, NULL);
3581 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3582 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3584 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3585 unsigned int parent, struct htb_class *class)
3588 struct tc_htb_opt opt;
3589 struct ofpbuf request;
3590 struct tcmsg *tcmsg;
3594 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3596 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3597 netdev_get_name(netdev));
3601 memset(&opt, 0, sizeof opt);
3602 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3603 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3604 /* Makes sure the quantum is at least MTU. Setting quantum will
3605 * make htb ignore the r2q for this class. */
3606 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3609 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3610 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3611 opt.prio = class->priority;
3613 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3617 tcmsg->tcm_handle = handle;
3618 tcmsg->tcm_parent = parent;
3620 nl_msg_put_string(&request, TCA_KIND, "htb");
3621 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3622 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3623 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3624 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3625 nl_msg_end_nested(&request, opt_offset);
3627 error = tc_transact(&request, NULL);
3629 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3630 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3631 netdev_get_name(netdev),
3632 tc_get_major(handle), tc_get_minor(handle),
3633 tc_get_major(parent), tc_get_minor(parent),
3634 class->min_rate, class->max_rate,
3635 class->burst, class->priority, ovs_strerror(error));
3640 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3641 * description of them into 'details'. The description complies with the
3642 * specification given in the vswitch database documentation for linux-htb
3645 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3647 static const struct nl_policy tca_htb_policy[] = {
3648 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3649 .min_len = sizeof(struct tc_htb_opt) },
3652 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3653 const struct tc_htb_opt *htb;
3655 if (!nl_parse_nested(nl_options, tca_htb_policy,
3656 attrs, ARRAY_SIZE(tca_htb_policy))) {
3657 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3661 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3662 class->min_rate = htb->rate.rate;
3663 class->max_rate = htb->ceil.rate;
3664 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3665 class->priority = htb->prio;
3670 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3671 struct htb_class *options,
3672 struct netdev_queue_stats *stats)
3674 struct nlattr *nl_options;
3675 unsigned int handle;
3678 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3679 if (!error && queue_id) {
3680 unsigned int major = tc_get_major(handle);
3681 unsigned int minor = tc_get_minor(handle);
3682 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3683 *queue_id = minor - 1;
3688 if (!error && options) {
3689 error = htb_parse_tca_options__(nl_options, options);
3695 htb_parse_qdisc_details__(struct netdev *netdev_,
3696 const struct smap *details, struct htb_class *hc)
3698 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3699 const char *max_rate_s;
3701 max_rate_s = smap_get(details, "max-rate");
3702 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3703 if (!hc->max_rate) {
3704 enum netdev_features current;
3706 netdev_linux_read_features(netdev);
3707 current = !netdev->get_features_error ? netdev->current : 0;
3708 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3710 hc->min_rate = hc->max_rate;
3716 htb_parse_class_details__(struct netdev *netdev,
3717 const struct smap *details, struct htb_class *hc)
3719 const struct htb *htb = htb_get__(netdev);
3720 const char *min_rate_s = smap_get(details, "min-rate");
3721 const char *max_rate_s = smap_get(details, "max-rate");
3722 const char *burst_s = smap_get(details, "burst");
3723 const char *priority_s = smap_get(details, "priority");
3726 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3728 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3729 netdev_get_name(netdev));
3733 /* HTB requires at least an mtu sized min-rate to send any traffic even
3734 * on uncongested links. */
3735 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3736 hc->min_rate = MAX(hc->min_rate, mtu);
3737 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3740 hc->max_rate = (max_rate_s
3741 ? strtoull(max_rate_s, NULL, 10) / 8
3743 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3744 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3748 * According to hints in the documentation that I've read, it is important
3749 * that 'burst' be at least as big as the largest frame that might be
3750 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3751 * but having it a bit too small is a problem. Since netdev_get_mtu()
3752 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3753 * the MTU. We actually add 64, instead of 14, as a guard against
3754 * additional headers get tacked on somewhere that we're not aware of. */
3755 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3756 hc->burst = MAX(hc->burst, mtu + 64);
3759 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3765 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3766 unsigned int parent, struct htb_class *options,
3767 struct netdev_queue_stats *stats)
3769 struct ofpbuf *reply;
3772 error = tc_query_class(netdev, handle, parent, &reply);
3774 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3775 ofpbuf_delete(reply);
3781 htb_tc_install(struct netdev *netdev, const struct smap *details)
3785 error = htb_setup_qdisc__(netdev);
3787 struct htb_class hc;
3789 htb_parse_qdisc_details__(netdev, details, &hc);
3790 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3791 tc_make_handle(1, 0), &hc);
3793 htb_install__(netdev, hc.max_rate);
3799 static struct htb_class *
3800 htb_class_cast__(const struct tc_queue *queue)
3802 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3806 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3807 const struct htb_class *hc)
3809 struct htb *htb = htb_get__(netdev);
3810 size_t hash = hash_int(queue_id, 0);
3811 struct tc_queue *queue;
3812 struct htb_class *hcp;
3814 queue = tc_find_queue__(netdev, queue_id, hash);
3816 hcp = htb_class_cast__(queue);
3818 hcp = xmalloc(sizeof *hcp);
3819 queue = &hcp->tc_queue;
3820 queue->queue_id = queue_id;
3821 queue->created = time_msec();
3822 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3825 hcp->min_rate = hc->min_rate;
3826 hcp->max_rate = hc->max_rate;
3827 hcp->burst = hc->burst;
3828 hcp->priority = hc->priority;
3832 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3835 struct queue_dump_state state;
3836 struct htb_class hc;
3838 /* Get qdisc options. */
3840 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3841 htb_install__(netdev, hc.max_rate);
3844 if (!start_queue_dump(netdev, &state)) {
3847 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3848 unsigned int queue_id;
3850 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3851 htb_update_queue__(netdev, queue_id, &hc);
3854 finish_queue_dump(&state);
3860 htb_tc_destroy(struct tc *tc)
3862 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3863 struct htb_class *hc;
3865 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
3873 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3875 const struct htb *htb = htb_get__(netdev);
3876 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3881 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3883 struct htb_class hc;
3886 htb_parse_qdisc_details__(netdev, details, &hc);
3887 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3888 tc_make_handle(1, 0), &hc);
3890 htb_get__(netdev)->max_rate = hc.max_rate;
3896 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3897 const struct tc_queue *queue, struct smap *details)
3899 const struct htb_class *hc = htb_class_cast__(queue);
3901 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3902 if (hc->min_rate != hc->max_rate) {
3903 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3905 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3907 smap_add_format(details, "priority", "%u", hc->priority);
3913 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3914 const struct smap *details)
3916 struct htb_class hc;
3919 error = htb_parse_class_details__(netdev, details, &hc);
3924 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3925 tc_make_handle(1, 0xfffe), &hc);
3930 htb_update_queue__(netdev, queue_id, &hc);
3935 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3937 struct htb_class *hc = htb_class_cast__(queue);
3938 struct htb *htb = htb_get__(netdev);
3941 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3943 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3950 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3951 struct netdev_queue_stats *stats)
3953 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3954 tc_make_handle(1, 0xfffe), NULL, stats);
3958 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3959 const struct ofpbuf *nlmsg,
3960 netdev_dump_queue_stats_cb *cb, void *aux)
3962 struct netdev_queue_stats stats;
3963 unsigned int handle, major, minor;
3966 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3971 major = tc_get_major(handle);
3972 minor = tc_get_minor(handle);
3973 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3974 (*cb)(minor - 1, &stats, aux);
3979 static const struct tc_ops tc_ops_htb = {
3980 "htb", /* linux_name */
3981 "linux-htb", /* ovs_name */
3982 HTB_N_QUEUES, /* n_queues */
3991 htb_class_get_stats,
3992 htb_class_dump_stats
3995 /* "linux-hfsc" traffic control class. */
3997 #define HFSC_N_QUEUES 0xf000
4005 struct tc_queue tc_queue;
4010 static struct hfsc *
4011 hfsc_get__(const struct netdev *netdev_)
4013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4014 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4017 static struct hfsc_class *
4018 hfsc_class_cast__(const struct tc_queue *queue)
4020 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4024 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4026 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4029 hfsc = xmalloc(sizeof *hfsc);
4030 tc_init(&hfsc->tc, &tc_ops_hfsc);
4031 hfsc->max_rate = max_rate;
4032 netdev->tc = &hfsc->tc;
4036 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4037 const struct hfsc_class *hc)
4041 struct hfsc_class *hcp;
4042 struct tc_queue *queue;
4044 hfsc = hfsc_get__(netdev);
4045 hash = hash_int(queue_id, 0);
4047 queue = tc_find_queue__(netdev, queue_id, hash);
4049 hcp = hfsc_class_cast__(queue);
4051 hcp = xmalloc(sizeof *hcp);
4052 queue = &hcp->tc_queue;
4053 queue->queue_id = queue_id;
4054 queue->created = time_msec();
4055 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4058 hcp->min_rate = hc->min_rate;
4059 hcp->max_rate = hc->max_rate;
4063 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4065 const struct tc_service_curve *rsc, *fsc, *usc;
4066 static const struct nl_policy tca_hfsc_policy[] = {
4068 .type = NL_A_UNSPEC,
4070 .min_len = sizeof(struct tc_service_curve),
4073 .type = NL_A_UNSPEC,
4075 .min_len = sizeof(struct tc_service_curve),
4078 .type = NL_A_UNSPEC,
4080 .min_len = sizeof(struct tc_service_curve),
4083 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4085 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4086 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4087 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4091 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4092 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4093 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4095 if (rsc->m1 != 0 || rsc->d != 0 ||
4096 fsc->m1 != 0 || fsc->d != 0 ||
4097 usc->m1 != 0 || usc->d != 0) {
4098 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4099 "Non-linear service curves are not supported.");
4103 if (rsc->m2 != fsc->m2) {
4104 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4105 "Real-time service curves are not supported ");
4109 if (rsc->m2 > usc->m2) {
4110 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4111 "Min-rate service curve is greater than "
4112 "the max-rate service curve.");
4116 class->min_rate = fsc->m2;
4117 class->max_rate = usc->m2;
4122 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4123 struct hfsc_class *options,
4124 struct netdev_queue_stats *stats)
4127 unsigned int handle;
4128 struct nlattr *nl_options;
4130 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4136 unsigned int major, minor;
4138 major = tc_get_major(handle);
4139 minor = tc_get_minor(handle);
4140 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4141 *queue_id = minor - 1;
4148 error = hfsc_parse_tca_options__(nl_options, options);
4155 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4156 unsigned int parent, struct hfsc_class *options,
4157 struct netdev_queue_stats *stats)
4160 struct ofpbuf *reply;
4162 error = tc_query_class(netdev, handle, parent, &reply);
4167 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4168 ofpbuf_delete(reply);
4173 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4174 struct hfsc_class *class)
4176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4178 const char *max_rate_s;
4180 max_rate_s = smap_get(details, "max-rate");
4181 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4184 enum netdev_features current;
4186 netdev_linux_read_features(netdev);
4187 current = !netdev->get_features_error ? netdev->current : 0;
4188 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4191 class->min_rate = max_rate;
4192 class->max_rate = max_rate;
4196 hfsc_parse_class_details__(struct netdev *netdev,
4197 const struct smap *details,
4198 struct hfsc_class * class)
4200 const struct hfsc *hfsc;
4201 uint32_t min_rate, max_rate;
4202 const char *min_rate_s, *max_rate_s;
4204 hfsc = hfsc_get__(netdev);
4205 min_rate_s = smap_get(details, "min-rate");
4206 max_rate_s = smap_get(details, "max-rate");
4208 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4209 min_rate = MAX(min_rate, 1);
4210 min_rate = MIN(min_rate, hfsc->max_rate);
4212 max_rate = (max_rate_s
4213 ? strtoull(max_rate_s, NULL, 10) / 8
4215 max_rate = MAX(max_rate, min_rate);
4216 max_rate = MIN(max_rate, hfsc->max_rate);
4218 class->min_rate = min_rate;
4219 class->max_rate = max_rate;
4224 /* Create an HFSC qdisc.
4226 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4228 hfsc_setup_qdisc__(struct netdev * netdev)
4230 struct tcmsg *tcmsg;
4231 struct ofpbuf request;
4232 struct tc_hfsc_qopt opt;
4234 tc_del_qdisc(netdev);
4236 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4237 NLM_F_EXCL | NLM_F_CREATE, &request);
4243 tcmsg->tcm_handle = tc_make_handle(1, 0);
4244 tcmsg->tcm_parent = TC_H_ROOT;
4246 memset(&opt, 0, sizeof opt);
4249 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4250 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4252 return tc_transact(&request, NULL);
4255 /* Create an HFSC class.
4257 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4258 * sc rate <min_rate> ul rate <max_rate>" */
4260 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4261 unsigned int parent, struct hfsc_class *class)
4265 struct tcmsg *tcmsg;
4266 struct ofpbuf request;
4267 struct tc_service_curve min, max;
4269 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4275 tcmsg->tcm_handle = handle;
4276 tcmsg->tcm_parent = parent;
4280 min.m2 = class->min_rate;
4284 max.m2 = class->max_rate;
4286 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4287 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4288 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4289 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4290 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4291 nl_msg_end_nested(&request, opt_offset);
4293 error = tc_transact(&request, NULL);
4295 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4296 "min-rate %ubps, max-rate %ubps (%s)",
4297 netdev_get_name(netdev),
4298 tc_get_major(handle), tc_get_minor(handle),
4299 tc_get_major(parent), tc_get_minor(parent),
4300 class->min_rate, class->max_rate, ovs_strerror(error));
4307 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4310 struct hfsc_class class;
4312 error = hfsc_setup_qdisc__(netdev);
4318 hfsc_parse_qdisc_details__(netdev, details, &class);
4319 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4320 tc_make_handle(1, 0), &class);
4326 hfsc_install__(netdev, class.max_rate);
4331 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4334 struct queue_dump_state state;
4335 struct hfsc_class hc;
4338 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4339 hfsc_install__(netdev, hc.max_rate);
4341 if (!start_queue_dump(netdev, &state)) {
4345 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4346 unsigned int queue_id;
4348 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4349 hfsc_update_queue__(netdev, queue_id, &hc);
4353 finish_queue_dump(&state);
4358 hfsc_tc_destroy(struct tc *tc)
4361 struct hfsc_class *hc, *next;
4363 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4365 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4366 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4375 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4377 const struct hfsc *hfsc;
4378 hfsc = hfsc_get__(netdev);
4379 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4384 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4387 struct hfsc_class class;
4389 hfsc_parse_qdisc_details__(netdev, details, &class);
4390 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4391 tc_make_handle(1, 0), &class);
4394 hfsc_get__(netdev)->max_rate = class.max_rate;
4401 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4402 const struct tc_queue *queue, struct smap *details)
4404 const struct hfsc_class *hc;
4406 hc = hfsc_class_cast__(queue);
4407 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4408 if (hc->min_rate != hc->max_rate) {
4409 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4415 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4416 const struct smap *details)
4419 struct hfsc_class class;
4421 error = hfsc_parse_class_details__(netdev, details, &class);
4426 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4427 tc_make_handle(1, 0xfffe), &class);
4432 hfsc_update_queue__(netdev, queue_id, &class);
4437 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4441 struct hfsc_class *hc;
4443 hc = hfsc_class_cast__(queue);
4444 hfsc = hfsc_get__(netdev);
4446 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4448 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4455 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4456 struct netdev_queue_stats *stats)
4458 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4459 tc_make_handle(1, 0xfffe), NULL, stats);
4463 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4464 const struct ofpbuf *nlmsg,
4465 netdev_dump_queue_stats_cb *cb, void *aux)
4467 struct netdev_queue_stats stats;
4468 unsigned int handle, major, minor;
4471 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4476 major = tc_get_major(handle);
4477 minor = tc_get_minor(handle);
4478 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4479 (*cb)(minor - 1, &stats, aux);
4484 static const struct tc_ops tc_ops_hfsc = {
4485 "hfsc", /* linux_name */
4486 "linux-hfsc", /* ovs_name */
4487 HFSC_N_QUEUES, /* n_queues */
4488 hfsc_tc_install, /* tc_install */
4489 hfsc_tc_load, /* tc_load */
4490 hfsc_tc_destroy, /* tc_destroy */
4491 hfsc_qdisc_get, /* qdisc_get */
4492 hfsc_qdisc_set, /* qdisc_set */
4493 hfsc_class_get, /* class_get */
4494 hfsc_class_set, /* class_set */
4495 hfsc_class_delete, /* class_delete */
4496 hfsc_class_get_stats, /* class_get_stats */
4497 hfsc_class_dump_stats /* class_dump_stats */
4500 /* "linux-noop" traffic control class. */
4503 noop_install__(struct netdev *netdev_)
4505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4506 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4508 netdev->tc = CONST_CAST(struct tc *, &tc);
4512 noop_tc_install(struct netdev *netdev,
4513 const struct smap *details OVS_UNUSED)
4515 noop_install__(netdev);
4520 noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4522 noop_install__(netdev);
4526 static const struct tc_ops tc_ops_noop = {
4527 NULL, /* linux_name */
4528 "linux-noop", /* ovs_name */
4532 NULL, /* tc_destroy */
4533 NULL, /* qdisc_get */
4534 NULL, /* qdisc_set */
4535 NULL, /* class_get */
4536 NULL, /* class_set */
4537 NULL, /* class_delete */
4538 NULL, /* class_get_stats */
4539 NULL /* class_dump_stats */
4542 /* "linux-default" traffic control class.
4544 * This class represents the default, unnamed Linux qdisc. It corresponds to
4545 * the "" (empty string) QoS type in the OVS database. */
4548 default_install__(struct netdev *netdev_)
4550 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4551 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4553 /* Nothing but a tc class implementation is allowed to write to a tc. This
4554 * class never does that, so we can legitimately use a const tc object. */
4555 netdev->tc = CONST_CAST(struct tc *, &tc);
4559 default_tc_install(struct netdev *netdev,
4560 const struct smap *details OVS_UNUSED)
4562 default_install__(netdev);
4567 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4569 default_install__(netdev);
4573 static const struct tc_ops tc_ops_default = {
4574 NULL, /* linux_name */
4579 NULL, /* tc_destroy */
4580 NULL, /* qdisc_get */
4581 NULL, /* qdisc_set */
4582 NULL, /* class_get */
4583 NULL, /* class_set */
4584 NULL, /* class_delete */
4585 NULL, /* class_get_stats */
4586 NULL /* class_dump_stats */
4589 /* "linux-other" traffic control class.
4594 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4596 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4597 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4599 /* Nothing but a tc class implementation is allowed to write to a tc. This
4600 * class never does that, so we can legitimately use a const tc object. */
4601 netdev->tc = CONST_CAST(struct tc *, &tc);
4605 static const struct tc_ops tc_ops_other = {
4606 NULL, /* linux_name */
4607 "linux-other", /* ovs_name */
4609 NULL, /* tc_install */
4611 NULL, /* tc_destroy */
4612 NULL, /* qdisc_get */
4613 NULL, /* qdisc_set */
4614 NULL, /* class_get */
4615 NULL, /* class_set */
4616 NULL, /* class_delete */
4617 NULL, /* class_get_stats */
4618 NULL /* class_dump_stats */
4621 /* Traffic control. */
4623 /* Number of kernel "tc" ticks per second. */
4624 static double ticks_per_s;
4626 /* Number of kernel "jiffies" per second. This is used for the purpose of
4627 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4628 * one jiffy's worth of data.
4630 * There are two possibilities here:
4632 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4633 * approximate range of 100 to 1024. That means that we really need to
4634 * make sure that the qdisc can buffer that much data.
4636 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4637 * has finely granular timers and there's no need to fudge additional room
4638 * for buffers. (There's no extra effort needed to implement that: the
4639 * large 'buffer_hz' is used as a divisor, so practically any number will
4640 * come out as 0 in the division. Small integer results in the case of
4641 * really high dividends won't have any real effect anyhow.)
4643 static unsigned int buffer_hz;
4645 /* Returns tc handle 'major':'minor'. */
4647 tc_make_handle(unsigned int major, unsigned int minor)
4649 return TC_H_MAKE(major << 16, minor);
4652 /* Returns the major number from 'handle'. */
4654 tc_get_major(unsigned int handle)
4656 return TC_H_MAJ(handle) >> 16;
4659 /* Returns the minor number from 'handle'. */
4661 tc_get_minor(unsigned int handle)
4663 return TC_H_MIN(handle);
4666 static struct tcmsg *
4667 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4668 struct ofpbuf *request)
4670 struct tcmsg *tcmsg;
4674 error = get_ifindex(netdev, &ifindex);
4679 ofpbuf_init(request, 512);
4680 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4681 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4682 tcmsg->tcm_family = AF_UNSPEC;
4683 tcmsg->tcm_ifindex = ifindex;
4684 /* Caller should fill in tcmsg->tcm_handle. */
4685 /* Caller should fill in tcmsg->tcm_parent. */
4691 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4693 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4694 ofpbuf_uninit(request);
4698 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4699 * policing configuration.
4701 * This function is equivalent to running the following when 'add' is true:
4702 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4704 * This function is equivalent to running the following when 'add' is false:
4705 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4707 * The configuration and stats may be seen with the following command:
4708 * /sbin/tc -s qdisc show dev <devname>
4710 * Returns 0 if successful, otherwise a positive errno value.
4713 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4715 struct ofpbuf request;
4716 struct tcmsg *tcmsg;
4718 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4719 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4721 tcmsg = tc_make_request(netdev, type, flags, &request);
4725 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4726 tcmsg->tcm_parent = TC_H_INGRESS;
4727 nl_msg_put_string(&request, TCA_KIND, "ingress");
4728 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4730 error = tc_transact(&request, NULL);
4732 /* If we're deleting the qdisc, don't worry about some of the
4733 * error conditions. */
4734 if (!add && (error == ENOENT || error == EINVAL)) {
4743 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4746 * This function is equivalent to running:
4747 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4748 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4751 * The configuration and stats may be seen with the following command:
4752 * /sbin/tc -s filter show dev <devname> parent ffff:
4754 * Returns 0 if successful, otherwise a positive errno value.
4757 tc_add_policer(struct netdev *netdev,
4758 uint32_t kbits_rate, uint32_t kbits_burst)
4760 struct tc_police tc_police;
4761 struct ofpbuf request;
4762 struct tcmsg *tcmsg;
4763 size_t basic_offset;
4764 size_t police_offset;
4768 memset(&tc_police, 0, sizeof tc_police);
4769 tc_police.action = TC_POLICE_SHOT;
4770 tc_police.mtu = mtu;
4771 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4773 /* The following appears wrong in one way: In networking a kilobit is
4774 * usually 1000 bits but this uses 1024 bits.
4776 * However if you "fix" those problems then "tc filter show ..." shows
4777 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4778 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4779 * tc's point of view. Whatever. */
4780 tc_police.burst = tc_bytes_to_ticks(
4781 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
4783 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4784 NLM_F_EXCL | NLM_F_CREATE, &request);
4788 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4789 tcmsg->tcm_info = tc_make_handle(49,
4790 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4792 nl_msg_put_string(&request, TCA_KIND, "basic");
4793 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4794 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4795 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4796 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4797 nl_msg_end_nested(&request, police_offset);
4798 nl_msg_end_nested(&request, basic_offset);
4800 error = tc_transact(&request, NULL);
4811 /* The values in psched are not individually very meaningful, but they are
4812 * important. The tables below show some values seen in the wild.
4816 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4817 * (Before that, there are hints that it was 1000000000.)
4819 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4823 * -----------------------------------
4824 * [1] 000c8000 000f4240 000f4240 00000064
4825 * [2] 000003e8 00000400 000f4240 3b9aca00
4826 * [3] 000003e8 00000400 000f4240 3b9aca00
4827 * [4] 000003e8 00000400 000f4240 00000064
4828 * [5] 000003e8 00000040 000f4240 3b9aca00
4829 * [6] 000003e8 00000040 000f4240 000000f9
4831 * a b c d ticks_per_s buffer_hz
4832 * ------- --------- ---------- ------------- ----------- -------------
4833 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4834 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4835 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4836 * [4] 1,000 1,024 1,000,000 100 976,562 100
4837 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4838 * [6] 1,000 64 1,000,000 249 15,625,000 249
4840 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4841 * [2] 2.6.26-1-686-bigmem from Debian lenny
4842 * [3] 2.6.26-2-sparc64 from Debian lenny
4843 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4844 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4845 * [6] 2.6.34 from kernel.org on KVM
4847 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4848 static const char fn[] = "/proc/net/psched";
4849 unsigned int a, b, c, d;
4852 if (!ovsthread_once_start(&once)) {
4859 stream = fopen(fn, "r");
4861 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4865 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4866 VLOG_WARN("%s: read failed", fn);
4870 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4874 VLOG_WARN("%s: invalid scheduler parameters", fn);
4878 ticks_per_s = (double) a * c / b;
4882 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4885 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4888 ovsthread_once_done(&once);
4891 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4892 * rate of 'rate' bytes per second. */
4894 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4897 return (rate * ticks) / ticks_per_s;
4900 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4901 * rate of 'rate' bytes per second. */
4903 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4906 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4909 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4910 * a transmission rate of 'rate' bytes per second. */
4912 tc_buffer_per_jiffy(unsigned int rate)
4915 return rate / buffer_hz;
4918 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4919 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4920 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4921 * stores NULL into it if it is absent.
4923 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4926 * Returns 0 if successful, otherwise a positive errno value. */
4928 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4929 struct nlattr **options)
4931 static const struct nl_policy tca_policy[] = {
4932 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4933 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4935 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4937 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4938 tca_policy, ta, ARRAY_SIZE(ta))) {
4939 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4944 *kind = nl_attr_get_string(ta[TCA_KIND]);
4948 *options = ta[TCA_OPTIONS];
4963 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4964 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4965 * into '*options', and its queue statistics into '*stats'. Any of the output
4966 * arguments may be null.
4968 * Returns 0 if successful, otherwise a positive errno value. */
4970 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4971 struct nlattr **options, struct netdev_queue_stats *stats)
4973 static const struct nl_policy tca_policy[] = {
4974 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4975 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4977 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4979 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4980 tca_policy, ta, ARRAY_SIZE(ta))) {
4981 VLOG_WARN_RL(&rl, "failed to parse class message");
4986 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4987 *handlep = tc->tcm_handle;
4991 *options = ta[TCA_OPTIONS];
4995 const struct gnet_stats_queue *gsq;
4996 struct gnet_stats_basic gsb;
4998 static const struct nl_policy stats_policy[] = {
4999 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5000 .min_len = sizeof gsb },
5001 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5002 .min_len = sizeof *gsq },
5004 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5006 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5007 sa, ARRAY_SIZE(sa))) {
5008 VLOG_WARN_RL(&rl, "failed to parse class stats");
5012 /* Alignment issues screw up the length of struct gnet_stats_basic on
5013 * some arch/bitsize combinations. Newer versions of Linux have a
5014 * struct gnet_stats_basic_packed, but we can't depend on that. The
5015 * easiest thing to do is just to make a copy. */
5016 memset(&gsb, 0, sizeof gsb);
5017 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5018 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5019 stats->tx_bytes = gsb.bytes;
5020 stats->tx_packets = gsb.packets;
5022 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5023 stats->tx_errors = gsq->drops;
5033 memset(stats, 0, sizeof *stats);
5038 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5041 tc_query_class(const struct netdev *netdev,
5042 unsigned int handle, unsigned int parent,
5043 struct ofpbuf **replyp)
5045 struct ofpbuf request;
5046 struct tcmsg *tcmsg;
5049 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5053 tcmsg->tcm_handle = handle;
5054 tcmsg->tcm_parent = parent;
5056 error = tc_transact(&request, replyp);
5058 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5059 netdev_get_name(netdev),
5060 tc_get_major(handle), tc_get_minor(handle),
5061 tc_get_major(parent), tc_get_minor(parent),
5062 ovs_strerror(error));
5067 /* Equivalent to "tc class del dev <name> handle <handle>". */
5069 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5071 struct ofpbuf request;
5072 struct tcmsg *tcmsg;
5075 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5079 tcmsg->tcm_handle = handle;
5080 tcmsg->tcm_parent = 0;
5082 error = tc_transact(&request, NULL);
5084 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5085 netdev_get_name(netdev),
5086 tc_get_major(handle), tc_get_minor(handle),
5087 ovs_strerror(error));
5092 /* Equivalent to "tc qdisc del dev <name> root". */
5094 tc_del_qdisc(struct netdev *netdev_)
5096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5097 struct ofpbuf request;
5098 struct tcmsg *tcmsg;
5101 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5105 tcmsg->tcm_handle = tc_make_handle(1, 0);
5106 tcmsg->tcm_parent = TC_H_ROOT;
5108 error = tc_transact(&request, NULL);
5109 if (error == EINVAL) {
5110 /* EINVAL probably means that the default qdisc was in use, in which
5111 * case we've accomplished our purpose. */
5114 if (!error && netdev->tc) {
5115 if (netdev->tc->ops->tc_destroy) {
5116 netdev->tc->ops->tc_destroy(netdev->tc);
5124 getqdisc_is_safe(void)
5126 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5127 static bool safe = false;
5129 if (ovsthread_once_start(&once)) {
5130 struct utsname utsname;
5133 if (uname(&utsname) == -1) {
5134 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5135 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5136 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5137 } else if (major < 2 || (major == 2 && minor < 35)) {
5138 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5143 ovsthread_once_done(&once);
5148 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5149 * kernel to determine what they are. Returns 0 if successful, otherwise a
5150 * positive errno value. */
5152 tc_query_qdisc(const struct netdev *netdev_)
5154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5155 struct ofpbuf request, *qdisc;
5156 const struct tc_ops *ops;
5157 struct tcmsg *tcmsg;
5165 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5166 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5167 * 2.6.35 without that fix backported to it.
5169 * To avoid the OOPS, we must not make a request that would attempt to dump
5170 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5171 * few others. There are a few ways that I can see to do this, but most of
5172 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5173 * technique chosen here is to assume that any non-default qdisc that we
5174 * create will have a class with handle 1:0. The built-in qdiscs only have
5175 * a class with handle 0:0.
5177 * On Linux 2.6.35+ we use the straightforward method because it allows us
5178 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5179 * in such a case we get no response at all from the kernel (!) if a
5180 * builtin qdisc is in use (which is later caught by "!error &&
5181 * !qdisc->size"). */
5182 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5186 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5187 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5189 /* Figure out what tc class to instantiate. */
5190 error = tc_transact(&request, &qdisc);
5191 if (!error && qdisc->size) {
5194 error = tc_parse_qdisc(qdisc, &kind, NULL);
5196 ops = &tc_ops_other;
5198 ops = tc_lookup_linux_name(kind);
5200 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5201 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5203 ops = &tc_ops_other;
5206 } else if ((!error && !qdisc->size) || error == ENOENT) {
5207 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5208 * set up by some other entity that doesn't have a handle 1:0. We will
5209 * assume that it's the system default qdisc. */
5210 ops = &tc_ops_default;
5213 /* Who knows? Maybe the device got deleted. */
5214 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5215 netdev_get_name(netdev_), ovs_strerror(error));
5216 ops = &tc_ops_other;
5219 /* Instantiate it. */
5220 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5221 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5222 ofpbuf_delete(qdisc);
5224 return error ? error : load_error;
5227 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5228 approximate the time to transmit packets of various lengths. For an MTU of
5229 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5230 represents two possible packet lengths; for a MTU of 513 through 1024, four
5231 possible lengths; and so on.
5233 Returns, for the specified 'mtu', the number of bits that packet lengths
5234 need to be shifted right to fit within such a 256-entry table. */
5236 tc_calc_cell_log(unsigned int mtu)
5241 mtu = ETH_PAYLOAD_MAX;
5243 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5245 for (cell_log = 0; mtu >= 256; cell_log++) {
5252 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5255 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5257 memset(rate, 0, sizeof *rate);
5258 rate->cell_log = tc_calc_cell_log(mtu);
5259 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5260 /* rate->cell_align = 0; */ /* distro headers. */
5261 rate->mpu = ETH_TOTAL_MIN;
5265 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5266 * attribute of the specified "type".
5268 * See tc_calc_cell_log() above for a description of "rtab"s. */
5270 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5275 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5276 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5277 unsigned packet_size = (i + 1) << rate->cell_log;
5278 if (packet_size < rate->mpu) {
5279 packet_size = rate->mpu;
5281 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5285 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5286 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5287 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5290 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5292 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5293 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5296 /* Linux-only functions declared in netdev-linux.h */
5298 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5299 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5301 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5302 const char *flag_name, bool enable)
5304 const char *netdev_name = netdev_get_name(netdev);
5305 struct ethtool_value evalue;
5309 COVERAGE_INC(netdev_get_ethtool);
5310 memset(&evalue, 0, sizeof evalue);
5311 error = netdev_linux_do_ethtool(netdev_name,
5312 (struct ethtool_cmd *)&evalue,
5313 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5318 COVERAGE_INC(netdev_set_ethtool);
5319 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5320 if (new_flags == evalue.data) {
5323 evalue.data = new_flags;
5324 error = netdev_linux_do_ethtool(netdev_name,
5325 (struct ethtool_cmd *)&evalue,
5326 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5331 COVERAGE_INC(netdev_get_ethtool);
5332 memset(&evalue, 0, sizeof evalue);
5333 error = netdev_linux_do_ethtool(netdev_name,
5334 (struct ethtool_cmd *)&evalue,
5335 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5340 if (new_flags != evalue.data) {
5341 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5342 "device %s failed", enable ? "enable" : "disable",
5343 flag_name, netdev_name);
5350 /* Utility functions. */
5352 /* Copies 'src' into 'dst', performing format conversion in the process. */
5354 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5355 const struct rtnl_link_stats *src)
5357 dst->rx_packets = src->rx_packets;
5358 dst->tx_packets = src->tx_packets;
5359 dst->rx_bytes = src->rx_bytes;
5360 dst->tx_bytes = src->tx_bytes;
5361 dst->rx_errors = src->rx_errors;
5362 dst->tx_errors = src->tx_errors;
5363 dst->rx_dropped = src->rx_dropped;
5364 dst->tx_dropped = src->tx_dropped;
5365 dst->multicast = src->multicast;
5366 dst->collisions = src->collisions;
5367 dst->rx_length_errors = src->rx_length_errors;
5368 dst->rx_over_errors = src->rx_over_errors;
5369 dst->rx_crc_errors = src->rx_crc_errors;
5370 dst->rx_frame_errors = src->rx_frame_errors;
5371 dst->rx_fifo_errors = src->rx_fifo_errors;
5372 dst->rx_missed_errors = src->rx_missed_errors;
5373 dst->tx_aborted_errors = src->tx_aborted_errors;
5374 dst->tx_carrier_errors = src->tx_carrier_errors;
5375 dst->tx_fifo_errors = src->tx_fifo_errors;
5376 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5377 dst->tx_window_errors = src->tx_window_errors;
5380 /* Copies 'src' into 'dst', performing format conversion in the process. */
5382 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5383 const struct rtnl_link_stats64 *src)
5385 dst->rx_packets = src->rx_packets;
5386 dst->tx_packets = src->tx_packets;
5387 dst->rx_bytes = src->rx_bytes;
5388 dst->tx_bytes = src->tx_bytes;
5389 dst->rx_errors = src->rx_errors;
5390 dst->tx_errors = src->tx_errors;
5391 dst->rx_dropped = src->rx_dropped;
5392 dst->tx_dropped = src->tx_dropped;
5393 dst->multicast = src->multicast;
5394 dst->collisions = src->collisions;
5395 dst->rx_length_errors = src->rx_length_errors;
5396 dst->rx_over_errors = src->rx_over_errors;
5397 dst->rx_crc_errors = src->rx_crc_errors;
5398 dst->rx_frame_errors = src->rx_frame_errors;
5399 dst->rx_fifo_errors = src->rx_fifo_errors;
5400 dst->rx_missed_errors = src->rx_missed_errors;
5401 dst->tx_aborted_errors = src->tx_aborted_errors;
5402 dst->tx_carrier_errors = src->tx_carrier_errors;
5403 dst->tx_fifo_errors = src->tx_fifo_errors;
5404 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5405 dst->tx_window_errors = src->tx_window_errors;
5409 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5411 struct ofpbuf request;
5412 struct ofpbuf *reply;
5415 /* Filtering all counters by default */
5416 memset(stats, 0xFF, sizeof(struct netdev_stats));
5418 ofpbuf_init(&request, 0);
5419 nl_msg_put_nlmsghdr(&request,
5420 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5421 RTM_GETLINK, NLM_F_REQUEST);
5422 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5423 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5424 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5425 ofpbuf_uninit(&request);
5430 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5431 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5432 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5433 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5436 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5437 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5438 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5441 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5446 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5451 ofpbuf_delete(reply);
5456 get_flags(const struct netdev *dev, unsigned int *flags)
5462 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5464 *flags = ifr.ifr_flags;
5470 set_flags(const char *name, unsigned int flags)
5474 ifr.ifr_flags = flags;
5475 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5479 do_get_ifindex(const char *netdev_name)
5484 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5485 COVERAGE_INC(netdev_get_ifindex);
5487 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5489 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5490 netdev_name, ovs_strerror(error));
5493 return ifr.ifr_ifindex;
5497 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5499 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5501 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5502 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5505 netdev->get_ifindex_error = -ifindex;
5506 netdev->ifindex = 0;
5508 netdev->get_ifindex_error = 0;
5509 netdev->ifindex = ifindex;
5511 netdev->cache_valid |= VALID_IFINDEX;
5514 *ifindexp = netdev->ifindex;
5515 return netdev->get_ifindex_error;
5519 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5525 memset(&ifr, 0, sizeof ifr);
5526 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5527 COVERAGE_INC(netdev_get_hwaddr);
5528 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5530 /* ENODEV probably means that a vif disappeared asynchronously and
5531 * hasn't been removed from the database yet, so reduce the log level
5532 * to INFO for that case. */
5533 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5534 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5535 netdev_name, ovs_strerror(error));
5538 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5539 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5540 VLOG_INFO("%s device has unknown hardware address family %d",
5541 netdev_name, hwaddr_family);
5544 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5549 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5554 memset(&ifr, 0, sizeof ifr);
5555 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5556 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5557 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5558 COVERAGE_INC(netdev_set_hwaddr);
5559 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5561 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5562 netdev_name, ovs_strerror(error));
5568 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5569 int cmd, const char *cmd_name)
5574 memset(&ifr, 0, sizeof ifr);
5575 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5576 ifr.ifr_data = (caddr_t) ecmd;
5579 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5581 if (error != EOPNOTSUPP) {
5582 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5583 "failed: %s", cmd_name, name, ovs_strerror(error));
5585 /* The device doesn't support this operation. That's pretty
5586 * common, so there's no point in logging anything. */
5592 /* Returns an AF_PACKET raw socket or a negative errno value. */
5594 af_packet_sock(void)
5596 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5599 if (ovsthread_once_start(&once)) {
5600 sock = socket(AF_PACKET, SOCK_RAW, 0);
5602 int error = set_nonblocking(sock);
5609 VLOG_ERR("failed to create packet socket: %s",
5610 ovs_strerror(errno));
5612 ovsthread_once_done(&once);