2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 uint8_t etheraddr[ETH_ADDR_LEN];
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
462 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
463 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
466 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
469 /* For devices of class netdev_tap_class only. */
473 struct netdev_rxq_linux {
474 struct netdev_rxq up;
479 /* This is set pretty low because we probably won't learn anything from the
480 * additional log messages. */
481 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
483 /* Polling miimon status for all ports causes performance degradation when
484 * handling a large number of ports. If there are no devices using miimon, then
485 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
487 * Readers do not depend on this variable synchronizing with the related
488 * changes in the device miimon status, so we can use atomic_count. */
489 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
491 static void netdev_linux_run(void);
493 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
494 int cmd, const char *cmd_name);
495 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
496 int cmd, const char *cmd_name);
497 static int get_flags(const struct netdev *, unsigned int *flags);
498 static int set_flags(const char *, unsigned int flags);
499 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
500 enum netdev_flags on, enum netdev_flags *old_flagsp)
501 OVS_REQUIRES(netdev->mutex);
502 static int do_get_ifindex(const char *netdev_name);
503 static int get_ifindex(const struct netdev *, int *ifindexp);
504 static int do_set_addr(struct netdev *netdev,
505 int ioctl_nr, const char *ioctl_name,
506 struct in_addr addr);
507 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
508 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
509 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
510 static int af_packet_sock(void);
511 static bool netdev_linux_miimon_enabled(void);
512 static void netdev_linux_miimon_run(void);
513 static void netdev_linux_miimon_wait(void);
514 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
517 is_netdev_linux_class(const struct netdev_class *netdev_class)
519 return netdev_class->run == netdev_linux_run;
523 is_tap_netdev(const struct netdev *netdev)
525 return netdev_get_class(netdev) == &netdev_tap_class;
528 static struct netdev_linux *
529 netdev_linux_cast(const struct netdev *netdev)
531 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
533 return CONTAINER_OF(netdev, struct netdev_linux, up);
536 static struct netdev_rxq_linux *
537 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
539 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
540 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
543 static void netdev_linux_update(struct netdev_linux *netdev,
544 const struct rtnetlink_change *)
545 OVS_REQUIRES(netdev->mutex);
546 static void netdev_linux_changed(struct netdev_linux *netdev,
547 unsigned int ifi_flags, unsigned int mask)
548 OVS_REQUIRES(netdev->mutex);
550 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
551 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
552 * if no such socket could be created. */
553 static struct nl_sock *
554 netdev_linux_notify_sock(void)
556 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
557 static struct nl_sock *sock;
558 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
559 RTNLGRP_IPV6_IFADDR};
561 if (ovsthread_once_start(&once)) {
564 error = nl_sock_create(NETLINK_ROUTE, &sock);
568 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
569 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
571 nl_sock_destroy(sock);
577 ovsthread_once_done(&once);
584 netdev_linux_miimon_enabled(void)
586 return atomic_count_get(&miimon_cnt) > 0;
590 netdev_linux_run(void)
592 struct nl_sock *sock;
595 if (netdev_linux_miimon_enabled()) {
596 netdev_linux_miimon_run();
599 sock = netdev_linux_notify_sock();
605 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
606 uint64_t buf_stub[4096 / 8];
609 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
610 error = nl_sock_recv(sock, &buf, false);
612 struct rtnetlink_change change;
614 if (rtnetlink_parse(&buf, &change)) {
615 struct netdev *netdev_ = netdev_from_name(change.ifname);
616 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
617 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
619 ovs_mutex_lock(&netdev->mutex);
620 netdev_linux_update(netdev, &change);
621 ovs_mutex_unlock(&netdev->mutex);
623 netdev_close(netdev_);
625 } else if (error == ENOBUFS) {
626 struct shash device_shash;
627 struct shash_node *node;
631 shash_init(&device_shash);
632 netdev_get_devices(&netdev_linux_class, &device_shash);
633 SHASH_FOR_EACH (node, &device_shash) {
634 struct netdev *netdev_ = node->data;
635 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
638 ovs_mutex_lock(&netdev->mutex);
639 get_flags(netdev_, &flags);
640 netdev_linux_changed(netdev, flags, 0);
641 ovs_mutex_unlock(&netdev->mutex);
643 netdev_close(netdev_);
645 shash_destroy(&device_shash);
646 } else if (error != EAGAIN) {
647 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
648 ovs_strerror(error));
655 netdev_linux_wait(void)
657 struct nl_sock *sock;
659 if (netdev_linux_miimon_enabled()) {
660 netdev_linux_miimon_wait();
662 sock = netdev_linux_notify_sock();
664 nl_sock_wait(sock, POLLIN);
669 netdev_linux_changed(struct netdev_linux *dev,
670 unsigned int ifi_flags, unsigned int mask)
671 OVS_REQUIRES(dev->mutex)
673 netdev_change_seq_changed(&dev->up);
675 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
676 dev->carrier_resets++;
678 dev->ifi_flags = ifi_flags;
680 dev->cache_valid &= mask;
684 netdev_linux_update(struct netdev_linux *dev,
685 const struct rtnetlink_change *change)
686 OVS_REQUIRES(dev->mutex)
688 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
689 if (change->nlmsg_type == RTM_NEWLINK) {
690 /* Keep drv-info, in4, in6. */
691 netdev_linux_changed(dev, change->ifi_flags,
692 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
694 /* Update netdev from rtnl-change msg. */
696 dev->mtu = change->mtu;
697 dev->cache_valid |= VALID_MTU;
698 dev->netdev_mtu_error = 0;
701 if (!eth_addr_is_zero(change->addr)) {
702 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
703 dev->cache_valid |= VALID_ETHERADDR;
704 dev->ether_addr_error = 0;
707 dev->ifindex = change->if_index;
708 dev->cache_valid |= VALID_IFINDEX;
709 dev->get_ifindex_error = 0;
711 netdev_linux_changed(dev, change->ifi_flags, 0);
713 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
714 /* Invalidates in4, in6. */
715 netdev_linux_changed(dev, dev->ifi_flags,
716 ~(VALID_IN4 | VALID_IN6));
722 static struct netdev *
723 netdev_linux_alloc(void)
725 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
730 netdev_linux_common_construct(struct netdev_linux *netdev)
732 ovs_mutex_init(&netdev->mutex);
735 /* Creates system and internal devices. */
737 netdev_linux_construct(struct netdev *netdev_)
739 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
742 netdev_linux_common_construct(netdev);
744 error = get_flags(&netdev->up, &netdev->ifi_flags);
745 if (error == ENODEV) {
746 if (netdev->up.netdev_class != &netdev_internal_class) {
747 /* The device does not exist, so don't allow it to be opened. */
750 /* "Internal" netdevs have to be created as netdev objects before
751 * they exist in the kernel, because creating them in the kernel
752 * happens by passing a netdev object to dpif_port_add().
753 * Therefore, ignore the error. */
760 /* For most types of netdevs we open the device for each call of
761 * netdev_open(). However, this is not the case with tap devices,
762 * since it is only possible to open the device once. In this
763 * situation we share a single file descriptor, and consequently
764 * buffers, across all readers. Therefore once data is read it will
765 * be unavailable to other reads for tap devices. */
767 netdev_linux_construct_tap(struct netdev *netdev_)
769 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
770 static const char tap_dev[] = "/dev/net/tun";
771 const char *name = netdev_->name;
775 netdev_linux_common_construct(netdev);
777 /* Open tap device. */
778 netdev->tap_fd = open(tap_dev, O_RDWR);
779 if (netdev->tap_fd < 0) {
781 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
785 /* Create tap device. */
786 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
787 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
788 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
789 VLOG_WARN("%s: creating tap device failed: %s", name,
790 ovs_strerror(errno));
795 /* Make non-blocking. */
796 error = set_nonblocking(netdev->tap_fd);
804 close(netdev->tap_fd);
809 netdev_linux_destruct(struct netdev *netdev_)
811 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
813 if (netdev->tc && netdev->tc->ops->tc_destroy) {
814 netdev->tc->ops->tc_destroy(netdev->tc);
817 if (netdev_get_class(netdev_) == &netdev_tap_class
818 && netdev->tap_fd >= 0)
820 close(netdev->tap_fd);
823 if (netdev->miimon_interval > 0) {
824 atomic_count_dec(&miimon_cnt);
827 ovs_mutex_destroy(&netdev->mutex);
831 netdev_linux_dealloc(struct netdev *netdev_)
833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
837 static struct netdev_rxq *
838 netdev_linux_rxq_alloc(void)
840 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
845 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
847 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
848 struct netdev *netdev_ = rx->up.netdev;
849 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
852 ovs_mutex_lock(&netdev->mutex);
853 rx->is_tap = is_tap_netdev(netdev_);
855 rx->fd = netdev->tap_fd;
857 struct sockaddr_ll sll;
859 /* Result of tcpdump -dd inbound */
860 static const struct sock_filter filt[] = {
861 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
862 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
863 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
864 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
866 static const struct sock_fprog fprog = {
867 ARRAY_SIZE(filt), (struct sock_filter *) filt
870 /* Create file descriptor. */
871 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
874 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
879 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
881 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
882 netdev_get_name(netdev_), ovs_strerror(error));
886 /* Set non-blocking mode. */
887 error = set_nonblocking(rx->fd);
892 /* Get ethernet device index. */
893 error = get_ifindex(&netdev->up, &ifindex);
898 /* Bind to specific ethernet device. */
899 memset(&sll, 0, sizeof sll);
900 sll.sll_family = AF_PACKET;
901 sll.sll_ifindex = ifindex;
902 sll.sll_protocol = htons(ETH_P_ALL);
903 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
905 VLOG_ERR("%s: failed to bind raw socket (%s)",
906 netdev_get_name(netdev_), ovs_strerror(error));
910 /* Filter for only inbound packets. */
911 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
915 VLOG_ERR("%s: failed to attach filter (%s)",
916 netdev_get_name(netdev_), ovs_strerror(error));
920 ovs_mutex_unlock(&netdev->mutex);
928 ovs_mutex_unlock(&netdev->mutex);
933 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
935 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
943 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
945 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
951 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
953 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
954 return htons(aux->tp_vlan_tpid);
956 return htons(ETH_TYPE_VLAN);
961 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
963 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
967 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
972 struct cmsghdr *cmsg;
975 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
979 /* Reserve headroom for a single VLAN tag */
980 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
981 size = dp_packet_tailroom(buffer);
983 iov.iov_base = dp_packet_data(buffer);
985 msgh.msg_name = NULL;
986 msgh.msg_namelen = 0;
989 msgh.msg_control = &cmsg_buffer;
990 msgh.msg_controllen = sizeof cmsg_buffer;
994 retval = recvmsg(fd, &msgh, MSG_TRUNC);
995 } while (retval < 0 && errno == EINTR);
999 } else if (retval > size) {
1003 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1005 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1006 const struct tpacket_auxdata *aux;
1008 if (cmsg->cmsg_level != SOL_PACKET
1009 || cmsg->cmsg_type != PACKET_AUXDATA
1010 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1014 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1015 if (auxdata_has_vlan_tci(aux)) {
1016 if (retval < ETH_HEADER_LEN) {
1020 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1021 htons(aux->tp_vlan_tci));
1030 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1033 size_t size = dp_packet_tailroom(buffer);
1036 retval = read(fd, dp_packet_data(buffer), size);
1037 } while (retval < 0 && errno == EINTR);
1041 } else if (retval > size) {
1045 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1050 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1053 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1054 struct netdev *netdev = rx->up.netdev;
1055 struct dp_packet *buffer;
1059 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1060 mtu = ETH_PAYLOAD_MAX;
1063 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1064 DP_NETDEV_HEADROOM);
1065 retval = (rx->is_tap
1066 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1067 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1070 if (retval != EAGAIN && retval != EMSGSIZE) {
1071 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1072 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1074 dp_packet_delete(buffer);
1076 dp_packet_pad(buffer);
1077 dp_packet_set_rss_hash(buffer, 0);
1078 packets[0] = buffer;
1086 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1088 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1089 poll_fd_wait(rx->fd, POLLIN);
1093 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1099 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1103 drain_fd(rx->fd, ifr.ifr_qlen);
1106 return drain_rcvbuf(rx->fd);
1110 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1111 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1112 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1113 * the packet is too big or too small to transmit on the device.
1115 * The caller retains ownership of 'buffer' in all cases.
1117 * The kernel maintains a packet transmission queue, so the caller is not
1118 * expected to do additional queuing of packets. */
1120 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1121 struct dp_packet **pkts, int cnt, bool may_steal)
1126 /* 'i' is incremented only if there's no error */
1127 for (i = 0; i < cnt;) {
1128 const void *data = dp_packet_data(pkts[i]);
1129 size_t size = dp_packet_size(pkts[i]);
1132 if (!is_tap_netdev(netdev_)) {
1133 /* Use our AF_PACKET socket to send to this device. */
1134 struct sockaddr_ll sll;
1140 sock = af_packet_sock();
1145 ifindex = netdev_get_ifindex(netdev_);
1150 /* We don't bother setting most fields in sockaddr_ll because the
1151 * kernel ignores them for SOCK_RAW. */
1152 memset(&sll, 0, sizeof sll);
1153 sll.sll_family = AF_PACKET;
1154 sll.sll_ifindex = ifindex;
1156 iov.iov_base = CONST_CAST(void *, data);
1159 msg.msg_name = &sll;
1160 msg.msg_namelen = sizeof sll;
1163 msg.msg_control = NULL;
1164 msg.msg_controllen = 0;
1167 retval = sendmsg(sock, &msg, 0);
1169 /* Use the tap fd to send to this device. This is essential for
1170 * tap devices, because packets sent to a tap device with an
1171 * AF_PACKET socket will loop back to be *received* again on the
1172 * tap device. This doesn't occur on other interface types
1173 * because we attach a socket filter to the rx socket. */
1174 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1176 retval = write(netdev->tap_fd, data, size);
1180 /* The Linux AF_PACKET implementation never blocks waiting for room
1181 * for packets, instead returning ENOBUFS. Translate this into
1182 * EAGAIN for the caller. */
1183 error = errno == ENOBUFS ? EAGAIN : errno;
1184 if (error == EINTR) {
1185 /* continue without incrementing 'i', i.e. retry this packet */
1189 } else if (retval != size) {
1190 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1191 " of %"PRIuSIZE") on %s", retval, size,
1192 netdev_get_name(netdev_));
1197 /* Process the next packet in the batch */
1202 for (i = 0; i < cnt; i++) {
1203 dp_packet_delete(pkts[i]);
1207 if (error && error != EAGAIN) {
1208 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1209 netdev_get_name(netdev_), ovs_strerror(error));
1216 /* Registers with the poll loop to wake up from the next call to poll_block()
1217 * when the packet transmission queue has sufficient room to transmit a packet
1218 * with netdev_send().
1220 * The kernel maintains a packet transmission queue, so the client is not
1221 * expected to do additional queuing of packets. Thus, this function is
1222 * unlikely to ever be used. It is included for completeness. */
1224 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1226 if (is_tap_netdev(netdev)) {
1227 /* TAP device always accepts packets.*/
1228 poll_immediate_wake();
1232 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1233 * otherwise a positive errno value. */
1235 netdev_linux_set_etheraddr(struct netdev *netdev_,
1236 const uint8_t mac[ETH_ADDR_LEN])
1238 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1239 enum netdev_flags old_flags = 0;
1242 ovs_mutex_lock(&netdev->mutex);
1244 if (netdev->cache_valid & VALID_ETHERADDR) {
1245 error = netdev->ether_addr_error;
1246 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1249 netdev->cache_valid &= ~VALID_ETHERADDR;
1252 /* Tap devices must be brought down before setting the address. */
1253 if (is_tap_netdev(netdev_)) {
1254 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1256 error = set_etheraddr(netdev_get_name(netdev_), mac);
1257 if (!error || error == ENODEV) {
1258 netdev->ether_addr_error = error;
1259 netdev->cache_valid |= VALID_ETHERADDR;
1261 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1265 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1266 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1270 ovs_mutex_unlock(&netdev->mutex);
1274 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1276 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1277 uint8_t mac[ETH_ADDR_LEN])
1279 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1282 ovs_mutex_lock(&netdev->mutex);
1283 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1284 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1286 netdev->cache_valid |= VALID_ETHERADDR;
1289 error = netdev->ether_addr_error;
1291 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1293 ovs_mutex_unlock(&netdev->mutex);
1299 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1303 if (!(netdev->cache_valid & VALID_MTU)) {
1306 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1307 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1308 netdev->mtu = ifr.ifr_mtu;
1309 netdev->cache_valid |= VALID_MTU;
1312 error = netdev->netdev_mtu_error;
1314 *mtup = netdev->mtu;
1320 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1321 * in bytes, not including the hardware header; thus, this is typically 1500
1322 * bytes for Ethernet devices. */
1324 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1329 ovs_mutex_lock(&netdev->mutex);
1330 error = netdev_linux_get_mtu__(netdev, mtup);
1331 ovs_mutex_unlock(&netdev->mutex);
1336 /* Sets the maximum size of transmitted (MTU) for given device using linux
1337 * networking ioctl interface.
1340 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1346 ovs_mutex_lock(&netdev->mutex);
1347 if (netdev->cache_valid & VALID_MTU) {
1348 error = netdev->netdev_mtu_error;
1349 if (error || netdev->mtu == mtu) {
1352 netdev->cache_valid &= ~VALID_MTU;
1355 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1356 SIOCSIFMTU, "SIOCSIFMTU");
1357 if (!error || error == ENODEV) {
1358 netdev->netdev_mtu_error = error;
1359 netdev->mtu = ifr.ifr_mtu;
1360 netdev->cache_valid |= VALID_MTU;
1363 ovs_mutex_unlock(&netdev->mutex);
1367 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1368 * On failure, returns a negative errno value. */
1370 netdev_linux_get_ifindex(const struct netdev *netdev_)
1372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1375 ovs_mutex_lock(&netdev->mutex);
1376 error = get_ifindex(netdev_, &ifindex);
1377 ovs_mutex_unlock(&netdev->mutex);
1379 return error ? -error : ifindex;
1383 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1387 ovs_mutex_lock(&netdev->mutex);
1388 if (netdev->miimon_interval > 0) {
1389 *carrier = netdev->miimon;
1391 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1393 ovs_mutex_unlock(&netdev->mutex);
1398 static long long int
1399 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1402 long long int carrier_resets;
1404 ovs_mutex_lock(&netdev->mutex);
1405 carrier_resets = netdev->carrier_resets;
1406 ovs_mutex_unlock(&netdev->mutex);
1408 return carrier_resets;
1412 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1413 struct mii_ioctl_data *data)
1418 memset(&ifr, 0, sizeof ifr);
1419 memcpy(&ifr.ifr_data, data, sizeof *data);
1420 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1421 memcpy(data, &ifr.ifr_data, sizeof *data);
1427 netdev_linux_get_miimon(const char *name, bool *miimon)
1429 struct mii_ioctl_data data;
1434 memset(&data, 0, sizeof data);
1435 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1437 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1438 data.reg_num = MII_BMSR;
1439 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1443 *miimon = !!(data.val_out & BMSR_LSTATUS);
1445 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1448 struct ethtool_cmd ecmd;
1450 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1453 COVERAGE_INC(netdev_get_ethtool);
1454 memset(&ecmd, 0, sizeof ecmd);
1455 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1458 struct ethtool_value eval;
1460 memcpy(&eval, &ecmd, sizeof eval);
1461 *miimon = !!eval.data;
1463 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1471 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1472 long long int interval)
1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1476 ovs_mutex_lock(&netdev->mutex);
1477 interval = interval > 0 ? MAX(interval, 100) : 0;
1478 if (netdev->miimon_interval != interval) {
1479 if (interval && !netdev->miimon_interval) {
1480 atomic_count_inc(&miimon_cnt);
1481 } else if (!interval && netdev->miimon_interval) {
1482 atomic_count_dec(&miimon_cnt);
1485 netdev->miimon_interval = interval;
1486 timer_set_expired(&netdev->miimon_timer);
1488 ovs_mutex_unlock(&netdev->mutex);
1494 netdev_linux_miimon_run(void)
1496 struct shash device_shash;
1497 struct shash_node *node;
1499 shash_init(&device_shash);
1500 netdev_get_devices(&netdev_linux_class, &device_shash);
1501 SHASH_FOR_EACH (node, &device_shash) {
1502 struct netdev *netdev = node->data;
1503 struct netdev_linux *dev = netdev_linux_cast(netdev);
1506 ovs_mutex_lock(&dev->mutex);
1507 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1508 netdev_linux_get_miimon(dev->up.name, &miimon);
1509 if (miimon != dev->miimon) {
1510 dev->miimon = miimon;
1511 netdev_linux_changed(dev, dev->ifi_flags, 0);
1514 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1516 ovs_mutex_unlock(&dev->mutex);
1517 netdev_close(netdev);
1520 shash_destroy(&device_shash);
1524 netdev_linux_miimon_wait(void)
1526 struct shash device_shash;
1527 struct shash_node *node;
1529 shash_init(&device_shash);
1530 netdev_get_devices(&netdev_linux_class, &device_shash);
1531 SHASH_FOR_EACH (node, &device_shash) {
1532 struct netdev *netdev = node->data;
1533 struct netdev_linux *dev = netdev_linux_cast(netdev);
1535 ovs_mutex_lock(&dev->mutex);
1536 if (dev->miimon_interval > 0) {
1537 timer_wait(&dev->miimon_timer);
1539 ovs_mutex_unlock(&dev->mutex);
1540 netdev_close(netdev);
1542 shash_destroy(&device_shash);
1546 swap_uint64(uint64_t *a, uint64_t *b)
1553 /* Copies 'src' into 'dst', performing format conversion in the process.
1555 * 'src' is allowed to be misaligned. */
1557 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1558 const struct ovs_vport_stats *src)
1560 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1561 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1562 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1563 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1564 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1565 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1566 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1567 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1569 dst->collisions = 0;
1570 dst->rx_length_errors = 0;
1571 dst->rx_over_errors = 0;
1572 dst->rx_crc_errors = 0;
1573 dst->rx_frame_errors = 0;
1574 dst->rx_fifo_errors = 0;
1575 dst->rx_missed_errors = 0;
1576 dst->tx_aborted_errors = 0;
1577 dst->tx_carrier_errors = 0;
1578 dst->tx_fifo_errors = 0;
1579 dst->tx_heartbeat_errors = 0;
1580 dst->tx_window_errors = 0;
1584 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1586 struct dpif_netlink_vport reply;
1590 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1593 } else if (!reply.stats) {
1598 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1606 get_stats_via_vport(const struct netdev *netdev_,
1607 struct netdev_stats *stats)
1609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1611 if (!netdev->vport_stats_error ||
1612 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1615 error = get_stats_via_vport__(netdev_, stats);
1616 if (error && error != ENOENT && error != ENODEV) {
1617 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1619 netdev_get_name(netdev_), ovs_strerror(error));
1621 netdev->vport_stats_error = error;
1622 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1626 /* Retrieves current device stats for 'netdev-linux'. */
1628 netdev_linux_get_stats(const struct netdev *netdev_,
1629 struct netdev_stats *stats)
1631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1632 struct netdev_stats dev_stats;
1635 ovs_mutex_lock(&netdev->mutex);
1636 get_stats_via_vport(netdev_, stats);
1637 error = get_stats_via_netlink(netdev_, &dev_stats);
1639 if (!netdev->vport_stats_error) {
1642 } else if (netdev->vport_stats_error) {
1643 /* stats not available from OVS then use netdev stats. */
1646 /* Use kernel netdev's packet and byte counts since vport's counters
1647 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1649 stats->rx_packets = dev_stats.rx_packets;
1650 stats->rx_bytes = dev_stats.rx_bytes;
1651 stats->tx_packets = dev_stats.tx_packets;
1652 stats->tx_bytes = dev_stats.tx_bytes;
1654 stats->rx_errors += dev_stats.rx_errors;
1655 stats->tx_errors += dev_stats.tx_errors;
1656 stats->rx_dropped += dev_stats.rx_dropped;
1657 stats->tx_dropped += dev_stats.tx_dropped;
1658 stats->multicast += dev_stats.multicast;
1659 stats->collisions += dev_stats.collisions;
1660 stats->rx_length_errors += dev_stats.rx_length_errors;
1661 stats->rx_over_errors += dev_stats.rx_over_errors;
1662 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1663 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1664 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1665 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1666 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1667 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1668 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1669 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1670 stats->tx_window_errors += dev_stats.tx_window_errors;
1672 ovs_mutex_unlock(&netdev->mutex);
1677 /* Retrieves current device stats for 'netdev-tap' netdev or
1678 * netdev-internal. */
1680 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1683 struct netdev_stats dev_stats;
1686 ovs_mutex_lock(&netdev->mutex);
1687 get_stats_via_vport(netdev_, stats);
1688 error = get_stats_via_netlink(netdev_, &dev_stats);
1690 if (!netdev->vport_stats_error) {
1693 } else if (netdev->vport_stats_error) {
1694 /* Transmit and receive stats will appear to be swapped relative to the
1695 * other ports since we are the one sending the data, not a remote
1696 * computer. For consistency, we swap them back here. This does not
1697 * apply if we are getting stats from the vport layer because it always
1698 * tracks stats from the perspective of the switch. */
1701 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1702 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1703 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1704 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1705 stats->rx_length_errors = 0;
1706 stats->rx_over_errors = 0;
1707 stats->rx_crc_errors = 0;
1708 stats->rx_frame_errors = 0;
1709 stats->rx_fifo_errors = 0;
1710 stats->rx_missed_errors = 0;
1711 stats->tx_aborted_errors = 0;
1712 stats->tx_carrier_errors = 0;
1713 stats->tx_fifo_errors = 0;
1714 stats->tx_heartbeat_errors = 0;
1715 stats->tx_window_errors = 0;
1717 /* Use kernel netdev's packet and byte counts since vport counters
1718 * do not reflect packet counts on the wire when GSO, TSO or GRO
1720 stats->rx_packets = dev_stats.tx_packets;
1721 stats->rx_bytes = dev_stats.tx_bytes;
1722 stats->tx_packets = dev_stats.rx_packets;
1723 stats->tx_bytes = dev_stats.rx_bytes;
1725 stats->rx_dropped += dev_stats.tx_dropped;
1726 stats->tx_dropped += dev_stats.rx_dropped;
1728 stats->rx_errors += dev_stats.tx_errors;
1729 stats->tx_errors += dev_stats.rx_errors;
1731 stats->multicast += dev_stats.multicast;
1732 stats->collisions += dev_stats.collisions;
1734 ovs_mutex_unlock(&netdev->mutex);
1740 netdev_internal_get_stats(const struct netdev *netdev_,
1741 struct netdev_stats *stats)
1743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1746 ovs_mutex_lock(&netdev->mutex);
1747 get_stats_via_vport(netdev_, stats);
1748 error = netdev->vport_stats_error;
1749 ovs_mutex_unlock(&netdev->mutex);
1755 netdev_linux_read_features(struct netdev_linux *netdev)
1757 struct ethtool_cmd ecmd;
1761 if (netdev->cache_valid & VALID_FEATURES) {
1765 COVERAGE_INC(netdev_get_ethtool);
1766 memset(&ecmd, 0, sizeof ecmd);
1767 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1768 ETHTOOL_GSET, "ETHTOOL_GSET");
1773 /* Supported features. */
1774 netdev->supported = 0;
1775 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1776 netdev->supported |= NETDEV_F_10MB_HD;
1778 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1779 netdev->supported |= NETDEV_F_10MB_FD;
1781 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1782 netdev->supported |= NETDEV_F_100MB_HD;
1784 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1785 netdev->supported |= NETDEV_F_100MB_FD;
1787 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1788 netdev->supported |= NETDEV_F_1GB_HD;
1790 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1791 netdev->supported |= NETDEV_F_1GB_FD;
1793 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1794 netdev->supported |= NETDEV_F_10GB_FD;
1796 if (ecmd.supported & SUPPORTED_TP) {
1797 netdev->supported |= NETDEV_F_COPPER;
1799 if (ecmd.supported & SUPPORTED_FIBRE) {
1800 netdev->supported |= NETDEV_F_FIBER;
1802 if (ecmd.supported & SUPPORTED_Autoneg) {
1803 netdev->supported |= NETDEV_F_AUTONEG;
1805 if (ecmd.supported & SUPPORTED_Pause) {
1806 netdev->supported |= NETDEV_F_PAUSE;
1808 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1809 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1812 /* Advertised features. */
1813 netdev->advertised = 0;
1814 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1815 netdev->advertised |= NETDEV_F_10MB_HD;
1817 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1818 netdev->advertised |= NETDEV_F_10MB_FD;
1820 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1821 netdev->advertised |= NETDEV_F_100MB_HD;
1823 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1824 netdev->advertised |= NETDEV_F_100MB_FD;
1826 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1827 netdev->advertised |= NETDEV_F_1GB_HD;
1829 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1830 netdev->advertised |= NETDEV_F_1GB_FD;
1832 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1833 netdev->advertised |= NETDEV_F_10GB_FD;
1835 if (ecmd.advertising & ADVERTISED_TP) {
1836 netdev->advertised |= NETDEV_F_COPPER;
1838 if (ecmd.advertising & ADVERTISED_FIBRE) {
1839 netdev->advertised |= NETDEV_F_FIBER;
1841 if (ecmd.advertising & ADVERTISED_Autoneg) {
1842 netdev->advertised |= NETDEV_F_AUTONEG;
1844 if (ecmd.advertising & ADVERTISED_Pause) {
1845 netdev->advertised |= NETDEV_F_PAUSE;
1847 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1848 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1851 /* Current settings. */
1853 if (speed == SPEED_10) {
1854 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1855 } else if (speed == SPEED_100) {
1856 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1857 } else if (speed == SPEED_1000) {
1858 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1859 } else if (speed == SPEED_10000) {
1860 netdev->current = NETDEV_F_10GB_FD;
1861 } else if (speed == 40000) {
1862 netdev->current = NETDEV_F_40GB_FD;
1863 } else if (speed == 100000) {
1864 netdev->current = NETDEV_F_100GB_FD;
1865 } else if (speed == 1000000) {
1866 netdev->current = NETDEV_F_1TB_FD;
1868 netdev->current = 0;
1871 if (ecmd.port == PORT_TP) {
1872 netdev->current |= NETDEV_F_COPPER;
1873 } else if (ecmd.port == PORT_FIBRE) {
1874 netdev->current |= NETDEV_F_FIBER;
1878 netdev->current |= NETDEV_F_AUTONEG;
1882 netdev->cache_valid |= VALID_FEATURES;
1883 netdev->get_features_error = error;
1886 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1887 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1888 * Returns 0 if successful, otherwise a positive errno value. */
1890 netdev_linux_get_features(const struct netdev *netdev_,
1891 enum netdev_features *current,
1892 enum netdev_features *advertised,
1893 enum netdev_features *supported,
1894 enum netdev_features *peer)
1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1899 ovs_mutex_lock(&netdev->mutex);
1900 netdev_linux_read_features(netdev);
1901 if (!netdev->get_features_error) {
1902 *current = netdev->current;
1903 *advertised = netdev->advertised;
1904 *supported = netdev->supported;
1905 *peer = 0; /* XXX */
1907 error = netdev->get_features_error;
1908 ovs_mutex_unlock(&netdev->mutex);
1913 /* Set the features advertised by 'netdev' to 'advertise'. */
1915 netdev_linux_set_advertisements(struct netdev *netdev_,
1916 enum netdev_features advertise)
1918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1919 struct ethtool_cmd ecmd;
1922 ovs_mutex_lock(&netdev->mutex);
1924 COVERAGE_INC(netdev_get_ethtool);
1925 memset(&ecmd, 0, sizeof ecmd);
1926 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1927 ETHTOOL_GSET, "ETHTOOL_GSET");
1932 ecmd.advertising = 0;
1933 if (advertise & NETDEV_F_10MB_HD) {
1934 ecmd.advertising |= ADVERTISED_10baseT_Half;
1936 if (advertise & NETDEV_F_10MB_FD) {
1937 ecmd.advertising |= ADVERTISED_10baseT_Full;
1939 if (advertise & NETDEV_F_100MB_HD) {
1940 ecmd.advertising |= ADVERTISED_100baseT_Half;
1942 if (advertise & NETDEV_F_100MB_FD) {
1943 ecmd.advertising |= ADVERTISED_100baseT_Full;
1945 if (advertise & NETDEV_F_1GB_HD) {
1946 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1948 if (advertise & NETDEV_F_1GB_FD) {
1949 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1951 if (advertise & NETDEV_F_10GB_FD) {
1952 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1954 if (advertise & NETDEV_F_COPPER) {
1955 ecmd.advertising |= ADVERTISED_TP;
1957 if (advertise & NETDEV_F_FIBER) {
1958 ecmd.advertising |= ADVERTISED_FIBRE;
1960 if (advertise & NETDEV_F_AUTONEG) {
1961 ecmd.advertising |= ADVERTISED_Autoneg;
1963 if (advertise & NETDEV_F_PAUSE) {
1964 ecmd.advertising |= ADVERTISED_Pause;
1966 if (advertise & NETDEV_F_PAUSE_ASYM) {
1967 ecmd.advertising |= ADVERTISED_Asym_Pause;
1969 COVERAGE_INC(netdev_set_ethtool);
1970 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1971 ETHTOOL_SSET, "ETHTOOL_SSET");
1974 ovs_mutex_unlock(&netdev->mutex);
1978 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1979 * successful, otherwise a positive errno value. */
1981 netdev_linux_set_policing(struct netdev *netdev_,
1982 uint32_t kbits_rate, uint32_t kbits_burst)
1984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1985 const char *netdev_name = netdev_get_name(netdev_);
1988 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1989 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1990 : kbits_burst); /* Stick with user-specified value. */
1992 ovs_mutex_lock(&netdev->mutex);
1993 if (netdev->cache_valid & VALID_POLICING) {
1994 error = netdev->netdev_policing_error;
1995 if (error || (netdev->kbits_rate == kbits_rate &&
1996 netdev->kbits_burst == kbits_burst)) {
1997 /* Assume that settings haven't changed since we last set them. */
2000 netdev->cache_valid &= ~VALID_POLICING;
2003 COVERAGE_INC(netdev_set_policing);
2004 /* Remove any existing ingress qdisc. */
2005 error = tc_add_del_ingress_qdisc(netdev_, false);
2007 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2008 netdev_name, ovs_strerror(error));
2013 error = tc_add_del_ingress_qdisc(netdev_, true);
2015 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2016 netdev_name, ovs_strerror(error));
2020 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2022 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2023 netdev_name, ovs_strerror(error));
2028 netdev->kbits_rate = kbits_rate;
2029 netdev->kbits_burst = kbits_burst;
2032 if (!error || error == ENODEV) {
2033 netdev->netdev_policing_error = error;
2034 netdev->cache_valid |= VALID_POLICING;
2036 ovs_mutex_unlock(&netdev->mutex);
2041 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2044 const struct tc_ops *const *opsp;
2046 for (opsp = tcs; *opsp != NULL; opsp++) {
2047 const struct tc_ops *ops = *opsp;
2048 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2049 sset_add(types, ops->ovs_name);
2055 static const struct tc_ops *
2056 tc_lookup_ovs_name(const char *name)
2058 const struct tc_ops *const *opsp;
2060 for (opsp = tcs; *opsp != NULL; opsp++) {
2061 const struct tc_ops *ops = *opsp;
2062 if (!strcmp(name, ops->ovs_name)) {
2069 static const struct tc_ops *
2070 tc_lookup_linux_name(const char *name)
2072 const struct tc_ops *const *opsp;
2074 for (opsp = tcs; *opsp != NULL; opsp++) {
2075 const struct tc_ops *ops = *opsp;
2076 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2083 static struct tc_queue *
2084 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2088 struct tc_queue *queue;
2090 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2091 if (queue->queue_id == queue_id) {
2098 static struct tc_queue *
2099 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2101 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2105 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2107 struct netdev_qos_capabilities *caps)
2109 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2113 caps->n_queues = ops->n_queues;
2118 netdev_linux_get_qos(const struct netdev *netdev_,
2119 const char **typep, struct smap *details)
2121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2124 ovs_mutex_lock(&netdev->mutex);
2125 error = tc_query_qdisc(netdev_);
2127 *typep = netdev->tc->ops->ovs_name;
2128 error = (netdev->tc->ops->qdisc_get
2129 ? netdev->tc->ops->qdisc_get(netdev_, details)
2132 ovs_mutex_unlock(&netdev->mutex);
2138 netdev_linux_set_qos(struct netdev *netdev_,
2139 const char *type, const struct smap *details)
2141 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2142 const struct tc_ops *new_ops;
2145 new_ops = tc_lookup_ovs_name(type);
2146 if (!new_ops || !new_ops->tc_install) {
2150 ovs_mutex_lock(&netdev->mutex);
2151 error = tc_query_qdisc(netdev_);
2156 if (new_ops == netdev->tc->ops) {
2157 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2159 /* Delete existing qdisc. */
2160 error = tc_del_qdisc(netdev_);
2164 ovs_assert(netdev->tc == NULL);
2166 /* Install new qdisc. */
2167 error = new_ops->tc_install(netdev_, details);
2168 ovs_assert((error == 0) == (netdev->tc != NULL));
2172 ovs_mutex_unlock(&netdev->mutex);
2177 netdev_linux_get_queue(const struct netdev *netdev_,
2178 unsigned int queue_id, struct smap *details)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2188 ? netdev->tc->ops->class_get(netdev_, queue, details)
2191 ovs_mutex_unlock(&netdev->mutex);
2197 netdev_linux_set_queue(struct netdev *netdev_,
2198 unsigned int queue_id, const struct smap *details)
2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2203 ovs_mutex_lock(&netdev->mutex);
2204 error = tc_query_qdisc(netdev_);
2206 error = (queue_id < netdev->tc->ops->n_queues
2207 && netdev->tc->ops->class_set
2208 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2211 ovs_mutex_unlock(&netdev->mutex);
2217 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2222 ovs_mutex_lock(&netdev->mutex);
2223 error = tc_query_qdisc(netdev_);
2225 if (netdev->tc->ops->class_delete) {
2226 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2228 ? netdev->tc->ops->class_delete(netdev_, queue)
2234 ovs_mutex_unlock(&netdev->mutex);
2240 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2241 unsigned int queue_id,
2242 struct netdev_queue_stats *stats)
2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2247 ovs_mutex_lock(&netdev->mutex);
2248 error = tc_query_qdisc(netdev_);
2250 if (netdev->tc->ops->class_get_stats) {
2251 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2253 stats->created = queue->created;
2254 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2263 ovs_mutex_unlock(&netdev->mutex);
2268 struct queue_dump_state {
2269 struct nl_dump dump;
2274 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2276 struct ofpbuf request;
2277 struct tcmsg *tcmsg;
2279 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2283 tcmsg->tcm_parent = 0;
2284 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2285 ofpbuf_uninit(&request);
2287 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2292 finish_queue_dump(struct queue_dump_state *state)
2294 ofpbuf_uninit(&state->buf);
2295 return nl_dump_done(&state->dump);
2298 struct netdev_linux_queue_state {
2299 unsigned int *queues;
2305 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2307 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2310 ovs_mutex_lock(&netdev->mutex);
2311 error = tc_query_qdisc(netdev_);
2313 if (netdev->tc->ops->class_get) {
2314 struct netdev_linux_queue_state *state;
2315 struct tc_queue *queue;
2318 *statep = state = xmalloc(sizeof *state);
2319 state->n_queues = hmap_count(&netdev->tc->queues);
2320 state->cur_queue = 0;
2321 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2324 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2325 state->queues[i++] = queue->queue_id;
2331 ovs_mutex_unlock(&netdev->mutex);
2337 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2338 unsigned int *queue_idp, struct smap *details)
2340 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2341 struct netdev_linux_queue_state *state = state_;
2344 ovs_mutex_lock(&netdev->mutex);
2345 while (state->cur_queue < state->n_queues) {
2346 unsigned int queue_id = state->queues[state->cur_queue++];
2347 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2350 *queue_idp = queue_id;
2351 error = netdev->tc->ops->class_get(netdev_, queue, details);
2355 ovs_mutex_unlock(&netdev->mutex);
2361 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2364 struct netdev_linux_queue_state *state = state_;
2366 free(state->queues);
2372 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2373 netdev_dump_queue_stats_cb *cb, void *aux)
2375 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2378 ovs_mutex_lock(&netdev->mutex);
2379 error = tc_query_qdisc(netdev_);
2381 struct queue_dump_state state;
2383 if (!netdev->tc->ops->class_dump_stats) {
2385 } else if (!start_queue_dump(netdev_, &state)) {
2391 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2392 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2399 retval = finish_queue_dump(&state);
2405 ovs_mutex_unlock(&netdev->mutex);
2411 netdev_linux_get_in4(const struct netdev *netdev_,
2412 struct in_addr *address, struct in_addr *netmask)
2414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2417 ovs_mutex_lock(&netdev->mutex);
2418 if (!(netdev->cache_valid & VALID_IN4)) {
2419 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2420 SIOCGIFADDR, "SIOCGIFADDR");
2422 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2423 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2425 netdev->cache_valid |= VALID_IN4;
2433 if (netdev->address.s_addr != INADDR_ANY) {
2434 *address = netdev->address;
2435 *netmask = netdev->netmask;
2437 error = EADDRNOTAVAIL;
2440 ovs_mutex_unlock(&netdev->mutex);
2446 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2447 struct in_addr netmask)
2449 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2452 ovs_mutex_lock(&netdev->mutex);
2453 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2455 netdev->cache_valid |= VALID_IN4;
2456 netdev->address = address;
2457 netdev->netmask = netmask;
2458 if (address.s_addr != INADDR_ANY) {
2459 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2460 "SIOCSIFNETMASK", netmask);
2463 ovs_mutex_unlock(&netdev->mutex);
2469 parse_if_inet6_line(const char *line,
2470 struct in6_addr *in6, char ifname[16 + 1])
2472 uint8_t *s6 = in6->s6_addr;
2473 #define X8 "%2"SCNx8
2474 return ovs_scan(line,
2475 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2476 "%*x %*x %*x %*x %16s\n",
2477 &s6[0], &s6[1], &s6[2], &s6[3],
2478 &s6[4], &s6[5], &s6[6], &s6[7],
2479 &s6[8], &s6[9], &s6[10], &s6[11],
2480 &s6[12], &s6[13], &s6[14], &s6[15],
2484 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2485 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2487 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2489 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2491 ovs_mutex_lock(&netdev->mutex);
2492 if (!(netdev->cache_valid & VALID_IN6)) {
2496 netdev->in6 = in6addr_any;
2498 file = fopen("/proc/net/if_inet6", "r");
2500 const char *name = netdev_get_name(netdev_);
2501 while (fgets(line, sizeof line, file)) {
2502 struct in6_addr in6_tmp;
2503 char ifname[16 + 1];
2504 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2505 && !strcmp(name, ifname))
2507 netdev->in6 = in6_tmp;
2513 netdev->cache_valid |= VALID_IN6;
2516 ovs_mutex_unlock(&netdev->mutex);
2522 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2524 struct sockaddr_in sin;
2525 memset(&sin, 0, sizeof sin);
2526 sin.sin_family = AF_INET;
2527 sin.sin_addr = addr;
2530 memset(sa, 0, sizeof *sa);
2531 memcpy(sa, &sin, sizeof sin);
2535 do_set_addr(struct netdev *netdev,
2536 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2540 make_in4_sockaddr(&ifr.ifr_addr, addr);
2541 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2545 /* Adds 'router' as a default IP gateway. */
2547 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2549 struct in_addr any = { INADDR_ANY };
2553 memset(&rt, 0, sizeof rt);
2554 make_in4_sockaddr(&rt.rt_dst, any);
2555 make_in4_sockaddr(&rt.rt_gateway, router);
2556 make_in4_sockaddr(&rt.rt_genmask, any);
2557 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2558 error = af_inet_ioctl(SIOCADDRT, &rt);
2560 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2566 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2569 static const char fn[] = "/proc/net/route";
2574 *netdev_name = NULL;
2575 stream = fopen(fn, "r");
2576 if (stream == NULL) {
2577 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2582 while (fgets(line, sizeof line, stream)) {
2585 ovs_be32 dest, gateway, mask;
2586 int refcnt, metric, mtu;
2587 unsigned int flags, use, window, irtt;
2590 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2592 iface, &dest, &gateway, &flags, &refcnt,
2593 &use, &metric, &mask, &mtu, &window, &irtt)) {
2594 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2598 if (!(flags & RTF_UP)) {
2599 /* Skip routes that aren't up. */
2603 /* The output of 'dest', 'mask', and 'gateway' were given in
2604 * network byte order, so we don't need need any endian
2605 * conversions here. */
2606 if ((dest & mask) == (host->s_addr & mask)) {
2608 /* The host is directly reachable. */
2609 next_hop->s_addr = 0;
2611 /* To reach the host, we must go through a gateway. */
2612 next_hop->s_addr = gateway;
2614 *netdev_name = xstrdup(iface);
2626 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2628 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2631 ovs_mutex_lock(&netdev->mutex);
2632 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2633 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2635 COVERAGE_INC(netdev_get_ethtool);
2636 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2637 error = netdev_linux_do_ethtool(netdev->up.name,
2640 "ETHTOOL_GDRVINFO");
2642 netdev->cache_valid |= VALID_DRVINFO;
2647 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2648 smap_add(smap, "driver_version", netdev->drvinfo.version);
2649 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2651 ovs_mutex_unlock(&netdev->mutex);
2657 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2660 smap_add(smap, "driver_name", "openvswitch");
2664 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2665 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2666 * returns 0. Otherwise, it returns a positive errno value; in particular,
2667 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2669 netdev_linux_arp_lookup(const struct netdev *netdev,
2670 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2673 struct sockaddr_in sin;
2676 memset(&r, 0, sizeof r);
2677 memset(&sin, 0, sizeof sin);
2678 sin.sin_family = AF_INET;
2679 sin.sin_addr.s_addr = ip;
2681 memcpy(&r.arp_pa, &sin, sizeof sin);
2682 r.arp_ha.sa_family = ARPHRD_ETHER;
2684 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2685 COVERAGE_INC(netdev_arp_lookup);
2686 retval = af_inet_ioctl(SIOCGARP, &r);
2688 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2689 } else if (retval != ENXIO) {
2690 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2691 netdev_get_name(netdev), IP_ARGS(ip),
2692 ovs_strerror(retval));
2698 nd_to_iff_flags(enum netdev_flags nd)
2701 if (nd & NETDEV_UP) {
2704 if (nd & NETDEV_PROMISC) {
2707 if (nd & NETDEV_LOOPBACK) {
2708 iff |= IFF_LOOPBACK;
2714 iff_to_nd_flags(int iff)
2716 enum netdev_flags nd = 0;
2720 if (iff & IFF_PROMISC) {
2721 nd |= NETDEV_PROMISC;
2723 if (iff & IFF_LOOPBACK) {
2724 nd |= NETDEV_LOOPBACK;
2730 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2731 enum netdev_flags on, enum netdev_flags *old_flagsp)
2732 OVS_REQUIRES(netdev->mutex)
2734 int old_flags, new_flags;
2737 old_flags = netdev->ifi_flags;
2738 *old_flagsp = iff_to_nd_flags(old_flags);
2739 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2740 if (new_flags != old_flags) {
2741 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2742 get_flags(&netdev->up, &netdev->ifi_flags);
2749 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2750 enum netdev_flags on, enum netdev_flags *old_flagsp)
2752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2755 ovs_mutex_lock(&netdev->mutex);
2756 error = update_flags(netdev, off, on, old_flagsp);
2757 ovs_mutex_unlock(&netdev->mutex);
2762 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2763 GET_FEATURES, GET_STATUS) \
2769 netdev_linux_wait, \
2771 netdev_linux_alloc, \
2773 netdev_linux_destruct, \
2774 netdev_linux_dealloc, \
2775 NULL, /* get_config */ \
2776 NULL, /* set_config */ \
2777 NULL, /* get_tunnel_config */ \
2778 NULL, /* build header */ \
2779 NULL, /* push header */ \
2780 NULL, /* pop header */ \
2781 NULL, /* get_numa_id */ \
2782 NULL, /* set_multiq */ \
2784 netdev_linux_send, \
2785 netdev_linux_send_wait, \
2787 netdev_linux_set_etheraddr, \
2788 netdev_linux_get_etheraddr, \
2789 netdev_linux_get_mtu, \
2790 netdev_linux_set_mtu, \
2791 netdev_linux_get_ifindex, \
2792 netdev_linux_get_carrier, \
2793 netdev_linux_get_carrier_resets, \
2794 netdev_linux_set_miimon_interval, \
2798 netdev_linux_set_advertisements, \
2800 netdev_linux_set_policing, \
2801 netdev_linux_get_qos_types, \
2802 netdev_linux_get_qos_capabilities, \
2803 netdev_linux_get_qos, \
2804 netdev_linux_set_qos, \
2805 netdev_linux_get_queue, \
2806 netdev_linux_set_queue, \
2807 netdev_linux_delete_queue, \
2808 netdev_linux_get_queue_stats, \
2809 netdev_linux_queue_dump_start, \
2810 netdev_linux_queue_dump_next, \
2811 netdev_linux_queue_dump_done, \
2812 netdev_linux_dump_queue_stats, \
2814 netdev_linux_get_in4, \
2815 netdev_linux_set_in4, \
2816 netdev_linux_get_in6, \
2817 netdev_linux_add_router, \
2818 netdev_linux_get_next_hop, \
2820 netdev_linux_arp_lookup, \
2822 netdev_linux_update_flags, \
2824 netdev_linux_rxq_alloc, \
2825 netdev_linux_rxq_construct, \
2826 netdev_linux_rxq_destruct, \
2827 netdev_linux_rxq_dealloc, \
2828 netdev_linux_rxq_recv, \
2829 netdev_linux_rxq_wait, \
2830 netdev_linux_rxq_drain, \
2833 const struct netdev_class netdev_linux_class =
2836 netdev_linux_construct,
2837 netdev_linux_get_stats,
2838 netdev_linux_get_features,
2839 netdev_linux_get_status);
2841 const struct netdev_class netdev_tap_class =
2844 netdev_linux_construct_tap,
2845 netdev_tap_get_stats,
2846 netdev_linux_get_features,
2847 netdev_linux_get_status);
2849 const struct netdev_class netdev_internal_class =
2852 netdev_linux_construct,
2853 netdev_internal_get_stats,
2854 NULL, /* get_features */
2855 netdev_internal_get_status);
2858 #define CODEL_N_QUEUES 0x0000
2860 /* In sufficiently new kernel headers these are defined as enums in
2861 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2862 * kernels. (This overrides any enum definition in the header file but that's
2864 #define TCA_CODEL_TARGET 1
2865 #define TCA_CODEL_LIMIT 2
2866 #define TCA_CODEL_INTERVAL 3
2875 static struct codel *
2876 codel_get__(const struct netdev *netdev_)
2878 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2879 return CONTAINER_OF(netdev->tc, struct codel, tc);
2883 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2887 struct codel *codel;
2889 codel = xmalloc(sizeof *codel);
2890 tc_init(&codel->tc, &tc_ops_codel);
2891 codel->target = target;
2892 codel->limit = limit;
2893 codel->interval = interval;
2895 netdev->tc = &codel->tc;
2899 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2903 struct ofpbuf request;
2904 struct tcmsg *tcmsg;
2905 uint32_t otarget, olimit, ointerval;
2908 tc_del_qdisc(netdev);
2910 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2911 NLM_F_EXCL | NLM_F_CREATE, &request);
2915 tcmsg->tcm_handle = tc_make_handle(1, 0);
2916 tcmsg->tcm_parent = TC_H_ROOT;
2918 otarget = target ? target : 5000;
2919 olimit = limit ? limit : 10240;
2920 ointerval = interval ? interval : 100000;
2922 nl_msg_put_string(&request, TCA_KIND, "codel");
2923 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2924 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2925 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2926 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2927 nl_msg_end_nested(&request, opt_offset);
2929 error = tc_transact(&request, NULL);
2931 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2932 "target %u, limit %u, interval %u error %d(%s)",
2933 netdev_get_name(netdev),
2934 otarget, olimit, ointerval,
2935 error, ovs_strerror(error));
2941 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2942 const struct smap *details, struct codel *codel)
2944 const char *target_s;
2945 const char *limit_s;
2946 const char *interval_s;
2948 target_s = smap_get(details, "target");
2949 limit_s = smap_get(details, "limit");
2950 interval_s = smap_get(details, "interval");
2952 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2953 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2954 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2956 if (!codel->target) {
2957 codel->target = 5000;
2959 if (!codel->limit) {
2960 codel->limit = 10240;
2962 if (!codel->interval) {
2963 codel->interval = 100000;
2968 codel_tc_install(struct netdev *netdev, const struct smap *details)
2973 codel_parse_qdisc_details__(netdev, details, &codel);
2974 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2977 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2983 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2985 static const struct nl_policy tca_codel_policy[] = {
2986 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2987 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2988 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2991 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2993 if (!nl_parse_nested(nl_options, tca_codel_policy,
2994 attrs, ARRAY_SIZE(tca_codel_policy))) {
2995 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2999 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3000 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3001 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3006 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3008 struct nlattr *nlattr;
3013 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3018 error = codel_parse_tca_options__(nlattr, &codel);
3023 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3029 codel_tc_destroy(struct tc *tc)
3031 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3037 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3039 const struct codel *codel = codel_get__(netdev);
3040 smap_add_format(details, "target", "%u", codel->target);
3041 smap_add_format(details, "limit", "%u", codel->limit);
3042 smap_add_format(details, "interval", "%u", codel->interval);
3047 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3051 codel_parse_qdisc_details__(netdev, details, &codel);
3052 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3053 codel_get__(netdev)->target = codel.target;
3054 codel_get__(netdev)->limit = codel.limit;
3055 codel_get__(netdev)->interval = codel.interval;
3059 static const struct tc_ops tc_ops_codel = {
3060 "codel", /* linux_name */
3061 "linux-codel", /* ovs_name */
3062 CODEL_N_QUEUES, /* n_queues */
3075 /* FQ-CoDel traffic control class. */
3077 #define FQCODEL_N_QUEUES 0x0000
3079 /* In sufficiently new kernel headers these are defined as enums in
3080 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3081 * kernels. (This overrides any enum definition in the header file but that's
3083 #define TCA_FQ_CODEL_TARGET 1
3084 #define TCA_FQ_CODEL_LIMIT 2
3085 #define TCA_FQ_CODEL_INTERVAL 3
3086 #define TCA_FQ_CODEL_ECN 4
3087 #define TCA_FQ_CODEL_FLOWS 5
3088 #define TCA_FQ_CODEL_QUANTUM 6
3099 static struct fqcodel *
3100 fqcodel_get__(const struct netdev *netdev_)
3102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3103 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3107 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3108 uint32_t interval, uint32_t flows, uint32_t quantum)
3110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3111 struct fqcodel *fqcodel;
3113 fqcodel = xmalloc(sizeof *fqcodel);
3114 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3115 fqcodel->target = target;
3116 fqcodel->limit = limit;
3117 fqcodel->interval = interval;
3118 fqcodel->flows = flows;
3119 fqcodel->quantum = quantum;
3121 netdev->tc = &fqcodel->tc;
3125 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3126 uint32_t interval, uint32_t flows, uint32_t quantum)
3129 struct ofpbuf request;
3130 struct tcmsg *tcmsg;
3131 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3134 tc_del_qdisc(netdev);
3136 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3137 NLM_F_EXCL | NLM_F_CREATE, &request);
3141 tcmsg->tcm_handle = tc_make_handle(1, 0);
3142 tcmsg->tcm_parent = TC_H_ROOT;
3144 otarget = target ? target : 5000;
3145 olimit = limit ? limit : 10240;
3146 ointerval = interval ? interval : 100000;
3147 oflows = flows ? flows : 1024;
3148 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3151 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3152 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3153 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3154 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3155 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3156 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3157 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3158 nl_msg_end_nested(&request, opt_offset);
3160 error = tc_transact(&request, NULL);
3162 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3163 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3164 netdev_get_name(netdev),
3165 otarget, olimit, ointerval, oflows, oquantum,
3166 error, ovs_strerror(error));
3172 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3173 const struct smap *details, struct fqcodel *fqcodel)
3175 const char *target_s;
3176 const char *limit_s;
3177 const char *interval_s;
3178 const char *flows_s;
3179 const char *quantum_s;
3181 target_s = smap_get(details, "target");
3182 limit_s = smap_get(details, "limit");
3183 interval_s = smap_get(details, "interval");
3184 flows_s = smap_get(details, "flows");
3185 quantum_s = smap_get(details, "quantum");
3186 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3187 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3188 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3189 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3190 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3191 if (!fqcodel->target) {
3192 fqcodel->target = 5000;
3194 if (!fqcodel->limit) {
3195 fqcodel->limit = 10240;
3197 if (!fqcodel->interval) {
3198 fqcodel->interval = 1000000;
3200 if (!fqcodel->flows) {
3201 fqcodel->flows = 1024;
3203 if (!fqcodel->quantum) {
3204 fqcodel->quantum = 1514;
3209 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3212 struct fqcodel fqcodel;
3214 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3215 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3216 fqcodel.interval, fqcodel.flows,
3219 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3220 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3226 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3228 static const struct nl_policy tca_fqcodel_policy[] = {
3229 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3230 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3231 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3232 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3233 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3236 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3238 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3239 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3240 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3244 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3245 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3246 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3247 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3248 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3253 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3255 struct nlattr *nlattr;
3258 struct fqcodel fqcodel;
3260 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3265 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3270 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3271 fqcodel.flows, fqcodel.quantum);
3276 fqcodel_tc_destroy(struct tc *tc)
3278 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3284 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3286 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3287 smap_add_format(details, "target", "%u", fqcodel->target);
3288 smap_add_format(details, "limit", "%u", fqcodel->limit);
3289 smap_add_format(details, "interval", "%u", fqcodel->interval);
3290 smap_add_format(details, "flows", "%u", fqcodel->flows);
3291 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3296 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3298 struct fqcodel fqcodel;
3300 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3301 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3302 fqcodel.flows, fqcodel.quantum);
3303 fqcodel_get__(netdev)->target = fqcodel.target;
3304 fqcodel_get__(netdev)->limit = fqcodel.limit;
3305 fqcodel_get__(netdev)->interval = fqcodel.interval;
3306 fqcodel_get__(netdev)->flows = fqcodel.flows;
3307 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3311 static const struct tc_ops tc_ops_fqcodel = {
3312 "fq_codel", /* linux_name */
3313 "linux-fq_codel", /* ovs_name */
3314 FQCODEL_N_QUEUES, /* n_queues */
3327 /* SFQ traffic control class. */
3329 #define SFQ_N_QUEUES 0x0000
3338 sfq_get__(const struct netdev *netdev_)
3340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3341 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3345 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3347 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3350 sfq = xmalloc(sizeof *sfq);
3351 tc_init(&sfq->tc, &tc_ops_sfq);
3352 sfq->perturb = perturb;
3353 sfq->quantum = quantum;
3355 netdev->tc = &sfq->tc;
3359 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3361 struct tc_sfq_qopt opt;
3362 struct ofpbuf request;
3363 struct tcmsg *tcmsg;
3365 int mtu_error, error;
3366 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3368 tc_del_qdisc(netdev);
3370 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3371 NLM_F_EXCL | NLM_F_CREATE, &request);
3375 tcmsg->tcm_handle = tc_make_handle(1, 0);
3376 tcmsg->tcm_parent = TC_H_ROOT;
3378 memset(&opt, 0, sizeof opt);
3381 opt.quantum = mtu; /* if we cannot find mtu, use default */
3384 opt.quantum = quantum;
3388 opt.perturb_period = 10;
3390 opt.perturb_period = perturb;
3393 nl_msg_put_string(&request, TCA_KIND, "sfq");
3394 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3396 error = tc_transact(&request, NULL);
3398 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3399 "quantum %u, perturb %u error %d(%s)",
3400 netdev_get_name(netdev),
3401 opt.quantum, opt.perturb_period,
3402 error, ovs_strerror(error));
3408 sfq_parse_qdisc_details__(struct netdev *netdev,
3409 const struct smap *details, struct sfq *sfq)
3411 const char *perturb_s;
3412 const char *quantum_s;
3416 perturb_s = smap_get(details, "perturb");
3417 quantum_s = smap_get(details, "quantum");
3418 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3419 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3420 if (!sfq->perturb) {
3424 if (!sfq->quantum) {
3425 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3429 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3430 "device without mtu");
3437 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3442 sfq_parse_qdisc_details__(netdev, details, &sfq);
3443 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3445 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3451 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3453 const struct tc_sfq_qopt *sfq;
3454 struct nlattr *nlattr;
3458 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3460 sfq = nl_attr_get(nlattr);
3461 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3469 sfq_tc_destroy(struct tc *tc)
3471 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3477 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3479 const struct sfq *sfq = sfq_get__(netdev);
3480 smap_add_format(details, "quantum", "%u", sfq->quantum);
3481 smap_add_format(details, "perturb", "%u", sfq->perturb);
3486 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3490 sfq_parse_qdisc_details__(netdev, details, &sfq);
3491 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3492 sfq_get__(netdev)->quantum = sfq.quantum;
3493 sfq_get__(netdev)->perturb = sfq.perturb;
3497 static const struct tc_ops tc_ops_sfq = {
3498 "sfq", /* linux_name */
3499 "linux-sfq", /* ovs_name */
3500 SFQ_N_QUEUES, /* n_queues */
3513 /* HTB traffic control class. */
3515 #define HTB_N_QUEUES 0xf000
3516 #define HTB_RATE2QUANTUM 10
3520 unsigned int max_rate; /* In bytes/s. */
3524 struct tc_queue tc_queue;
3525 unsigned int min_rate; /* In bytes/s. */
3526 unsigned int max_rate; /* In bytes/s. */
3527 unsigned int burst; /* In bytes. */
3528 unsigned int priority; /* Lower values are higher priorities. */
3532 htb_get__(const struct netdev *netdev_)
3534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3535 return CONTAINER_OF(netdev->tc, struct htb, tc);
3539 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3541 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3544 htb = xmalloc(sizeof *htb);
3545 tc_init(&htb->tc, &tc_ops_htb);
3546 htb->max_rate = max_rate;
3548 netdev->tc = &htb->tc;
3551 /* Create an HTB qdisc.
3553 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3555 htb_setup_qdisc__(struct netdev *netdev)
3558 struct tc_htb_glob opt;
3559 struct ofpbuf request;
3560 struct tcmsg *tcmsg;
3562 tc_del_qdisc(netdev);
3564 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3565 NLM_F_EXCL | NLM_F_CREATE, &request);
3569 tcmsg->tcm_handle = tc_make_handle(1, 0);
3570 tcmsg->tcm_parent = TC_H_ROOT;
3572 nl_msg_put_string(&request, TCA_KIND, "htb");
3574 memset(&opt, 0, sizeof opt);
3575 opt.rate2quantum = HTB_RATE2QUANTUM;
3579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3580 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3581 nl_msg_end_nested(&request, opt_offset);
3583 return tc_transact(&request, NULL);
3586 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3587 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3589 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3590 unsigned int parent, struct htb_class *class)
3593 struct tc_htb_opt opt;
3594 struct ofpbuf request;
3595 struct tcmsg *tcmsg;
3599 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3601 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3602 netdev_get_name(netdev));
3606 memset(&opt, 0, sizeof opt);
3607 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3608 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3609 /* Makes sure the quantum is at least MTU. Setting quantum will
3610 * make htb ignore the r2q for this class. */
3611 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3614 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3615 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3616 opt.prio = class->priority;
3618 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3622 tcmsg->tcm_handle = handle;
3623 tcmsg->tcm_parent = parent;
3625 nl_msg_put_string(&request, TCA_KIND, "htb");
3626 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3627 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3628 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3629 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3630 nl_msg_end_nested(&request, opt_offset);
3632 error = tc_transact(&request, NULL);
3634 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3635 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3636 netdev_get_name(netdev),
3637 tc_get_major(handle), tc_get_minor(handle),
3638 tc_get_major(parent), tc_get_minor(parent),
3639 class->min_rate, class->max_rate,
3640 class->burst, class->priority, ovs_strerror(error));
3645 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3646 * description of them into 'details'. The description complies with the
3647 * specification given in the vswitch database documentation for linux-htb
3650 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3652 static const struct nl_policy tca_htb_policy[] = {
3653 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3654 .min_len = sizeof(struct tc_htb_opt) },
3657 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3658 const struct tc_htb_opt *htb;
3660 if (!nl_parse_nested(nl_options, tca_htb_policy,
3661 attrs, ARRAY_SIZE(tca_htb_policy))) {
3662 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3666 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3667 class->min_rate = htb->rate.rate;
3668 class->max_rate = htb->ceil.rate;
3669 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3670 class->priority = htb->prio;
3675 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3676 struct htb_class *options,
3677 struct netdev_queue_stats *stats)
3679 struct nlattr *nl_options;
3680 unsigned int handle;
3683 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3684 if (!error && queue_id) {
3685 unsigned int major = tc_get_major(handle);
3686 unsigned int minor = tc_get_minor(handle);
3687 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3688 *queue_id = minor - 1;
3693 if (!error && options) {
3694 error = htb_parse_tca_options__(nl_options, options);
3700 htb_parse_qdisc_details__(struct netdev *netdev_,
3701 const struct smap *details, struct htb_class *hc)
3703 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3704 const char *max_rate_s;
3706 max_rate_s = smap_get(details, "max-rate");
3707 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3708 if (!hc->max_rate) {
3709 enum netdev_features current;
3711 netdev_linux_read_features(netdev);
3712 current = !netdev->get_features_error ? netdev->current : 0;
3713 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3715 hc->min_rate = hc->max_rate;
3721 htb_parse_class_details__(struct netdev *netdev,
3722 const struct smap *details, struct htb_class *hc)
3724 const struct htb *htb = htb_get__(netdev);
3725 const char *min_rate_s = smap_get(details, "min-rate");
3726 const char *max_rate_s = smap_get(details, "max-rate");
3727 const char *burst_s = smap_get(details, "burst");
3728 const char *priority_s = smap_get(details, "priority");
3731 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3733 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3734 netdev_get_name(netdev));
3738 /* HTB requires at least an mtu sized min-rate to send any traffic even
3739 * on uncongested links. */
3740 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3741 hc->min_rate = MAX(hc->min_rate, mtu);
3742 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3745 hc->max_rate = (max_rate_s
3746 ? strtoull(max_rate_s, NULL, 10) / 8
3748 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3749 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3753 * According to hints in the documentation that I've read, it is important
3754 * that 'burst' be at least as big as the largest frame that might be
3755 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3756 * but having it a bit too small is a problem. Since netdev_get_mtu()
3757 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3758 * the MTU. We actually add 64, instead of 14, as a guard against
3759 * additional headers get tacked on somewhere that we're not aware of. */
3760 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3761 hc->burst = MAX(hc->burst, mtu + 64);
3764 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3770 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3771 unsigned int parent, struct htb_class *options,
3772 struct netdev_queue_stats *stats)
3774 struct ofpbuf *reply;
3777 error = tc_query_class(netdev, handle, parent, &reply);
3779 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3780 ofpbuf_delete(reply);
3786 htb_tc_install(struct netdev *netdev, const struct smap *details)
3790 error = htb_setup_qdisc__(netdev);
3792 struct htb_class hc;
3794 htb_parse_qdisc_details__(netdev, details, &hc);
3795 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3796 tc_make_handle(1, 0), &hc);
3798 htb_install__(netdev, hc.max_rate);
3804 static struct htb_class *
3805 htb_class_cast__(const struct tc_queue *queue)
3807 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3811 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3812 const struct htb_class *hc)
3814 struct htb *htb = htb_get__(netdev);
3815 size_t hash = hash_int(queue_id, 0);
3816 struct tc_queue *queue;
3817 struct htb_class *hcp;
3819 queue = tc_find_queue__(netdev, queue_id, hash);
3821 hcp = htb_class_cast__(queue);
3823 hcp = xmalloc(sizeof *hcp);
3824 queue = &hcp->tc_queue;
3825 queue->queue_id = queue_id;
3826 queue->created = time_msec();
3827 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3830 hcp->min_rate = hc->min_rate;
3831 hcp->max_rate = hc->max_rate;
3832 hcp->burst = hc->burst;
3833 hcp->priority = hc->priority;
3837 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3840 struct queue_dump_state state;
3841 struct htb_class hc;
3843 /* Get qdisc options. */
3845 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3846 htb_install__(netdev, hc.max_rate);
3849 if (!start_queue_dump(netdev, &state)) {
3852 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3853 unsigned int queue_id;
3855 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3856 htb_update_queue__(netdev, queue_id, &hc);
3859 finish_queue_dump(&state);
3865 htb_tc_destroy(struct tc *tc)
3867 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3868 struct htb_class *hc, *next;
3870 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3871 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3879 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3881 const struct htb *htb = htb_get__(netdev);
3882 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3887 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3889 struct htb_class hc;
3892 htb_parse_qdisc_details__(netdev, details, &hc);
3893 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3894 tc_make_handle(1, 0), &hc);
3896 htb_get__(netdev)->max_rate = hc.max_rate;
3902 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3903 const struct tc_queue *queue, struct smap *details)
3905 const struct htb_class *hc = htb_class_cast__(queue);
3907 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3908 if (hc->min_rate != hc->max_rate) {
3909 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3911 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3913 smap_add_format(details, "priority", "%u", hc->priority);
3919 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3920 const struct smap *details)
3922 struct htb_class hc;
3925 error = htb_parse_class_details__(netdev, details, &hc);
3930 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3931 tc_make_handle(1, 0xfffe), &hc);
3936 htb_update_queue__(netdev, queue_id, &hc);
3941 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3943 struct htb_class *hc = htb_class_cast__(queue);
3944 struct htb *htb = htb_get__(netdev);
3947 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3949 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3956 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3957 struct netdev_queue_stats *stats)
3959 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3960 tc_make_handle(1, 0xfffe), NULL, stats);
3964 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3965 const struct ofpbuf *nlmsg,
3966 netdev_dump_queue_stats_cb *cb, void *aux)
3968 struct netdev_queue_stats stats;
3969 unsigned int handle, major, minor;
3972 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3977 major = tc_get_major(handle);
3978 minor = tc_get_minor(handle);
3979 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3980 (*cb)(minor - 1, &stats, aux);
3985 static const struct tc_ops tc_ops_htb = {
3986 "htb", /* linux_name */
3987 "linux-htb", /* ovs_name */
3988 HTB_N_QUEUES, /* n_queues */
3997 htb_class_get_stats,
3998 htb_class_dump_stats
4001 /* "linux-hfsc" traffic control class. */
4003 #define HFSC_N_QUEUES 0xf000
4011 struct tc_queue tc_queue;
4016 static struct hfsc *
4017 hfsc_get__(const struct netdev *netdev_)
4019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4020 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4023 static struct hfsc_class *
4024 hfsc_class_cast__(const struct tc_queue *queue)
4026 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4030 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4032 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4035 hfsc = xmalloc(sizeof *hfsc);
4036 tc_init(&hfsc->tc, &tc_ops_hfsc);
4037 hfsc->max_rate = max_rate;
4038 netdev->tc = &hfsc->tc;
4042 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4043 const struct hfsc_class *hc)
4047 struct hfsc_class *hcp;
4048 struct tc_queue *queue;
4050 hfsc = hfsc_get__(netdev);
4051 hash = hash_int(queue_id, 0);
4053 queue = tc_find_queue__(netdev, queue_id, hash);
4055 hcp = hfsc_class_cast__(queue);
4057 hcp = xmalloc(sizeof *hcp);
4058 queue = &hcp->tc_queue;
4059 queue->queue_id = queue_id;
4060 queue->created = time_msec();
4061 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4064 hcp->min_rate = hc->min_rate;
4065 hcp->max_rate = hc->max_rate;
4069 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4071 const struct tc_service_curve *rsc, *fsc, *usc;
4072 static const struct nl_policy tca_hfsc_policy[] = {
4074 .type = NL_A_UNSPEC,
4076 .min_len = sizeof(struct tc_service_curve),
4079 .type = NL_A_UNSPEC,
4081 .min_len = sizeof(struct tc_service_curve),
4084 .type = NL_A_UNSPEC,
4086 .min_len = sizeof(struct tc_service_curve),
4089 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4091 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4092 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4093 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4097 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4098 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4099 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4101 if (rsc->m1 != 0 || rsc->d != 0 ||
4102 fsc->m1 != 0 || fsc->d != 0 ||
4103 usc->m1 != 0 || usc->d != 0) {
4104 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4105 "Non-linear service curves are not supported.");
4109 if (rsc->m2 != fsc->m2) {
4110 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4111 "Real-time service curves are not supported ");
4115 if (rsc->m2 > usc->m2) {
4116 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4117 "Min-rate service curve is greater than "
4118 "the max-rate service curve.");
4122 class->min_rate = fsc->m2;
4123 class->max_rate = usc->m2;
4128 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4129 struct hfsc_class *options,
4130 struct netdev_queue_stats *stats)
4133 unsigned int handle;
4134 struct nlattr *nl_options;
4136 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4142 unsigned int major, minor;
4144 major = tc_get_major(handle);
4145 minor = tc_get_minor(handle);
4146 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4147 *queue_id = minor - 1;
4154 error = hfsc_parse_tca_options__(nl_options, options);
4161 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4162 unsigned int parent, struct hfsc_class *options,
4163 struct netdev_queue_stats *stats)
4166 struct ofpbuf *reply;
4168 error = tc_query_class(netdev, handle, parent, &reply);
4173 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4174 ofpbuf_delete(reply);
4179 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4180 struct hfsc_class *class)
4182 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4184 const char *max_rate_s;
4186 max_rate_s = smap_get(details, "max-rate");
4187 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4190 enum netdev_features current;
4192 netdev_linux_read_features(netdev);
4193 current = !netdev->get_features_error ? netdev->current : 0;
4194 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4197 class->min_rate = max_rate;
4198 class->max_rate = max_rate;
4202 hfsc_parse_class_details__(struct netdev *netdev,
4203 const struct smap *details,
4204 struct hfsc_class * class)
4206 const struct hfsc *hfsc;
4207 uint32_t min_rate, max_rate;
4208 const char *min_rate_s, *max_rate_s;
4210 hfsc = hfsc_get__(netdev);
4211 min_rate_s = smap_get(details, "min-rate");
4212 max_rate_s = smap_get(details, "max-rate");
4214 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4215 min_rate = MAX(min_rate, 1);
4216 min_rate = MIN(min_rate, hfsc->max_rate);
4218 max_rate = (max_rate_s
4219 ? strtoull(max_rate_s, NULL, 10) / 8
4221 max_rate = MAX(max_rate, min_rate);
4222 max_rate = MIN(max_rate, hfsc->max_rate);
4224 class->min_rate = min_rate;
4225 class->max_rate = max_rate;
4230 /* Create an HFSC qdisc.
4232 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4234 hfsc_setup_qdisc__(struct netdev * netdev)
4236 struct tcmsg *tcmsg;
4237 struct ofpbuf request;
4238 struct tc_hfsc_qopt opt;
4240 tc_del_qdisc(netdev);
4242 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4243 NLM_F_EXCL | NLM_F_CREATE, &request);
4249 tcmsg->tcm_handle = tc_make_handle(1, 0);
4250 tcmsg->tcm_parent = TC_H_ROOT;
4252 memset(&opt, 0, sizeof opt);
4255 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4256 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4258 return tc_transact(&request, NULL);
4261 /* Create an HFSC class.
4263 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4264 * sc rate <min_rate> ul rate <max_rate>" */
4266 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4267 unsigned int parent, struct hfsc_class *class)
4271 struct tcmsg *tcmsg;
4272 struct ofpbuf request;
4273 struct tc_service_curve min, max;
4275 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4281 tcmsg->tcm_handle = handle;
4282 tcmsg->tcm_parent = parent;
4286 min.m2 = class->min_rate;
4290 max.m2 = class->max_rate;
4292 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4293 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4294 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4295 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4296 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4297 nl_msg_end_nested(&request, opt_offset);
4299 error = tc_transact(&request, NULL);
4301 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4302 "min-rate %ubps, max-rate %ubps (%s)",
4303 netdev_get_name(netdev),
4304 tc_get_major(handle), tc_get_minor(handle),
4305 tc_get_major(parent), tc_get_minor(parent),
4306 class->min_rate, class->max_rate, ovs_strerror(error));
4313 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4316 struct hfsc_class class;
4318 error = hfsc_setup_qdisc__(netdev);
4324 hfsc_parse_qdisc_details__(netdev, details, &class);
4325 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4326 tc_make_handle(1, 0), &class);
4332 hfsc_install__(netdev, class.max_rate);
4337 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4340 struct queue_dump_state state;
4341 struct hfsc_class hc;
4344 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4345 hfsc_install__(netdev, hc.max_rate);
4347 if (!start_queue_dump(netdev, &state)) {
4351 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4352 unsigned int queue_id;
4354 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4355 hfsc_update_queue__(netdev, queue_id, &hc);
4359 finish_queue_dump(&state);
4364 hfsc_tc_destroy(struct tc *tc)
4367 struct hfsc_class *hc, *next;
4369 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4371 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4372 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4381 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4383 const struct hfsc *hfsc;
4384 hfsc = hfsc_get__(netdev);
4385 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4390 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4393 struct hfsc_class class;
4395 hfsc_parse_qdisc_details__(netdev, details, &class);
4396 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4397 tc_make_handle(1, 0), &class);
4400 hfsc_get__(netdev)->max_rate = class.max_rate;
4407 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4408 const struct tc_queue *queue, struct smap *details)
4410 const struct hfsc_class *hc;
4412 hc = hfsc_class_cast__(queue);
4413 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4414 if (hc->min_rate != hc->max_rate) {
4415 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4421 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4422 const struct smap *details)
4425 struct hfsc_class class;
4427 error = hfsc_parse_class_details__(netdev, details, &class);
4432 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4433 tc_make_handle(1, 0xfffe), &class);
4438 hfsc_update_queue__(netdev, queue_id, &class);
4443 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4447 struct hfsc_class *hc;
4449 hc = hfsc_class_cast__(queue);
4450 hfsc = hfsc_get__(netdev);
4452 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4454 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4461 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4462 struct netdev_queue_stats *stats)
4464 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4465 tc_make_handle(1, 0xfffe), NULL, stats);
4469 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4470 const struct ofpbuf *nlmsg,
4471 netdev_dump_queue_stats_cb *cb, void *aux)
4473 struct netdev_queue_stats stats;
4474 unsigned int handle, major, minor;
4477 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4482 major = tc_get_major(handle);
4483 minor = tc_get_minor(handle);
4484 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4485 (*cb)(minor - 1, &stats, aux);
4490 static const struct tc_ops tc_ops_hfsc = {
4491 "hfsc", /* linux_name */
4492 "linux-hfsc", /* ovs_name */
4493 HFSC_N_QUEUES, /* n_queues */
4494 hfsc_tc_install, /* tc_install */
4495 hfsc_tc_load, /* tc_load */
4496 hfsc_tc_destroy, /* tc_destroy */
4497 hfsc_qdisc_get, /* qdisc_get */
4498 hfsc_qdisc_set, /* qdisc_set */
4499 hfsc_class_get, /* class_get */
4500 hfsc_class_set, /* class_set */
4501 hfsc_class_delete, /* class_delete */
4502 hfsc_class_get_stats, /* class_get_stats */
4503 hfsc_class_dump_stats /* class_dump_stats */
4506 /* "linux-default" traffic control class.
4508 * This class represents the default, unnamed Linux qdisc. It corresponds to
4509 * the "" (empty string) QoS type in the OVS database. */
4512 default_install__(struct netdev *netdev_)
4514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4515 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4517 /* Nothing but a tc class implementation is allowed to write to a tc. This
4518 * class never does that, so we can legitimately use a const tc object. */
4519 netdev->tc = CONST_CAST(struct tc *, &tc);
4523 default_tc_install(struct netdev *netdev,
4524 const struct smap *details OVS_UNUSED)
4526 default_install__(netdev);
4531 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4533 default_install__(netdev);
4537 static const struct tc_ops tc_ops_default = {
4538 NULL, /* linux_name */
4543 NULL, /* tc_destroy */
4544 NULL, /* qdisc_get */
4545 NULL, /* qdisc_set */
4546 NULL, /* class_get */
4547 NULL, /* class_set */
4548 NULL, /* class_delete */
4549 NULL, /* class_get_stats */
4550 NULL /* class_dump_stats */
4553 /* "linux-other" traffic control class.
4558 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4561 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4563 /* Nothing but a tc class implementation is allowed to write to a tc. This
4564 * class never does that, so we can legitimately use a const tc object. */
4565 netdev->tc = CONST_CAST(struct tc *, &tc);
4569 static const struct tc_ops tc_ops_other = {
4570 NULL, /* linux_name */
4571 "linux-other", /* ovs_name */
4573 NULL, /* tc_install */
4575 NULL, /* tc_destroy */
4576 NULL, /* qdisc_get */
4577 NULL, /* qdisc_set */
4578 NULL, /* class_get */
4579 NULL, /* class_set */
4580 NULL, /* class_delete */
4581 NULL, /* class_get_stats */
4582 NULL /* class_dump_stats */
4585 /* Traffic control. */
4587 /* Number of kernel "tc" ticks per second. */
4588 static double ticks_per_s;
4590 /* Number of kernel "jiffies" per second. This is used for the purpose of
4591 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4592 * one jiffy's worth of data.
4594 * There are two possibilities here:
4596 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4597 * approximate range of 100 to 1024. That means that we really need to
4598 * make sure that the qdisc can buffer that much data.
4600 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4601 * has finely granular timers and there's no need to fudge additional room
4602 * for buffers. (There's no extra effort needed to implement that: the
4603 * large 'buffer_hz' is used as a divisor, so practically any number will
4604 * come out as 0 in the division. Small integer results in the case of
4605 * really high dividends won't have any real effect anyhow.)
4607 static unsigned int buffer_hz;
4609 /* Returns tc handle 'major':'minor'. */
4611 tc_make_handle(unsigned int major, unsigned int minor)
4613 return TC_H_MAKE(major << 16, minor);
4616 /* Returns the major number from 'handle'. */
4618 tc_get_major(unsigned int handle)
4620 return TC_H_MAJ(handle) >> 16;
4623 /* Returns the minor number from 'handle'. */
4625 tc_get_minor(unsigned int handle)
4627 return TC_H_MIN(handle);
4630 static struct tcmsg *
4631 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4632 struct ofpbuf *request)
4634 struct tcmsg *tcmsg;
4638 error = get_ifindex(netdev, &ifindex);
4643 ofpbuf_init(request, 512);
4644 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4645 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4646 tcmsg->tcm_family = AF_UNSPEC;
4647 tcmsg->tcm_ifindex = ifindex;
4648 /* Caller should fill in tcmsg->tcm_handle. */
4649 /* Caller should fill in tcmsg->tcm_parent. */
4655 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4657 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4658 ofpbuf_uninit(request);
4662 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4663 * policing configuration.
4665 * This function is equivalent to running the following when 'add' is true:
4666 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4668 * This function is equivalent to running the following when 'add' is false:
4669 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4671 * The configuration and stats may be seen with the following command:
4672 * /sbin/tc -s qdisc show dev <devname>
4674 * Returns 0 if successful, otherwise a positive errno value.
4677 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4679 struct ofpbuf request;
4680 struct tcmsg *tcmsg;
4682 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4683 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4685 tcmsg = tc_make_request(netdev, type, flags, &request);
4689 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4690 tcmsg->tcm_parent = TC_H_INGRESS;
4691 nl_msg_put_string(&request, TCA_KIND, "ingress");
4692 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4694 error = tc_transact(&request, NULL);
4696 /* If we're deleting the qdisc, don't worry about some of the
4697 * error conditions. */
4698 if (!add && (error == ENOENT || error == EINVAL)) {
4707 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4710 * This function is equivalent to running:
4711 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4712 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4715 * The configuration and stats may be seen with the following command:
4716 * /sbin/tc -s filter show dev <devname> parent ffff:
4718 * Returns 0 if successful, otherwise a positive errno value.
4721 tc_add_policer(struct netdev *netdev,
4722 uint32_t kbits_rate, uint32_t kbits_burst)
4724 struct tc_police tc_police;
4725 struct ofpbuf request;
4726 struct tcmsg *tcmsg;
4727 size_t basic_offset;
4728 size_t police_offset;
4732 memset(&tc_police, 0, sizeof tc_police);
4733 tc_police.action = TC_POLICE_SHOT;
4734 tc_police.mtu = mtu;
4735 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4737 /* The following appears wrong in two ways:
4739 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4740 * arguments (or at least consistently "bytes" as both or "bits" as
4741 * both), but this supplies bytes for the first argument and bits for the
4744 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4746 * However if you "fix" those problems then "tc filter show ..." shows
4747 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4748 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4749 * tc's point of view. Whatever. */
4750 tc_police.burst = tc_bytes_to_ticks(
4751 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4753 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4754 NLM_F_EXCL | NLM_F_CREATE, &request);
4758 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4759 tcmsg->tcm_info = tc_make_handle(49,
4760 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4762 nl_msg_put_string(&request, TCA_KIND, "basic");
4763 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4764 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4765 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4766 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4767 nl_msg_end_nested(&request, police_offset);
4768 nl_msg_end_nested(&request, basic_offset);
4770 error = tc_transact(&request, NULL);
4781 /* The values in psched are not individually very meaningful, but they are
4782 * important. The tables below show some values seen in the wild.
4786 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4787 * (Before that, there are hints that it was 1000000000.)
4789 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4793 * -----------------------------------
4794 * [1] 000c8000 000f4240 000f4240 00000064
4795 * [2] 000003e8 00000400 000f4240 3b9aca00
4796 * [3] 000003e8 00000400 000f4240 3b9aca00
4797 * [4] 000003e8 00000400 000f4240 00000064
4798 * [5] 000003e8 00000040 000f4240 3b9aca00
4799 * [6] 000003e8 00000040 000f4240 000000f9
4801 * a b c d ticks_per_s buffer_hz
4802 * ------- --------- ---------- ------------- ----------- -------------
4803 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4804 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4805 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4806 * [4] 1,000 1,024 1,000,000 100 976,562 100
4807 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4808 * [6] 1,000 64 1,000,000 249 15,625,000 249
4810 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4811 * [2] 2.6.26-1-686-bigmem from Debian lenny
4812 * [3] 2.6.26-2-sparc64 from Debian lenny
4813 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4814 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4815 * [6] 2.6.34 from kernel.org on KVM
4817 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4818 static const char fn[] = "/proc/net/psched";
4819 unsigned int a, b, c, d;
4822 if (!ovsthread_once_start(&once)) {
4829 stream = fopen(fn, "r");
4831 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4835 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4836 VLOG_WARN("%s: read failed", fn);
4840 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4844 VLOG_WARN("%s: invalid scheduler parameters", fn);
4848 ticks_per_s = (double) a * c / b;
4852 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4855 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4858 ovsthread_once_done(&once);
4861 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4862 * rate of 'rate' bytes per second. */
4864 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4867 return (rate * ticks) / ticks_per_s;
4870 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4871 * rate of 'rate' bytes per second. */
4873 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4876 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4879 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4880 * a transmission rate of 'rate' bytes per second. */
4882 tc_buffer_per_jiffy(unsigned int rate)
4885 return rate / buffer_hz;
4888 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4889 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4890 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4891 * stores NULL into it if it is absent.
4893 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4896 * Returns 0 if successful, otherwise a positive errno value. */
4898 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4899 struct nlattr **options)
4901 static const struct nl_policy tca_policy[] = {
4902 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4903 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4905 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4907 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4908 tca_policy, ta, ARRAY_SIZE(ta))) {
4909 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4914 *kind = nl_attr_get_string(ta[TCA_KIND]);
4918 *options = ta[TCA_OPTIONS];
4933 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4934 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4935 * into '*options', and its queue statistics into '*stats'. Any of the output
4936 * arguments may be null.
4938 * Returns 0 if successful, otherwise a positive errno value. */
4940 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4941 struct nlattr **options, struct netdev_queue_stats *stats)
4943 static const struct nl_policy tca_policy[] = {
4944 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4945 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4947 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4949 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4950 tca_policy, ta, ARRAY_SIZE(ta))) {
4951 VLOG_WARN_RL(&rl, "failed to parse class message");
4956 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4957 *handlep = tc->tcm_handle;
4961 *options = ta[TCA_OPTIONS];
4965 const struct gnet_stats_queue *gsq;
4966 struct gnet_stats_basic gsb;
4968 static const struct nl_policy stats_policy[] = {
4969 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4970 .min_len = sizeof gsb },
4971 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4972 .min_len = sizeof *gsq },
4974 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4976 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4977 sa, ARRAY_SIZE(sa))) {
4978 VLOG_WARN_RL(&rl, "failed to parse class stats");
4982 /* Alignment issues screw up the length of struct gnet_stats_basic on
4983 * some arch/bitsize combinations. Newer versions of Linux have a
4984 * struct gnet_stats_basic_packed, but we can't depend on that. The
4985 * easiest thing to do is just to make a copy. */
4986 memset(&gsb, 0, sizeof gsb);
4987 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4988 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4989 stats->tx_bytes = gsb.bytes;
4990 stats->tx_packets = gsb.packets;
4992 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4993 stats->tx_errors = gsq->drops;
5003 memset(stats, 0, sizeof *stats);
5008 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5011 tc_query_class(const struct netdev *netdev,
5012 unsigned int handle, unsigned int parent,
5013 struct ofpbuf **replyp)
5015 struct ofpbuf request;
5016 struct tcmsg *tcmsg;
5019 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5023 tcmsg->tcm_handle = handle;
5024 tcmsg->tcm_parent = parent;
5026 error = tc_transact(&request, replyp);
5028 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5029 netdev_get_name(netdev),
5030 tc_get_major(handle), tc_get_minor(handle),
5031 tc_get_major(parent), tc_get_minor(parent),
5032 ovs_strerror(error));
5037 /* Equivalent to "tc class del dev <name> handle <handle>". */
5039 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5041 struct ofpbuf request;
5042 struct tcmsg *tcmsg;
5045 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5049 tcmsg->tcm_handle = handle;
5050 tcmsg->tcm_parent = 0;
5052 error = tc_transact(&request, NULL);
5054 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5055 netdev_get_name(netdev),
5056 tc_get_major(handle), tc_get_minor(handle),
5057 ovs_strerror(error));
5062 /* Equivalent to "tc qdisc del dev <name> root". */
5064 tc_del_qdisc(struct netdev *netdev_)
5066 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5067 struct ofpbuf request;
5068 struct tcmsg *tcmsg;
5071 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5075 tcmsg->tcm_handle = tc_make_handle(1, 0);
5076 tcmsg->tcm_parent = TC_H_ROOT;
5078 error = tc_transact(&request, NULL);
5079 if (error == EINVAL) {
5080 /* EINVAL probably means that the default qdisc was in use, in which
5081 * case we've accomplished our purpose. */
5084 if (!error && netdev->tc) {
5085 if (netdev->tc->ops->tc_destroy) {
5086 netdev->tc->ops->tc_destroy(netdev->tc);
5094 getqdisc_is_safe(void)
5096 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5097 static bool safe = false;
5099 if (ovsthread_once_start(&once)) {
5100 struct utsname utsname;
5103 if (uname(&utsname) == -1) {
5104 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5105 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5106 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5107 } else if (major < 2 || (major == 2 && minor < 35)) {
5108 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5113 ovsthread_once_done(&once);
5118 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5119 * kernel to determine what they are. Returns 0 if successful, otherwise a
5120 * positive errno value. */
5122 tc_query_qdisc(const struct netdev *netdev_)
5124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5125 struct ofpbuf request, *qdisc;
5126 const struct tc_ops *ops;
5127 struct tcmsg *tcmsg;
5135 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5136 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5137 * 2.6.35 without that fix backported to it.
5139 * To avoid the OOPS, we must not make a request that would attempt to dump
5140 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5141 * few others. There are a few ways that I can see to do this, but most of
5142 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5143 * technique chosen here is to assume that any non-default qdisc that we
5144 * create will have a class with handle 1:0. The built-in qdiscs only have
5145 * a class with handle 0:0.
5147 * On Linux 2.6.35+ we use the straightforward method because it allows us
5148 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5149 * in such a case we get no response at all from the kernel (!) if a
5150 * builtin qdisc is in use (which is later caught by "!error &&
5151 * !qdisc->size"). */
5152 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5156 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5157 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5159 /* Figure out what tc class to instantiate. */
5160 error = tc_transact(&request, &qdisc);
5161 if (!error && qdisc->size) {
5164 error = tc_parse_qdisc(qdisc, &kind, NULL);
5166 ops = &tc_ops_other;
5168 ops = tc_lookup_linux_name(kind);
5170 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5171 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5173 ops = &tc_ops_other;
5176 } else if ((!error && !qdisc->size) || error == ENOENT) {
5177 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5178 * set up by some other entity that doesn't have a handle 1:0. We will
5179 * assume that it's the system default qdisc. */
5180 ops = &tc_ops_default;
5183 /* Who knows? Maybe the device got deleted. */
5184 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5185 netdev_get_name(netdev_), ovs_strerror(error));
5186 ops = &tc_ops_other;
5189 /* Instantiate it. */
5190 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5191 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5192 ofpbuf_delete(qdisc);
5194 return error ? error : load_error;
5197 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5198 approximate the time to transmit packets of various lengths. For an MTU of
5199 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5200 represents two possible packet lengths; for a MTU of 513 through 1024, four
5201 possible lengths; and so on.
5203 Returns, for the specified 'mtu', the number of bits that packet lengths
5204 need to be shifted right to fit within such a 256-entry table. */
5206 tc_calc_cell_log(unsigned int mtu)
5211 mtu = ETH_PAYLOAD_MAX;
5213 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5215 for (cell_log = 0; mtu >= 256; cell_log++) {
5222 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5225 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5227 memset(rate, 0, sizeof *rate);
5228 rate->cell_log = tc_calc_cell_log(mtu);
5229 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5230 /* rate->cell_align = 0; */ /* distro headers. */
5231 rate->mpu = ETH_TOTAL_MIN;
5235 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5236 * attribute of the specified "type".
5238 * See tc_calc_cell_log() above for a description of "rtab"s. */
5240 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5245 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5246 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5247 unsigned packet_size = (i + 1) << rate->cell_log;
5248 if (packet_size < rate->mpu) {
5249 packet_size = rate->mpu;
5251 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5255 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5256 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5257 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5260 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5262 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5263 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5266 /* Linux-only functions declared in netdev-linux.h */
5268 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5269 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5271 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5272 const char *flag_name, bool enable)
5274 const char *netdev_name = netdev_get_name(netdev);
5275 struct ethtool_value evalue;
5279 COVERAGE_INC(netdev_get_ethtool);
5280 memset(&evalue, 0, sizeof evalue);
5281 error = netdev_linux_do_ethtool(netdev_name,
5282 (struct ethtool_cmd *)&evalue,
5283 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5288 COVERAGE_INC(netdev_set_ethtool);
5289 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5290 error = netdev_linux_do_ethtool(netdev_name,
5291 (struct ethtool_cmd *)&evalue,
5292 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5297 COVERAGE_INC(netdev_get_ethtool);
5298 memset(&evalue, 0, sizeof evalue);
5299 error = netdev_linux_do_ethtool(netdev_name,
5300 (struct ethtool_cmd *)&evalue,
5301 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5306 if (new_flags != evalue.data) {
5307 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5308 "device %s failed", enable ? "enable" : "disable",
5309 flag_name, netdev_name);
5316 /* Utility functions. */
5318 /* Copies 'src' into 'dst', performing format conversion in the process. */
5320 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5321 const struct rtnl_link_stats *src)
5323 dst->rx_packets = src->rx_packets;
5324 dst->tx_packets = src->tx_packets;
5325 dst->rx_bytes = src->rx_bytes;
5326 dst->tx_bytes = src->tx_bytes;
5327 dst->rx_errors = src->rx_errors;
5328 dst->tx_errors = src->tx_errors;
5329 dst->rx_dropped = src->rx_dropped;
5330 dst->tx_dropped = src->tx_dropped;
5331 dst->multicast = src->multicast;
5332 dst->collisions = src->collisions;
5333 dst->rx_length_errors = src->rx_length_errors;
5334 dst->rx_over_errors = src->rx_over_errors;
5335 dst->rx_crc_errors = src->rx_crc_errors;
5336 dst->rx_frame_errors = src->rx_frame_errors;
5337 dst->rx_fifo_errors = src->rx_fifo_errors;
5338 dst->rx_missed_errors = src->rx_missed_errors;
5339 dst->tx_aborted_errors = src->tx_aborted_errors;
5340 dst->tx_carrier_errors = src->tx_carrier_errors;
5341 dst->tx_fifo_errors = src->tx_fifo_errors;
5342 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5343 dst->tx_window_errors = src->tx_window_errors;
5346 /* Copies 'src' into 'dst', performing format conversion in the process. */
5348 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5349 const struct rtnl_link_stats64 *src)
5351 dst->rx_packets = src->rx_packets;
5352 dst->tx_packets = src->tx_packets;
5353 dst->rx_bytes = src->rx_bytes;
5354 dst->tx_bytes = src->tx_bytes;
5355 dst->rx_errors = src->rx_errors;
5356 dst->tx_errors = src->tx_errors;
5357 dst->rx_dropped = src->rx_dropped;
5358 dst->tx_dropped = src->tx_dropped;
5359 dst->multicast = src->multicast;
5360 dst->collisions = src->collisions;
5361 dst->rx_length_errors = src->rx_length_errors;
5362 dst->rx_over_errors = src->rx_over_errors;
5363 dst->rx_crc_errors = src->rx_crc_errors;
5364 dst->rx_frame_errors = src->rx_frame_errors;
5365 dst->rx_fifo_errors = src->rx_fifo_errors;
5366 dst->rx_missed_errors = src->rx_missed_errors;
5367 dst->tx_aborted_errors = src->tx_aborted_errors;
5368 dst->tx_carrier_errors = src->tx_carrier_errors;
5369 dst->tx_fifo_errors = src->tx_fifo_errors;
5370 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5371 dst->tx_window_errors = src->tx_window_errors;
5375 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5377 struct ofpbuf request;
5378 struct ofpbuf *reply;
5381 ofpbuf_init(&request, 0);
5382 nl_msg_put_nlmsghdr(&request,
5383 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5384 RTM_GETLINK, NLM_F_REQUEST);
5385 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5386 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5387 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5388 ofpbuf_uninit(&request);
5393 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5394 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5395 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5396 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5399 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5400 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5401 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5404 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5409 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5414 ofpbuf_delete(reply);
5419 get_flags(const struct netdev *dev, unsigned int *flags)
5425 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5427 *flags = ifr.ifr_flags;
5433 set_flags(const char *name, unsigned int flags)
5437 ifr.ifr_flags = flags;
5438 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5442 do_get_ifindex(const char *netdev_name)
5447 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5448 COVERAGE_INC(netdev_get_ifindex);
5450 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5452 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5453 netdev_name, ovs_strerror(error));
5456 return ifr.ifr_ifindex;
5460 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5462 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5464 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5465 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5468 netdev->get_ifindex_error = -ifindex;
5469 netdev->ifindex = 0;
5471 netdev->get_ifindex_error = 0;
5472 netdev->ifindex = ifindex;
5474 netdev->cache_valid |= VALID_IFINDEX;
5477 *ifindexp = netdev->ifindex;
5478 return netdev->get_ifindex_error;
5482 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5488 memset(&ifr, 0, sizeof ifr);
5489 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5490 COVERAGE_INC(netdev_get_hwaddr);
5491 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5493 /* ENODEV probably means that a vif disappeared asynchronously and
5494 * hasn't been removed from the database yet, so reduce the log level
5495 * to INFO for that case. */
5496 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5497 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5498 netdev_name, ovs_strerror(error));
5501 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5502 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5503 VLOG_WARN("%s device has unknown hardware address family %d",
5504 netdev_name, hwaddr_family);
5506 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5511 set_etheraddr(const char *netdev_name,
5512 const uint8_t mac[ETH_ADDR_LEN])
5517 memset(&ifr, 0, sizeof ifr);
5518 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5519 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5520 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5521 COVERAGE_INC(netdev_set_hwaddr);
5522 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5524 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5525 netdev_name, ovs_strerror(error));
5531 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5532 int cmd, const char *cmd_name)
5537 memset(&ifr, 0, sizeof ifr);
5538 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5539 ifr.ifr_data = (caddr_t) ecmd;
5542 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5544 if (error != EOPNOTSUPP) {
5545 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5546 "failed: %s", cmd_name, name, ovs_strerror(error));
5548 /* The device doesn't support this operation. That's pretty
5549 * common, so there's no point in logging anything. */
5556 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5557 int cmd, const char *cmd_name)
5562 ifr.ifr_addr.sa_family = AF_INET;
5563 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5565 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5567 *ip = sin->sin_addr;
5572 /* Returns an AF_PACKET raw socket or a negative errno value. */
5574 af_packet_sock(void)
5576 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5579 if (ovsthread_once_start(&once)) {
5580 sock = socket(AF_PACKET, SOCK_RAW, 0);
5582 int error = set_nonblocking(sock);
5589 VLOG_ERR("failed to create packet socket: %s",
5590 ovs_strerror(errno));
5592 ovsthread_once_done(&once);