2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 uint8_t etheraddr[ETH_ADDR_LEN];
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
461 int in4_error; /* Cached error code from reading in4 addr. */
462 int in6_error; /* Cached error code from reading in6 addr. */
464 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
465 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
466 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
468 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
471 /* For devices of class netdev_tap_class only. */
475 struct netdev_rxq_linux {
476 struct netdev_rxq up;
481 /* This is set pretty low because we probably won't learn anything from the
482 * additional log messages. */
483 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
485 /* Polling miimon status for all ports causes performance degradation when
486 * handling a large number of ports. If there are no devices using miimon, then
487 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
489 * Readers do not depend on this variable synchronizing with the related
490 * changes in the device miimon status, so we can use atomic_count. */
491 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
493 static void netdev_linux_run(void);
495 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
496 int cmd, const char *cmd_name);
497 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
498 int cmd, const char *cmd_name);
499 static int get_flags(const struct netdev *, unsigned int *flags);
500 static int set_flags(const char *, unsigned int flags);
501 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
502 enum netdev_flags on, enum netdev_flags *old_flagsp)
503 OVS_REQUIRES(netdev->mutex);
504 static int do_get_ifindex(const char *netdev_name);
505 static int get_ifindex(const struct netdev *, int *ifindexp);
506 static int do_set_addr(struct netdev *netdev,
507 int ioctl_nr, const char *ioctl_name,
508 struct in_addr addr);
509 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
510 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
511 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
512 static int af_packet_sock(void);
513 static bool netdev_linux_miimon_enabled(void);
514 static void netdev_linux_miimon_run(void);
515 static void netdev_linux_miimon_wait(void);
516 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
519 is_netdev_linux_class(const struct netdev_class *netdev_class)
521 return netdev_class->run == netdev_linux_run;
525 is_tap_netdev(const struct netdev *netdev)
527 return netdev_get_class(netdev) == &netdev_tap_class;
530 static struct netdev_linux *
531 netdev_linux_cast(const struct netdev *netdev)
533 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
535 return CONTAINER_OF(netdev, struct netdev_linux, up);
538 static struct netdev_rxq_linux *
539 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
541 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
542 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
545 static void netdev_linux_update(struct netdev_linux *netdev,
546 const struct rtnetlink_change *)
547 OVS_REQUIRES(netdev->mutex);
548 static void netdev_linux_changed(struct netdev_linux *netdev,
549 unsigned int ifi_flags, unsigned int mask)
550 OVS_REQUIRES(netdev->mutex);
552 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
553 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
554 * if no such socket could be created. */
555 static struct nl_sock *
556 netdev_linux_notify_sock(void)
558 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
559 static struct nl_sock *sock;
560 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
561 RTNLGRP_IPV6_IFADDR};
563 if (ovsthread_once_start(&once)) {
566 error = nl_sock_create(NETLINK_ROUTE, &sock);
570 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
571 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
573 nl_sock_destroy(sock);
579 ovsthread_once_done(&once);
586 netdev_linux_miimon_enabled(void)
588 return atomic_count_get(&miimon_cnt) > 0;
592 netdev_linux_run(void)
594 struct nl_sock *sock;
597 if (netdev_linux_miimon_enabled()) {
598 netdev_linux_miimon_run();
601 sock = netdev_linux_notify_sock();
607 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
608 uint64_t buf_stub[4096 / 8];
611 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
612 error = nl_sock_recv(sock, &buf, false);
614 struct rtnetlink_change change;
616 if (rtnetlink_parse(&buf, &change)) {
617 struct netdev *netdev_ = netdev_from_name(change.ifname);
618 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
621 ovs_mutex_lock(&netdev->mutex);
622 netdev_linux_update(netdev, &change);
623 ovs_mutex_unlock(&netdev->mutex);
625 netdev_close(netdev_);
627 } else if (error == ENOBUFS) {
628 struct shash device_shash;
629 struct shash_node *node;
633 shash_init(&device_shash);
634 netdev_get_devices(&netdev_linux_class, &device_shash);
635 SHASH_FOR_EACH (node, &device_shash) {
636 struct netdev *netdev_ = node->data;
637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
640 ovs_mutex_lock(&netdev->mutex);
641 get_flags(netdev_, &flags);
642 netdev_linux_changed(netdev, flags, 0);
643 ovs_mutex_unlock(&netdev->mutex);
645 netdev_close(netdev_);
647 shash_destroy(&device_shash);
648 } else if (error != EAGAIN) {
649 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
650 ovs_strerror(error));
657 netdev_linux_wait(void)
659 struct nl_sock *sock;
661 if (netdev_linux_miimon_enabled()) {
662 netdev_linux_miimon_wait();
664 sock = netdev_linux_notify_sock();
666 nl_sock_wait(sock, POLLIN);
671 netdev_linux_changed(struct netdev_linux *dev,
672 unsigned int ifi_flags, unsigned int mask)
673 OVS_REQUIRES(dev->mutex)
675 netdev_change_seq_changed(&dev->up);
677 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
678 dev->carrier_resets++;
680 dev->ifi_flags = ifi_flags;
682 dev->cache_valid &= mask;
686 netdev_linux_update(struct netdev_linux *dev,
687 const struct rtnetlink_change *change)
688 OVS_REQUIRES(dev->mutex)
690 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
691 if (change->nlmsg_type == RTM_NEWLINK) {
692 /* Keep drv-info, in4, in6. */
693 netdev_linux_changed(dev, change->ifi_flags,
694 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
696 /* Update netdev from rtnl-change msg. */
698 dev->mtu = change->mtu;
699 dev->cache_valid |= VALID_MTU;
700 dev->netdev_mtu_error = 0;
703 if (!eth_addr_is_zero(change->addr)) {
704 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
705 dev->cache_valid |= VALID_ETHERADDR;
706 dev->ether_addr_error = 0;
709 dev->ifindex = change->if_index;
710 dev->cache_valid |= VALID_IFINDEX;
711 dev->get_ifindex_error = 0;
713 netdev_linux_changed(dev, change->ifi_flags, 0);
715 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
716 /* Invalidates in4, in6. */
717 netdev_linux_changed(dev, dev->ifi_flags,
718 ~(VALID_IN4 | VALID_IN6));
724 static struct netdev *
725 netdev_linux_alloc(void)
727 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
732 netdev_linux_common_construct(struct netdev_linux *netdev)
734 ovs_mutex_init(&netdev->mutex);
737 /* Creates system and internal devices. */
739 netdev_linux_construct(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
744 netdev_linux_common_construct(netdev);
746 error = get_flags(&netdev->up, &netdev->ifi_flags);
747 if (error == ENODEV) {
748 if (netdev->up.netdev_class != &netdev_internal_class) {
749 /* The device does not exist, so don't allow it to be opened. */
752 /* "Internal" netdevs have to be created as netdev objects before
753 * they exist in the kernel, because creating them in the kernel
754 * happens by passing a netdev object to dpif_port_add().
755 * Therefore, ignore the error. */
762 /* For most types of netdevs we open the device for each call of
763 * netdev_open(). However, this is not the case with tap devices,
764 * since it is only possible to open the device once. In this
765 * situation we share a single file descriptor, and consequently
766 * buffers, across all readers. Therefore once data is read it will
767 * be unavailable to other reads for tap devices. */
769 netdev_linux_construct_tap(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 static const char tap_dev[] = "/dev/net/tun";
773 const char *name = netdev_->name;
777 netdev_linux_common_construct(netdev);
779 /* Open tap device. */
780 netdev->tap_fd = open(tap_dev, O_RDWR);
781 if (netdev->tap_fd < 0) {
783 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
787 /* Create tap device. */
788 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
789 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
790 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
791 VLOG_WARN("%s: creating tap device failed: %s", name,
792 ovs_strerror(errno));
797 /* Make non-blocking. */
798 error = set_nonblocking(netdev->tap_fd);
806 close(netdev->tap_fd);
811 netdev_linux_destruct(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 if (netdev->tc && netdev->tc->ops->tc_destroy) {
816 netdev->tc->ops->tc_destroy(netdev->tc);
819 if (netdev_get_class(netdev_) == &netdev_tap_class
820 && netdev->tap_fd >= 0)
822 close(netdev->tap_fd);
825 if (netdev->miimon_interval > 0) {
826 atomic_count_dec(&miimon_cnt);
829 ovs_mutex_destroy(&netdev->mutex);
833 netdev_linux_dealloc(struct netdev *netdev_)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 static struct netdev_rxq *
840 netdev_linux_rxq_alloc(void)
842 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
847 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
849 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
850 struct netdev *netdev_ = rx->up.netdev;
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
854 ovs_mutex_lock(&netdev->mutex);
855 rx->is_tap = is_tap_netdev(netdev_);
857 rx->fd = netdev->tap_fd;
859 struct sockaddr_ll sll;
861 /* Result of tcpdump -dd inbound */
862 static const struct sock_filter filt[] = {
863 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
864 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
865 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
866 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
868 static const struct sock_fprog fprog = {
869 ARRAY_SIZE(filt), (struct sock_filter *) filt
872 /* Create file descriptor. */
873 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
876 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
881 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
883 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
884 netdev_get_name(netdev_), ovs_strerror(error));
888 /* Set non-blocking mode. */
889 error = set_nonblocking(rx->fd);
894 /* Get ethernet device index. */
895 error = get_ifindex(&netdev->up, &ifindex);
900 /* Bind to specific ethernet device. */
901 memset(&sll, 0, sizeof sll);
902 sll.sll_family = AF_PACKET;
903 sll.sll_ifindex = ifindex;
904 sll.sll_protocol = htons(ETH_P_ALL);
905 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
907 VLOG_ERR("%s: failed to bind raw socket (%s)",
908 netdev_get_name(netdev_), ovs_strerror(error));
912 /* Filter for only inbound packets. */
913 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
917 VLOG_ERR("%s: failed to attach filter (%s)",
918 netdev_get_name(netdev_), ovs_strerror(error));
922 ovs_mutex_unlock(&netdev->mutex);
930 ovs_mutex_unlock(&netdev->mutex);
935 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
937 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
945 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
947 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
953 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
955 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
956 return htons(aux->tp_vlan_tpid);
958 return htons(ETH_TYPE_VLAN);
963 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
965 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
969 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
974 struct cmsghdr *cmsg;
977 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
981 /* Reserve headroom for a single VLAN tag */
982 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
983 size = dp_packet_tailroom(buffer);
985 iov.iov_base = dp_packet_data(buffer);
987 msgh.msg_name = NULL;
988 msgh.msg_namelen = 0;
991 msgh.msg_control = &cmsg_buffer;
992 msgh.msg_controllen = sizeof cmsg_buffer;
996 retval = recvmsg(fd, &msgh, MSG_TRUNC);
997 } while (retval < 0 && errno == EINTR);
1001 } else if (retval > size) {
1005 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1007 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1008 const struct tpacket_auxdata *aux;
1010 if (cmsg->cmsg_level != SOL_PACKET
1011 || cmsg->cmsg_type != PACKET_AUXDATA
1012 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1016 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1017 if (auxdata_has_vlan_tci(aux)) {
1018 if (retval < ETH_HEADER_LEN) {
1022 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1023 htons(aux->tp_vlan_tci));
1032 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1035 size_t size = dp_packet_tailroom(buffer);
1038 retval = read(fd, dp_packet_data(buffer), size);
1039 } while (retval < 0 && errno == EINTR);
1043 } else if (retval > size) {
1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1052 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1055 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1056 struct netdev *netdev = rx->up.netdev;
1057 struct dp_packet *buffer;
1061 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1062 mtu = ETH_PAYLOAD_MAX;
1065 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1066 DP_NETDEV_HEADROOM);
1067 retval = (rx->is_tap
1068 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1069 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1072 if (retval != EAGAIN && retval != EMSGSIZE) {
1073 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1074 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1076 dp_packet_delete(buffer);
1078 dp_packet_pad(buffer);
1079 dp_packet_set_rss_hash(buffer, 0);
1080 packets[0] = buffer;
1088 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1090 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1091 poll_fd_wait(rx->fd, POLLIN);
1095 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1100 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1101 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1105 drain_fd(rx->fd, ifr.ifr_qlen);
1108 return drain_rcvbuf(rx->fd);
1112 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1113 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1114 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1115 * the packet is too big or too small to transmit on the device.
1117 * The caller retains ownership of 'buffer' in all cases.
1119 * The kernel maintains a packet transmission queue, so the caller is not
1120 * expected to do additional queuing of packets. */
1122 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1123 struct dp_packet **pkts, int cnt, bool may_steal)
1128 /* 'i' is incremented only if there's no error */
1129 for (i = 0; i < cnt;) {
1130 const void *data = dp_packet_data(pkts[i]);
1131 size_t size = dp_packet_size(pkts[i]);
1134 if (!is_tap_netdev(netdev_)) {
1135 /* Use our AF_PACKET socket to send to this device. */
1136 struct sockaddr_ll sll;
1142 sock = af_packet_sock();
1147 ifindex = netdev_get_ifindex(netdev_);
1152 /* We don't bother setting most fields in sockaddr_ll because the
1153 * kernel ignores them for SOCK_RAW. */
1154 memset(&sll, 0, sizeof sll);
1155 sll.sll_family = AF_PACKET;
1156 sll.sll_ifindex = ifindex;
1158 iov.iov_base = CONST_CAST(void *, data);
1161 msg.msg_name = &sll;
1162 msg.msg_namelen = sizeof sll;
1165 msg.msg_control = NULL;
1166 msg.msg_controllen = 0;
1169 retval = sendmsg(sock, &msg, 0);
1171 /* Use the tap fd to send to this device. This is essential for
1172 * tap devices, because packets sent to a tap device with an
1173 * AF_PACKET socket will loop back to be *received* again on the
1174 * tap device. This doesn't occur on other interface types
1175 * because we attach a socket filter to the rx socket. */
1176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1178 retval = write(netdev->tap_fd, data, size);
1182 /* The Linux AF_PACKET implementation never blocks waiting for room
1183 * for packets, instead returning ENOBUFS. Translate this into
1184 * EAGAIN for the caller. */
1185 error = errno == ENOBUFS ? EAGAIN : errno;
1186 if (error == EINTR) {
1187 /* continue without incrementing 'i', i.e. retry this packet */
1191 } else if (retval != size) {
1192 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1193 " of %"PRIuSIZE") on %s", retval, size,
1194 netdev_get_name(netdev_));
1199 /* Process the next packet in the batch */
1204 for (i = 0; i < cnt; i++) {
1205 dp_packet_delete(pkts[i]);
1209 if (error && error != EAGAIN) {
1210 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1211 netdev_get_name(netdev_), ovs_strerror(error));
1218 /* Registers with the poll loop to wake up from the next call to poll_block()
1219 * when the packet transmission queue has sufficient room to transmit a packet
1220 * with netdev_send().
1222 * The kernel maintains a packet transmission queue, so the client is not
1223 * expected to do additional queuing of packets. Thus, this function is
1224 * unlikely to ever be used. It is included for completeness. */
1226 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1228 if (is_tap_netdev(netdev)) {
1229 /* TAP device always accepts packets.*/
1230 poll_immediate_wake();
1234 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1235 * otherwise a positive errno value. */
1237 netdev_linux_set_etheraddr(struct netdev *netdev_,
1238 const uint8_t mac[ETH_ADDR_LEN])
1240 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1241 enum netdev_flags old_flags = 0;
1244 ovs_mutex_lock(&netdev->mutex);
1246 if (netdev->cache_valid & VALID_ETHERADDR) {
1247 error = netdev->ether_addr_error;
1248 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1251 netdev->cache_valid &= ~VALID_ETHERADDR;
1254 /* Tap devices must be brought down before setting the address. */
1255 if (is_tap_netdev(netdev_)) {
1256 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1258 error = set_etheraddr(netdev_get_name(netdev_), mac);
1259 if (!error || error == ENODEV) {
1260 netdev->ether_addr_error = error;
1261 netdev->cache_valid |= VALID_ETHERADDR;
1263 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1267 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1268 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1272 ovs_mutex_unlock(&netdev->mutex);
1276 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1278 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1279 uint8_t mac[ETH_ADDR_LEN])
1281 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1284 ovs_mutex_lock(&netdev->mutex);
1285 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1286 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1288 netdev->cache_valid |= VALID_ETHERADDR;
1291 error = netdev->ether_addr_error;
1293 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1295 ovs_mutex_unlock(&netdev->mutex);
1301 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1305 if (!(netdev->cache_valid & VALID_MTU)) {
1308 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1309 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1310 netdev->mtu = ifr.ifr_mtu;
1311 netdev->cache_valid |= VALID_MTU;
1314 error = netdev->netdev_mtu_error;
1316 *mtup = netdev->mtu;
1322 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1323 * in bytes, not including the hardware header; thus, this is typically 1500
1324 * bytes for Ethernet devices. */
1326 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1331 ovs_mutex_lock(&netdev->mutex);
1332 error = netdev_linux_get_mtu__(netdev, mtup);
1333 ovs_mutex_unlock(&netdev->mutex);
1338 /* Sets the maximum size of transmitted (MTU) for given device using linux
1339 * networking ioctl interface.
1342 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1344 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1348 ovs_mutex_lock(&netdev->mutex);
1349 if (netdev->cache_valid & VALID_MTU) {
1350 error = netdev->netdev_mtu_error;
1351 if (error || netdev->mtu == mtu) {
1354 netdev->cache_valid &= ~VALID_MTU;
1357 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1358 SIOCSIFMTU, "SIOCSIFMTU");
1359 if (!error || error == ENODEV) {
1360 netdev->netdev_mtu_error = error;
1361 netdev->mtu = ifr.ifr_mtu;
1362 netdev->cache_valid |= VALID_MTU;
1365 ovs_mutex_unlock(&netdev->mutex);
1369 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1370 * On failure, returns a negative errno value. */
1372 netdev_linux_get_ifindex(const struct netdev *netdev_)
1374 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1377 ovs_mutex_lock(&netdev->mutex);
1378 error = get_ifindex(netdev_, &ifindex);
1379 ovs_mutex_unlock(&netdev->mutex);
1381 return error ? -error : ifindex;
1385 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1387 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1389 ovs_mutex_lock(&netdev->mutex);
1390 if (netdev->miimon_interval > 0) {
1391 *carrier = netdev->miimon;
1393 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1395 ovs_mutex_unlock(&netdev->mutex);
1400 static long long int
1401 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1403 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1404 long long int carrier_resets;
1406 ovs_mutex_lock(&netdev->mutex);
1407 carrier_resets = netdev->carrier_resets;
1408 ovs_mutex_unlock(&netdev->mutex);
1410 return carrier_resets;
1414 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1415 struct mii_ioctl_data *data)
1420 memset(&ifr, 0, sizeof ifr);
1421 memcpy(&ifr.ifr_data, data, sizeof *data);
1422 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1423 memcpy(data, &ifr.ifr_data, sizeof *data);
1429 netdev_linux_get_miimon(const char *name, bool *miimon)
1431 struct mii_ioctl_data data;
1436 memset(&data, 0, sizeof data);
1437 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1439 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1440 data.reg_num = MII_BMSR;
1441 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1445 *miimon = !!(data.val_out & BMSR_LSTATUS);
1447 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1450 struct ethtool_cmd ecmd;
1452 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1455 COVERAGE_INC(netdev_get_ethtool);
1456 memset(&ecmd, 0, sizeof ecmd);
1457 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1460 struct ethtool_value eval;
1462 memcpy(&eval, &ecmd, sizeof eval);
1463 *miimon = !!eval.data;
1465 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1473 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1474 long long int interval)
1476 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1478 ovs_mutex_lock(&netdev->mutex);
1479 interval = interval > 0 ? MAX(interval, 100) : 0;
1480 if (netdev->miimon_interval != interval) {
1481 if (interval && !netdev->miimon_interval) {
1482 atomic_count_inc(&miimon_cnt);
1483 } else if (!interval && netdev->miimon_interval) {
1484 atomic_count_dec(&miimon_cnt);
1487 netdev->miimon_interval = interval;
1488 timer_set_expired(&netdev->miimon_timer);
1490 ovs_mutex_unlock(&netdev->mutex);
1496 netdev_linux_miimon_run(void)
1498 struct shash device_shash;
1499 struct shash_node *node;
1501 shash_init(&device_shash);
1502 netdev_get_devices(&netdev_linux_class, &device_shash);
1503 SHASH_FOR_EACH (node, &device_shash) {
1504 struct netdev *netdev = node->data;
1505 struct netdev_linux *dev = netdev_linux_cast(netdev);
1508 ovs_mutex_lock(&dev->mutex);
1509 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1510 netdev_linux_get_miimon(dev->up.name, &miimon);
1511 if (miimon != dev->miimon) {
1512 dev->miimon = miimon;
1513 netdev_linux_changed(dev, dev->ifi_flags, 0);
1516 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1518 ovs_mutex_unlock(&dev->mutex);
1519 netdev_close(netdev);
1522 shash_destroy(&device_shash);
1526 netdev_linux_miimon_wait(void)
1528 struct shash device_shash;
1529 struct shash_node *node;
1531 shash_init(&device_shash);
1532 netdev_get_devices(&netdev_linux_class, &device_shash);
1533 SHASH_FOR_EACH (node, &device_shash) {
1534 struct netdev *netdev = node->data;
1535 struct netdev_linux *dev = netdev_linux_cast(netdev);
1537 ovs_mutex_lock(&dev->mutex);
1538 if (dev->miimon_interval > 0) {
1539 timer_wait(&dev->miimon_timer);
1541 ovs_mutex_unlock(&dev->mutex);
1542 netdev_close(netdev);
1544 shash_destroy(&device_shash);
1548 swap_uint64(uint64_t *a, uint64_t *b)
1555 /* Copies 'src' into 'dst', performing format conversion in the process.
1557 * 'src' is allowed to be misaligned. */
1559 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1560 const struct ovs_vport_stats *src)
1562 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1563 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1564 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1565 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1566 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1567 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1568 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1569 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1571 dst->collisions = 0;
1572 dst->rx_length_errors = 0;
1573 dst->rx_over_errors = 0;
1574 dst->rx_crc_errors = 0;
1575 dst->rx_frame_errors = 0;
1576 dst->rx_fifo_errors = 0;
1577 dst->rx_missed_errors = 0;
1578 dst->tx_aborted_errors = 0;
1579 dst->tx_carrier_errors = 0;
1580 dst->tx_fifo_errors = 0;
1581 dst->tx_heartbeat_errors = 0;
1582 dst->tx_window_errors = 0;
1586 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1588 struct dpif_netlink_vport reply;
1592 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1595 } else if (!reply.stats) {
1600 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1608 get_stats_via_vport(const struct netdev *netdev_,
1609 struct netdev_stats *stats)
1611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1613 if (!netdev->vport_stats_error ||
1614 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1617 error = get_stats_via_vport__(netdev_, stats);
1618 if (error && error != ENOENT && error != ENODEV) {
1619 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1621 netdev_get_name(netdev_), ovs_strerror(error));
1623 netdev->vport_stats_error = error;
1624 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1628 /* Retrieves current device stats for 'netdev-linux'. */
1630 netdev_linux_get_stats(const struct netdev *netdev_,
1631 struct netdev_stats *stats)
1633 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1634 struct netdev_stats dev_stats;
1637 ovs_mutex_lock(&netdev->mutex);
1638 get_stats_via_vport(netdev_, stats);
1639 error = get_stats_via_netlink(netdev_, &dev_stats);
1641 if (!netdev->vport_stats_error) {
1644 } else if (netdev->vport_stats_error) {
1645 /* stats not available from OVS then use netdev stats. */
1648 /* Use kernel netdev's packet and byte counts since vport's counters
1649 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1651 stats->rx_packets = dev_stats.rx_packets;
1652 stats->rx_bytes = dev_stats.rx_bytes;
1653 stats->tx_packets = dev_stats.tx_packets;
1654 stats->tx_bytes = dev_stats.tx_bytes;
1656 stats->rx_errors += dev_stats.rx_errors;
1657 stats->tx_errors += dev_stats.tx_errors;
1658 stats->rx_dropped += dev_stats.rx_dropped;
1659 stats->tx_dropped += dev_stats.tx_dropped;
1660 stats->multicast += dev_stats.multicast;
1661 stats->collisions += dev_stats.collisions;
1662 stats->rx_length_errors += dev_stats.rx_length_errors;
1663 stats->rx_over_errors += dev_stats.rx_over_errors;
1664 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1665 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1666 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1667 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1668 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1669 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1670 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1671 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1672 stats->tx_window_errors += dev_stats.tx_window_errors;
1674 ovs_mutex_unlock(&netdev->mutex);
1679 /* Retrieves current device stats for 'netdev-tap' netdev or
1680 * netdev-internal. */
1682 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1684 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1685 struct netdev_stats dev_stats;
1688 ovs_mutex_lock(&netdev->mutex);
1689 get_stats_via_vport(netdev_, stats);
1690 error = get_stats_via_netlink(netdev_, &dev_stats);
1692 if (!netdev->vport_stats_error) {
1695 } else if (netdev->vport_stats_error) {
1696 /* Transmit and receive stats will appear to be swapped relative to the
1697 * other ports since we are the one sending the data, not a remote
1698 * computer. For consistency, we swap them back here. This does not
1699 * apply if we are getting stats from the vport layer because it always
1700 * tracks stats from the perspective of the switch. */
1703 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1704 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1705 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1706 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1707 stats->rx_length_errors = 0;
1708 stats->rx_over_errors = 0;
1709 stats->rx_crc_errors = 0;
1710 stats->rx_frame_errors = 0;
1711 stats->rx_fifo_errors = 0;
1712 stats->rx_missed_errors = 0;
1713 stats->tx_aborted_errors = 0;
1714 stats->tx_carrier_errors = 0;
1715 stats->tx_fifo_errors = 0;
1716 stats->tx_heartbeat_errors = 0;
1717 stats->tx_window_errors = 0;
1719 /* Use kernel netdev's packet and byte counts since vport counters
1720 * do not reflect packet counts on the wire when GSO, TSO or GRO
1722 stats->rx_packets = dev_stats.tx_packets;
1723 stats->rx_bytes = dev_stats.tx_bytes;
1724 stats->tx_packets = dev_stats.rx_packets;
1725 stats->tx_bytes = dev_stats.rx_bytes;
1727 stats->rx_dropped += dev_stats.tx_dropped;
1728 stats->tx_dropped += dev_stats.rx_dropped;
1730 stats->rx_errors += dev_stats.tx_errors;
1731 stats->tx_errors += dev_stats.rx_errors;
1733 stats->multicast += dev_stats.multicast;
1734 stats->collisions += dev_stats.collisions;
1736 ovs_mutex_unlock(&netdev->mutex);
1742 netdev_internal_get_stats(const struct netdev *netdev_,
1743 struct netdev_stats *stats)
1745 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1748 ovs_mutex_lock(&netdev->mutex);
1749 get_stats_via_vport(netdev_, stats);
1750 error = netdev->vport_stats_error;
1751 ovs_mutex_unlock(&netdev->mutex);
1757 netdev_linux_read_features(struct netdev_linux *netdev)
1759 struct ethtool_cmd ecmd;
1763 if (netdev->cache_valid & VALID_FEATURES) {
1767 COVERAGE_INC(netdev_get_ethtool);
1768 memset(&ecmd, 0, sizeof ecmd);
1769 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1770 ETHTOOL_GSET, "ETHTOOL_GSET");
1775 /* Supported features. */
1776 netdev->supported = 0;
1777 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1778 netdev->supported |= NETDEV_F_10MB_HD;
1780 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1781 netdev->supported |= NETDEV_F_10MB_FD;
1783 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1784 netdev->supported |= NETDEV_F_100MB_HD;
1786 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1787 netdev->supported |= NETDEV_F_100MB_FD;
1789 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1790 netdev->supported |= NETDEV_F_1GB_HD;
1792 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1793 netdev->supported |= NETDEV_F_1GB_FD;
1795 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1796 netdev->supported |= NETDEV_F_10GB_FD;
1798 if (ecmd.supported & SUPPORTED_TP) {
1799 netdev->supported |= NETDEV_F_COPPER;
1801 if (ecmd.supported & SUPPORTED_FIBRE) {
1802 netdev->supported |= NETDEV_F_FIBER;
1804 if (ecmd.supported & SUPPORTED_Autoneg) {
1805 netdev->supported |= NETDEV_F_AUTONEG;
1807 if (ecmd.supported & SUPPORTED_Pause) {
1808 netdev->supported |= NETDEV_F_PAUSE;
1810 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1811 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1814 /* Advertised features. */
1815 netdev->advertised = 0;
1816 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1817 netdev->advertised |= NETDEV_F_10MB_HD;
1819 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1820 netdev->advertised |= NETDEV_F_10MB_FD;
1822 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1823 netdev->advertised |= NETDEV_F_100MB_HD;
1825 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1826 netdev->advertised |= NETDEV_F_100MB_FD;
1828 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1829 netdev->advertised |= NETDEV_F_1GB_HD;
1831 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1832 netdev->advertised |= NETDEV_F_1GB_FD;
1834 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1835 netdev->advertised |= NETDEV_F_10GB_FD;
1837 if (ecmd.advertising & ADVERTISED_TP) {
1838 netdev->advertised |= NETDEV_F_COPPER;
1840 if (ecmd.advertising & ADVERTISED_FIBRE) {
1841 netdev->advertised |= NETDEV_F_FIBER;
1843 if (ecmd.advertising & ADVERTISED_Autoneg) {
1844 netdev->advertised |= NETDEV_F_AUTONEG;
1846 if (ecmd.advertising & ADVERTISED_Pause) {
1847 netdev->advertised |= NETDEV_F_PAUSE;
1849 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1850 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1853 /* Current settings. */
1855 if (speed == SPEED_10) {
1856 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1857 } else if (speed == SPEED_100) {
1858 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1859 } else if (speed == SPEED_1000) {
1860 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1861 } else if (speed == SPEED_10000) {
1862 netdev->current = NETDEV_F_10GB_FD;
1863 } else if (speed == 40000) {
1864 netdev->current = NETDEV_F_40GB_FD;
1865 } else if (speed == 100000) {
1866 netdev->current = NETDEV_F_100GB_FD;
1867 } else if (speed == 1000000) {
1868 netdev->current = NETDEV_F_1TB_FD;
1870 netdev->current = 0;
1873 if (ecmd.port == PORT_TP) {
1874 netdev->current |= NETDEV_F_COPPER;
1875 } else if (ecmd.port == PORT_FIBRE) {
1876 netdev->current |= NETDEV_F_FIBER;
1880 netdev->current |= NETDEV_F_AUTONEG;
1884 netdev->cache_valid |= VALID_FEATURES;
1885 netdev->get_features_error = error;
1888 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1889 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1890 * Returns 0 if successful, otherwise a positive errno value. */
1892 netdev_linux_get_features(const struct netdev *netdev_,
1893 enum netdev_features *current,
1894 enum netdev_features *advertised,
1895 enum netdev_features *supported,
1896 enum netdev_features *peer)
1898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1901 ovs_mutex_lock(&netdev->mutex);
1902 netdev_linux_read_features(netdev);
1903 if (!netdev->get_features_error) {
1904 *current = netdev->current;
1905 *advertised = netdev->advertised;
1906 *supported = netdev->supported;
1907 *peer = 0; /* XXX */
1909 error = netdev->get_features_error;
1910 ovs_mutex_unlock(&netdev->mutex);
1915 /* Set the features advertised by 'netdev' to 'advertise'. */
1917 netdev_linux_set_advertisements(struct netdev *netdev_,
1918 enum netdev_features advertise)
1920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1921 struct ethtool_cmd ecmd;
1924 ovs_mutex_lock(&netdev->mutex);
1926 COVERAGE_INC(netdev_get_ethtool);
1927 memset(&ecmd, 0, sizeof ecmd);
1928 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1929 ETHTOOL_GSET, "ETHTOOL_GSET");
1934 ecmd.advertising = 0;
1935 if (advertise & NETDEV_F_10MB_HD) {
1936 ecmd.advertising |= ADVERTISED_10baseT_Half;
1938 if (advertise & NETDEV_F_10MB_FD) {
1939 ecmd.advertising |= ADVERTISED_10baseT_Full;
1941 if (advertise & NETDEV_F_100MB_HD) {
1942 ecmd.advertising |= ADVERTISED_100baseT_Half;
1944 if (advertise & NETDEV_F_100MB_FD) {
1945 ecmd.advertising |= ADVERTISED_100baseT_Full;
1947 if (advertise & NETDEV_F_1GB_HD) {
1948 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1950 if (advertise & NETDEV_F_1GB_FD) {
1951 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1953 if (advertise & NETDEV_F_10GB_FD) {
1954 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1956 if (advertise & NETDEV_F_COPPER) {
1957 ecmd.advertising |= ADVERTISED_TP;
1959 if (advertise & NETDEV_F_FIBER) {
1960 ecmd.advertising |= ADVERTISED_FIBRE;
1962 if (advertise & NETDEV_F_AUTONEG) {
1963 ecmd.advertising |= ADVERTISED_Autoneg;
1965 if (advertise & NETDEV_F_PAUSE) {
1966 ecmd.advertising |= ADVERTISED_Pause;
1968 if (advertise & NETDEV_F_PAUSE_ASYM) {
1969 ecmd.advertising |= ADVERTISED_Asym_Pause;
1971 COVERAGE_INC(netdev_set_ethtool);
1972 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1973 ETHTOOL_SSET, "ETHTOOL_SSET");
1976 ovs_mutex_unlock(&netdev->mutex);
1980 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1981 * successful, otherwise a positive errno value. */
1983 netdev_linux_set_policing(struct netdev *netdev_,
1984 uint32_t kbits_rate, uint32_t kbits_burst)
1986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1987 const char *netdev_name = netdev_get_name(netdev_);
1990 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1991 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1992 : kbits_burst); /* Stick with user-specified value. */
1994 ovs_mutex_lock(&netdev->mutex);
1995 if (netdev->cache_valid & VALID_POLICING) {
1996 error = netdev->netdev_policing_error;
1997 if (error || (netdev->kbits_rate == kbits_rate &&
1998 netdev->kbits_burst == kbits_burst)) {
1999 /* Assume that settings haven't changed since we last set them. */
2002 netdev->cache_valid &= ~VALID_POLICING;
2005 COVERAGE_INC(netdev_set_policing);
2006 /* Remove any existing ingress qdisc. */
2007 error = tc_add_del_ingress_qdisc(netdev_, false);
2009 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2010 netdev_name, ovs_strerror(error));
2015 error = tc_add_del_ingress_qdisc(netdev_, true);
2017 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2018 netdev_name, ovs_strerror(error));
2022 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2024 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2025 netdev_name, ovs_strerror(error));
2030 netdev->kbits_rate = kbits_rate;
2031 netdev->kbits_burst = kbits_burst;
2034 if (!error || error == ENODEV) {
2035 netdev->netdev_policing_error = error;
2036 netdev->cache_valid |= VALID_POLICING;
2038 ovs_mutex_unlock(&netdev->mutex);
2043 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2046 const struct tc_ops *const *opsp;
2048 for (opsp = tcs; *opsp != NULL; opsp++) {
2049 const struct tc_ops *ops = *opsp;
2050 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2051 sset_add(types, ops->ovs_name);
2057 static const struct tc_ops *
2058 tc_lookup_ovs_name(const char *name)
2060 const struct tc_ops *const *opsp;
2062 for (opsp = tcs; *opsp != NULL; opsp++) {
2063 const struct tc_ops *ops = *opsp;
2064 if (!strcmp(name, ops->ovs_name)) {
2071 static const struct tc_ops *
2072 tc_lookup_linux_name(const char *name)
2074 const struct tc_ops *const *opsp;
2076 for (opsp = tcs; *opsp != NULL; opsp++) {
2077 const struct tc_ops *ops = *opsp;
2078 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2085 static struct tc_queue *
2086 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2089 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2090 struct tc_queue *queue;
2092 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2093 if (queue->queue_id == queue_id) {
2100 static struct tc_queue *
2101 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2103 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2107 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2109 struct netdev_qos_capabilities *caps)
2111 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2115 caps->n_queues = ops->n_queues;
2120 netdev_linux_get_qos(const struct netdev *netdev_,
2121 const char **typep, struct smap *details)
2123 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2126 ovs_mutex_lock(&netdev->mutex);
2127 error = tc_query_qdisc(netdev_);
2129 *typep = netdev->tc->ops->ovs_name;
2130 error = (netdev->tc->ops->qdisc_get
2131 ? netdev->tc->ops->qdisc_get(netdev_, details)
2134 ovs_mutex_unlock(&netdev->mutex);
2140 netdev_linux_set_qos(struct netdev *netdev_,
2141 const char *type, const struct smap *details)
2143 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2144 const struct tc_ops *new_ops;
2147 new_ops = tc_lookup_ovs_name(type);
2148 if (!new_ops || !new_ops->tc_install) {
2152 ovs_mutex_lock(&netdev->mutex);
2153 error = tc_query_qdisc(netdev_);
2158 if (new_ops == netdev->tc->ops) {
2159 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2161 /* Delete existing qdisc. */
2162 error = tc_del_qdisc(netdev_);
2166 ovs_assert(netdev->tc == NULL);
2168 /* Install new qdisc. */
2169 error = new_ops->tc_install(netdev_, details);
2170 ovs_assert((error == 0) == (netdev->tc != NULL));
2174 ovs_mutex_unlock(&netdev->mutex);
2179 netdev_linux_get_queue(const struct netdev *netdev_,
2180 unsigned int queue_id, struct smap *details)
2182 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2185 ovs_mutex_lock(&netdev->mutex);
2186 error = tc_query_qdisc(netdev_);
2188 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2190 ? netdev->tc->ops->class_get(netdev_, queue, details)
2193 ovs_mutex_unlock(&netdev->mutex);
2199 netdev_linux_set_queue(struct netdev *netdev_,
2200 unsigned int queue_id, const struct smap *details)
2202 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2205 ovs_mutex_lock(&netdev->mutex);
2206 error = tc_query_qdisc(netdev_);
2208 error = (queue_id < netdev->tc->ops->n_queues
2209 && netdev->tc->ops->class_set
2210 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2213 ovs_mutex_unlock(&netdev->mutex);
2219 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2224 ovs_mutex_lock(&netdev->mutex);
2225 error = tc_query_qdisc(netdev_);
2227 if (netdev->tc->ops->class_delete) {
2228 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2230 ? netdev->tc->ops->class_delete(netdev_, queue)
2236 ovs_mutex_unlock(&netdev->mutex);
2242 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2243 unsigned int queue_id,
2244 struct netdev_queue_stats *stats)
2246 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2249 ovs_mutex_lock(&netdev->mutex);
2250 error = tc_query_qdisc(netdev_);
2252 if (netdev->tc->ops->class_get_stats) {
2253 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2255 stats->created = queue->created;
2256 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2265 ovs_mutex_unlock(&netdev->mutex);
2270 struct queue_dump_state {
2271 struct nl_dump dump;
2276 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2278 struct ofpbuf request;
2279 struct tcmsg *tcmsg;
2281 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2285 tcmsg->tcm_parent = 0;
2286 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2287 ofpbuf_uninit(&request);
2289 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2294 finish_queue_dump(struct queue_dump_state *state)
2296 ofpbuf_uninit(&state->buf);
2297 return nl_dump_done(&state->dump);
2300 struct netdev_linux_queue_state {
2301 unsigned int *queues;
2307 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2309 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2312 ovs_mutex_lock(&netdev->mutex);
2313 error = tc_query_qdisc(netdev_);
2315 if (netdev->tc->ops->class_get) {
2316 struct netdev_linux_queue_state *state;
2317 struct tc_queue *queue;
2320 *statep = state = xmalloc(sizeof *state);
2321 state->n_queues = hmap_count(&netdev->tc->queues);
2322 state->cur_queue = 0;
2323 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2326 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2327 state->queues[i++] = queue->queue_id;
2333 ovs_mutex_unlock(&netdev->mutex);
2339 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2340 unsigned int *queue_idp, struct smap *details)
2342 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2343 struct netdev_linux_queue_state *state = state_;
2346 ovs_mutex_lock(&netdev->mutex);
2347 while (state->cur_queue < state->n_queues) {
2348 unsigned int queue_id = state->queues[state->cur_queue++];
2349 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2352 *queue_idp = queue_id;
2353 error = netdev->tc->ops->class_get(netdev_, queue, details);
2357 ovs_mutex_unlock(&netdev->mutex);
2363 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2366 struct netdev_linux_queue_state *state = state_;
2368 free(state->queues);
2374 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2375 netdev_dump_queue_stats_cb *cb, void *aux)
2377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2380 ovs_mutex_lock(&netdev->mutex);
2381 error = tc_query_qdisc(netdev_);
2383 struct queue_dump_state state;
2385 if (!netdev->tc->ops->class_dump_stats) {
2387 } else if (!start_queue_dump(netdev_, &state)) {
2393 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2394 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2401 retval = finish_queue_dump(&state);
2407 ovs_mutex_unlock(&netdev->mutex);
2413 netdev_linux_get_in4(const struct netdev *netdev_,
2414 struct in_addr *address, struct in_addr *netmask)
2416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2419 ovs_mutex_lock(&netdev->mutex);
2420 if (!(netdev->cache_valid & VALID_IN4)) {
2421 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2422 SIOCGIFADDR, "SIOCGIFADDR");
2424 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2425 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2427 netdev->in4_error = error;
2428 netdev->cache_valid |= VALID_IN4;
2430 error = netdev->in4_error;
2434 if (netdev->address.s_addr != INADDR_ANY) {
2435 *address = netdev->address;
2436 *netmask = netdev->netmask;
2438 error = EADDRNOTAVAIL;
2441 ovs_mutex_unlock(&netdev->mutex);
2447 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2448 struct in_addr netmask)
2450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2453 ovs_mutex_lock(&netdev->mutex);
2454 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2456 netdev->address = address;
2457 netdev->netmask = netmask;
2458 if (address.s_addr != INADDR_ANY) {
2459 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2460 "SIOCSIFNETMASK", netmask);
2465 netdev->cache_valid |= VALID_IN4;
2466 netdev->in4_error = 0;
2468 netdev->cache_valid &= ~VALID_IN4;
2470 ovs_mutex_unlock(&netdev->mutex);
2476 parse_if_inet6_line(const char *line,
2477 struct in6_addr *in6, char ifname[16 + 1])
2479 uint8_t *s6 = in6->s6_addr;
2480 #define X8 "%2"SCNx8
2481 return ovs_scan(line,
2482 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2483 "%*x %*x %*x %*x %16s\n",
2484 &s6[0], &s6[1], &s6[2], &s6[3],
2485 &s6[4], &s6[5], &s6[6], &s6[7],
2486 &s6[8], &s6[9], &s6[10], &s6[11],
2487 &s6[12], &s6[13], &s6[14], &s6[15],
2491 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2492 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2495 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2500 ovs_mutex_lock(&netdev->mutex);
2501 if (!(netdev->cache_valid & VALID_IN6)) {
2505 netdev->in6 = in6addr_any;
2506 netdev->in6_error = EADDRNOTAVAIL;
2508 file = fopen("/proc/net/if_inet6", "r");
2510 const char *name = netdev_get_name(netdev_);
2511 while (fgets(line, sizeof line, file)) {
2512 struct in6_addr in6_tmp;
2513 char ifname[16 + 1];
2514 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2515 && !strcmp(name, ifname))
2517 netdev->in6 = in6_tmp;
2518 netdev->in6_error = 0;
2524 netdev->in6_error = EOPNOTSUPP;
2526 netdev->cache_valid |= VALID_IN6;
2529 error = netdev->in6_error;
2530 ovs_mutex_unlock(&netdev->mutex);
2536 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2538 struct sockaddr_in sin;
2539 memset(&sin, 0, sizeof sin);
2540 sin.sin_family = AF_INET;
2541 sin.sin_addr = addr;
2544 memset(sa, 0, sizeof *sa);
2545 memcpy(sa, &sin, sizeof sin);
2549 do_set_addr(struct netdev *netdev,
2550 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2554 make_in4_sockaddr(&ifr.ifr_addr, addr);
2555 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2559 /* Adds 'router' as a default IP gateway. */
2561 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2563 struct in_addr any = { INADDR_ANY };
2567 memset(&rt, 0, sizeof rt);
2568 make_in4_sockaddr(&rt.rt_dst, any);
2569 make_in4_sockaddr(&rt.rt_gateway, router);
2570 make_in4_sockaddr(&rt.rt_genmask, any);
2571 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2572 error = af_inet_ioctl(SIOCADDRT, &rt);
2574 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2580 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2583 static const char fn[] = "/proc/net/route";
2588 *netdev_name = NULL;
2589 stream = fopen(fn, "r");
2590 if (stream == NULL) {
2591 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2596 while (fgets(line, sizeof line, stream)) {
2599 ovs_be32 dest, gateway, mask;
2600 int refcnt, metric, mtu;
2601 unsigned int flags, use, window, irtt;
2604 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2606 iface, &dest, &gateway, &flags, &refcnt,
2607 &use, &metric, &mask, &mtu, &window, &irtt)) {
2608 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2612 if (!(flags & RTF_UP)) {
2613 /* Skip routes that aren't up. */
2617 /* The output of 'dest', 'mask', and 'gateway' were given in
2618 * network byte order, so we don't need need any endian
2619 * conversions here. */
2620 if ((dest & mask) == (host->s_addr & mask)) {
2622 /* The host is directly reachable. */
2623 next_hop->s_addr = 0;
2625 /* To reach the host, we must go through a gateway. */
2626 next_hop->s_addr = gateway;
2628 *netdev_name = xstrdup(iface);
2640 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2642 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2645 ovs_mutex_lock(&netdev->mutex);
2646 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2647 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2649 COVERAGE_INC(netdev_get_ethtool);
2650 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2651 error = netdev_linux_do_ethtool(netdev->up.name,
2654 "ETHTOOL_GDRVINFO");
2656 netdev->cache_valid |= VALID_DRVINFO;
2661 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2662 smap_add(smap, "driver_version", netdev->drvinfo.version);
2663 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2665 ovs_mutex_unlock(&netdev->mutex);
2671 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2674 smap_add(smap, "driver_name", "openvswitch");
2678 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2679 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2680 * returns 0. Otherwise, it returns a positive errno value; in particular,
2681 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2683 netdev_linux_arp_lookup(const struct netdev *netdev,
2684 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2687 struct sockaddr_in sin;
2690 memset(&r, 0, sizeof r);
2691 memset(&sin, 0, sizeof sin);
2692 sin.sin_family = AF_INET;
2693 sin.sin_addr.s_addr = ip;
2695 memcpy(&r.arp_pa, &sin, sizeof sin);
2696 r.arp_ha.sa_family = ARPHRD_ETHER;
2698 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2699 COVERAGE_INC(netdev_arp_lookup);
2700 retval = af_inet_ioctl(SIOCGARP, &r);
2702 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2703 } else if (retval != ENXIO) {
2704 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2705 netdev_get_name(netdev), IP_ARGS(ip),
2706 ovs_strerror(retval));
2712 nd_to_iff_flags(enum netdev_flags nd)
2715 if (nd & NETDEV_UP) {
2718 if (nd & NETDEV_PROMISC) {
2721 if (nd & NETDEV_LOOPBACK) {
2722 iff |= IFF_LOOPBACK;
2728 iff_to_nd_flags(int iff)
2730 enum netdev_flags nd = 0;
2734 if (iff & IFF_PROMISC) {
2735 nd |= NETDEV_PROMISC;
2737 if (iff & IFF_LOOPBACK) {
2738 nd |= NETDEV_LOOPBACK;
2744 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2745 enum netdev_flags on, enum netdev_flags *old_flagsp)
2746 OVS_REQUIRES(netdev->mutex)
2748 int old_flags, new_flags;
2751 old_flags = netdev->ifi_flags;
2752 *old_flagsp = iff_to_nd_flags(old_flags);
2753 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2754 if (new_flags != old_flags) {
2755 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2756 get_flags(&netdev->up, &netdev->ifi_flags);
2763 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2764 enum netdev_flags on, enum netdev_flags *old_flagsp)
2766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2769 ovs_mutex_lock(&netdev->mutex);
2770 error = update_flags(netdev, off, on, old_flagsp);
2771 ovs_mutex_unlock(&netdev->mutex);
2776 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2777 GET_FEATURES, GET_STATUS) \
2783 netdev_linux_wait, \
2785 netdev_linux_alloc, \
2787 netdev_linux_destruct, \
2788 netdev_linux_dealloc, \
2789 NULL, /* get_config */ \
2790 NULL, /* set_config */ \
2791 NULL, /* get_tunnel_config */ \
2792 NULL, /* build header */ \
2793 NULL, /* push header */ \
2794 NULL, /* pop header */ \
2795 NULL, /* get_numa_id */ \
2796 NULL, /* set_multiq */ \
2798 netdev_linux_send, \
2799 netdev_linux_send_wait, \
2801 netdev_linux_set_etheraddr, \
2802 netdev_linux_get_etheraddr, \
2803 netdev_linux_get_mtu, \
2804 netdev_linux_set_mtu, \
2805 netdev_linux_get_ifindex, \
2806 netdev_linux_get_carrier, \
2807 netdev_linux_get_carrier_resets, \
2808 netdev_linux_set_miimon_interval, \
2812 netdev_linux_set_advertisements, \
2814 netdev_linux_set_policing, \
2815 netdev_linux_get_qos_types, \
2816 netdev_linux_get_qos_capabilities, \
2817 netdev_linux_get_qos, \
2818 netdev_linux_set_qos, \
2819 netdev_linux_get_queue, \
2820 netdev_linux_set_queue, \
2821 netdev_linux_delete_queue, \
2822 netdev_linux_get_queue_stats, \
2823 netdev_linux_queue_dump_start, \
2824 netdev_linux_queue_dump_next, \
2825 netdev_linux_queue_dump_done, \
2826 netdev_linux_dump_queue_stats, \
2828 netdev_linux_get_in4, \
2829 netdev_linux_set_in4, \
2830 netdev_linux_get_in6, \
2831 netdev_linux_add_router, \
2832 netdev_linux_get_next_hop, \
2834 netdev_linux_arp_lookup, \
2836 netdev_linux_update_flags, \
2838 netdev_linux_rxq_alloc, \
2839 netdev_linux_rxq_construct, \
2840 netdev_linux_rxq_destruct, \
2841 netdev_linux_rxq_dealloc, \
2842 netdev_linux_rxq_recv, \
2843 netdev_linux_rxq_wait, \
2844 netdev_linux_rxq_drain, \
2847 const struct netdev_class netdev_linux_class =
2850 netdev_linux_construct,
2851 netdev_linux_get_stats,
2852 netdev_linux_get_features,
2853 netdev_linux_get_status);
2855 const struct netdev_class netdev_tap_class =
2858 netdev_linux_construct_tap,
2859 netdev_tap_get_stats,
2860 netdev_linux_get_features,
2861 netdev_linux_get_status);
2863 const struct netdev_class netdev_internal_class =
2866 netdev_linux_construct,
2867 netdev_internal_get_stats,
2868 NULL, /* get_features */
2869 netdev_internal_get_status);
2872 #define CODEL_N_QUEUES 0x0000
2874 /* In sufficiently new kernel headers these are defined as enums in
2875 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2876 * kernels. (This overrides any enum definition in the header file but that's
2878 #define TCA_CODEL_TARGET 1
2879 #define TCA_CODEL_LIMIT 2
2880 #define TCA_CODEL_INTERVAL 3
2889 static struct codel *
2890 codel_get__(const struct netdev *netdev_)
2892 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2893 return CONTAINER_OF(netdev->tc, struct codel, tc);
2897 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2901 struct codel *codel;
2903 codel = xmalloc(sizeof *codel);
2904 tc_init(&codel->tc, &tc_ops_codel);
2905 codel->target = target;
2906 codel->limit = limit;
2907 codel->interval = interval;
2909 netdev->tc = &codel->tc;
2913 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2917 struct ofpbuf request;
2918 struct tcmsg *tcmsg;
2919 uint32_t otarget, olimit, ointerval;
2922 tc_del_qdisc(netdev);
2924 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2925 NLM_F_EXCL | NLM_F_CREATE, &request);
2929 tcmsg->tcm_handle = tc_make_handle(1, 0);
2930 tcmsg->tcm_parent = TC_H_ROOT;
2932 otarget = target ? target : 5000;
2933 olimit = limit ? limit : 10240;
2934 ointerval = interval ? interval : 100000;
2936 nl_msg_put_string(&request, TCA_KIND, "codel");
2937 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2938 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2939 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2940 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2941 nl_msg_end_nested(&request, opt_offset);
2943 error = tc_transact(&request, NULL);
2945 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2946 "target %u, limit %u, interval %u error %d(%s)",
2947 netdev_get_name(netdev),
2948 otarget, olimit, ointerval,
2949 error, ovs_strerror(error));
2955 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2956 const struct smap *details, struct codel *codel)
2958 const char *target_s;
2959 const char *limit_s;
2960 const char *interval_s;
2962 target_s = smap_get(details, "target");
2963 limit_s = smap_get(details, "limit");
2964 interval_s = smap_get(details, "interval");
2966 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2967 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2968 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2970 if (!codel->target) {
2971 codel->target = 5000;
2973 if (!codel->limit) {
2974 codel->limit = 10240;
2976 if (!codel->interval) {
2977 codel->interval = 100000;
2982 codel_tc_install(struct netdev *netdev, const struct smap *details)
2987 codel_parse_qdisc_details__(netdev, details, &codel);
2988 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2991 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2997 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2999 static const struct nl_policy tca_codel_policy[] = {
3000 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3001 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3002 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3005 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3007 if (!nl_parse_nested(nl_options, tca_codel_policy,
3008 attrs, ARRAY_SIZE(tca_codel_policy))) {
3009 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3013 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3014 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3015 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3020 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3022 struct nlattr *nlattr;
3027 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3032 error = codel_parse_tca_options__(nlattr, &codel);
3037 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3043 codel_tc_destroy(struct tc *tc)
3045 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3051 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3053 const struct codel *codel = codel_get__(netdev);
3054 smap_add_format(details, "target", "%u", codel->target);
3055 smap_add_format(details, "limit", "%u", codel->limit);
3056 smap_add_format(details, "interval", "%u", codel->interval);
3061 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3065 codel_parse_qdisc_details__(netdev, details, &codel);
3066 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3067 codel_get__(netdev)->target = codel.target;
3068 codel_get__(netdev)->limit = codel.limit;
3069 codel_get__(netdev)->interval = codel.interval;
3073 static const struct tc_ops tc_ops_codel = {
3074 "codel", /* linux_name */
3075 "linux-codel", /* ovs_name */
3076 CODEL_N_QUEUES, /* n_queues */
3089 /* FQ-CoDel traffic control class. */
3091 #define FQCODEL_N_QUEUES 0x0000
3093 /* In sufficiently new kernel headers these are defined as enums in
3094 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3095 * kernels. (This overrides any enum definition in the header file but that's
3097 #define TCA_FQ_CODEL_TARGET 1
3098 #define TCA_FQ_CODEL_LIMIT 2
3099 #define TCA_FQ_CODEL_INTERVAL 3
3100 #define TCA_FQ_CODEL_ECN 4
3101 #define TCA_FQ_CODEL_FLOWS 5
3102 #define TCA_FQ_CODEL_QUANTUM 6
3113 static struct fqcodel *
3114 fqcodel_get__(const struct netdev *netdev_)
3116 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3117 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3121 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3122 uint32_t interval, uint32_t flows, uint32_t quantum)
3124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3125 struct fqcodel *fqcodel;
3127 fqcodel = xmalloc(sizeof *fqcodel);
3128 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3129 fqcodel->target = target;
3130 fqcodel->limit = limit;
3131 fqcodel->interval = interval;
3132 fqcodel->flows = flows;
3133 fqcodel->quantum = quantum;
3135 netdev->tc = &fqcodel->tc;
3139 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3140 uint32_t interval, uint32_t flows, uint32_t quantum)
3143 struct ofpbuf request;
3144 struct tcmsg *tcmsg;
3145 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3148 tc_del_qdisc(netdev);
3150 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3151 NLM_F_EXCL | NLM_F_CREATE, &request);
3155 tcmsg->tcm_handle = tc_make_handle(1, 0);
3156 tcmsg->tcm_parent = TC_H_ROOT;
3158 otarget = target ? target : 5000;
3159 olimit = limit ? limit : 10240;
3160 ointerval = interval ? interval : 100000;
3161 oflows = flows ? flows : 1024;
3162 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3165 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3166 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3167 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3168 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3169 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3170 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3171 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3172 nl_msg_end_nested(&request, opt_offset);
3174 error = tc_transact(&request, NULL);
3176 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3177 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3178 netdev_get_name(netdev),
3179 otarget, olimit, ointerval, oflows, oquantum,
3180 error, ovs_strerror(error));
3186 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3187 const struct smap *details, struct fqcodel *fqcodel)
3189 const char *target_s;
3190 const char *limit_s;
3191 const char *interval_s;
3192 const char *flows_s;
3193 const char *quantum_s;
3195 target_s = smap_get(details, "target");
3196 limit_s = smap_get(details, "limit");
3197 interval_s = smap_get(details, "interval");
3198 flows_s = smap_get(details, "flows");
3199 quantum_s = smap_get(details, "quantum");
3200 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3201 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3202 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3203 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3204 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3205 if (!fqcodel->target) {
3206 fqcodel->target = 5000;
3208 if (!fqcodel->limit) {
3209 fqcodel->limit = 10240;
3211 if (!fqcodel->interval) {
3212 fqcodel->interval = 1000000;
3214 if (!fqcodel->flows) {
3215 fqcodel->flows = 1024;
3217 if (!fqcodel->quantum) {
3218 fqcodel->quantum = 1514;
3223 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3226 struct fqcodel fqcodel;
3228 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3229 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3230 fqcodel.interval, fqcodel.flows,
3233 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3234 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3240 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3242 static const struct nl_policy tca_fqcodel_policy[] = {
3243 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3244 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3245 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3246 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3247 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3250 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3252 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3253 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3254 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3258 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3259 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3260 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3261 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3262 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3267 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3269 struct nlattr *nlattr;
3272 struct fqcodel fqcodel;
3274 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3279 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3284 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3285 fqcodel.flows, fqcodel.quantum);
3290 fqcodel_tc_destroy(struct tc *tc)
3292 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3298 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3300 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3301 smap_add_format(details, "target", "%u", fqcodel->target);
3302 smap_add_format(details, "limit", "%u", fqcodel->limit);
3303 smap_add_format(details, "interval", "%u", fqcodel->interval);
3304 smap_add_format(details, "flows", "%u", fqcodel->flows);
3305 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3310 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3312 struct fqcodel fqcodel;
3314 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3315 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3316 fqcodel.flows, fqcodel.quantum);
3317 fqcodel_get__(netdev)->target = fqcodel.target;
3318 fqcodel_get__(netdev)->limit = fqcodel.limit;
3319 fqcodel_get__(netdev)->interval = fqcodel.interval;
3320 fqcodel_get__(netdev)->flows = fqcodel.flows;
3321 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3325 static const struct tc_ops tc_ops_fqcodel = {
3326 "fq_codel", /* linux_name */
3327 "linux-fq_codel", /* ovs_name */
3328 FQCODEL_N_QUEUES, /* n_queues */
3341 /* SFQ traffic control class. */
3343 #define SFQ_N_QUEUES 0x0000
3352 sfq_get__(const struct netdev *netdev_)
3354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3355 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3359 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3361 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3364 sfq = xmalloc(sizeof *sfq);
3365 tc_init(&sfq->tc, &tc_ops_sfq);
3366 sfq->perturb = perturb;
3367 sfq->quantum = quantum;
3369 netdev->tc = &sfq->tc;
3373 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3375 struct tc_sfq_qopt opt;
3376 struct ofpbuf request;
3377 struct tcmsg *tcmsg;
3379 int mtu_error, error;
3380 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3382 tc_del_qdisc(netdev);
3384 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3385 NLM_F_EXCL | NLM_F_CREATE, &request);
3389 tcmsg->tcm_handle = tc_make_handle(1, 0);
3390 tcmsg->tcm_parent = TC_H_ROOT;
3392 memset(&opt, 0, sizeof opt);
3395 opt.quantum = mtu; /* if we cannot find mtu, use default */
3398 opt.quantum = quantum;
3402 opt.perturb_period = 10;
3404 opt.perturb_period = perturb;
3407 nl_msg_put_string(&request, TCA_KIND, "sfq");
3408 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3410 error = tc_transact(&request, NULL);
3412 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3413 "quantum %u, perturb %u error %d(%s)",
3414 netdev_get_name(netdev),
3415 opt.quantum, opt.perturb_period,
3416 error, ovs_strerror(error));
3422 sfq_parse_qdisc_details__(struct netdev *netdev,
3423 const struct smap *details, struct sfq *sfq)
3425 const char *perturb_s;
3426 const char *quantum_s;
3430 perturb_s = smap_get(details, "perturb");
3431 quantum_s = smap_get(details, "quantum");
3432 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3433 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3434 if (!sfq->perturb) {
3438 if (!sfq->quantum) {
3439 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3443 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3444 "device without mtu");
3451 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3456 sfq_parse_qdisc_details__(netdev, details, &sfq);
3457 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3459 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3465 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3467 const struct tc_sfq_qopt *sfq;
3468 struct nlattr *nlattr;
3472 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3474 sfq = nl_attr_get(nlattr);
3475 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3483 sfq_tc_destroy(struct tc *tc)
3485 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3491 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3493 const struct sfq *sfq = sfq_get__(netdev);
3494 smap_add_format(details, "quantum", "%u", sfq->quantum);
3495 smap_add_format(details, "perturb", "%u", sfq->perturb);
3500 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3504 sfq_parse_qdisc_details__(netdev, details, &sfq);
3505 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3506 sfq_get__(netdev)->quantum = sfq.quantum;
3507 sfq_get__(netdev)->perturb = sfq.perturb;
3511 static const struct tc_ops tc_ops_sfq = {
3512 "sfq", /* linux_name */
3513 "linux-sfq", /* ovs_name */
3514 SFQ_N_QUEUES, /* n_queues */
3527 /* HTB traffic control class. */
3529 #define HTB_N_QUEUES 0xf000
3530 #define HTB_RATE2QUANTUM 10
3534 unsigned int max_rate; /* In bytes/s. */
3538 struct tc_queue tc_queue;
3539 unsigned int min_rate; /* In bytes/s. */
3540 unsigned int max_rate; /* In bytes/s. */
3541 unsigned int burst; /* In bytes. */
3542 unsigned int priority; /* Lower values are higher priorities. */
3546 htb_get__(const struct netdev *netdev_)
3548 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3549 return CONTAINER_OF(netdev->tc, struct htb, tc);
3553 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3555 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3558 htb = xmalloc(sizeof *htb);
3559 tc_init(&htb->tc, &tc_ops_htb);
3560 htb->max_rate = max_rate;
3562 netdev->tc = &htb->tc;
3565 /* Create an HTB qdisc.
3567 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3569 htb_setup_qdisc__(struct netdev *netdev)
3572 struct tc_htb_glob opt;
3573 struct ofpbuf request;
3574 struct tcmsg *tcmsg;
3576 tc_del_qdisc(netdev);
3578 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3579 NLM_F_EXCL | NLM_F_CREATE, &request);
3583 tcmsg->tcm_handle = tc_make_handle(1, 0);
3584 tcmsg->tcm_parent = TC_H_ROOT;
3586 nl_msg_put_string(&request, TCA_KIND, "htb");
3588 memset(&opt, 0, sizeof opt);
3589 opt.rate2quantum = HTB_RATE2QUANTUM;
3593 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3594 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3595 nl_msg_end_nested(&request, opt_offset);
3597 return tc_transact(&request, NULL);
3600 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3601 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3603 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3604 unsigned int parent, struct htb_class *class)
3607 struct tc_htb_opt opt;
3608 struct ofpbuf request;
3609 struct tcmsg *tcmsg;
3613 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3615 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3616 netdev_get_name(netdev));
3620 memset(&opt, 0, sizeof opt);
3621 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3622 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3623 /* Makes sure the quantum is at least MTU. Setting quantum will
3624 * make htb ignore the r2q for this class. */
3625 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3628 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3629 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3630 opt.prio = class->priority;
3632 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3636 tcmsg->tcm_handle = handle;
3637 tcmsg->tcm_parent = parent;
3639 nl_msg_put_string(&request, TCA_KIND, "htb");
3640 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3641 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3642 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3643 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3644 nl_msg_end_nested(&request, opt_offset);
3646 error = tc_transact(&request, NULL);
3648 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3649 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3650 netdev_get_name(netdev),
3651 tc_get_major(handle), tc_get_minor(handle),
3652 tc_get_major(parent), tc_get_minor(parent),
3653 class->min_rate, class->max_rate,
3654 class->burst, class->priority, ovs_strerror(error));
3659 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3660 * description of them into 'details'. The description complies with the
3661 * specification given in the vswitch database documentation for linux-htb
3664 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3666 static const struct nl_policy tca_htb_policy[] = {
3667 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3668 .min_len = sizeof(struct tc_htb_opt) },
3671 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3672 const struct tc_htb_opt *htb;
3674 if (!nl_parse_nested(nl_options, tca_htb_policy,
3675 attrs, ARRAY_SIZE(tca_htb_policy))) {
3676 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3680 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3681 class->min_rate = htb->rate.rate;
3682 class->max_rate = htb->ceil.rate;
3683 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3684 class->priority = htb->prio;
3689 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3690 struct htb_class *options,
3691 struct netdev_queue_stats *stats)
3693 struct nlattr *nl_options;
3694 unsigned int handle;
3697 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3698 if (!error && queue_id) {
3699 unsigned int major = tc_get_major(handle);
3700 unsigned int minor = tc_get_minor(handle);
3701 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3702 *queue_id = minor - 1;
3707 if (!error && options) {
3708 error = htb_parse_tca_options__(nl_options, options);
3714 htb_parse_qdisc_details__(struct netdev *netdev_,
3715 const struct smap *details, struct htb_class *hc)
3717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3718 const char *max_rate_s;
3720 max_rate_s = smap_get(details, "max-rate");
3721 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3722 if (!hc->max_rate) {
3723 enum netdev_features current;
3725 netdev_linux_read_features(netdev);
3726 current = !netdev->get_features_error ? netdev->current : 0;
3727 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3729 hc->min_rate = hc->max_rate;
3735 htb_parse_class_details__(struct netdev *netdev,
3736 const struct smap *details, struct htb_class *hc)
3738 const struct htb *htb = htb_get__(netdev);
3739 const char *min_rate_s = smap_get(details, "min-rate");
3740 const char *max_rate_s = smap_get(details, "max-rate");
3741 const char *burst_s = smap_get(details, "burst");
3742 const char *priority_s = smap_get(details, "priority");
3745 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3747 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3748 netdev_get_name(netdev));
3752 /* HTB requires at least an mtu sized min-rate to send any traffic even
3753 * on uncongested links. */
3754 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3755 hc->min_rate = MAX(hc->min_rate, mtu);
3756 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3759 hc->max_rate = (max_rate_s
3760 ? strtoull(max_rate_s, NULL, 10) / 8
3762 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3763 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3767 * According to hints in the documentation that I've read, it is important
3768 * that 'burst' be at least as big as the largest frame that might be
3769 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3770 * but having it a bit too small is a problem. Since netdev_get_mtu()
3771 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3772 * the MTU. We actually add 64, instead of 14, as a guard against
3773 * additional headers get tacked on somewhere that we're not aware of. */
3774 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3775 hc->burst = MAX(hc->burst, mtu + 64);
3778 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3784 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3785 unsigned int parent, struct htb_class *options,
3786 struct netdev_queue_stats *stats)
3788 struct ofpbuf *reply;
3791 error = tc_query_class(netdev, handle, parent, &reply);
3793 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3794 ofpbuf_delete(reply);
3800 htb_tc_install(struct netdev *netdev, const struct smap *details)
3804 error = htb_setup_qdisc__(netdev);
3806 struct htb_class hc;
3808 htb_parse_qdisc_details__(netdev, details, &hc);
3809 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3810 tc_make_handle(1, 0), &hc);
3812 htb_install__(netdev, hc.max_rate);
3818 static struct htb_class *
3819 htb_class_cast__(const struct tc_queue *queue)
3821 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3825 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3826 const struct htb_class *hc)
3828 struct htb *htb = htb_get__(netdev);
3829 size_t hash = hash_int(queue_id, 0);
3830 struct tc_queue *queue;
3831 struct htb_class *hcp;
3833 queue = tc_find_queue__(netdev, queue_id, hash);
3835 hcp = htb_class_cast__(queue);
3837 hcp = xmalloc(sizeof *hcp);
3838 queue = &hcp->tc_queue;
3839 queue->queue_id = queue_id;
3840 queue->created = time_msec();
3841 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3844 hcp->min_rate = hc->min_rate;
3845 hcp->max_rate = hc->max_rate;
3846 hcp->burst = hc->burst;
3847 hcp->priority = hc->priority;
3851 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3854 struct queue_dump_state state;
3855 struct htb_class hc;
3857 /* Get qdisc options. */
3859 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3860 htb_install__(netdev, hc.max_rate);
3863 if (!start_queue_dump(netdev, &state)) {
3866 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3867 unsigned int queue_id;
3869 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3870 htb_update_queue__(netdev, queue_id, &hc);
3873 finish_queue_dump(&state);
3879 htb_tc_destroy(struct tc *tc)
3881 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3882 struct htb_class *hc, *next;
3884 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3885 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3893 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3895 const struct htb *htb = htb_get__(netdev);
3896 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3901 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3903 struct htb_class hc;
3906 htb_parse_qdisc_details__(netdev, details, &hc);
3907 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3908 tc_make_handle(1, 0), &hc);
3910 htb_get__(netdev)->max_rate = hc.max_rate;
3916 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3917 const struct tc_queue *queue, struct smap *details)
3919 const struct htb_class *hc = htb_class_cast__(queue);
3921 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3922 if (hc->min_rate != hc->max_rate) {
3923 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3925 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3927 smap_add_format(details, "priority", "%u", hc->priority);
3933 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3934 const struct smap *details)
3936 struct htb_class hc;
3939 error = htb_parse_class_details__(netdev, details, &hc);
3944 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3945 tc_make_handle(1, 0xfffe), &hc);
3950 htb_update_queue__(netdev, queue_id, &hc);
3955 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3957 struct htb_class *hc = htb_class_cast__(queue);
3958 struct htb *htb = htb_get__(netdev);
3961 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3963 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3970 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3971 struct netdev_queue_stats *stats)
3973 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3974 tc_make_handle(1, 0xfffe), NULL, stats);
3978 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3979 const struct ofpbuf *nlmsg,
3980 netdev_dump_queue_stats_cb *cb, void *aux)
3982 struct netdev_queue_stats stats;
3983 unsigned int handle, major, minor;
3986 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3991 major = tc_get_major(handle);
3992 minor = tc_get_minor(handle);
3993 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3994 (*cb)(minor - 1, &stats, aux);
3999 static const struct tc_ops tc_ops_htb = {
4000 "htb", /* linux_name */
4001 "linux-htb", /* ovs_name */
4002 HTB_N_QUEUES, /* n_queues */
4011 htb_class_get_stats,
4012 htb_class_dump_stats
4015 /* "linux-hfsc" traffic control class. */
4017 #define HFSC_N_QUEUES 0xf000
4025 struct tc_queue tc_queue;
4030 static struct hfsc *
4031 hfsc_get__(const struct netdev *netdev_)
4033 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4034 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4037 static struct hfsc_class *
4038 hfsc_class_cast__(const struct tc_queue *queue)
4040 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4044 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4049 hfsc = xmalloc(sizeof *hfsc);
4050 tc_init(&hfsc->tc, &tc_ops_hfsc);
4051 hfsc->max_rate = max_rate;
4052 netdev->tc = &hfsc->tc;
4056 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4057 const struct hfsc_class *hc)
4061 struct hfsc_class *hcp;
4062 struct tc_queue *queue;
4064 hfsc = hfsc_get__(netdev);
4065 hash = hash_int(queue_id, 0);
4067 queue = tc_find_queue__(netdev, queue_id, hash);
4069 hcp = hfsc_class_cast__(queue);
4071 hcp = xmalloc(sizeof *hcp);
4072 queue = &hcp->tc_queue;
4073 queue->queue_id = queue_id;
4074 queue->created = time_msec();
4075 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4078 hcp->min_rate = hc->min_rate;
4079 hcp->max_rate = hc->max_rate;
4083 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4085 const struct tc_service_curve *rsc, *fsc, *usc;
4086 static const struct nl_policy tca_hfsc_policy[] = {
4088 .type = NL_A_UNSPEC,
4090 .min_len = sizeof(struct tc_service_curve),
4093 .type = NL_A_UNSPEC,
4095 .min_len = sizeof(struct tc_service_curve),
4098 .type = NL_A_UNSPEC,
4100 .min_len = sizeof(struct tc_service_curve),
4103 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4105 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4106 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4107 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4111 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4112 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4113 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4115 if (rsc->m1 != 0 || rsc->d != 0 ||
4116 fsc->m1 != 0 || fsc->d != 0 ||
4117 usc->m1 != 0 || usc->d != 0) {
4118 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4119 "Non-linear service curves are not supported.");
4123 if (rsc->m2 != fsc->m2) {
4124 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4125 "Real-time service curves are not supported ");
4129 if (rsc->m2 > usc->m2) {
4130 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4131 "Min-rate service curve is greater than "
4132 "the max-rate service curve.");
4136 class->min_rate = fsc->m2;
4137 class->max_rate = usc->m2;
4142 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4143 struct hfsc_class *options,
4144 struct netdev_queue_stats *stats)
4147 unsigned int handle;
4148 struct nlattr *nl_options;
4150 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4156 unsigned int major, minor;
4158 major = tc_get_major(handle);
4159 minor = tc_get_minor(handle);
4160 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4161 *queue_id = minor - 1;
4168 error = hfsc_parse_tca_options__(nl_options, options);
4175 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4176 unsigned int parent, struct hfsc_class *options,
4177 struct netdev_queue_stats *stats)
4180 struct ofpbuf *reply;
4182 error = tc_query_class(netdev, handle, parent, &reply);
4187 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4188 ofpbuf_delete(reply);
4193 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4194 struct hfsc_class *class)
4196 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4198 const char *max_rate_s;
4200 max_rate_s = smap_get(details, "max-rate");
4201 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4204 enum netdev_features current;
4206 netdev_linux_read_features(netdev);
4207 current = !netdev->get_features_error ? netdev->current : 0;
4208 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4211 class->min_rate = max_rate;
4212 class->max_rate = max_rate;
4216 hfsc_parse_class_details__(struct netdev *netdev,
4217 const struct smap *details,
4218 struct hfsc_class * class)
4220 const struct hfsc *hfsc;
4221 uint32_t min_rate, max_rate;
4222 const char *min_rate_s, *max_rate_s;
4224 hfsc = hfsc_get__(netdev);
4225 min_rate_s = smap_get(details, "min-rate");
4226 max_rate_s = smap_get(details, "max-rate");
4228 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4229 min_rate = MAX(min_rate, 1);
4230 min_rate = MIN(min_rate, hfsc->max_rate);
4232 max_rate = (max_rate_s
4233 ? strtoull(max_rate_s, NULL, 10) / 8
4235 max_rate = MAX(max_rate, min_rate);
4236 max_rate = MIN(max_rate, hfsc->max_rate);
4238 class->min_rate = min_rate;
4239 class->max_rate = max_rate;
4244 /* Create an HFSC qdisc.
4246 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4248 hfsc_setup_qdisc__(struct netdev * netdev)
4250 struct tcmsg *tcmsg;
4251 struct ofpbuf request;
4252 struct tc_hfsc_qopt opt;
4254 tc_del_qdisc(netdev);
4256 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4257 NLM_F_EXCL | NLM_F_CREATE, &request);
4263 tcmsg->tcm_handle = tc_make_handle(1, 0);
4264 tcmsg->tcm_parent = TC_H_ROOT;
4266 memset(&opt, 0, sizeof opt);
4269 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4270 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4272 return tc_transact(&request, NULL);
4275 /* Create an HFSC class.
4277 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4278 * sc rate <min_rate> ul rate <max_rate>" */
4280 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4281 unsigned int parent, struct hfsc_class *class)
4285 struct tcmsg *tcmsg;
4286 struct ofpbuf request;
4287 struct tc_service_curve min, max;
4289 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4295 tcmsg->tcm_handle = handle;
4296 tcmsg->tcm_parent = parent;
4300 min.m2 = class->min_rate;
4304 max.m2 = class->max_rate;
4306 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4307 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4308 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4309 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4310 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4311 nl_msg_end_nested(&request, opt_offset);
4313 error = tc_transact(&request, NULL);
4315 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4316 "min-rate %ubps, max-rate %ubps (%s)",
4317 netdev_get_name(netdev),
4318 tc_get_major(handle), tc_get_minor(handle),
4319 tc_get_major(parent), tc_get_minor(parent),
4320 class->min_rate, class->max_rate, ovs_strerror(error));
4327 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4330 struct hfsc_class class;
4332 error = hfsc_setup_qdisc__(netdev);
4338 hfsc_parse_qdisc_details__(netdev, details, &class);
4339 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4340 tc_make_handle(1, 0), &class);
4346 hfsc_install__(netdev, class.max_rate);
4351 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4354 struct queue_dump_state state;
4355 struct hfsc_class hc;
4358 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4359 hfsc_install__(netdev, hc.max_rate);
4361 if (!start_queue_dump(netdev, &state)) {
4365 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4366 unsigned int queue_id;
4368 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4369 hfsc_update_queue__(netdev, queue_id, &hc);
4373 finish_queue_dump(&state);
4378 hfsc_tc_destroy(struct tc *tc)
4381 struct hfsc_class *hc, *next;
4383 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4385 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4386 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4395 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4397 const struct hfsc *hfsc;
4398 hfsc = hfsc_get__(netdev);
4399 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4404 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4407 struct hfsc_class class;
4409 hfsc_parse_qdisc_details__(netdev, details, &class);
4410 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4411 tc_make_handle(1, 0), &class);
4414 hfsc_get__(netdev)->max_rate = class.max_rate;
4421 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4422 const struct tc_queue *queue, struct smap *details)
4424 const struct hfsc_class *hc;
4426 hc = hfsc_class_cast__(queue);
4427 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4428 if (hc->min_rate != hc->max_rate) {
4429 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4435 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4436 const struct smap *details)
4439 struct hfsc_class class;
4441 error = hfsc_parse_class_details__(netdev, details, &class);
4446 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4447 tc_make_handle(1, 0xfffe), &class);
4452 hfsc_update_queue__(netdev, queue_id, &class);
4457 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4461 struct hfsc_class *hc;
4463 hc = hfsc_class_cast__(queue);
4464 hfsc = hfsc_get__(netdev);
4466 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4468 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4475 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4476 struct netdev_queue_stats *stats)
4478 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4479 tc_make_handle(1, 0xfffe), NULL, stats);
4483 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4484 const struct ofpbuf *nlmsg,
4485 netdev_dump_queue_stats_cb *cb, void *aux)
4487 struct netdev_queue_stats stats;
4488 unsigned int handle, major, minor;
4491 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4496 major = tc_get_major(handle);
4497 minor = tc_get_minor(handle);
4498 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4499 (*cb)(minor - 1, &stats, aux);
4504 static const struct tc_ops tc_ops_hfsc = {
4505 "hfsc", /* linux_name */
4506 "linux-hfsc", /* ovs_name */
4507 HFSC_N_QUEUES, /* n_queues */
4508 hfsc_tc_install, /* tc_install */
4509 hfsc_tc_load, /* tc_load */
4510 hfsc_tc_destroy, /* tc_destroy */
4511 hfsc_qdisc_get, /* qdisc_get */
4512 hfsc_qdisc_set, /* qdisc_set */
4513 hfsc_class_get, /* class_get */
4514 hfsc_class_set, /* class_set */
4515 hfsc_class_delete, /* class_delete */
4516 hfsc_class_get_stats, /* class_get_stats */
4517 hfsc_class_dump_stats /* class_dump_stats */
4520 /* "linux-default" traffic control class.
4522 * This class represents the default, unnamed Linux qdisc. It corresponds to
4523 * the "" (empty string) QoS type in the OVS database. */
4526 default_install__(struct netdev *netdev_)
4528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4529 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4531 /* Nothing but a tc class implementation is allowed to write to a tc. This
4532 * class never does that, so we can legitimately use a const tc object. */
4533 netdev->tc = CONST_CAST(struct tc *, &tc);
4537 default_tc_install(struct netdev *netdev,
4538 const struct smap *details OVS_UNUSED)
4540 default_install__(netdev);
4545 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4547 default_install__(netdev);
4551 static const struct tc_ops tc_ops_default = {
4552 NULL, /* linux_name */
4557 NULL, /* tc_destroy */
4558 NULL, /* qdisc_get */
4559 NULL, /* qdisc_set */
4560 NULL, /* class_get */
4561 NULL, /* class_set */
4562 NULL, /* class_delete */
4563 NULL, /* class_get_stats */
4564 NULL /* class_dump_stats */
4567 /* "linux-other" traffic control class.
4572 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4574 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4575 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4577 /* Nothing but a tc class implementation is allowed to write to a tc. This
4578 * class never does that, so we can legitimately use a const tc object. */
4579 netdev->tc = CONST_CAST(struct tc *, &tc);
4583 static const struct tc_ops tc_ops_other = {
4584 NULL, /* linux_name */
4585 "linux-other", /* ovs_name */
4587 NULL, /* tc_install */
4589 NULL, /* tc_destroy */
4590 NULL, /* qdisc_get */
4591 NULL, /* qdisc_set */
4592 NULL, /* class_get */
4593 NULL, /* class_set */
4594 NULL, /* class_delete */
4595 NULL, /* class_get_stats */
4596 NULL /* class_dump_stats */
4599 /* Traffic control. */
4601 /* Number of kernel "tc" ticks per second. */
4602 static double ticks_per_s;
4604 /* Number of kernel "jiffies" per second. This is used for the purpose of
4605 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4606 * one jiffy's worth of data.
4608 * There are two possibilities here:
4610 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4611 * approximate range of 100 to 1024. That means that we really need to
4612 * make sure that the qdisc can buffer that much data.
4614 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4615 * has finely granular timers and there's no need to fudge additional room
4616 * for buffers. (There's no extra effort needed to implement that: the
4617 * large 'buffer_hz' is used as a divisor, so practically any number will
4618 * come out as 0 in the division. Small integer results in the case of
4619 * really high dividends won't have any real effect anyhow.)
4621 static unsigned int buffer_hz;
4623 /* Returns tc handle 'major':'minor'. */
4625 tc_make_handle(unsigned int major, unsigned int minor)
4627 return TC_H_MAKE(major << 16, minor);
4630 /* Returns the major number from 'handle'. */
4632 tc_get_major(unsigned int handle)
4634 return TC_H_MAJ(handle) >> 16;
4637 /* Returns the minor number from 'handle'. */
4639 tc_get_minor(unsigned int handle)
4641 return TC_H_MIN(handle);
4644 static struct tcmsg *
4645 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4646 struct ofpbuf *request)
4648 struct tcmsg *tcmsg;
4652 error = get_ifindex(netdev, &ifindex);
4657 ofpbuf_init(request, 512);
4658 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4659 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4660 tcmsg->tcm_family = AF_UNSPEC;
4661 tcmsg->tcm_ifindex = ifindex;
4662 /* Caller should fill in tcmsg->tcm_handle. */
4663 /* Caller should fill in tcmsg->tcm_parent. */
4669 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4671 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4672 ofpbuf_uninit(request);
4676 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4677 * policing configuration.
4679 * This function is equivalent to running the following when 'add' is true:
4680 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4682 * This function is equivalent to running the following when 'add' is false:
4683 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4685 * The configuration and stats may be seen with the following command:
4686 * /sbin/tc -s qdisc show dev <devname>
4688 * Returns 0 if successful, otherwise a positive errno value.
4691 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4693 struct ofpbuf request;
4694 struct tcmsg *tcmsg;
4696 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4697 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4699 tcmsg = tc_make_request(netdev, type, flags, &request);
4703 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4704 tcmsg->tcm_parent = TC_H_INGRESS;
4705 nl_msg_put_string(&request, TCA_KIND, "ingress");
4706 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4708 error = tc_transact(&request, NULL);
4710 /* If we're deleting the qdisc, don't worry about some of the
4711 * error conditions. */
4712 if (!add && (error == ENOENT || error == EINVAL)) {
4721 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4724 * This function is equivalent to running:
4725 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4726 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4729 * The configuration and stats may be seen with the following command:
4730 * /sbin/tc -s filter show dev <devname> parent ffff:
4732 * Returns 0 if successful, otherwise a positive errno value.
4735 tc_add_policer(struct netdev *netdev,
4736 uint32_t kbits_rate, uint32_t kbits_burst)
4738 struct tc_police tc_police;
4739 struct ofpbuf request;
4740 struct tcmsg *tcmsg;
4741 size_t basic_offset;
4742 size_t police_offset;
4746 memset(&tc_police, 0, sizeof tc_police);
4747 tc_police.action = TC_POLICE_SHOT;
4748 tc_police.mtu = mtu;
4749 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4751 /* The following appears wrong in two ways:
4753 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4754 * arguments (or at least consistently "bytes" as both or "bits" as
4755 * both), but this supplies bytes for the first argument and bits for the
4758 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4760 * However if you "fix" those problems then "tc filter show ..." shows
4761 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4762 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4763 * tc's point of view. Whatever. */
4764 tc_police.burst = tc_bytes_to_ticks(
4765 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4767 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4768 NLM_F_EXCL | NLM_F_CREATE, &request);
4772 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4773 tcmsg->tcm_info = tc_make_handle(49,
4774 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4776 nl_msg_put_string(&request, TCA_KIND, "basic");
4777 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4778 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4779 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4780 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4781 nl_msg_end_nested(&request, police_offset);
4782 nl_msg_end_nested(&request, basic_offset);
4784 error = tc_transact(&request, NULL);
4795 /* The values in psched are not individually very meaningful, but they are
4796 * important. The tables below show some values seen in the wild.
4800 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4801 * (Before that, there are hints that it was 1000000000.)
4803 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4807 * -----------------------------------
4808 * [1] 000c8000 000f4240 000f4240 00000064
4809 * [2] 000003e8 00000400 000f4240 3b9aca00
4810 * [3] 000003e8 00000400 000f4240 3b9aca00
4811 * [4] 000003e8 00000400 000f4240 00000064
4812 * [5] 000003e8 00000040 000f4240 3b9aca00
4813 * [6] 000003e8 00000040 000f4240 000000f9
4815 * a b c d ticks_per_s buffer_hz
4816 * ------- --------- ---------- ------------- ----------- -------------
4817 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4818 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4819 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4820 * [4] 1,000 1,024 1,000,000 100 976,562 100
4821 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4822 * [6] 1,000 64 1,000,000 249 15,625,000 249
4824 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4825 * [2] 2.6.26-1-686-bigmem from Debian lenny
4826 * [3] 2.6.26-2-sparc64 from Debian lenny
4827 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4828 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4829 * [6] 2.6.34 from kernel.org on KVM
4831 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4832 static const char fn[] = "/proc/net/psched";
4833 unsigned int a, b, c, d;
4836 if (!ovsthread_once_start(&once)) {
4843 stream = fopen(fn, "r");
4845 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4849 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4850 VLOG_WARN("%s: read failed", fn);
4854 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4858 VLOG_WARN("%s: invalid scheduler parameters", fn);
4862 ticks_per_s = (double) a * c / b;
4866 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4869 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4872 ovsthread_once_done(&once);
4875 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4876 * rate of 'rate' bytes per second. */
4878 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4881 return (rate * ticks) / ticks_per_s;
4884 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4885 * rate of 'rate' bytes per second. */
4887 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4890 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4893 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4894 * a transmission rate of 'rate' bytes per second. */
4896 tc_buffer_per_jiffy(unsigned int rate)
4899 return rate / buffer_hz;
4902 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4903 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4904 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4905 * stores NULL into it if it is absent.
4907 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4910 * Returns 0 if successful, otherwise a positive errno value. */
4912 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4913 struct nlattr **options)
4915 static const struct nl_policy tca_policy[] = {
4916 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4917 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4919 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4921 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4922 tca_policy, ta, ARRAY_SIZE(ta))) {
4923 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4928 *kind = nl_attr_get_string(ta[TCA_KIND]);
4932 *options = ta[TCA_OPTIONS];
4947 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4948 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4949 * into '*options', and its queue statistics into '*stats'. Any of the output
4950 * arguments may be null.
4952 * Returns 0 if successful, otherwise a positive errno value. */
4954 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4955 struct nlattr **options, struct netdev_queue_stats *stats)
4957 static const struct nl_policy tca_policy[] = {
4958 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4959 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4961 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4963 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4964 tca_policy, ta, ARRAY_SIZE(ta))) {
4965 VLOG_WARN_RL(&rl, "failed to parse class message");
4970 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4971 *handlep = tc->tcm_handle;
4975 *options = ta[TCA_OPTIONS];
4979 const struct gnet_stats_queue *gsq;
4980 struct gnet_stats_basic gsb;
4982 static const struct nl_policy stats_policy[] = {
4983 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4984 .min_len = sizeof gsb },
4985 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4986 .min_len = sizeof *gsq },
4988 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4990 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4991 sa, ARRAY_SIZE(sa))) {
4992 VLOG_WARN_RL(&rl, "failed to parse class stats");
4996 /* Alignment issues screw up the length of struct gnet_stats_basic on
4997 * some arch/bitsize combinations. Newer versions of Linux have a
4998 * struct gnet_stats_basic_packed, but we can't depend on that. The
4999 * easiest thing to do is just to make a copy. */
5000 memset(&gsb, 0, sizeof gsb);
5001 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5002 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5003 stats->tx_bytes = gsb.bytes;
5004 stats->tx_packets = gsb.packets;
5006 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5007 stats->tx_errors = gsq->drops;
5017 memset(stats, 0, sizeof *stats);
5022 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5025 tc_query_class(const struct netdev *netdev,
5026 unsigned int handle, unsigned int parent,
5027 struct ofpbuf **replyp)
5029 struct ofpbuf request;
5030 struct tcmsg *tcmsg;
5033 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5037 tcmsg->tcm_handle = handle;
5038 tcmsg->tcm_parent = parent;
5040 error = tc_transact(&request, replyp);
5042 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5043 netdev_get_name(netdev),
5044 tc_get_major(handle), tc_get_minor(handle),
5045 tc_get_major(parent), tc_get_minor(parent),
5046 ovs_strerror(error));
5051 /* Equivalent to "tc class del dev <name> handle <handle>". */
5053 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5055 struct ofpbuf request;
5056 struct tcmsg *tcmsg;
5059 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5063 tcmsg->tcm_handle = handle;
5064 tcmsg->tcm_parent = 0;
5066 error = tc_transact(&request, NULL);
5068 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5069 netdev_get_name(netdev),
5070 tc_get_major(handle), tc_get_minor(handle),
5071 ovs_strerror(error));
5076 /* Equivalent to "tc qdisc del dev <name> root". */
5078 tc_del_qdisc(struct netdev *netdev_)
5080 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5081 struct ofpbuf request;
5082 struct tcmsg *tcmsg;
5085 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5089 tcmsg->tcm_handle = tc_make_handle(1, 0);
5090 tcmsg->tcm_parent = TC_H_ROOT;
5092 error = tc_transact(&request, NULL);
5093 if (error == EINVAL) {
5094 /* EINVAL probably means that the default qdisc was in use, in which
5095 * case we've accomplished our purpose. */
5098 if (!error && netdev->tc) {
5099 if (netdev->tc->ops->tc_destroy) {
5100 netdev->tc->ops->tc_destroy(netdev->tc);
5108 getqdisc_is_safe(void)
5110 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5111 static bool safe = false;
5113 if (ovsthread_once_start(&once)) {
5114 struct utsname utsname;
5117 if (uname(&utsname) == -1) {
5118 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5119 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5120 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5121 } else if (major < 2 || (major == 2 && minor < 35)) {
5122 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5127 ovsthread_once_done(&once);
5132 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5133 * kernel to determine what they are. Returns 0 if successful, otherwise a
5134 * positive errno value. */
5136 tc_query_qdisc(const struct netdev *netdev_)
5138 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5139 struct ofpbuf request, *qdisc;
5140 const struct tc_ops *ops;
5141 struct tcmsg *tcmsg;
5149 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5150 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5151 * 2.6.35 without that fix backported to it.
5153 * To avoid the OOPS, we must not make a request that would attempt to dump
5154 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5155 * few others. There are a few ways that I can see to do this, but most of
5156 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5157 * technique chosen here is to assume that any non-default qdisc that we
5158 * create will have a class with handle 1:0. The built-in qdiscs only have
5159 * a class with handle 0:0.
5161 * On Linux 2.6.35+ we use the straightforward method because it allows us
5162 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5163 * in such a case we get no response at all from the kernel (!) if a
5164 * builtin qdisc is in use (which is later caught by "!error &&
5165 * !qdisc->size"). */
5166 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5170 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5171 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5173 /* Figure out what tc class to instantiate. */
5174 error = tc_transact(&request, &qdisc);
5175 if (!error && qdisc->size) {
5178 error = tc_parse_qdisc(qdisc, &kind, NULL);
5180 ops = &tc_ops_other;
5182 ops = tc_lookup_linux_name(kind);
5184 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5185 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5187 ops = &tc_ops_other;
5190 } else if ((!error && !qdisc->size) || error == ENOENT) {
5191 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5192 * set up by some other entity that doesn't have a handle 1:0. We will
5193 * assume that it's the system default qdisc. */
5194 ops = &tc_ops_default;
5197 /* Who knows? Maybe the device got deleted. */
5198 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5199 netdev_get_name(netdev_), ovs_strerror(error));
5200 ops = &tc_ops_other;
5203 /* Instantiate it. */
5204 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5205 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5206 ofpbuf_delete(qdisc);
5208 return error ? error : load_error;
5211 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5212 approximate the time to transmit packets of various lengths. For an MTU of
5213 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5214 represents two possible packet lengths; for a MTU of 513 through 1024, four
5215 possible lengths; and so on.
5217 Returns, for the specified 'mtu', the number of bits that packet lengths
5218 need to be shifted right to fit within such a 256-entry table. */
5220 tc_calc_cell_log(unsigned int mtu)
5225 mtu = ETH_PAYLOAD_MAX;
5227 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5229 for (cell_log = 0; mtu >= 256; cell_log++) {
5236 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5239 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5241 memset(rate, 0, sizeof *rate);
5242 rate->cell_log = tc_calc_cell_log(mtu);
5243 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5244 /* rate->cell_align = 0; */ /* distro headers. */
5245 rate->mpu = ETH_TOTAL_MIN;
5249 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5250 * attribute of the specified "type".
5252 * See tc_calc_cell_log() above for a description of "rtab"s. */
5254 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5259 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5260 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5261 unsigned packet_size = (i + 1) << rate->cell_log;
5262 if (packet_size < rate->mpu) {
5263 packet_size = rate->mpu;
5265 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5269 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5270 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5271 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5274 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5276 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5277 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5280 /* Linux-only functions declared in netdev-linux.h */
5282 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5283 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5285 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5286 const char *flag_name, bool enable)
5288 const char *netdev_name = netdev_get_name(netdev);
5289 struct ethtool_value evalue;
5293 COVERAGE_INC(netdev_get_ethtool);
5294 memset(&evalue, 0, sizeof evalue);
5295 error = netdev_linux_do_ethtool(netdev_name,
5296 (struct ethtool_cmd *)&evalue,
5297 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5302 COVERAGE_INC(netdev_set_ethtool);
5303 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5304 error = netdev_linux_do_ethtool(netdev_name,
5305 (struct ethtool_cmd *)&evalue,
5306 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5311 COVERAGE_INC(netdev_get_ethtool);
5312 memset(&evalue, 0, sizeof evalue);
5313 error = netdev_linux_do_ethtool(netdev_name,
5314 (struct ethtool_cmd *)&evalue,
5315 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5320 if (new_flags != evalue.data) {
5321 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5322 "device %s failed", enable ? "enable" : "disable",
5323 flag_name, netdev_name);
5330 /* Utility functions. */
5332 /* Copies 'src' into 'dst', performing format conversion in the process. */
5334 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5335 const struct rtnl_link_stats *src)
5337 dst->rx_packets = src->rx_packets;
5338 dst->tx_packets = src->tx_packets;
5339 dst->rx_bytes = src->rx_bytes;
5340 dst->tx_bytes = src->tx_bytes;
5341 dst->rx_errors = src->rx_errors;
5342 dst->tx_errors = src->tx_errors;
5343 dst->rx_dropped = src->rx_dropped;
5344 dst->tx_dropped = src->tx_dropped;
5345 dst->multicast = src->multicast;
5346 dst->collisions = src->collisions;
5347 dst->rx_length_errors = src->rx_length_errors;
5348 dst->rx_over_errors = src->rx_over_errors;
5349 dst->rx_crc_errors = src->rx_crc_errors;
5350 dst->rx_frame_errors = src->rx_frame_errors;
5351 dst->rx_fifo_errors = src->rx_fifo_errors;
5352 dst->rx_missed_errors = src->rx_missed_errors;
5353 dst->tx_aborted_errors = src->tx_aborted_errors;
5354 dst->tx_carrier_errors = src->tx_carrier_errors;
5355 dst->tx_fifo_errors = src->tx_fifo_errors;
5356 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5357 dst->tx_window_errors = src->tx_window_errors;
5360 /* Copies 'src' into 'dst', performing format conversion in the process. */
5362 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5363 const struct rtnl_link_stats64 *src)
5365 dst->rx_packets = src->rx_packets;
5366 dst->tx_packets = src->tx_packets;
5367 dst->rx_bytes = src->rx_bytes;
5368 dst->tx_bytes = src->tx_bytes;
5369 dst->rx_errors = src->rx_errors;
5370 dst->tx_errors = src->tx_errors;
5371 dst->rx_dropped = src->rx_dropped;
5372 dst->tx_dropped = src->tx_dropped;
5373 dst->multicast = src->multicast;
5374 dst->collisions = src->collisions;
5375 dst->rx_length_errors = src->rx_length_errors;
5376 dst->rx_over_errors = src->rx_over_errors;
5377 dst->rx_crc_errors = src->rx_crc_errors;
5378 dst->rx_frame_errors = src->rx_frame_errors;
5379 dst->rx_fifo_errors = src->rx_fifo_errors;
5380 dst->rx_missed_errors = src->rx_missed_errors;
5381 dst->tx_aborted_errors = src->tx_aborted_errors;
5382 dst->tx_carrier_errors = src->tx_carrier_errors;
5383 dst->tx_fifo_errors = src->tx_fifo_errors;
5384 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5385 dst->tx_window_errors = src->tx_window_errors;
5389 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5391 struct ofpbuf request;
5392 struct ofpbuf *reply;
5395 ofpbuf_init(&request, 0);
5396 nl_msg_put_nlmsghdr(&request,
5397 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5398 RTM_GETLINK, NLM_F_REQUEST);
5399 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5400 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5401 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5402 ofpbuf_uninit(&request);
5407 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5408 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5409 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5410 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5413 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5414 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5415 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5418 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5423 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5428 ofpbuf_delete(reply);
5433 get_flags(const struct netdev *dev, unsigned int *flags)
5439 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5441 *flags = ifr.ifr_flags;
5447 set_flags(const char *name, unsigned int flags)
5451 ifr.ifr_flags = flags;
5452 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5456 do_get_ifindex(const char *netdev_name)
5461 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5462 COVERAGE_INC(netdev_get_ifindex);
5464 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5466 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5467 netdev_name, ovs_strerror(error));
5470 return ifr.ifr_ifindex;
5474 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5476 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5478 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5479 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5482 netdev->get_ifindex_error = -ifindex;
5483 netdev->ifindex = 0;
5485 netdev->get_ifindex_error = 0;
5486 netdev->ifindex = ifindex;
5488 netdev->cache_valid |= VALID_IFINDEX;
5491 *ifindexp = netdev->ifindex;
5492 return netdev->get_ifindex_error;
5496 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5502 memset(&ifr, 0, sizeof ifr);
5503 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5504 COVERAGE_INC(netdev_get_hwaddr);
5505 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5507 /* ENODEV probably means that a vif disappeared asynchronously and
5508 * hasn't been removed from the database yet, so reduce the log level
5509 * to INFO for that case. */
5510 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5511 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5512 netdev_name, ovs_strerror(error));
5515 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5516 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5517 VLOG_WARN("%s device has unknown hardware address family %d",
5518 netdev_name, hwaddr_family);
5520 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5525 set_etheraddr(const char *netdev_name,
5526 const uint8_t mac[ETH_ADDR_LEN])
5531 memset(&ifr, 0, sizeof ifr);
5532 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5533 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5534 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5535 COVERAGE_INC(netdev_set_hwaddr);
5536 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5538 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5539 netdev_name, ovs_strerror(error));
5545 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5546 int cmd, const char *cmd_name)
5551 memset(&ifr, 0, sizeof ifr);
5552 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5553 ifr.ifr_data = (caddr_t) ecmd;
5556 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5558 if (error != EOPNOTSUPP) {
5559 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5560 "failed: %s", cmd_name, name, ovs_strerror(error));
5562 /* The device doesn't support this operation. That's pretty
5563 * common, so there's no point in logging anything. */
5570 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5571 int cmd, const char *cmd_name)
5576 ifr.ifr_addr.sa_family = AF_INET;
5577 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5579 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5581 *ip = sin->sin_addr;
5586 /* Returns an AF_PACKET raw socket or a negative errno value. */
5588 af_packet_sock(void)
5590 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5593 if (ovsthread_once_start(&once)) {
5594 sock = socket(AF_PACKET, SOCK_RAW, 0);
5596 int error = set_nonblocking(sock);
5603 VLOG_ERR("failed to create packet socket: %s",
5604 ovs_strerror(errno));
5606 ovsthread_once_done(&once);