2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_default;
381 static const struct tc_ops tc_ops_other;
383 static const struct tc_ops *const tcs[] = {
384 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
385 &tc_ops_hfsc, /* Hierarchical fair service curve. */
386 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
387 &tc_ops_other, /* Some other qdisc. */
391 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
392 static unsigned int tc_get_major(unsigned int handle);
393 static unsigned int tc_get_minor(unsigned int handle);
395 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
396 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
397 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
399 static struct tcmsg *tc_make_request(const struct netdev *, int type,
400 unsigned int flags, struct ofpbuf *);
401 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
402 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
403 static int tc_add_policer(struct netdev *,
404 uint32_t kbits_rate, uint32_t kbits_burst);
406 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
407 struct nlattr **options);
408 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
409 struct nlattr **options,
410 struct netdev_queue_stats *);
411 static int tc_query_class(const struct netdev *,
412 unsigned int handle, unsigned int parent,
413 struct ofpbuf **replyp);
414 static int tc_delete_class(const struct netdev *, unsigned int handle);
416 static int tc_del_qdisc(struct netdev *netdev);
417 static int tc_query_qdisc(const struct netdev *netdev);
419 static int tc_calc_cell_log(unsigned int mtu);
420 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
421 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
422 const struct tc_ratespec *rate);
423 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
425 struct netdev_linux {
428 /* Protects all members below. */
429 struct ovs_mutex mutex;
431 unsigned int cache_valid;
433 bool miimon; /* Link status of last poll. */
434 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
435 struct timer miimon_timer;
437 /* The following are figured out "on demand" only. They are only valid
438 * when the corresponding VALID_* bit in 'cache_valid' is set. */
440 uint8_t etheraddr[ETH_ADDR_LEN];
441 struct in_addr address, netmask;
444 unsigned int ifi_flags;
445 long long int carrier_resets;
446 uint32_t kbits_rate; /* Policing data. */
447 uint32_t kbits_burst;
448 int vport_stats_error; /* Cached error code from vport_get_stats().
449 0 or an errno value. */
450 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
451 int ether_addr_error; /* Cached error code from set/get etheraddr. */
452 int netdev_policing_error; /* Cached error code from set policing. */
453 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
454 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
456 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
457 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
458 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
460 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
463 /* For devices of class netdev_tap_class only. */
467 struct netdev_rxq_linux {
468 struct netdev_rxq up;
473 /* This is set pretty low because we probably won't learn anything from the
474 * additional log messages. */
475 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
477 /* Polling miimon status for all ports causes performance degradation when
478 * handling a large number of ports. If there are no devices using miimon, then
479 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
481 * Readers do not depend on this variable synchronizing with the related
482 * changes in the device miimon status, so we can use atomic_count. */
483 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
485 static void netdev_linux_run(void);
487 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
488 int cmd, const char *cmd_name);
489 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
490 int cmd, const char *cmd_name);
491 static int get_flags(const struct netdev *, unsigned int *flags);
492 static int set_flags(const char *, unsigned int flags);
493 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
494 enum netdev_flags on, enum netdev_flags *old_flagsp)
495 OVS_REQUIRES(netdev->mutex);
496 static int do_get_ifindex(const char *netdev_name);
497 static int get_ifindex(const struct netdev *, int *ifindexp);
498 static int do_set_addr(struct netdev *netdev,
499 int ioctl_nr, const char *ioctl_name,
500 struct in_addr addr);
501 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
502 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
503 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
504 static int af_packet_sock(void);
505 static bool netdev_linux_miimon_enabled(void);
506 static void netdev_linux_miimon_run(void);
507 static void netdev_linux_miimon_wait(void);
508 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
511 is_netdev_linux_class(const struct netdev_class *netdev_class)
513 return netdev_class->run == netdev_linux_run;
517 is_tap_netdev(const struct netdev *netdev)
519 return netdev_get_class(netdev) == &netdev_tap_class;
522 static struct netdev_linux *
523 netdev_linux_cast(const struct netdev *netdev)
525 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
527 return CONTAINER_OF(netdev, struct netdev_linux, up);
530 static struct netdev_rxq_linux *
531 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
533 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
534 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
537 static void netdev_linux_update(struct netdev_linux *netdev,
538 const struct rtnetlink_link_change *)
539 OVS_REQUIRES(netdev->mutex);
540 static void netdev_linux_changed(struct netdev_linux *netdev,
541 unsigned int ifi_flags, unsigned int mask)
542 OVS_REQUIRES(netdev->mutex);
544 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
545 * if no such socket could be created. */
546 static struct nl_sock *
547 netdev_linux_notify_sock(void)
549 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
550 static struct nl_sock *sock;
552 if (ovsthread_once_start(&once)) {
555 error = nl_sock_create(NETLINK_ROUTE, &sock);
557 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
559 nl_sock_destroy(sock);
563 ovsthread_once_done(&once);
570 netdev_linux_miimon_enabled(void)
572 return atomic_count_get(&miimon_cnt) > 0;
576 netdev_linux_run(void)
578 struct nl_sock *sock;
581 if (netdev_linux_miimon_enabled()) {
582 netdev_linux_miimon_run();
585 sock = netdev_linux_notify_sock();
591 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
592 uint64_t buf_stub[4096 / 8];
595 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
596 error = nl_sock_recv(sock, &buf, false);
598 struct rtnetlink_link_change change;
600 if (rtnetlink_link_parse(&buf, &change)) {
601 struct netdev *netdev_ = netdev_from_name(change.ifname);
602 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
603 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
605 ovs_mutex_lock(&netdev->mutex);
606 netdev_linux_update(netdev, &change);
607 ovs_mutex_unlock(&netdev->mutex);
609 netdev_close(netdev_);
611 } else if (error == ENOBUFS) {
612 struct shash device_shash;
613 struct shash_node *node;
617 shash_init(&device_shash);
618 netdev_get_devices(&netdev_linux_class, &device_shash);
619 SHASH_FOR_EACH (node, &device_shash) {
620 struct netdev *netdev_ = node->data;
621 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
624 ovs_mutex_lock(&netdev->mutex);
625 get_flags(netdev_, &flags);
626 netdev_linux_changed(netdev, flags, 0);
627 ovs_mutex_unlock(&netdev->mutex);
629 netdev_close(netdev_);
631 shash_destroy(&device_shash);
632 } else if (error != EAGAIN) {
633 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
634 ovs_strerror(error));
641 netdev_linux_wait(void)
643 struct nl_sock *sock;
645 if (netdev_linux_miimon_enabled()) {
646 netdev_linux_miimon_wait();
648 sock = netdev_linux_notify_sock();
650 nl_sock_wait(sock, POLLIN);
655 netdev_linux_changed(struct netdev_linux *dev,
656 unsigned int ifi_flags, unsigned int mask)
657 OVS_REQUIRES(dev->mutex)
659 netdev_change_seq_changed(&dev->up);
661 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
662 dev->carrier_resets++;
664 dev->ifi_flags = ifi_flags;
666 dev->cache_valid &= mask;
670 netdev_linux_update(struct netdev_linux *dev,
671 const struct rtnetlink_link_change *change)
672 OVS_REQUIRES(dev->mutex)
674 if (change->nlmsg_type == RTM_NEWLINK) {
676 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
678 /* Update netdev from rtnl-change msg. */
680 dev->mtu = change->mtu;
681 dev->cache_valid |= VALID_MTU;
682 dev->netdev_mtu_error = 0;
685 if (!eth_addr_is_zero(change->addr)) {
686 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
687 dev->cache_valid |= VALID_ETHERADDR;
688 dev->ether_addr_error = 0;
691 dev->ifindex = change->ifi_index;
692 dev->cache_valid |= VALID_IFINDEX;
693 dev->get_ifindex_error = 0;
696 netdev_linux_changed(dev, change->ifi_flags, 0);
700 static struct netdev *
701 netdev_linux_alloc(void)
703 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
708 netdev_linux_common_construct(struct netdev_linux *netdev)
710 ovs_mutex_init(&netdev->mutex);
713 /* Creates system and internal devices. */
715 netdev_linux_construct(struct netdev *netdev_)
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
720 netdev_linux_common_construct(netdev);
722 error = get_flags(&netdev->up, &netdev->ifi_flags);
723 if (error == ENODEV) {
724 if (netdev->up.netdev_class != &netdev_internal_class) {
725 /* The device does not exist, so don't allow it to be opened. */
728 /* "Internal" netdevs have to be created as netdev objects before
729 * they exist in the kernel, because creating them in the kernel
730 * happens by passing a netdev object to dpif_port_add().
731 * Therefore, ignore the error. */
738 /* For most types of netdevs we open the device for each call of
739 * netdev_open(). However, this is not the case with tap devices,
740 * since it is only possible to open the device once. In this
741 * situation we share a single file descriptor, and consequently
742 * buffers, across all readers. Therefore once data is read it will
743 * be unavailable to other reads for tap devices. */
745 netdev_linux_construct_tap(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
748 static const char tap_dev[] = "/dev/net/tun";
749 const char *name = netdev_->name;
753 netdev_linux_common_construct(netdev);
755 /* Open tap device. */
756 netdev->tap_fd = open(tap_dev, O_RDWR);
757 if (netdev->tap_fd < 0) {
759 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
763 /* Create tap device. */
764 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
765 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
766 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
767 VLOG_WARN("%s: creating tap device failed: %s", name,
768 ovs_strerror(errno));
773 /* Make non-blocking. */
774 error = set_nonblocking(netdev->tap_fd);
782 close(netdev->tap_fd);
787 netdev_linux_destruct(struct netdev *netdev_)
789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 if (netdev->tc && netdev->tc->ops->tc_destroy) {
792 netdev->tc->ops->tc_destroy(netdev->tc);
795 if (netdev_get_class(netdev_) == &netdev_tap_class
796 && netdev->tap_fd >= 0)
798 close(netdev->tap_fd);
801 if (netdev->miimon_interval > 0) {
802 atomic_count_dec(&miimon_cnt);
805 ovs_mutex_destroy(&netdev->mutex);
809 netdev_linux_dealloc(struct netdev *netdev_)
811 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 static struct netdev_rxq *
816 netdev_linux_rxq_alloc(void)
818 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
823 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
825 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
826 struct netdev *netdev_ = rx->up.netdev;
827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 ovs_mutex_lock(&netdev->mutex);
831 rx->is_tap = is_tap_netdev(netdev_);
833 rx->fd = netdev->tap_fd;
835 struct sockaddr_ll sll;
837 /* Result of tcpdump -dd inbound */
838 static const struct sock_filter filt[] = {
839 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
840 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
841 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
842 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
844 static const struct sock_fprog fprog = {
845 ARRAY_SIZE(filt), (struct sock_filter *) filt
848 /* Create file descriptor. */
849 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
852 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
857 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
859 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
860 netdev_get_name(netdev_), ovs_strerror(error));
864 /* Set non-blocking mode. */
865 error = set_nonblocking(rx->fd);
870 /* Get ethernet device index. */
871 error = get_ifindex(&netdev->up, &ifindex);
876 /* Bind to specific ethernet device. */
877 memset(&sll, 0, sizeof sll);
878 sll.sll_family = AF_PACKET;
879 sll.sll_ifindex = ifindex;
880 sll.sll_protocol = htons(ETH_P_ALL);
881 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
883 VLOG_ERR("%s: failed to bind raw socket (%s)",
884 netdev_get_name(netdev_), ovs_strerror(error));
888 /* Filter for only inbound packets. */
889 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
893 VLOG_ERR("%s: failed to attach filter (%s)",
894 netdev_get_name(netdev_), ovs_strerror(error));
898 ovs_mutex_unlock(&netdev->mutex);
906 ovs_mutex_unlock(&netdev->mutex);
911 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
913 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
921 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
923 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
929 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
931 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
932 return htons(aux->tp_vlan_tpid);
934 return htons(ETH_TYPE_VLAN);
939 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
941 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
945 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
950 struct cmsghdr *cmsg;
953 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
957 /* Reserve headroom for a single VLAN tag */
958 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
959 size = dp_packet_tailroom(buffer);
961 iov.iov_base = dp_packet_data(buffer);
963 msgh.msg_name = NULL;
964 msgh.msg_namelen = 0;
967 msgh.msg_control = &cmsg_buffer;
968 msgh.msg_controllen = sizeof cmsg_buffer;
972 retval = recvmsg(fd, &msgh, MSG_TRUNC);
973 } while (retval < 0 && errno == EINTR);
977 } else if (retval > size) {
981 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
983 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
984 const struct tpacket_auxdata *aux;
986 if (cmsg->cmsg_level != SOL_PACKET
987 || cmsg->cmsg_type != PACKET_AUXDATA
988 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
992 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
993 if (auxdata_has_vlan_tci(aux)) {
994 if (retval < ETH_HEADER_LEN) {
998 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
999 htons(aux->tp_vlan_tci));
1008 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1011 size_t size = dp_packet_tailroom(buffer);
1014 retval = read(fd, dp_packet_data(buffer), size);
1015 } while (retval < 0 && errno == EINTR);
1019 } else if (retval > size) {
1023 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1028 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1031 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1032 struct netdev *netdev = rx->up.netdev;
1033 struct dp_packet *buffer;
1037 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1038 mtu = ETH_PAYLOAD_MAX;
1041 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1042 DP_NETDEV_HEADROOM);
1043 retval = (rx->is_tap
1044 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1045 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1048 if (retval != EAGAIN && retval != EMSGSIZE) {
1049 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1050 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1052 dp_packet_delete(buffer);
1054 dp_packet_pad(buffer);
1055 dp_packet_set_dp_hash(buffer, 0);
1056 packets[0] = buffer;
1064 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1066 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1067 poll_fd_wait(rx->fd, POLLIN);
1071 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1073 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1076 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1077 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1081 drain_fd(rx->fd, ifr.ifr_qlen);
1084 return drain_rcvbuf(rx->fd);
1088 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1089 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1090 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1091 * the packet is too big or too small to transmit on the device.
1093 * The caller retains ownership of 'buffer' in all cases.
1095 * The kernel maintains a packet transmission queue, so the caller is not
1096 * expected to do additional queuing of packets. */
1098 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1099 struct dp_packet **pkts, int cnt, bool may_steal)
1104 /* 'i' is incremented only if there's no error */
1105 for (i = 0; i < cnt;) {
1106 const void *data = dp_packet_data(pkts[i]);
1107 size_t size = dp_packet_size(pkts[i]);
1110 if (!is_tap_netdev(netdev_)) {
1111 /* Use our AF_PACKET socket to send to this device. */
1112 struct sockaddr_ll sll;
1118 sock = af_packet_sock();
1123 ifindex = netdev_get_ifindex(netdev_);
1128 /* We don't bother setting most fields in sockaddr_ll because the
1129 * kernel ignores them for SOCK_RAW. */
1130 memset(&sll, 0, sizeof sll);
1131 sll.sll_family = AF_PACKET;
1132 sll.sll_ifindex = ifindex;
1134 iov.iov_base = CONST_CAST(void *, data);
1137 msg.msg_name = &sll;
1138 msg.msg_namelen = sizeof sll;
1141 msg.msg_control = NULL;
1142 msg.msg_controllen = 0;
1145 retval = sendmsg(sock, &msg, 0);
1147 /* Use the tap fd to send to this device. This is essential for
1148 * tap devices, because packets sent to a tap device with an
1149 * AF_PACKET socket will loop back to be *received* again on the
1150 * tap device. This doesn't occur on other interface types
1151 * because we attach a socket filter to the rx socket. */
1152 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1154 retval = write(netdev->tap_fd, data, size);
1158 /* The Linux AF_PACKET implementation never blocks waiting for room
1159 * for packets, instead returning ENOBUFS. Translate this into
1160 * EAGAIN for the caller. */
1161 error = errno == ENOBUFS ? EAGAIN : errno;
1162 if (error == EINTR) {
1163 /* continue without incrementing 'i', i.e. retry this packet */
1167 } else if (retval != size) {
1168 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1169 " of %"PRIuSIZE") on %s", retval, size,
1170 netdev_get_name(netdev_));
1175 /* Process the next packet in the batch */
1180 for (i = 0; i < cnt; i++) {
1181 dp_packet_delete(pkts[i]);
1185 if (error && error != EAGAIN) {
1186 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1187 netdev_get_name(netdev_), ovs_strerror(error));
1194 /* Registers with the poll loop to wake up from the next call to poll_block()
1195 * when the packet transmission queue has sufficient room to transmit a packet
1196 * with netdev_send().
1198 * The kernel maintains a packet transmission queue, so the client is not
1199 * expected to do additional queuing of packets. Thus, this function is
1200 * unlikely to ever be used. It is included for completeness. */
1202 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1204 if (is_tap_netdev(netdev)) {
1205 /* TAP device always accepts packets.*/
1206 poll_immediate_wake();
1210 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1211 * otherwise a positive errno value. */
1213 netdev_linux_set_etheraddr(struct netdev *netdev_,
1214 const uint8_t mac[ETH_ADDR_LEN])
1216 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1217 enum netdev_flags old_flags = 0;
1220 ovs_mutex_lock(&netdev->mutex);
1222 if (netdev->cache_valid & VALID_ETHERADDR) {
1223 error = netdev->ether_addr_error;
1224 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1227 netdev->cache_valid &= ~VALID_ETHERADDR;
1230 /* Tap devices must be brought down before setting the address. */
1231 if (is_tap_netdev(netdev_)) {
1232 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1234 error = set_etheraddr(netdev_get_name(netdev_), mac);
1235 if (!error || error == ENODEV) {
1236 netdev->ether_addr_error = error;
1237 netdev->cache_valid |= VALID_ETHERADDR;
1239 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1243 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1244 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1248 ovs_mutex_unlock(&netdev->mutex);
1252 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1254 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1255 uint8_t mac[ETH_ADDR_LEN])
1257 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1260 ovs_mutex_lock(&netdev->mutex);
1261 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1262 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1264 netdev->cache_valid |= VALID_ETHERADDR;
1267 error = netdev->ether_addr_error;
1269 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1271 ovs_mutex_unlock(&netdev->mutex);
1277 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1281 if (!(netdev->cache_valid & VALID_MTU)) {
1284 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1285 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1286 netdev->mtu = ifr.ifr_mtu;
1287 netdev->cache_valid |= VALID_MTU;
1290 error = netdev->netdev_mtu_error;
1292 *mtup = netdev->mtu;
1298 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1299 * in bytes, not including the hardware header; thus, this is typically 1500
1300 * bytes for Ethernet devices. */
1302 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1304 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1307 ovs_mutex_lock(&netdev->mutex);
1308 error = netdev_linux_get_mtu__(netdev, mtup);
1309 ovs_mutex_unlock(&netdev->mutex);
1314 /* Sets the maximum size of transmitted (MTU) for given device using linux
1315 * networking ioctl interface.
1318 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1324 ovs_mutex_lock(&netdev->mutex);
1325 if (netdev->cache_valid & VALID_MTU) {
1326 error = netdev->netdev_mtu_error;
1327 if (error || netdev->mtu == mtu) {
1330 netdev->cache_valid &= ~VALID_MTU;
1333 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1334 SIOCSIFMTU, "SIOCSIFMTU");
1335 if (!error || error == ENODEV) {
1336 netdev->netdev_mtu_error = error;
1337 netdev->mtu = ifr.ifr_mtu;
1338 netdev->cache_valid |= VALID_MTU;
1341 ovs_mutex_unlock(&netdev->mutex);
1345 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1346 * On failure, returns a negative errno value. */
1348 netdev_linux_get_ifindex(const struct netdev *netdev_)
1350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1353 ovs_mutex_lock(&netdev->mutex);
1354 error = get_ifindex(netdev_, &ifindex);
1355 ovs_mutex_unlock(&netdev->mutex);
1357 return error ? -error : ifindex;
1361 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1363 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1365 ovs_mutex_lock(&netdev->mutex);
1366 if (netdev->miimon_interval > 0) {
1367 *carrier = netdev->miimon;
1369 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1371 ovs_mutex_unlock(&netdev->mutex);
1376 static long long int
1377 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1379 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1380 long long int carrier_resets;
1382 ovs_mutex_lock(&netdev->mutex);
1383 carrier_resets = netdev->carrier_resets;
1384 ovs_mutex_unlock(&netdev->mutex);
1386 return carrier_resets;
1390 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1391 struct mii_ioctl_data *data)
1396 memset(&ifr, 0, sizeof ifr);
1397 memcpy(&ifr.ifr_data, data, sizeof *data);
1398 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1399 memcpy(data, &ifr.ifr_data, sizeof *data);
1405 netdev_linux_get_miimon(const char *name, bool *miimon)
1407 struct mii_ioctl_data data;
1412 memset(&data, 0, sizeof data);
1413 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1415 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1416 data.reg_num = MII_BMSR;
1417 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1421 *miimon = !!(data.val_out & BMSR_LSTATUS);
1423 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1426 struct ethtool_cmd ecmd;
1428 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1431 COVERAGE_INC(netdev_get_ethtool);
1432 memset(&ecmd, 0, sizeof ecmd);
1433 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1436 struct ethtool_value eval;
1438 memcpy(&eval, &ecmd, sizeof eval);
1439 *miimon = !!eval.data;
1441 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1449 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1450 long long int interval)
1452 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1454 ovs_mutex_lock(&netdev->mutex);
1455 interval = interval > 0 ? MAX(interval, 100) : 0;
1456 if (netdev->miimon_interval != interval) {
1457 if (interval && !netdev->miimon_interval) {
1458 atomic_count_inc(&miimon_cnt);
1459 } else if (!interval && netdev->miimon_interval) {
1460 atomic_count_dec(&miimon_cnt);
1463 netdev->miimon_interval = interval;
1464 timer_set_expired(&netdev->miimon_timer);
1466 ovs_mutex_unlock(&netdev->mutex);
1472 netdev_linux_miimon_run(void)
1474 struct shash device_shash;
1475 struct shash_node *node;
1477 shash_init(&device_shash);
1478 netdev_get_devices(&netdev_linux_class, &device_shash);
1479 SHASH_FOR_EACH (node, &device_shash) {
1480 struct netdev *netdev = node->data;
1481 struct netdev_linux *dev = netdev_linux_cast(netdev);
1484 ovs_mutex_lock(&dev->mutex);
1485 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1486 netdev_linux_get_miimon(dev->up.name, &miimon);
1487 if (miimon != dev->miimon) {
1488 dev->miimon = miimon;
1489 netdev_linux_changed(dev, dev->ifi_flags, 0);
1492 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1494 ovs_mutex_unlock(&dev->mutex);
1495 netdev_close(netdev);
1498 shash_destroy(&device_shash);
1502 netdev_linux_miimon_wait(void)
1504 struct shash device_shash;
1505 struct shash_node *node;
1507 shash_init(&device_shash);
1508 netdev_get_devices(&netdev_linux_class, &device_shash);
1509 SHASH_FOR_EACH (node, &device_shash) {
1510 struct netdev *netdev = node->data;
1511 struct netdev_linux *dev = netdev_linux_cast(netdev);
1513 ovs_mutex_lock(&dev->mutex);
1514 if (dev->miimon_interval > 0) {
1515 timer_wait(&dev->miimon_timer);
1517 ovs_mutex_unlock(&dev->mutex);
1518 netdev_close(netdev);
1520 shash_destroy(&device_shash);
1524 swap_uint64(uint64_t *a, uint64_t *b)
1531 /* Copies 'src' into 'dst', performing format conversion in the process.
1533 * 'src' is allowed to be misaligned. */
1535 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1536 const struct ovs_vport_stats *src)
1538 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1539 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1540 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1541 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1542 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1543 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1544 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1545 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1547 dst->collisions = 0;
1548 dst->rx_length_errors = 0;
1549 dst->rx_over_errors = 0;
1550 dst->rx_crc_errors = 0;
1551 dst->rx_frame_errors = 0;
1552 dst->rx_fifo_errors = 0;
1553 dst->rx_missed_errors = 0;
1554 dst->tx_aborted_errors = 0;
1555 dst->tx_carrier_errors = 0;
1556 dst->tx_fifo_errors = 0;
1557 dst->tx_heartbeat_errors = 0;
1558 dst->tx_window_errors = 0;
1562 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1564 struct dpif_netlink_vport reply;
1568 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1571 } else if (!reply.stats) {
1576 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1584 get_stats_via_vport(const struct netdev *netdev_,
1585 struct netdev_stats *stats)
1587 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1589 if (!netdev->vport_stats_error ||
1590 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1593 error = get_stats_via_vport__(netdev_, stats);
1594 if (error && error != ENOENT) {
1595 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1597 netdev_get_name(netdev_), ovs_strerror(error));
1599 netdev->vport_stats_error = error;
1600 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1604 /* Retrieves current device stats for 'netdev-linux'. */
1606 netdev_linux_get_stats(const struct netdev *netdev_,
1607 struct netdev_stats *stats)
1609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1610 struct netdev_stats dev_stats;
1613 ovs_mutex_lock(&netdev->mutex);
1614 get_stats_via_vport(netdev_, stats);
1615 error = get_stats_via_netlink(netdev_, &dev_stats);
1617 if (!netdev->vport_stats_error) {
1620 } else if (netdev->vport_stats_error) {
1621 /* stats not available from OVS then use netdev stats. */
1624 /* Use kernel netdev's packet and byte counts since vport's counters
1625 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1627 stats->rx_packets = dev_stats.rx_packets;
1628 stats->rx_bytes = dev_stats.rx_bytes;
1629 stats->tx_packets = dev_stats.tx_packets;
1630 stats->tx_bytes = dev_stats.tx_bytes;
1632 stats->rx_errors += dev_stats.rx_errors;
1633 stats->tx_errors += dev_stats.tx_errors;
1634 stats->rx_dropped += dev_stats.rx_dropped;
1635 stats->tx_dropped += dev_stats.tx_dropped;
1636 stats->multicast += dev_stats.multicast;
1637 stats->collisions += dev_stats.collisions;
1638 stats->rx_length_errors += dev_stats.rx_length_errors;
1639 stats->rx_over_errors += dev_stats.rx_over_errors;
1640 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1641 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1642 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1643 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1644 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1645 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1646 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1647 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1648 stats->tx_window_errors += dev_stats.tx_window_errors;
1650 ovs_mutex_unlock(&netdev->mutex);
1655 /* Retrieves current device stats for 'netdev-tap' netdev or
1656 * netdev-internal. */
1658 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1661 struct netdev_stats dev_stats;
1664 ovs_mutex_lock(&netdev->mutex);
1665 get_stats_via_vport(netdev_, stats);
1666 error = get_stats_via_netlink(netdev_, &dev_stats);
1668 if (!netdev->vport_stats_error) {
1671 } else if (netdev->vport_stats_error) {
1672 /* Transmit and receive stats will appear to be swapped relative to the
1673 * other ports since we are the one sending the data, not a remote
1674 * computer. For consistency, we swap them back here. This does not
1675 * apply if we are getting stats from the vport layer because it always
1676 * tracks stats from the perspective of the switch. */
1679 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1680 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1681 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1682 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1683 stats->rx_length_errors = 0;
1684 stats->rx_over_errors = 0;
1685 stats->rx_crc_errors = 0;
1686 stats->rx_frame_errors = 0;
1687 stats->rx_fifo_errors = 0;
1688 stats->rx_missed_errors = 0;
1689 stats->tx_aborted_errors = 0;
1690 stats->tx_carrier_errors = 0;
1691 stats->tx_fifo_errors = 0;
1692 stats->tx_heartbeat_errors = 0;
1693 stats->tx_window_errors = 0;
1695 /* Use kernel netdev's packet and byte counts since vport counters
1696 * do not reflect packet counts on the wire when GSO, TSO or GRO
1698 stats->rx_packets = dev_stats.tx_packets;
1699 stats->rx_bytes = dev_stats.tx_bytes;
1700 stats->tx_packets = dev_stats.rx_packets;
1701 stats->tx_bytes = dev_stats.rx_bytes;
1703 stats->rx_dropped += dev_stats.tx_dropped;
1704 stats->tx_dropped += dev_stats.rx_dropped;
1706 stats->rx_errors += dev_stats.tx_errors;
1707 stats->tx_errors += dev_stats.rx_errors;
1709 stats->multicast += dev_stats.multicast;
1710 stats->collisions += dev_stats.collisions;
1712 ovs_mutex_unlock(&netdev->mutex);
1718 netdev_internal_get_stats(const struct netdev *netdev_,
1719 struct netdev_stats *stats)
1721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1724 ovs_mutex_lock(&netdev->mutex);
1725 get_stats_via_vport(netdev_, stats);
1726 error = netdev->vport_stats_error;
1727 ovs_mutex_unlock(&netdev->mutex);
1733 netdev_linux_read_features(struct netdev_linux *netdev)
1735 struct ethtool_cmd ecmd;
1739 if (netdev->cache_valid & VALID_FEATURES) {
1743 COVERAGE_INC(netdev_get_ethtool);
1744 memset(&ecmd, 0, sizeof ecmd);
1745 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1746 ETHTOOL_GSET, "ETHTOOL_GSET");
1751 /* Supported features. */
1752 netdev->supported = 0;
1753 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1754 netdev->supported |= NETDEV_F_10MB_HD;
1756 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1757 netdev->supported |= NETDEV_F_10MB_FD;
1759 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1760 netdev->supported |= NETDEV_F_100MB_HD;
1762 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1763 netdev->supported |= NETDEV_F_100MB_FD;
1765 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1766 netdev->supported |= NETDEV_F_1GB_HD;
1768 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1769 netdev->supported |= NETDEV_F_1GB_FD;
1771 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1772 netdev->supported |= NETDEV_F_10GB_FD;
1774 if (ecmd.supported & SUPPORTED_TP) {
1775 netdev->supported |= NETDEV_F_COPPER;
1777 if (ecmd.supported & SUPPORTED_FIBRE) {
1778 netdev->supported |= NETDEV_F_FIBER;
1780 if (ecmd.supported & SUPPORTED_Autoneg) {
1781 netdev->supported |= NETDEV_F_AUTONEG;
1783 if (ecmd.supported & SUPPORTED_Pause) {
1784 netdev->supported |= NETDEV_F_PAUSE;
1786 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1787 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1790 /* Advertised features. */
1791 netdev->advertised = 0;
1792 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1793 netdev->advertised |= NETDEV_F_10MB_HD;
1795 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1796 netdev->advertised |= NETDEV_F_10MB_FD;
1798 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1799 netdev->advertised |= NETDEV_F_100MB_HD;
1801 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1802 netdev->advertised |= NETDEV_F_100MB_FD;
1804 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1805 netdev->advertised |= NETDEV_F_1GB_HD;
1807 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1808 netdev->advertised |= NETDEV_F_1GB_FD;
1810 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1811 netdev->advertised |= NETDEV_F_10GB_FD;
1813 if (ecmd.advertising & ADVERTISED_TP) {
1814 netdev->advertised |= NETDEV_F_COPPER;
1816 if (ecmd.advertising & ADVERTISED_FIBRE) {
1817 netdev->advertised |= NETDEV_F_FIBER;
1819 if (ecmd.advertising & ADVERTISED_Autoneg) {
1820 netdev->advertised |= NETDEV_F_AUTONEG;
1822 if (ecmd.advertising & ADVERTISED_Pause) {
1823 netdev->advertised |= NETDEV_F_PAUSE;
1825 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1826 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1829 /* Current settings. */
1831 if (speed == SPEED_10) {
1832 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1833 } else if (speed == SPEED_100) {
1834 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1835 } else if (speed == SPEED_1000) {
1836 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1837 } else if (speed == SPEED_10000) {
1838 netdev->current = NETDEV_F_10GB_FD;
1839 } else if (speed == 40000) {
1840 netdev->current = NETDEV_F_40GB_FD;
1841 } else if (speed == 100000) {
1842 netdev->current = NETDEV_F_100GB_FD;
1843 } else if (speed == 1000000) {
1844 netdev->current = NETDEV_F_1TB_FD;
1846 netdev->current = 0;
1849 if (ecmd.port == PORT_TP) {
1850 netdev->current |= NETDEV_F_COPPER;
1851 } else if (ecmd.port == PORT_FIBRE) {
1852 netdev->current |= NETDEV_F_FIBER;
1856 netdev->current |= NETDEV_F_AUTONEG;
1860 netdev->cache_valid |= VALID_FEATURES;
1861 netdev->get_features_error = error;
1864 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1865 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1866 * Returns 0 if successful, otherwise a positive errno value. */
1868 netdev_linux_get_features(const struct netdev *netdev_,
1869 enum netdev_features *current,
1870 enum netdev_features *advertised,
1871 enum netdev_features *supported,
1872 enum netdev_features *peer)
1874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1877 ovs_mutex_lock(&netdev->mutex);
1878 netdev_linux_read_features(netdev);
1879 if (!netdev->get_features_error) {
1880 *current = netdev->current;
1881 *advertised = netdev->advertised;
1882 *supported = netdev->supported;
1883 *peer = 0; /* XXX */
1885 error = netdev->get_features_error;
1886 ovs_mutex_unlock(&netdev->mutex);
1891 /* Set the features advertised by 'netdev' to 'advertise'. */
1893 netdev_linux_set_advertisements(struct netdev *netdev_,
1894 enum netdev_features advertise)
1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1897 struct ethtool_cmd ecmd;
1900 ovs_mutex_lock(&netdev->mutex);
1902 COVERAGE_INC(netdev_get_ethtool);
1903 memset(&ecmd, 0, sizeof ecmd);
1904 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1905 ETHTOOL_GSET, "ETHTOOL_GSET");
1910 ecmd.advertising = 0;
1911 if (advertise & NETDEV_F_10MB_HD) {
1912 ecmd.advertising |= ADVERTISED_10baseT_Half;
1914 if (advertise & NETDEV_F_10MB_FD) {
1915 ecmd.advertising |= ADVERTISED_10baseT_Full;
1917 if (advertise & NETDEV_F_100MB_HD) {
1918 ecmd.advertising |= ADVERTISED_100baseT_Half;
1920 if (advertise & NETDEV_F_100MB_FD) {
1921 ecmd.advertising |= ADVERTISED_100baseT_Full;
1923 if (advertise & NETDEV_F_1GB_HD) {
1924 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1926 if (advertise & NETDEV_F_1GB_FD) {
1927 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1929 if (advertise & NETDEV_F_10GB_FD) {
1930 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1932 if (advertise & NETDEV_F_COPPER) {
1933 ecmd.advertising |= ADVERTISED_TP;
1935 if (advertise & NETDEV_F_FIBER) {
1936 ecmd.advertising |= ADVERTISED_FIBRE;
1938 if (advertise & NETDEV_F_AUTONEG) {
1939 ecmd.advertising |= ADVERTISED_Autoneg;
1941 if (advertise & NETDEV_F_PAUSE) {
1942 ecmd.advertising |= ADVERTISED_Pause;
1944 if (advertise & NETDEV_F_PAUSE_ASYM) {
1945 ecmd.advertising |= ADVERTISED_Asym_Pause;
1947 COVERAGE_INC(netdev_set_ethtool);
1948 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1949 ETHTOOL_SSET, "ETHTOOL_SSET");
1952 ovs_mutex_unlock(&netdev->mutex);
1956 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1957 * successful, otherwise a positive errno value. */
1959 netdev_linux_set_policing(struct netdev *netdev_,
1960 uint32_t kbits_rate, uint32_t kbits_burst)
1962 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1963 const char *netdev_name = netdev_get_name(netdev_);
1966 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1967 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1968 : kbits_burst); /* Stick with user-specified value. */
1970 ovs_mutex_lock(&netdev->mutex);
1971 if (netdev->cache_valid & VALID_POLICING) {
1972 error = netdev->netdev_policing_error;
1973 if (error || (netdev->kbits_rate == kbits_rate &&
1974 netdev->kbits_burst == kbits_burst)) {
1975 /* Assume that settings haven't changed since we last set them. */
1978 netdev->cache_valid &= ~VALID_POLICING;
1981 COVERAGE_INC(netdev_set_policing);
1982 /* Remove any existing ingress qdisc. */
1983 error = tc_add_del_ingress_qdisc(netdev_, false);
1985 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1986 netdev_name, ovs_strerror(error));
1991 error = tc_add_del_ingress_qdisc(netdev_, true);
1993 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1994 netdev_name, ovs_strerror(error));
1998 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2000 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2001 netdev_name, ovs_strerror(error));
2006 netdev->kbits_rate = kbits_rate;
2007 netdev->kbits_burst = kbits_burst;
2010 if (!error || error == ENODEV) {
2011 netdev->netdev_policing_error = error;
2012 netdev->cache_valid |= VALID_POLICING;
2014 ovs_mutex_unlock(&netdev->mutex);
2019 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2022 const struct tc_ops *const *opsp;
2024 for (opsp = tcs; *opsp != NULL; opsp++) {
2025 const struct tc_ops *ops = *opsp;
2026 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2027 sset_add(types, ops->ovs_name);
2033 static const struct tc_ops *
2034 tc_lookup_ovs_name(const char *name)
2036 const struct tc_ops *const *opsp;
2038 for (opsp = tcs; *opsp != NULL; opsp++) {
2039 const struct tc_ops *ops = *opsp;
2040 if (!strcmp(name, ops->ovs_name)) {
2047 static const struct tc_ops *
2048 tc_lookup_linux_name(const char *name)
2050 const struct tc_ops *const *opsp;
2052 for (opsp = tcs; *opsp != NULL; opsp++) {
2053 const struct tc_ops *ops = *opsp;
2054 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2061 static struct tc_queue *
2062 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2066 struct tc_queue *queue;
2068 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2069 if (queue->queue_id == queue_id) {
2076 static struct tc_queue *
2077 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2079 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2083 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2085 struct netdev_qos_capabilities *caps)
2087 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2091 caps->n_queues = ops->n_queues;
2096 netdev_linux_get_qos(const struct netdev *netdev_,
2097 const char **typep, struct smap *details)
2099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2102 ovs_mutex_lock(&netdev->mutex);
2103 error = tc_query_qdisc(netdev_);
2105 *typep = netdev->tc->ops->ovs_name;
2106 error = (netdev->tc->ops->qdisc_get
2107 ? netdev->tc->ops->qdisc_get(netdev_, details)
2110 ovs_mutex_unlock(&netdev->mutex);
2116 netdev_linux_set_qos(struct netdev *netdev_,
2117 const char *type, const struct smap *details)
2119 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2120 const struct tc_ops *new_ops;
2123 new_ops = tc_lookup_ovs_name(type);
2124 if (!new_ops || !new_ops->tc_install) {
2128 ovs_mutex_lock(&netdev->mutex);
2129 error = tc_query_qdisc(netdev_);
2134 if (new_ops == netdev->tc->ops) {
2135 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2137 /* Delete existing qdisc. */
2138 error = tc_del_qdisc(netdev_);
2142 ovs_assert(netdev->tc == NULL);
2144 /* Install new qdisc. */
2145 error = new_ops->tc_install(netdev_, details);
2146 ovs_assert((error == 0) == (netdev->tc != NULL));
2150 ovs_mutex_unlock(&netdev->mutex);
2155 netdev_linux_get_queue(const struct netdev *netdev_,
2156 unsigned int queue_id, struct smap *details)
2158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2161 ovs_mutex_lock(&netdev->mutex);
2162 error = tc_query_qdisc(netdev_);
2164 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2166 ? netdev->tc->ops->class_get(netdev_, queue, details)
2169 ovs_mutex_unlock(&netdev->mutex);
2175 netdev_linux_set_queue(struct netdev *netdev_,
2176 unsigned int queue_id, const struct smap *details)
2178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2181 ovs_mutex_lock(&netdev->mutex);
2182 error = tc_query_qdisc(netdev_);
2184 error = (queue_id < netdev->tc->ops->n_queues
2185 && netdev->tc->ops->class_set
2186 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2189 ovs_mutex_unlock(&netdev->mutex);
2195 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2197 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2200 ovs_mutex_lock(&netdev->mutex);
2201 error = tc_query_qdisc(netdev_);
2203 if (netdev->tc->ops->class_delete) {
2204 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2206 ? netdev->tc->ops->class_delete(netdev_, queue)
2212 ovs_mutex_unlock(&netdev->mutex);
2218 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2219 unsigned int queue_id,
2220 struct netdev_queue_stats *stats)
2222 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2225 ovs_mutex_lock(&netdev->mutex);
2226 error = tc_query_qdisc(netdev_);
2228 if (netdev->tc->ops->class_get_stats) {
2229 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2231 stats->created = queue->created;
2232 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2241 ovs_mutex_unlock(&netdev->mutex);
2246 struct queue_dump_state {
2247 struct nl_dump dump;
2252 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2254 struct ofpbuf request;
2255 struct tcmsg *tcmsg;
2257 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2261 tcmsg->tcm_parent = 0;
2262 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2263 ofpbuf_uninit(&request);
2265 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2270 finish_queue_dump(struct queue_dump_state *state)
2272 ofpbuf_uninit(&state->buf);
2273 return nl_dump_done(&state->dump);
2276 struct netdev_linux_queue_state {
2277 unsigned int *queues;
2283 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2285 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2288 ovs_mutex_lock(&netdev->mutex);
2289 error = tc_query_qdisc(netdev_);
2291 if (netdev->tc->ops->class_get) {
2292 struct netdev_linux_queue_state *state;
2293 struct tc_queue *queue;
2296 *statep = state = xmalloc(sizeof *state);
2297 state->n_queues = hmap_count(&netdev->tc->queues);
2298 state->cur_queue = 0;
2299 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2302 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2303 state->queues[i++] = queue->queue_id;
2309 ovs_mutex_unlock(&netdev->mutex);
2315 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2316 unsigned int *queue_idp, struct smap *details)
2318 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2319 struct netdev_linux_queue_state *state = state_;
2322 ovs_mutex_lock(&netdev->mutex);
2323 while (state->cur_queue < state->n_queues) {
2324 unsigned int queue_id = state->queues[state->cur_queue++];
2325 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2328 *queue_idp = queue_id;
2329 error = netdev->tc->ops->class_get(netdev_, queue, details);
2333 ovs_mutex_unlock(&netdev->mutex);
2339 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2342 struct netdev_linux_queue_state *state = state_;
2344 free(state->queues);
2350 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2351 netdev_dump_queue_stats_cb *cb, void *aux)
2353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2356 ovs_mutex_lock(&netdev->mutex);
2357 error = tc_query_qdisc(netdev_);
2359 struct queue_dump_state state;
2361 if (!netdev->tc->ops->class_dump_stats) {
2363 } else if (!start_queue_dump(netdev_, &state)) {
2369 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2370 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2377 retval = finish_queue_dump(&state);
2383 ovs_mutex_unlock(&netdev->mutex);
2389 netdev_linux_get_in4(const struct netdev *netdev_,
2390 struct in_addr *address, struct in_addr *netmask)
2392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2395 ovs_mutex_lock(&netdev->mutex);
2396 if (!(netdev->cache_valid & VALID_IN4)) {
2397 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2398 SIOCGIFADDR, "SIOCGIFADDR");
2400 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2401 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2403 netdev->cache_valid |= VALID_IN4;
2411 if (netdev->address.s_addr != INADDR_ANY) {
2412 *address = netdev->address;
2413 *netmask = netdev->netmask;
2415 error = EADDRNOTAVAIL;
2418 ovs_mutex_unlock(&netdev->mutex);
2424 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2425 struct in_addr netmask)
2427 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2430 ovs_mutex_lock(&netdev->mutex);
2431 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2433 netdev->cache_valid |= VALID_IN4;
2434 netdev->address = address;
2435 netdev->netmask = netmask;
2436 if (address.s_addr != INADDR_ANY) {
2437 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2438 "SIOCSIFNETMASK", netmask);
2441 ovs_mutex_unlock(&netdev->mutex);
2447 parse_if_inet6_line(const char *line,
2448 struct in6_addr *in6, char ifname[16 + 1])
2450 uint8_t *s6 = in6->s6_addr;
2451 #define X8 "%2"SCNx8
2452 return ovs_scan(line,
2453 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2454 "%*x %*x %*x %*x %16s\n",
2455 &s6[0], &s6[1], &s6[2], &s6[3],
2456 &s6[4], &s6[5], &s6[6], &s6[7],
2457 &s6[8], &s6[9], &s6[10], &s6[11],
2458 &s6[12], &s6[13], &s6[14], &s6[15],
2462 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2463 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2465 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2469 ovs_mutex_lock(&netdev->mutex);
2470 if (!(netdev->cache_valid & VALID_IN6)) {
2474 netdev->in6 = in6addr_any;
2476 file = fopen("/proc/net/if_inet6", "r");
2478 const char *name = netdev_get_name(netdev_);
2479 while (fgets(line, sizeof line, file)) {
2480 struct in6_addr in6_tmp;
2481 char ifname[16 + 1];
2482 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2483 && !strcmp(name, ifname))
2485 netdev->in6 = in6_tmp;
2491 netdev->cache_valid |= VALID_IN6;
2494 ovs_mutex_unlock(&netdev->mutex);
2500 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2502 struct sockaddr_in sin;
2503 memset(&sin, 0, sizeof sin);
2504 sin.sin_family = AF_INET;
2505 sin.sin_addr = addr;
2508 memset(sa, 0, sizeof *sa);
2509 memcpy(sa, &sin, sizeof sin);
2513 do_set_addr(struct netdev *netdev,
2514 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2518 make_in4_sockaddr(&ifr.ifr_addr, addr);
2519 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2523 /* Adds 'router' as a default IP gateway. */
2525 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2527 struct in_addr any = { INADDR_ANY };
2531 memset(&rt, 0, sizeof rt);
2532 make_in4_sockaddr(&rt.rt_dst, any);
2533 make_in4_sockaddr(&rt.rt_gateway, router);
2534 make_in4_sockaddr(&rt.rt_genmask, any);
2535 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2536 error = af_inet_ioctl(SIOCADDRT, &rt);
2538 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2544 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2547 static const char fn[] = "/proc/net/route";
2552 *netdev_name = NULL;
2553 stream = fopen(fn, "r");
2554 if (stream == NULL) {
2555 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2560 while (fgets(line, sizeof line, stream)) {
2563 ovs_be32 dest, gateway, mask;
2564 int refcnt, metric, mtu;
2565 unsigned int flags, use, window, irtt;
2568 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2570 iface, &dest, &gateway, &flags, &refcnt,
2571 &use, &metric, &mask, &mtu, &window, &irtt)) {
2572 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2576 if (!(flags & RTF_UP)) {
2577 /* Skip routes that aren't up. */
2581 /* The output of 'dest', 'mask', and 'gateway' were given in
2582 * network byte order, so we don't need need any endian
2583 * conversions here. */
2584 if ((dest & mask) == (host->s_addr & mask)) {
2586 /* The host is directly reachable. */
2587 next_hop->s_addr = 0;
2589 /* To reach the host, we must go through a gateway. */
2590 next_hop->s_addr = gateway;
2592 *netdev_name = xstrdup(iface);
2604 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2606 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2609 ovs_mutex_lock(&netdev->mutex);
2610 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2611 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2613 COVERAGE_INC(netdev_get_ethtool);
2614 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2615 error = netdev_linux_do_ethtool(netdev->up.name,
2618 "ETHTOOL_GDRVINFO");
2620 netdev->cache_valid |= VALID_DRVINFO;
2625 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2626 smap_add(smap, "driver_version", netdev->drvinfo.version);
2627 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2629 ovs_mutex_unlock(&netdev->mutex);
2635 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2638 smap_add(smap, "driver_name", "openvswitch");
2642 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2643 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2644 * returns 0. Otherwise, it returns a positive errno value; in particular,
2645 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2647 netdev_linux_arp_lookup(const struct netdev *netdev,
2648 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2651 struct sockaddr_in sin;
2654 memset(&r, 0, sizeof r);
2655 memset(&sin, 0, sizeof sin);
2656 sin.sin_family = AF_INET;
2657 sin.sin_addr.s_addr = ip;
2659 memcpy(&r.arp_pa, &sin, sizeof sin);
2660 r.arp_ha.sa_family = ARPHRD_ETHER;
2662 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2663 COVERAGE_INC(netdev_arp_lookup);
2664 retval = af_inet_ioctl(SIOCGARP, &r);
2666 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2667 } else if (retval != ENXIO) {
2668 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2669 netdev_get_name(netdev), IP_ARGS(ip),
2670 ovs_strerror(retval));
2676 nd_to_iff_flags(enum netdev_flags nd)
2679 if (nd & NETDEV_UP) {
2682 if (nd & NETDEV_PROMISC) {
2685 if (nd & NETDEV_LOOPBACK) {
2686 iff |= IFF_LOOPBACK;
2692 iff_to_nd_flags(int iff)
2694 enum netdev_flags nd = 0;
2698 if (iff & IFF_PROMISC) {
2699 nd |= NETDEV_PROMISC;
2701 if (iff & IFF_LOOPBACK) {
2702 nd |= NETDEV_LOOPBACK;
2708 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2709 enum netdev_flags on, enum netdev_flags *old_flagsp)
2710 OVS_REQUIRES(netdev->mutex)
2712 int old_flags, new_flags;
2715 old_flags = netdev->ifi_flags;
2716 *old_flagsp = iff_to_nd_flags(old_flags);
2717 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2718 if (new_flags != old_flags) {
2719 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2720 get_flags(&netdev->up, &netdev->ifi_flags);
2727 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2728 enum netdev_flags on, enum netdev_flags *old_flagsp)
2730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2733 ovs_mutex_lock(&netdev->mutex);
2734 error = update_flags(netdev, off, on, old_flagsp);
2735 ovs_mutex_unlock(&netdev->mutex);
2740 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2741 GET_FEATURES, GET_STATUS) \
2747 netdev_linux_wait, \
2749 netdev_linux_alloc, \
2751 netdev_linux_destruct, \
2752 netdev_linux_dealloc, \
2753 NULL, /* get_config */ \
2754 NULL, /* set_config */ \
2755 NULL, /* get_tunnel_config */ \
2756 NULL, /* build header */ \
2757 NULL, /* push header */ \
2758 NULL, /* pop header */ \
2759 NULL, /* get_numa_id */ \
2760 NULL, /* set_multiq */ \
2762 netdev_linux_send, \
2763 netdev_linux_send_wait, \
2765 netdev_linux_set_etheraddr, \
2766 netdev_linux_get_etheraddr, \
2767 netdev_linux_get_mtu, \
2768 netdev_linux_set_mtu, \
2769 netdev_linux_get_ifindex, \
2770 netdev_linux_get_carrier, \
2771 netdev_linux_get_carrier_resets, \
2772 netdev_linux_set_miimon_interval, \
2776 netdev_linux_set_advertisements, \
2778 netdev_linux_set_policing, \
2779 netdev_linux_get_qos_types, \
2780 netdev_linux_get_qos_capabilities, \
2781 netdev_linux_get_qos, \
2782 netdev_linux_set_qos, \
2783 netdev_linux_get_queue, \
2784 netdev_linux_set_queue, \
2785 netdev_linux_delete_queue, \
2786 netdev_linux_get_queue_stats, \
2787 netdev_linux_queue_dump_start, \
2788 netdev_linux_queue_dump_next, \
2789 netdev_linux_queue_dump_done, \
2790 netdev_linux_dump_queue_stats, \
2792 netdev_linux_get_in4, \
2793 netdev_linux_set_in4, \
2794 netdev_linux_get_in6, \
2795 netdev_linux_add_router, \
2796 netdev_linux_get_next_hop, \
2798 netdev_linux_arp_lookup, \
2800 netdev_linux_update_flags, \
2802 netdev_linux_rxq_alloc, \
2803 netdev_linux_rxq_construct, \
2804 netdev_linux_rxq_destruct, \
2805 netdev_linux_rxq_dealloc, \
2806 netdev_linux_rxq_recv, \
2807 netdev_linux_rxq_wait, \
2808 netdev_linux_rxq_drain, \
2811 const struct netdev_class netdev_linux_class =
2814 netdev_linux_construct,
2815 netdev_linux_get_stats,
2816 netdev_linux_get_features,
2817 netdev_linux_get_status);
2819 const struct netdev_class netdev_tap_class =
2822 netdev_linux_construct_tap,
2823 netdev_tap_get_stats,
2824 netdev_linux_get_features,
2825 netdev_linux_get_status);
2827 const struct netdev_class netdev_internal_class =
2830 netdev_linux_construct,
2831 netdev_internal_get_stats,
2832 NULL, /* get_features */
2833 netdev_internal_get_status);
2835 /* HTB traffic control class. */
2837 #define HTB_N_QUEUES 0xf000
2841 unsigned int max_rate; /* In bytes/s. */
2845 struct tc_queue tc_queue;
2846 unsigned int min_rate; /* In bytes/s. */
2847 unsigned int max_rate; /* In bytes/s. */
2848 unsigned int burst; /* In bytes. */
2849 unsigned int priority; /* Lower values are higher priorities. */
2853 htb_get__(const struct netdev *netdev_)
2855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2856 return CONTAINER_OF(netdev->tc, struct htb, tc);
2860 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2865 htb = xmalloc(sizeof *htb);
2866 tc_init(&htb->tc, &tc_ops_htb);
2867 htb->max_rate = max_rate;
2869 netdev->tc = &htb->tc;
2872 /* Create an HTB qdisc.
2874 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2876 htb_setup_qdisc__(struct netdev *netdev)
2879 struct tc_htb_glob opt;
2880 struct ofpbuf request;
2881 struct tcmsg *tcmsg;
2883 tc_del_qdisc(netdev);
2885 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2886 NLM_F_EXCL | NLM_F_CREATE, &request);
2890 tcmsg->tcm_handle = tc_make_handle(1, 0);
2891 tcmsg->tcm_parent = TC_H_ROOT;
2893 nl_msg_put_string(&request, TCA_KIND, "htb");
2895 memset(&opt, 0, sizeof opt);
2896 opt.rate2quantum = 10;
2900 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2901 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2902 nl_msg_end_nested(&request, opt_offset);
2904 return tc_transact(&request, NULL);
2907 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2908 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2910 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2911 unsigned int parent, struct htb_class *class)
2914 struct tc_htb_opt opt;
2915 struct ofpbuf request;
2916 struct tcmsg *tcmsg;
2920 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2922 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2923 netdev_get_name(netdev));
2927 memset(&opt, 0, sizeof opt);
2928 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2929 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2930 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2931 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2932 opt.prio = class->priority;
2934 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2938 tcmsg->tcm_handle = handle;
2939 tcmsg->tcm_parent = parent;
2941 nl_msg_put_string(&request, TCA_KIND, "htb");
2942 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2943 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2944 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2945 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2946 nl_msg_end_nested(&request, opt_offset);
2948 error = tc_transact(&request, NULL);
2950 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2951 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2952 netdev_get_name(netdev),
2953 tc_get_major(handle), tc_get_minor(handle),
2954 tc_get_major(parent), tc_get_minor(parent),
2955 class->min_rate, class->max_rate,
2956 class->burst, class->priority, ovs_strerror(error));
2961 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2962 * description of them into 'details'. The description complies with the
2963 * specification given in the vswitch database documentation for linux-htb
2966 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2968 static const struct nl_policy tca_htb_policy[] = {
2969 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2970 .min_len = sizeof(struct tc_htb_opt) },
2973 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2974 const struct tc_htb_opt *htb;
2976 if (!nl_parse_nested(nl_options, tca_htb_policy,
2977 attrs, ARRAY_SIZE(tca_htb_policy))) {
2978 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2982 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2983 class->min_rate = htb->rate.rate;
2984 class->max_rate = htb->ceil.rate;
2985 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2986 class->priority = htb->prio;
2991 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2992 struct htb_class *options,
2993 struct netdev_queue_stats *stats)
2995 struct nlattr *nl_options;
2996 unsigned int handle;
2999 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3000 if (!error && queue_id) {
3001 unsigned int major = tc_get_major(handle);
3002 unsigned int minor = tc_get_minor(handle);
3003 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3004 *queue_id = minor - 1;
3009 if (!error && options) {
3010 error = htb_parse_tca_options__(nl_options, options);
3016 htb_parse_qdisc_details__(struct netdev *netdev_,
3017 const struct smap *details, struct htb_class *hc)
3019 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3020 const char *max_rate_s;
3022 max_rate_s = smap_get(details, "max-rate");
3023 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3024 if (!hc->max_rate) {
3025 enum netdev_features current;
3027 netdev_linux_read_features(netdev);
3028 current = !netdev->get_features_error ? netdev->current : 0;
3029 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3031 hc->min_rate = hc->max_rate;
3037 htb_parse_class_details__(struct netdev *netdev,
3038 const struct smap *details, struct htb_class *hc)
3040 const struct htb *htb = htb_get__(netdev);
3041 const char *min_rate_s = smap_get(details, "min-rate");
3042 const char *max_rate_s = smap_get(details, "max-rate");
3043 const char *burst_s = smap_get(details, "burst");
3044 const char *priority_s = smap_get(details, "priority");
3047 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3049 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3050 netdev_get_name(netdev));
3054 /* HTB requires at least an mtu sized min-rate to send any traffic even
3055 * on uncongested links. */
3056 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3057 hc->min_rate = MAX(hc->min_rate, mtu);
3058 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3061 hc->max_rate = (max_rate_s
3062 ? strtoull(max_rate_s, NULL, 10) / 8
3064 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3065 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3069 * According to hints in the documentation that I've read, it is important
3070 * that 'burst' be at least as big as the largest frame that might be
3071 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3072 * but having it a bit too small is a problem. Since netdev_get_mtu()
3073 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3074 * the MTU. We actually add 64, instead of 14, as a guard against
3075 * additional headers get tacked on somewhere that we're not aware of. */
3076 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3077 hc->burst = MAX(hc->burst, mtu + 64);
3080 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3086 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3087 unsigned int parent, struct htb_class *options,
3088 struct netdev_queue_stats *stats)
3090 struct ofpbuf *reply;
3093 error = tc_query_class(netdev, handle, parent, &reply);
3095 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3096 ofpbuf_delete(reply);
3102 htb_tc_install(struct netdev *netdev, const struct smap *details)
3106 error = htb_setup_qdisc__(netdev);
3108 struct htb_class hc;
3110 htb_parse_qdisc_details__(netdev, details, &hc);
3111 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3112 tc_make_handle(1, 0), &hc);
3114 htb_install__(netdev, hc.max_rate);
3120 static struct htb_class *
3121 htb_class_cast__(const struct tc_queue *queue)
3123 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3127 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3128 const struct htb_class *hc)
3130 struct htb *htb = htb_get__(netdev);
3131 size_t hash = hash_int(queue_id, 0);
3132 struct tc_queue *queue;
3133 struct htb_class *hcp;
3135 queue = tc_find_queue__(netdev, queue_id, hash);
3137 hcp = htb_class_cast__(queue);
3139 hcp = xmalloc(sizeof *hcp);
3140 queue = &hcp->tc_queue;
3141 queue->queue_id = queue_id;
3142 queue->created = time_msec();
3143 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3146 hcp->min_rate = hc->min_rate;
3147 hcp->max_rate = hc->max_rate;
3148 hcp->burst = hc->burst;
3149 hcp->priority = hc->priority;
3153 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3156 struct queue_dump_state state;
3157 struct htb_class hc;
3159 /* Get qdisc options. */
3161 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3162 htb_install__(netdev, hc.max_rate);
3165 if (!start_queue_dump(netdev, &state)) {
3168 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3169 unsigned int queue_id;
3171 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3172 htb_update_queue__(netdev, queue_id, &hc);
3175 finish_queue_dump(&state);
3181 htb_tc_destroy(struct tc *tc)
3183 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3184 struct htb_class *hc, *next;
3186 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3187 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3195 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3197 const struct htb *htb = htb_get__(netdev);
3198 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3203 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3205 struct htb_class hc;
3208 htb_parse_qdisc_details__(netdev, details, &hc);
3209 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3210 tc_make_handle(1, 0), &hc);
3212 htb_get__(netdev)->max_rate = hc.max_rate;
3218 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3219 const struct tc_queue *queue, struct smap *details)
3221 const struct htb_class *hc = htb_class_cast__(queue);
3223 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3224 if (hc->min_rate != hc->max_rate) {
3225 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3227 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3229 smap_add_format(details, "priority", "%u", hc->priority);
3235 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3236 const struct smap *details)
3238 struct htb_class hc;
3241 error = htb_parse_class_details__(netdev, details, &hc);
3246 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3247 tc_make_handle(1, 0xfffe), &hc);
3252 htb_update_queue__(netdev, queue_id, &hc);
3257 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3259 struct htb_class *hc = htb_class_cast__(queue);
3260 struct htb *htb = htb_get__(netdev);
3263 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3265 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3272 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3273 struct netdev_queue_stats *stats)
3275 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3276 tc_make_handle(1, 0xfffe), NULL, stats);
3280 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3281 const struct ofpbuf *nlmsg,
3282 netdev_dump_queue_stats_cb *cb, void *aux)
3284 struct netdev_queue_stats stats;
3285 unsigned int handle, major, minor;
3288 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3293 major = tc_get_major(handle);
3294 minor = tc_get_minor(handle);
3295 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3296 (*cb)(minor - 1, &stats, aux);
3301 static const struct tc_ops tc_ops_htb = {
3302 "htb", /* linux_name */
3303 "linux-htb", /* ovs_name */
3304 HTB_N_QUEUES, /* n_queues */
3313 htb_class_get_stats,
3314 htb_class_dump_stats
3317 /* "linux-hfsc" traffic control class. */
3319 #define HFSC_N_QUEUES 0xf000
3327 struct tc_queue tc_queue;
3332 static struct hfsc *
3333 hfsc_get__(const struct netdev *netdev_)
3335 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3336 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3339 static struct hfsc_class *
3340 hfsc_class_cast__(const struct tc_queue *queue)
3342 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3346 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3348 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3351 hfsc = xmalloc(sizeof *hfsc);
3352 tc_init(&hfsc->tc, &tc_ops_hfsc);
3353 hfsc->max_rate = max_rate;
3354 netdev->tc = &hfsc->tc;
3358 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3359 const struct hfsc_class *hc)
3363 struct hfsc_class *hcp;
3364 struct tc_queue *queue;
3366 hfsc = hfsc_get__(netdev);
3367 hash = hash_int(queue_id, 0);
3369 queue = tc_find_queue__(netdev, queue_id, hash);
3371 hcp = hfsc_class_cast__(queue);
3373 hcp = xmalloc(sizeof *hcp);
3374 queue = &hcp->tc_queue;
3375 queue->queue_id = queue_id;
3376 queue->created = time_msec();
3377 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3380 hcp->min_rate = hc->min_rate;
3381 hcp->max_rate = hc->max_rate;
3385 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3387 const struct tc_service_curve *rsc, *fsc, *usc;
3388 static const struct nl_policy tca_hfsc_policy[] = {
3390 .type = NL_A_UNSPEC,
3392 .min_len = sizeof(struct tc_service_curve),
3395 .type = NL_A_UNSPEC,
3397 .min_len = sizeof(struct tc_service_curve),
3400 .type = NL_A_UNSPEC,
3402 .min_len = sizeof(struct tc_service_curve),
3405 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3407 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3408 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3409 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3413 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3414 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3415 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3417 if (rsc->m1 != 0 || rsc->d != 0 ||
3418 fsc->m1 != 0 || fsc->d != 0 ||
3419 usc->m1 != 0 || usc->d != 0) {
3420 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3421 "Non-linear service curves are not supported.");
3425 if (rsc->m2 != fsc->m2) {
3426 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3427 "Real-time service curves are not supported ");
3431 if (rsc->m2 > usc->m2) {
3432 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3433 "Min-rate service curve is greater than "
3434 "the max-rate service curve.");
3438 class->min_rate = fsc->m2;
3439 class->max_rate = usc->m2;
3444 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3445 struct hfsc_class *options,
3446 struct netdev_queue_stats *stats)
3449 unsigned int handle;
3450 struct nlattr *nl_options;
3452 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3458 unsigned int major, minor;
3460 major = tc_get_major(handle);
3461 minor = tc_get_minor(handle);
3462 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3463 *queue_id = minor - 1;
3470 error = hfsc_parse_tca_options__(nl_options, options);
3477 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3478 unsigned int parent, struct hfsc_class *options,
3479 struct netdev_queue_stats *stats)
3482 struct ofpbuf *reply;
3484 error = tc_query_class(netdev, handle, parent, &reply);
3489 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3490 ofpbuf_delete(reply);
3495 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3496 struct hfsc_class *class)
3498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3500 const char *max_rate_s;
3502 max_rate_s = smap_get(details, "max-rate");
3503 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3506 enum netdev_features current;
3508 netdev_linux_read_features(netdev);
3509 current = !netdev->get_features_error ? netdev->current : 0;
3510 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3513 class->min_rate = max_rate;
3514 class->max_rate = max_rate;
3518 hfsc_parse_class_details__(struct netdev *netdev,
3519 const struct smap *details,
3520 struct hfsc_class * class)
3522 const struct hfsc *hfsc;
3523 uint32_t min_rate, max_rate;
3524 const char *min_rate_s, *max_rate_s;
3526 hfsc = hfsc_get__(netdev);
3527 min_rate_s = smap_get(details, "min-rate");
3528 max_rate_s = smap_get(details, "max-rate");
3530 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3531 min_rate = MAX(min_rate, 1);
3532 min_rate = MIN(min_rate, hfsc->max_rate);
3534 max_rate = (max_rate_s
3535 ? strtoull(max_rate_s, NULL, 10) / 8
3537 max_rate = MAX(max_rate, min_rate);
3538 max_rate = MIN(max_rate, hfsc->max_rate);
3540 class->min_rate = min_rate;
3541 class->max_rate = max_rate;
3546 /* Create an HFSC qdisc.
3548 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3550 hfsc_setup_qdisc__(struct netdev * netdev)
3552 struct tcmsg *tcmsg;
3553 struct ofpbuf request;
3554 struct tc_hfsc_qopt opt;
3556 tc_del_qdisc(netdev);
3558 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3559 NLM_F_EXCL | NLM_F_CREATE, &request);
3565 tcmsg->tcm_handle = tc_make_handle(1, 0);
3566 tcmsg->tcm_parent = TC_H_ROOT;
3568 memset(&opt, 0, sizeof opt);
3571 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3572 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3574 return tc_transact(&request, NULL);
3577 /* Create an HFSC class.
3579 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3580 * sc rate <min_rate> ul rate <max_rate>" */
3582 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3583 unsigned int parent, struct hfsc_class *class)
3587 struct tcmsg *tcmsg;
3588 struct ofpbuf request;
3589 struct tc_service_curve min, max;
3591 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3597 tcmsg->tcm_handle = handle;
3598 tcmsg->tcm_parent = parent;
3602 min.m2 = class->min_rate;
3606 max.m2 = class->max_rate;
3608 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3609 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3610 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3611 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3612 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3613 nl_msg_end_nested(&request, opt_offset);
3615 error = tc_transact(&request, NULL);
3617 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3618 "min-rate %ubps, max-rate %ubps (%s)",
3619 netdev_get_name(netdev),
3620 tc_get_major(handle), tc_get_minor(handle),
3621 tc_get_major(parent), tc_get_minor(parent),
3622 class->min_rate, class->max_rate, ovs_strerror(error));
3629 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3632 struct hfsc_class class;
3634 error = hfsc_setup_qdisc__(netdev);
3640 hfsc_parse_qdisc_details__(netdev, details, &class);
3641 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3642 tc_make_handle(1, 0), &class);
3648 hfsc_install__(netdev, class.max_rate);
3653 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3656 struct queue_dump_state state;
3657 struct hfsc_class hc;
3660 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3661 hfsc_install__(netdev, hc.max_rate);
3663 if (!start_queue_dump(netdev, &state)) {
3667 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3668 unsigned int queue_id;
3670 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3671 hfsc_update_queue__(netdev, queue_id, &hc);
3675 finish_queue_dump(&state);
3680 hfsc_tc_destroy(struct tc *tc)
3683 struct hfsc_class *hc, *next;
3685 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3687 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3688 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3697 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3699 const struct hfsc *hfsc;
3700 hfsc = hfsc_get__(netdev);
3701 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3706 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3709 struct hfsc_class class;
3711 hfsc_parse_qdisc_details__(netdev, details, &class);
3712 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3713 tc_make_handle(1, 0), &class);
3716 hfsc_get__(netdev)->max_rate = class.max_rate;
3723 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3724 const struct tc_queue *queue, struct smap *details)
3726 const struct hfsc_class *hc;
3728 hc = hfsc_class_cast__(queue);
3729 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3730 if (hc->min_rate != hc->max_rate) {
3731 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3737 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3738 const struct smap *details)
3741 struct hfsc_class class;
3743 error = hfsc_parse_class_details__(netdev, details, &class);
3748 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3749 tc_make_handle(1, 0xfffe), &class);
3754 hfsc_update_queue__(netdev, queue_id, &class);
3759 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3763 struct hfsc_class *hc;
3765 hc = hfsc_class_cast__(queue);
3766 hfsc = hfsc_get__(netdev);
3768 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3770 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3777 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3778 struct netdev_queue_stats *stats)
3780 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3781 tc_make_handle(1, 0xfffe), NULL, stats);
3785 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3786 const struct ofpbuf *nlmsg,
3787 netdev_dump_queue_stats_cb *cb, void *aux)
3789 struct netdev_queue_stats stats;
3790 unsigned int handle, major, minor;
3793 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3798 major = tc_get_major(handle);
3799 minor = tc_get_minor(handle);
3800 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3801 (*cb)(minor - 1, &stats, aux);
3806 static const struct tc_ops tc_ops_hfsc = {
3807 "hfsc", /* linux_name */
3808 "linux-hfsc", /* ovs_name */
3809 HFSC_N_QUEUES, /* n_queues */
3810 hfsc_tc_install, /* tc_install */
3811 hfsc_tc_load, /* tc_load */
3812 hfsc_tc_destroy, /* tc_destroy */
3813 hfsc_qdisc_get, /* qdisc_get */
3814 hfsc_qdisc_set, /* qdisc_set */
3815 hfsc_class_get, /* class_get */
3816 hfsc_class_set, /* class_set */
3817 hfsc_class_delete, /* class_delete */
3818 hfsc_class_get_stats, /* class_get_stats */
3819 hfsc_class_dump_stats /* class_dump_stats */
3822 /* "linux-default" traffic control class.
3824 * This class represents the default, unnamed Linux qdisc. It corresponds to
3825 * the "" (empty string) QoS type in the OVS database. */
3828 default_install__(struct netdev *netdev_)
3830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3831 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3833 /* Nothing but a tc class implementation is allowed to write to a tc. This
3834 * class never does that, so we can legitimately use a const tc object. */
3835 netdev->tc = CONST_CAST(struct tc *, &tc);
3839 default_tc_install(struct netdev *netdev,
3840 const struct smap *details OVS_UNUSED)
3842 default_install__(netdev);
3847 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3849 default_install__(netdev);
3853 static const struct tc_ops tc_ops_default = {
3854 NULL, /* linux_name */
3859 NULL, /* tc_destroy */
3860 NULL, /* qdisc_get */
3861 NULL, /* qdisc_set */
3862 NULL, /* class_get */
3863 NULL, /* class_set */
3864 NULL, /* class_delete */
3865 NULL, /* class_get_stats */
3866 NULL /* class_dump_stats */
3869 /* "linux-other" traffic control class.
3874 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3877 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3879 /* Nothing but a tc class implementation is allowed to write to a tc. This
3880 * class never does that, so we can legitimately use a const tc object. */
3881 netdev->tc = CONST_CAST(struct tc *, &tc);
3885 static const struct tc_ops tc_ops_other = {
3886 NULL, /* linux_name */
3887 "linux-other", /* ovs_name */
3889 NULL, /* tc_install */
3891 NULL, /* tc_destroy */
3892 NULL, /* qdisc_get */
3893 NULL, /* qdisc_set */
3894 NULL, /* class_get */
3895 NULL, /* class_set */
3896 NULL, /* class_delete */
3897 NULL, /* class_get_stats */
3898 NULL /* class_dump_stats */
3901 /* Traffic control. */
3903 /* Number of kernel "tc" ticks per second. */
3904 static double ticks_per_s;
3906 /* Number of kernel "jiffies" per second. This is used for the purpose of
3907 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3908 * one jiffy's worth of data.
3910 * There are two possibilities here:
3912 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3913 * approximate range of 100 to 1024. That means that we really need to
3914 * make sure that the qdisc can buffer that much data.
3916 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3917 * has finely granular timers and there's no need to fudge additional room
3918 * for buffers. (There's no extra effort needed to implement that: the
3919 * large 'buffer_hz' is used as a divisor, so practically any number will
3920 * come out as 0 in the division. Small integer results in the case of
3921 * really high dividends won't have any real effect anyhow.)
3923 static unsigned int buffer_hz;
3925 /* Returns tc handle 'major':'minor'. */
3927 tc_make_handle(unsigned int major, unsigned int minor)
3929 return TC_H_MAKE(major << 16, minor);
3932 /* Returns the major number from 'handle'. */
3934 tc_get_major(unsigned int handle)
3936 return TC_H_MAJ(handle) >> 16;
3939 /* Returns the minor number from 'handle'. */
3941 tc_get_minor(unsigned int handle)
3943 return TC_H_MIN(handle);
3946 static struct tcmsg *
3947 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3948 struct ofpbuf *request)
3950 struct tcmsg *tcmsg;
3954 error = get_ifindex(netdev, &ifindex);
3959 ofpbuf_init(request, 512);
3960 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3961 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3962 tcmsg->tcm_family = AF_UNSPEC;
3963 tcmsg->tcm_ifindex = ifindex;
3964 /* Caller should fill in tcmsg->tcm_handle. */
3965 /* Caller should fill in tcmsg->tcm_parent. */
3971 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3973 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3974 ofpbuf_uninit(request);
3978 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3979 * policing configuration.
3981 * This function is equivalent to running the following when 'add' is true:
3982 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3984 * This function is equivalent to running the following when 'add' is false:
3985 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3987 * The configuration and stats may be seen with the following command:
3988 * /sbin/tc -s qdisc show dev <devname>
3990 * Returns 0 if successful, otherwise a positive errno value.
3993 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3995 struct ofpbuf request;
3996 struct tcmsg *tcmsg;
3998 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3999 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4001 tcmsg = tc_make_request(netdev, type, flags, &request);
4005 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4006 tcmsg->tcm_parent = TC_H_INGRESS;
4007 nl_msg_put_string(&request, TCA_KIND, "ingress");
4008 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4010 error = tc_transact(&request, NULL);
4012 /* If we're deleting the qdisc, don't worry about some of the
4013 * error conditions. */
4014 if (!add && (error == ENOENT || error == EINVAL)) {
4023 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4026 * This function is equivalent to running:
4027 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4028 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4031 * The configuration and stats may be seen with the following command:
4032 * /sbin/tc -s filter show dev <devname> parent ffff:
4034 * Returns 0 if successful, otherwise a positive errno value.
4037 tc_add_policer(struct netdev *netdev,
4038 uint32_t kbits_rate, uint32_t kbits_burst)
4040 struct tc_police tc_police;
4041 struct ofpbuf request;
4042 struct tcmsg *tcmsg;
4043 size_t basic_offset;
4044 size_t police_offset;
4048 memset(&tc_police, 0, sizeof tc_police);
4049 tc_police.action = TC_POLICE_SHOT;
4050 tc_police.mtu = mtu;
4051 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4053 /* The following appears wrong in two ways:
4055 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4056 * arguments (or at least consistently "bytes" as both or "bits" as
4057 * both), but this supplies bytes for the first argument and bits for the
4060 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4062 * However if you "fix" those problems then "tc filter show ..." shows
4063 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4064 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4065 * tc's point of view. Whatever. */
4066 tc_police.burst = tc_bytes_to_ticks(
4067 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4069 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4070 NLM_F_EXCL | NLM_F_CREATE, &request);
4074 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4075 tcmsg->tcm_info = tc_make_handle(49,
4076 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4078 nl_msg_put_string(&request, TCA_KIND, "basic");
4079 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4080 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4081 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4082 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4083 nl_msg_end_nested(&request, police_offset);
4084 nl_msg_end_nested(&request, basic_offset);
4086 error = tc_transact(&request, NULL);
4097 /* The values in psched are not individually very meaningful, but they are
4098 * important. The tables below show some values seen in the wild.
4102 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4103 * (Before that, there are hints that it was 1000000000.)
4105 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4109 * -----------------------------------
4110 * [1] 000c8000 000f4240 000f4240 00000064
4111 * [2] 000003e8 00000400 000f4240 3b9aca00
4112 * [3] 000003e8 00000400 000f4240 3b9aca00
4113 * [4] 000003e8 00000400 000f4240 00000064
4114 * [5] 000003e8 00000040 000f4240 3b9aca00
4115 * [6] 000003e8 00000040 000f4240 000000f9
4117 * a b c d ticks_per_s buffer_hz
4118 * ------- --------- ---------- ------------- ----------- -------------
4119 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4120 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4121 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4122 * [4] 1,000 1,024 1,000,000 100 976,562 100
4123 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4124 * [6] 1,000 64 1,000,000 249 15,625,000 249
4126 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4127 * [2] 2.6.26-1-686-bigmem from Debian lenny
4128 * [3] 2.6.26-2-sparc64 from Debian lenny
4129 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4130 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4131 * [6] 2.6.34 from kernel.org on KVM
4133 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4134 static const char fn[] = "/proc/net/psched";
4135 unsigned int a, b, c, d;
4138 if (!ovsthread_once_start(&once)) {
4145 stream = fopen(fn, "r");
4147 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4151 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4152 VLOG_WARN("%s: read failed", fn);
4156 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4160 VLOG_WARN("%s: invalid scheduler parameters", fn);
4164 ticks_per_s = (double) a * c / b;
4168 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4171 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4174 ovsthread_once_done(&once);
4177 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4178 * rate of 'rate' bytes per second. */
4180 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4183 return (rate * ticks) / ticks_per_s;
4186 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4187 * rate of 'rate' bytes per second. */
4189 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4192 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4195 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4196 * a transmission rate of 'rate' bytes per second. */
4198 tc_buffer_per_jiffy(unsigned int rate)
4201 return rate / buffer_hz;
4204 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4205 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4206 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4207 * stores NULL into it if it is absent.
4209 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4212 * Returns 0 if successful, otherwise a positive errno value. */
4214 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4215 struct nlattr **options)
4217 static const struct nl_policy tca_policy[] = {
4218 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4219 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4221 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4223 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4224 tca_policy, ta, ARRAY_SIZE(ta))) {
4225 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4230 *kind = nl_attr_get_string(ta[TCA_KIND]);
4234 *options = ta[TCA_OPTIONS];
4249 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4250 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4251 * into '*options', and its queue statistics into '*stats'. Any of the output
4252 * arguments may be null.
4254 * Returns 0 if successful, otherwise a positive errno value. */
4256 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4257 struct nlattr **options, struct netdev_queue_stats *stats)
4259 static const struct nl_policy tca_policy[] = {
4260 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4261 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4263 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4265 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4266 tca_policy, ta, ARRAY_SIZE(ta))) {
4267 VLOG_WARN_RL(&rl, "failed to parse class message");
4272 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4273 *handlep = tc->tcm_handle;
4277 *options = ta[TCA_OPTIONS];
4281 const struct gnet_stats_queue *gsq;
4282 struct gnet_stats_basic gsb;
4284 static const struct nl_policy stats_policy[] = {
4285 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4286 .min_len = sizeof gsb },
4287 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4288 .min_len = sizeof *gsq },
4290 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4292 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4293 sa, ARRAY_SIZE(sa))) {
4294 VLOG_WARN_RL(&rl, "failed to parse class stats");
4298 /* Alignment issues screw up the length of struct gnet_stats_basic on
4299 * some arch/bitsize combinations. Newer versions of Linux have a
4300 * struct gnet_stats_basic_packed, but we can't depend on that. The
4301 * easiest thing to do is just to make a copy. */
4302 memset(&gsb, 0, sizeof gsb);
4303 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4304 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4305 stats->tx_bytes = gsb.bytes;
4306 stats->tx_packets = gsb.packets;
4308 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4309 stats->tx_errors = gsq->drops;
4319 memset(stats, 0, sizeof *stats);
4324 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4327 tc_query_class(const struct netdev *netdev,
4328 unsigned int handle, unsigned int parent,
4329 struct ofpbuf **replyp)
4331 struct ofpbuf request;
4332 struct tcmsg *tcmsg;
4335 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4339 tcmsg->tcm_handle = handle;
4340 tcmsg->tcm_parent = parent;
4342 error = tc_transact(&request, replyp);
4344 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4345 netdev_get_name(netdev),
4346 tc_get_major(handle), tc_get_minor(handle),
4347 tc_get_major(parent), tc_get_minor(parent),
4348 ovs_strerror(error));
4353 /* Equivalent to "tc class del dev <name> handle <handle>". */
4355 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4357 struct ofpbuf request;
4358 struct tcmsg *tcmsg;
4361 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4365 tcmsg->tcm_handle = handle;
4366 tcmsg->tcm_parent = 0;
4368 error = tc_transact(&request, NULL);
4370 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4371 netdev_get_name(netdev),
4372 tc_get_major(handle), tc_get_minor(handle),
4373 ovs_strerror(error));
4378 /* Equivalent to "tc qdisc del dev <name> root". */
4380 tc_del_qdisc(struct netdev *netdev_)
4382 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4383 struct ofpbuf request;
4384 struct tcmsg *tcmsg;
4387 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4391 tcmsg->tcm_handle = tc_make_handle(1, 0);
4392 tcmsg->tcm_parent = TC_H_ROOT;
4394 error = tc_transact(&request, NULL);
4395 if (error == EINVAL) {
4396 /* EINVAL probably means that the default qdisc was in use, in which
4397 * case we've accomplished our purpose. */
4400 if (!error && netdev->tc) {
4401 if (netdev->tc->ops->tc_destroy) {
4402 netdev->tc->ops->tc_destroy(netdev->tc);
4410 getqdisc_is_safe(void)
4412 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4413 static bool safe = false;
4415 if (ovsthread_once_start(&once)) {
4416 struct utsname utsname;
4419 if (uname(&utsname) == -1) {
4420 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
4421 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
4422 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
4423 } else if (major < 2 || (major == 2 && minor < 35)) {
4424 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
4429 ovsthread_once_done(&once);
4434 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4435 * kernel to determine what they are. Returns 0 if successful, otherwise a
4436 * positive errno value. */
4438 tc_query_qdisc(const struct netdev *netdev_)
4440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4441 struct ofpbuf request, *qdisc;
4442 const struct tc_ops *ops;
4443 struct tcmsg *tcmsg;
4451 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4452 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4453 * 2.6.35 without that fix backported to it.
4455 * To avoid the OOPS, we must not make a request that would attempt to dump
4456 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4457 * few others. There are a few ways that I can see to do this, but most of
4458 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4459 * technique chosen here is to assume that any non-default qdisc that we
4460 * create will have a class with handle 1:0. The built-in qdiscs only have
4461 * a class with handle 0:0.
4463 * On Linux 2.6.35+ we use the straightforward method because it allows us
4464 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
4465 * in such a case we get no response at all from the kernel (!) if a
4466 * builtin qdisc is in use (which is later caught by "!error &&
4467 * !qdisc->size"). */
4468 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4472 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
4473 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
4475 /* Figure out what tc class to instantiate. */
4476 error = tc_transact(&request, &qdisc);
4477 if (!error && qdisc->size) {
4480 error = tc_parse_qdisc(qdisc, &kind, NULL);
4482 ops = &tc_ops_other;
4484 ops = tc_lookup_linux_name(kind);
4486 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4487 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
4489 ops = &tc_ops_other;
4492 } else if ((!error && !qdisc->size) || error == ENOENT) {
4493 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
4494 * set up by some other entity that doesn't have a handle 1:0. We will
4495 * assume that it's the system default qdisc. */
4496 ops = &tc_ops_default;
4499 /* Who knows? Maybe the device got deleted. */
4500 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4501 netdev_get_name(netdev_), ovs_strerror(error));
4502 ops = &tc_ops_other;
4505 /* Instantiate it. */
4506 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4507 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4508 ofpbuf_delete(qdisc);
4510 return error ? error : load_error;
4513 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4514 approximate the time to transmit packets of various lengths. For an MTU of
4515 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4516 represents two possible packet lengths; for a MTU of 513 through 1024, four
4517 possible lengths; and so on.
4519 Returns, for the specified 'mtu', the number of bits that packet lengths
4520 need to be shifted right to fit within such a 256-entry table. */
4522 tc_calc_cell_log(unsigned int mtu)
4527 mtu = ETH_PAYLOAD_MAX;
4529 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4531 for (cell_log = 0; mtu >= 256; cell_log++) {
4538 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4541 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4543 memset(rate, 0, sizeof *rate);
4544 rate->cell_log = tc_calc_cell_log(mtu);
4545 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4546 /* rate->cell_align = 0; */ /* distro headers. */
4547 rate->mpu = ETH_TOTAL_MIN;
4551 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4552 * attribute of the specified "type".
4554 * See tc_calc_cell_log() above for a description of "rtab"s. */
4556 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4561 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4562 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4563 unsigned packet_size = (i + 1) << rate->cell_log;
4564 if (packet_size < rate->mpu) {
4565 packet_size = rate->mpu;
4567 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4571 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4572 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4573 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4576 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4578 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4579 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4582 /* Linux-only functions declared in netdev-linux.h */
4584 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4585 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4587 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4588 const char *flag_name, bool enable)
4590 const char *netdev_name = netdev_get_name(netdev);
4591 struct ethtool_value evalue;
4595 COVERAGE_INC(netdev_get_ethtool);
4596 memset(&evalue, 0, sizeof evalue);
4597 error = netdev_linux_do_ethtool(netdev_name,
4598 (struct ethtool_cmd *)&evalue,
4599 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4604 COVERAGE_INC(netdev_set_ethtool);
4605 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4606 error = netdev_linux_do_ethtool(netdev_name,
4607 (struct ethtool_cmd *)&evalue,
4608 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4613 COVERAGE_INC(netdev_get_ethtool);
4614 memset(&evalue, 0, sizeof evalue);
4615 error = netdev_linux_do_ethtool(netdev_name,
4616 (struct ethtool_cmd *)&evalue,
4617 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4622 if (new_flags != evalue.data) {
4623 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4624 "device %s failed", enable ? "enable" : "disable",
4625 flag_name, netdev_name);
4632 /* Utility functions. */
4634 /* Copies 'src' into 'dst', performing format conversion in the process. */
4636 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4637 const struct rtnl_link_stats *src)
4639 dst->rx_packets = src->rx_packets;
4640 dst->tx_packets = src->tx_packets;
4641 dst->rx_bytes = src->rx_bytes;
4642 dst->tx_bytes = src->tx_bytes;
4643 dst->rx_errors = src->rx_errors;
4644 dst->tx_errors = src->tx_errors;
4645 dst->rx_dropped = src->rx_dropped;
4646 dst->tx_dropped = src->tx_dropped;
4647 dst->multicast = src->multicast;
4648 dst->collisions = src->collisions;
4649 dst->rx_length_errors = src->rx_length_errors;
4650 dst->rx_over_errors = src->rx_over_errors;
4651 dst->rx_crc_errors = src->rx_crc_errors;
4652 dst->rx_frame_errors = src->rx_frame_errors;
4653 dst->rx_fifo_errors = src->rx_fifo_errors;
4654 dst->rx_missed_errors = src->rx_missed_errors;
4655 dst->tx_aborted_errors = src->tx_aborted_errors;
4656 dst->tx_carrier_errors = src->tx_carrier_errors;
4657 dst->tx_fifo_errors = src->tx_fifo_errors;
4658 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4659 dst->tx_window_errors = src->tx_window_errors;
4662 /* Copies 'src' into 'dst', performing format conversion in the process. */
4664 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4665 const struct rtnl_link_stats64 *src)
4667 dst->rx_packets = src->rx_packets;
4668 dst->tx_packets = src->tx_packets;
4669 dst->rx_bytes = src->rx_bytes;
4670 dst->tx_bytes = src->tx_bytes;
4671 dst->rx_errors = src->rx_errors;
4672 dst->tx_errors = src->tx_errors;
4673 dst->rx_dropped = src->rx_dropped;
4674 dst->tx_dropped = src->tx_dropped;
4675 dst->multicast = src->multicast;
4676 dst->collisions = src->collisions;
4677 dst->rx_length_errors = src->rx_length_errors;
4678 dst->rx_over_errors = src->rx_over_errors;
4679 dst->rx_crc_errors = src->rx_crc_errors;
4680 dst->rx_frame_errors = src->rx_frame_errors;
4681 dst->rx_fifo_errors = src->rx_fifo_errors;
4682 dst->rx_missed_errors = src->rx_missed_errors;
4683 dst->tx_aborted_errors = src->tx_aborted_errors;
4684 dst->tx_carrier_errors = src->tx_carrier_errors;
4685 dst->tx_fifo_errors = src->tx_fifo_errors;
4686 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4687 dst->tx_window_errors = src->tx_window_errors;
4691 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4693 struct ofpbuf request;
4694 struct ofpbuf *reply;
4697 ofpbuf_init(&request, 0);
4698 nl_msg_put_nlmsghdr(&request,
4699 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4700 RTM_GETLINK, NLM_F_REQUEST);
4701 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4702 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4703 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4704 ofpbuf_uninit(&request);
4709 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4710 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
4711 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
4712 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
4715 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4716 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4717 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4720 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4725 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4730 ofpbuf_delete(reply);
4735 get_flags(const struct netdev *dev, unsigned int *flags)
4741 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4743 *flags = ifr.ifr_flags;
4749 set_flags(const char *name, unsigned int flags)
4753 ifr.ifr_flags = flags;
4754 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4758 do_get_ifindex(const char *netdev_name)
4763 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4764 COVERAGE_INC(netdev_get_ifindex);
4766 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4768 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4769 netdev_name, ovs_strerror(error));
4772 return ifr.ifr_ifindex;
4776 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4780 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4781 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4784 netdev->get_ifindex_error = -ifindex;
4785 netdev->ifindex = 0;
4787 netdev->get_ifindex_error = 0;
4788 netdev->ifindex = ifindex;
4790 netdev->cache_valid |= VALID_IFINDEX;
4793 *ifindexp = netdev->ifindex;
4794 return netdev->get_ifindex_error;
4798 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4804 memset(&ifr, 0, sizeof ifr);
4805 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4806 COVERAGE_INC(netdev_get_hwaddr);
4807 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4809 /* ENODEV probably means that a vif disappeared asynchronously and
4810 * hasn't been removed from the database yet, so reduce the log level
4811 * to INFO for that case. */
4812 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4813 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4814 netdev_name, ovs_strerror(error));
4817 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4818 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4819 VLOG_WARN("%s device has unknown hardware address family %d",
4820 netdev_name, hwaddr_family);
4822 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4827 set_etheraddr(const char *netdev_name,
4828 const uint8_t mac[ETH_ADDR_LEN])
4833 memset(&ifr, 0, sizeof ifr);
4834 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4835 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4836 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4837 COVERAGE_INC(netdev_set_hwaddr);
4838 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4840 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4841 netdev_name, ovs_strerror(error));
4847 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4848 int cmd, const char *cmd_name)
4853 memset(&ifr, 0, sizeof ifr);
4854 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4855 ifr.ifr_data = (caddr_t) ecmd;
4858 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4860 if (error != EOPNOTSUPP) {
4861 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4862 "failed: %s", cmd_name, name, ovs_strerror(error));
4864 /* The device doesn't support this operation. That's pretty
4865 * common, so there's no point in logging anything. */
4872 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4873 int cmd, const char *cmd_name)
4878 ifr.ifr_addr.sa_family = AF_INET;
4879 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4881 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4883 *ip = sin->sin_addr;
4888 /* Returns an AF_PACKET raw socket or a negative errno value. */
4890 af_packet_sock(void)
4892 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4895 if (ovsthread_once_start(&once)) {
4896 sock = socket(AF_PACKET, SOCK_RAW, 0);
4898 int error = set_nonblocking(sock);
4905 VLOG_ERR("failed to create packet socket: %s",
4906 ovs_strerror(errno));
4908 ovsthread_once_done(&once);