2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
141 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
144 * 2.6.32-431.29.2.el6.x86_64 (see report at
145 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
146 * if_link.h is not self-contained on those kernels. It is easiest to
147 * unconditionally define a replacement. */
149 #define IFLA_STATS64 23
151 #define rtnl_link_stats64 rpl_rtnl_link_stats64
152 struct rtnl_link_stats64 {
164 uint64_t rx_length_errors;
165 uint64_t rx_over_errors;
166 uint64_t rx_crc_errors;
167 uint64_t rx_frame_errors;
168 uint64_t rx_fifo_errors;
169 uint64_t rx_missed_errors;
171 uint64_t tx_aborted_errors;
172 uint64_t tx_carrier_errors;
173 uint64_t tx_fifo_errors;
174 uint64_t tx_heartbeat_errors;
175 uint64_t tx_window_errors;
177 uint64_t rx_compressed;
178 uint64_t tx_compressed;
182 VALID_IFINDEX = 1 << 0,
183 VALID_ETHERADDR = 1 << 1,
187 VALID_POLICING = 1 << 5,
188 VALID_VPORT_STAT_ERROR = 1 << 6,
189 VALID_DRVINFO = 1 << 7,
190 VALID_FEATURES = 1 << 8,
193 /* Traffic control. */
195 /* An instance of a traffic control class. Always associated with a particular
198 * Each TC implementation subclasses this with whatever additional data it
201 const struct tc_ops *ops;
202 struct hmap queues; /* Contains "struct tc_queue"s.
203 * Read by generic TC layer.
204 * Written only by TC implementation. */
207 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209 /* One traffic control queue.
211 * Each TC implementation subclasses this with whatever additional data it
214 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
215 unsigned int queue_id; /* OpenFlow queue ID. */
216 long long int created; /* Time queue was created, in msecs. */
219 /* A particular kind of traffic control. Each implementation generally maps to
220 * one particular Linux qdisc class.
222 * The functions below return 0 if successful or a positive errno value on
223 * failure, except where otherwise noted. All of them must be provided, except
224 * where otherwise noted. */
226 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
227 * This is null for tc_ops_default and tc_ops_other, for which there are no
228 * appropriate values. */
229 const char *linux_name;
231 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
232 const char *ovs_name;
234 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
235 * queues. The queues are numbered 0 through n_queues - 1. */
236 unsigned int n_queues;
238 /* Called to install this TC class on 'netdev'. The implementation should
239 * make the Netlink calls required to set up 'netdev' with the right qdisc
240 * and configure it according to 'details'. The implementation may assume
241 * that the current qdisc is the default; that is, there is no need for it
242 * to delete the current qdisc before installing itself.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
246 * (which is built as ovs-vswitchd.conf.db(8)).
248 * This function must return 0 if and only if it sets 'netdev->tc' to an
249 * initialized 'struct tc'.
251 * (This function is null for tc_ops_other, which cannot be installed. For
252 * other TC classes it should always be nonnull.) */
253 int (*tc_install)(struct netdev *netdev, const struct smap *details);
255 /* Called when the netdev code determines (through a Netlink query) that
256 * this TC class's qdisc is installed on 'netdev', but we didn't install
257 * it ourselves and so don't know any of the details.
259 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
260 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
261 * implementation should parse the other attributes of 'nlmsg' as
262 * necessary to determine its configuration. If necessary it should also
263 * use Netlink queries to determine the configuration of queues on
266 * This function must return 0 if and only if it sets 'netdev->tc' to an
267 * initialized 'struct tc'. */
268 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
270 /* Destroys the data structures allocated by the implementation as part of
271 * 'tc'. (This includes destroying 'tc->queues' by calling
274 * The implementation should not need to perform any Netlink calls. If
275 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
276 * (But it may not be desirable.)
278 * This function may be null if 'tc' is trivial. */
279 void (*tc_destroy)(struct tc *tc);
281 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 * The implementation should not need to perform any Netlink calls, because
284 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
285 * cached the configuration.
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' is not configurable.
293 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
295 /* Reconfigures 'netdev->tc' according to 'details', performing any
296 * required Netlink calls to complete the reconfiguration.
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
302 * This function may be null if 'tc' is not configurable.
304 int (*qdisc_set)(struct netdev *, const struct smap *details);
306 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
307 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
309 * The contents of 'details' should be documented as valid for 'ovs_name'
310 * in the "other_config" column in the "Queue" table in
311 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 * The implementation should not need to perform any Netlink calls, because
314 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
315 * cached the queue configuration.
317 * This function may be null if 'tc' does not have queues ('n_queues' is
319 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
320 struct smap *details);
322 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
323 * 'details', perfoming any required Netlink calls to complete the
324 * reconfiguration. The caller ensures that 'queue_id' is less than
327 * The contents of 'details' should be documented as valid for 'ovs_name'
328 * in the "other_config" column in the "Queue" table in
329 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 * This function may be null if 'tc' does not have queues or its queues are
332 * not configurable. */
333 int (*class_set)(struct netdev *, unsigned int queue_id,
334 const struct smap *details);
336 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
337 * tc_queue's within 'netdev->tc->queues'.
339 * This function may be null if 'tc' does not have queues or its queues
340 * cannot be deleted. */
341 int (*class_delete)(struct netdev *, struct tc_queue *queue);
343 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
344 * 'struct tc_queue's within 'netdev->tc->queues'.
346 * On success, initializes '*stats'.
348 * This function may be null if 'tc' does not have queues or if it cannot
349 * report queue statistics. */
350 int (*class_get_stats)(const struct netdev *netdev,
351 const struct tc_queue *queue,
352 struct netdev_queue_stats *stats);
354 /* Extracts queue stats from 'nlmsg', which is a response to a
355 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 * This function may be null if 'tc' does not have queues or if it cannot
358 * report queue statistics. */
359 int (*class_dump_stats)(const struct netdev *netdev,
360 const struct ofpbuf *nlmsg,
361 netdev_dump_queue_stats_cb *cb, void *aux);
365 tc_init(struct tc *tc, const struct tc_ops *ops)
368 hmap_init(&tc->queues);
372 tc_destroy(struct tc *tc)
374 hmap_destroy(&tc->queues);
377 static const struct tc_ops tc_ops_htb;
378 static const struct tc_ops tc_ops_hfsc;
379 static const struct tc_ops tc_ops_default;
380 static const struct tc_ops tc_ops_other;
382 static const struct tc_ops *const tcs[] = {
383 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
384 &tc_ops_hfsc, /* Hierarchical fair service curve. */
385 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
386 &tc_ops_other, /* Some other qdisc. */
390 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
391 static unsigned int tc_get_major(unsigned int handle);
392 static unsigned int tc_get_minor(unsigned int handle);
394 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
395 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
396 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
398 static struct tcmsg *tc_make_request(const struct netdev *, int type,
399 unsigned int flags, struct ofpbuf *);
400 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
401 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
402 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
405 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
406 struct nlattr **options);
407 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
408 struct nlattr **options,
409 struct netdev_queue_stats *);
410 static int tc_query_class(const struct netdev *,
411 unsigned int handle, unsigned int parent,
412 struct ofpbuf **replyp);
413 static int tc_delete_class(const struct netdev *, unsigned int handle);
415 static int tc_del_qdisc(struct netdev *netdev);
416 static int tc_query_qdisc(const struct netdev *netdev);
418 static int tc_calc_cell_log(unsigned int mtu);
419 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
420 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
421 const struct tc_ratespec *rate);
422 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
424 struct netdev_linux {
427 /* Protects all members below. */
428 struct ovs_mutex mutex;
430 unsigned int cache_valid;
432 bool miimon; /* Link status of last poll. */
433 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
434 struct timer miimon_timer;
436 /* The following are figured out "on demand" only. They are only valid
437 * when the corresponding VALID_* bit in 'cache_valid' is set. */
439 uint8_t etheraddr[ETH_ADDR_LEN];
440 struct in_addr address, netmask;
443 unsigned int ifi_flags;
444 long long int carrier_resets;
445 uint32_t kbits_rate; /* Policing data. */
446 uint32_t kbits_burst;
447 int vport_stats_error; /* Cached error code from vport_get_stats().
448 0 or an errno value. */
449 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
450 int ether_addr_error; /* Cached error code from set/get etheraddr. */
451 int netdev_policing_error; /* Cached error code from set policing. */
452 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
453 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
455 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
456 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
457 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
459 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
462 /* For devices of class netdev_tap_class only. */
466 struct netdev_rxq_linux {
467 struct netdev_rxq up;
472 /* This is set pretty low because we probably won't learn anything from the
473 * additional log messages. */
474 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
476 /* Polling miimon status for all ports causes performance degradation when
477 * handling a large number of ports. If there are no devices using miimon, then
478 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
480 * Readers do not depend on this variable synchronizing with the related
481 * changes in the device miimon status, so we can use atomic_count. */
482 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
484 static void netdev_linux_run(void);
486 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
487 int cmd, const char *cmd_name);
488 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
489 int cmd, const char *cmd_name);
490 static int get_flags(const struct netdev *, unsigned int *flags);
491 static int set_flags(const char *, unsigned int flags);
492 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
493 enum netdev_flags on, enum netdev_flags *old_flagsp)
494 OVS_REQUIRES(netdev->mutex);
495 static int do_get_ifindex(const char *netdev_name);
496 static int get_ifindex(const struct netdev *, int *ifindexp);
497 static int do_set_addr(struct netdev *netdev,
498 int ioctl_nr, const char *ioctl_name,
499 struct in_addr addr);
500 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
501 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
502 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
503 static int af_packet_sock(void);
504 static bool netdev_linux_miimon_enabled(void);
505 static void netdev_linux_miimon_run(void);
506 static void netdev_linux_miimon_wait(void);
507 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
510 is_netdev_linux_class(const struct netdev_class *netdev_class)
512 return netdev_class->run == netdev_linux_run;
516 is_tap_netdev(const struct netdev *netdev)
518 return netdev_get_class(netdev) == &netdev_tap_class;
521 static struct netdev_linux *
522 netdev_linux_cast(const struct netdev *netdev)
524 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
526 return CONTAINER_OF(netdev, struct netdev_linux, up);
529 static struct netdev_rxq_linux *
530 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
532 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
533 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
536 static void netdev_linux_update(struct netdev_linux *netdev,
537 const struct rtnetlink_link_change *)
538 OVS_REQUIRES(netdev->mutex);
539 static void netdev_linux_changed(struct netdev_linux *netdev,
540 unsigned int ifi_flags, unsigned int mask)
541 OVS_REQUIRES(netdev->mutex);
543 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
544 * if no such socket could be created. */
545 static struct nl_sock *
546 netdev_linux_notify_sock(void)
548 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
549 static struct nl_sock *sock;
551 if (ovsthread_once_start(&once)) {
554 error = nl_sock_create(NETLINK_ROUTE, &sock);
556 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
558 nl_sock_destroy(sock);
562 ovsthread_once_done(&once);
569 netdev_linux_miimon_enabled(void)
571 return atomic_count_get(&miimon_cnt) > 0;
575 netdev_linux_run(void)
577 struct nl_sock *sock;
580 if (netdev_linux_miimon_enabled()) {
581 netdev_linux_miimon_run();
584 sock = netdev_linux_notify_sock();
590 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
591 uint64_t buf_stub[4096 / 8];
594 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
595 error = nl_sock_recv(sock, &buf, false);
597 struct rtnetlink_link_change change;
599 if (rtnetlink_link_parse(&buf, &change)) {
600 struct netdev *netdev_ = netdev_from_name(change.ifname);
601 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
604 ovs_mutex_lock(&netdev->mutex);
605 netdev_linux_update(netdev, &change);
606 ovs_mutex_unlock(&netdev->mutex);
608 netdev_close(netdev_);
610 } else if (error == ENOBUFS) {
611 struct shash device_shash;
612 struct shash_node *node;
616 shash_init(&device_shash);
617 netdev_get_devices(&netdev_linux_class, &device_shash);
618 SHASH_FOR_EACH (node, &device_shash) {
619 struct netdev *netdev_ = node->data;
620 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
623 ovs_mutex_lock(&netdev->mutex);
624 get_flags(netdev_, &flags);
625 netdev_linux_changed(netdev, flags, 0);
626 ovs_mutex_unlock(&netdev->mutex);
628 netdev_close(netdev_);
630 shash_destroy(&device_shash);
631 } else if (error != EAGAIN) {
632 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
633 ovs_strerror(error));
640 netdev_linux_wait(void)
642 struct nl_sock *sock;
644 if (netdev_linux_miimon_enabled()) {
645 netdev_linux_miimon_wait();
647 sock = netdev_linux_notify_sock();
649 nl_sock_wait(sock, POLLIN);
654 netdev_linux_changed(struct netdev_linux *dev,
655 unsigned int ifi_flags, unsigned int mask)
656 OVS_REQUIRES(dev->mutex)
658 netdev_change_seq_changed(&dev->up);
660 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
661 dev->carrier_resets++;
663 dev->ifi_flags = ifi_flags;
665 dev->cache_valid &= mask;
669 netdev_linux_update(struct netdev_linux *dev,
670 const struct rtnetlink_link_change *change)
671 OVS_REQUIRES(dev->mutex)
673 if (change->nlmsg_type == RTM_NEWLINK) {
675 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
677 /* Update netdev from rtnl-change msg. */
679 dev->mtu = change->mtu;
680 dev->cache_valid |= VALID_MTU;
681 dev->netdev_mtu_error = 0;
684 if (!eth_addr_is_zero(change->addr)) {
685 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
686 dev->cache_valid |= VALID_ETHERADDR;
687 dev->ether_addr_error = 0;
690 dev->ifindex = change->ifi_index;
691 dev->cache_valid |= VALID_IFINDEX;
692 dev->get_ifindex_error = 0;
695 netdev_linux_changed(dev, change->ifi_flags, 0);
699 static struct netdev *
700 netdev_linux_alloc(void)
702 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
707 netdev_linux_common_construct(struct netdev_linux *netdev)
709 ovs_mutex_init(&netdev->mutex);
712 /* Creates system and internal devices. */
714 netdev_linux_construct(struct netdev *netdev_)
716 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
719 netdev_linux_common_construct(netdev);
721 error = get_flags(&netdev->up, &netdev->ifi_flags);
722 if (error == ENODEV) {
723 if (netdev->up.netdev_class != &netdev_internal_class) {
724 /* The device does not exist, so don't allow it to be opened. */
727 /* "Internal" netdevs have to be created as netdev objects before
728 * they exist in the kernel, because creating them in the kernel
729 * happens by passing a netdev object to dpif_port_add().
730 * Therefore, ignore the error. */
737 /* For most types of netdevs we open the device for each call of
738 * netdev_open(). However, this is not the case with tap devices,
739 * since it is only possible to open the device once. In this
740 * situation we share a single file descriptor, and consequently
741 * buffers, across all readers. Therefore once data is read it will
742 * be unavailable to other reads for tap devices. */
744 netdev_linux_construct_tap(struct netdev *netdev_)
746 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
747 static const char tap_dev[] = "/dev/net/tun";
748 const char *name = netdev_->name;
752 netdev_linux_common_construct(netdev);
754 /* Open tap device. */
755 netdev->tap_fd = open(tap_dev, O_RDWR);
756 if (netdev->tap_fd < 0) {
758 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
762 /* Create tap device. */
763 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
764 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
765 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
766 VLOG_WARN("%s: creating tap device failed: %s", name,
767 ovs_strerror(errno));
772 /* Make non-blocking. */
773 error = set_nonblocking(netdev->tap_fd);
781 close(netdev->tap_fd);
786 netdev_linux_destruct(struct netdev *netdev_)
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 if (netdev->tc && netdev->tc->ops->tc_destroy) {
791 netdev->tc->ops->tc_destroy(netdev->tc);
794 if (netdev_get_class(netdev_) == &netdev_tap_class
795 && netdev->tap_fd >= 0)
797 close(netdev->tap_fd);
800 if (netdev->miimon_interval > 0) {
801 atomic_count_dec(&miimon_cnt);
804 ovs_mutex_destroy(&netdev->mutex);
808 netdev_linux_dealloc(struct netdev *netdev_)
810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 static struct netdev_rxq *
815 netdev_linux_rxq_alloc(void)
817 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
822 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
824 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
825 struct netdev *netdev_ = rx->up.netdev;
826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829 ovs_mutex_lock(&netdev->mutex);
830 rx->is_tap = is_tap_netdev(netdev_);
832 rx->fd = netdev->tap_fd;
834 struct sockaddr_ll sll;
836 /* Result of tcpdump -dd inbound */
837 static const struct sock_filter filt[] = {
838 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
839 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
840 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
841 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
843 static const struct sock_fprog fprog = {
844 ARRAY_SIZE(filt), (struct sock_filter *) filt
847 /* Create file descriptor. */
848 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
851 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
856 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
858 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
859 netdev_get_name(netdev_), ovs_strerror(error));
863 /* Set non-blocking mode. */
864 error = set_nonblocking(rx->fd);
869 /* Get ethernet device index. */
870 error = get_ifindex(&netdev->up, &ifindex);
875 /* Bind to specific ethernet device. */
876 memset(&sll, 0, sizeof sll);
877 sll.sll_family = AF_PACKET;
878 sll.sll_ifindex = ifindex;
879 sll.sll_protocol = htons(ETH_P_ALL);
880 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
882 VLOG_ERR("%s: failed to bind raw socket (%s)",
883 netdev_get_name(netdev_), ovs_strerror(error));
887 /* Filter for only inbound packets. */
888 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
892 VLOG_ERR("%s: failed to attach filter (%s)",
893 netdev_get_name(netdev_), ovs_strerror(error));
897 ovs_mutex_unlock(&netdev->mutex);
905 ovs_mutex_unlock(&netdev->mutex);
910 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
912 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
920 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
922 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
928 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
930 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
931 return htons(aux->tp_vlan_tpid);
933 return htons(ETH_TYPE_VLAN);
938 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
940 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
944 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
949 struct cmsghdr *cmsg;
952 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
956 /* Reserve headroom for a single VLAN tag */
957 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
958 size = ofpbuf_tailroom(buffer);
960 iov.iov_base = ofpbuf_data(buffer);
962 msgh.msg_name = NULL;
963 msgh.msg_namelen = 0;
966 msgh.msg_control = &cmsg_buffer;
967 msgh.msg_controllen = sizeof cmsg_buffer;
971 retval = recvmsg(fd, &msgh, MSG_TRUNC);
972 } while (retval < 0 && errno == EINTR);
976 } else if (retval > size) {
980 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
982 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
983 const struct tpacket_auxdata *aux;
985 if (cmsg->cmsg_level != SOL_PACKET
986 || cmsg->cmsg_type != PACKET_AUXDATA
987 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
991 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
992 if (auxdata_has_vlan_tci(aux)) {
993 if (retval < ETH_HEADER_LEN) {
997 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
998 htons(aux->tp_vlan_tci));
1007 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
1010 size_t size = ofpbuf_tailroom(buffer);
1013 retval = read(fd, ofpbuf_data(buffer), size);
1014 } while (retval < 0 && errno == EINTR);
1018 } else if (retval > size) {
1022 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
1027 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
1030 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1031 struct netdev *netdev = rx->up.netdev;
1032 struct dpif_packet *packet;
1033 struct ofpbuf *buffer;
1037 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1038 mtu = ETH_PAYLOAD_MAX;
1041 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1042 DP_NETDEV_HEADROOM);
1043 buffer = &packet->ofpbuf;
1045 retval = (rx->is_tap
1046 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1047 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1050 if (retval != EAGAIN && retval != EMSGSIZE) {
1051 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1052 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1054 dpif_packet_delete(packet);
1056 dp_packet_pad(buffer);
1057 dpif_packet_set_dp_hash(packet, 0);
1058 packets[0] = packet;
1066 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1068 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1069 poll_fd_wait(rx->fd, POLLIN);
1073 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1075 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1078 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1079 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1083 drain_fd(rx->fd, ifr.ifr_qlen);
1086 return drain_rcvbuf(rx->fd);
1090 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1091 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1092 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1093 * the packet is too big or too small to transmit on the device.
1095 * The caller retains ownership of 'buffer' in all cases.
1097 * The kernel maintains a packet transmission queue, so the caller is not
1098 * expected to do additional queuing of packets. */
1100 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1101 struct dpif_packet **pkts, int cnt, bool may_steal)
1106 /* 'i' is incremented only if there's no error */
1107 for (i = 0; i < cnt;) {
1108 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1109 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1112 if (!is_tap_netdev(netdev_)) {
1113 /* Use our AF_PACKET socket to send to this device. */
1114 struct sockaddr_ll sll;
1120 sock = af_packet_sock();
1125 ifindex = netdev_get_ifindex(netdev_);
1130 /* We don't bother setting most fields in sockaddr_ll because the
1131 * kernel ignores them for SOCK_RAW. */
1132 memset(&sll, 0, sizeof sll);
1133 sll.sll_family = AF_PACKET;
1134 sll.sll_ifindex = ifindex;
1136 iov.iov_base = CONST_CAST(void *, data);
1139 msg.msg_name = &sll;
1140 msg.msg_namelen = sizeof sll;
1143 msg.msg_control = NULL;
1144 msg.msg_controllen = 0;
1147 retval = sendmsg(sock, &msg, 0);
1149 /* Use the tap fd to send to this device. This is essential for
1150 * tap devices, because packets sent to a tap device with an
1151 * AF_PACKET socket will loop back to be *received* again on the
1152 * tap device. This doesn't occur on other interface types
1153 * because we attach a socket filter to the rx socket. */
1154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1156 retval = write(netdev->tap_fd, data, size);
1160 /* The Linux AF_PACKET implementation never blocks waiting for room
1161 * for packets, instead returning ENOBUFS. Translate this into
1162 * EAGAIN for the caller. */
1163 error = errno == ENOBUFS ? EAGAIN : errno;
1164 if (error == EINTR) {
1165 /* continue without incrementing 'i', i.e. retry this packet */
1169 } else if (retval != size) {
1170 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1171 " of %"PRIuSIZE") on %s", retval, size,
1172 netdev_get_name(netdev_));
1177 /* Process the next packet in the batch */
1182 for (i = 0; i < cnt; i++) {
1183 dpif_packet_delete(pkts[i]);
1187 if (error && error != EAGAIN) {
1188 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1189 netdev_get_name(netdev_), ovs_strerror(error));
1196 /* Registers with the poll loop to wake up from the next call to poll_block()
1197 * when the packet transmission queue has sufficient room to transmit a packet
1198 * with netdev_send().
1200 * The kernel maintains a packet transmission queue, so the client is not
1201 * expected to do additional queuing of packets. Thus, this function is
1202 * unlikely to ever be used. It is included for completeness. */
1204 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1206 if (is_tap_netdev(netdev)) {
1207 /* TAP device always accepts packets.*/
1208 poll_immediate_wake();
1212 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1213 * otherwise a positive errno value. */
1215 netdev_linux_set_etheraddr(struct netdev *netdev_,
1216 const uint8_t mac[ETH_ADDR_LEN])
1218 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1219 enum netdev_flags old_flags = 0;
1222 ovs_mutex_lock(&netdev->mutex);
1224 if (netdev->cache_valid & VALID_ETHERADDR) {
1225 error = netdev->ether_addr_error;
1226 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1229 netdev->cache_valid &= ~VALID_ETHERADDR;
1232 /* Tap devices must be brought down before setting the address. */
1233 if (is_tap_netdev(netdev_)) {
1234 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1236 error = set_etheraddr(netdev_get_name(netdev_), mac);
1237 if (!error || error == ENODEV) {
1238 netdev->ether_addr_error = error;
1239 netdev->cache_valid |= VALID_ETHERADDR;
1241 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1245 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1246 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1250 ovs_mutex_unlock(&netdev->mutex);
1254 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1256 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1257 uint8_t mac[ETH_ADDR_LEN])
1259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1262 ovs_mutex_lock(&netdev->mutex);
1263 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1264 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1266 netdev->cache_valid |= VALID_ETHERADDR;
1269 error = netdev->ether_addr_error;
1271 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1273 ovs_mutex_unlock(&netdev->mutex);
1279 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1283 if (!(netdev->cache_valid & VALID_MTU)) {
1286 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1287 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1288 netdev->mtu = ifr.ifr_mtu;
1289 netdev->cache_valid |= VALID_MTU;
1292 error = netdev->netdev_mtu_error;
1294 *mtup = netdev->mtu;
1300 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1301 * in bytes, not including the hardware header; thus, this is typically 1500
1302 * bytes for Ethernet devices. */
1304 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1309 ovs_mutex_lock(&netdev->mutex);
1310 error = netdev_linux_get_mtu__(netdev, mtup);
1311 ovs_mutex_unlock(&netdev->mutex);
1316 /* Sets the maximum size of transmitted (MTU) for given device using linux
1317 * networking ioctl interface.
1320 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1326 ovs_mutex_lock(&netdev->mutex);
1327 if (netdev->cache_valid & VALID_MTU) {
1328 error = netdev->netdev_mtu_error;
1329 if (error || netdev->mtu == mtu) {
1332 netdev->cache_valid &= ~VALID_MTU;
1335 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1336 SIOCSIFMTU, "SIOCSIFMTU");
1337 if (!error || error == ENODEV) {
1338 netdev->netdev_mtu_error = error;
1339 netdev->mtu = ifr.ifr_mtu;
1340 netdev->cache_valid |= VALID_MTU;
1343 ovs_mutex_unlock(&netdev->mutex);
1347 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1348 * On failure, returns a negative errno value. */
1350 netdev_linux_get_ifindex(const struct netdev *netdev_)
1352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1355 ovs_mutex_lock(&netdev->mutex);
1356 error = get_ifindex(netdev_, &ifindex);
1357 ovs_mutex_unlock(&netdev->mutex);
1359 return error ? -error : ifindex;
1363 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1367 ovs_mutex_lock(&netdev->mutex);
1368 if (netdev->miimon_interval > 0) {
1369 *carrier = netdev->miimon;
1371 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1373 ovs_mutex_unlock(&netdev->mutex);
1378 static long long int
1379 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1382 long long int carrier_resets;
1384 ovs_mutex_lock(&netdev->mutex);
1385 carrier_resets = netdev->carrier_resets;
1386 ovs_mutex_unlock(&netdev->mutex);
1388 return carrier_resets;
1392 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1393 struct mii_ioctl_data *data)
1398 memset(&ifr, 0, sizeof ifr);
1399 memcpy(&ifr.ifr_data, data, sizeof *data);
1400 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1401 memcpy(data, &ifr.ifr_data, sizeof *data);
1407 netdev_linux_get_miimon(const char *name, bool *miimon)
1409 struct mii_ioctl_data data;
1414 memset(&data, 0, sizeof data);
1415 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1417 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1418 data.reg_num = MII_BMSR;
1419 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1423 *miimon = !!(data.val_out & BMSR_LSTATUS);
1425 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1428 struct ethtool_cmd ecmd;
1430 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1433 COVERAGE_INC(netdev_get_ethtool);
1434 memset(&ecmd, 0, sizeof ecmd);
1435 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1438 struct ethtool_value eval;
1440 memcpy(&eval, &ecmd, sizeof eval);
1441 *miimon = !!eval.data;
1443 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1451 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1452 long long int interval)
1454 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1456 ovs_mutex_lock(&netdev->mutex);
1457 interval = interval > 0 ? MAX(interval, 100) : 0;
1458 if (netdev->miimon_interval != interval) {
1459 if (interval && !netdev->miimon_interval) {
1460 atomic_count_inc(&miimon_cnt);
1461 } else if (!interval && netdev->miimon_interval) {
1462 atomic_count_dec(&miimon_cnt);
1465 netdev->miimon_interval = interval;
1466 timer_set_expired(&netdev->miimon_timer);
1468 ovs_mutex_unlock(&netdev->mutex);
1474 netdev_linux_miimon_run(void)
1476 struct shash device_shash;
1477 struct shash_node *node;
1479 shash_init(&device_shash);
1480 netdev_get_devices(&netdev_linux_class, &device_shash);
1481 SHASH_FOR_EACH (node, &device_shash) {
1482 struct netdev *netdev = node->data;
1483 struct netdev_linux *dev = netdev_linux_cast(netdev);
1486 ovs_mutex_lock(&dev->mutex);
1487 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1488 netdev_linux_get_miimon(dev->up.name, &miimon);
1489 if (miimon != dev->miimon) {
1490 dev->miimon = miimon;
1491 netdev_linux_changed(dev, dev->ifi_flags, 0);
1494 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1496 ovs_mutex_unlock(&dev->mutex);
1497 netdev_close(netdev);
1500 shash_destroy(&device_shash);
1504 netdev_linux_miimon_wait(void)
1506 struct shash device_shash;
1507 struct shash_node *node;
1509 shash_init(&device_shash);
1510 netdev_get_devices(&netdev_linux_class, &device_shash);
1511 SHASH_FOR_EACH (node, &device_shash) {
1512 struct netdev *netdev = node->data;
1513 struct netdev_linux *dev = netdev_linux_cast(netdev);
1515 ovs_mutex_lock(&dev->mutex);
1516 if (dev->miimon_interval > 0) {
1517 timer_wait(&dev->miimon_timer);
1519 ovs_mutex_unlock(&dev->mutex);
1520 netdev_close(netdev);
1522 shash_destroy(&device_shash);
1526 swap_uint64(uint64_t *a, uint64_t *b)
1533 /* Copies 'src' into 'dst', performing format conversion in the process.
1535 * 'src' is allowed to be misaligned. */
1537 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1538 const struct ovs_vport_stats *src)
1540 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1541 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1542 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1543 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1544 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1545 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1546 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1547 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1549 dst->collisions = 0;
1550 dst->rx_length_errors = 0;
1551 dst->rx_over_errors = 0;
1552 dst->rx_crc_errors = 0;
1553 dst->rx_frame_errors = 0;
1554 dst->rx_fifo_errors = 0;
1555 dst->rx_missed_errors = 0;
1556 dst->tx_aborted_errors = 0;
1557 dst->tx_carrier_errors = 0;
1558 dst->tx_fifo_errors = 0;
1559 dst->tx_heartbeat_errors = 0;
1560 dst->tx_window_errors = 0;
1564 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1566 struct dpif_netlink_vport reply;
1570 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1573 } else if (!reply.stats) {
1578 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1586 get_stats_via_vport(const struct netdev *netdev_,
1587 struct netdev_stats *stats)
1589 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1591 if (!netdev->vport_stats_error ||
1592 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1595 error = get_stats_via_vport__(netdev_, stats);
1596 if (error && error != ENOENT) {
1597 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1599 netdev_get_name(netdev_), ovs_strerror(error));
1601 netdev->vport_stats_error = error;
1602 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1606 /* Retrieves current device stats for 'netdev-linux'. */
1608 netdev_linux_get_stats(const struct netdev *netdev_,
1609 struct netdev_stats *stats)
1611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1612 struct netdev_stats dev_stats;
1615 ovs_mutex_lock(&netdev->mutex);
1616 get_stats_via_vport(netdev_, stats);
1617 error = get_stats_via_netlink(netdev_, &dev_stats);
1619 if (!netdev->vport_stats_error) {
1622 } else if (netdev->vport_stats_error) {
1623 /* stats not available from OVS then use netdev stats. */
1626 /* Use kernel netdev's packet and byte counts since vport's counters
1627 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1629 stats->rx_packets = dev_stats.rx_packets;
1630 stats->rx_bytes = dev_stats.rx_bytes;
1631 stats->tx_packets = dev_stats.tx_packets;
1632 stats->tx_bytes = dev_stats.tx_bytes;
1634 stats->rx_errors += dev_stats.rx_errors;
1635 stats->tx_errors += dev_stats.tx_errors;
1636 stats->rx_dropped += dev_stats.rx_dropped;
1637 stats->tx_dropped += dev_stats.tx_dropped;
1638 stats->multicast += dev_stats.multicast;
1639 stats->collisions += dev_stats.collisions;
1640 stats->rx_length_errors += dev_stats.rx_length_errors;
1641 stats->rx_over_errors += dev_stats.rx_over_errors;
1642 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1643 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1644 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1645 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1646 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1647 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1648 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1649 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1650 stats->tx_window_errors += dev_stats.tx_window_errors;
1652 ovs_mutex_unlock(&netdev->mutex);
1657 /* Retrieves current device stats for 'netdev-tap' netdev or
1658 * netdev-internal. */
1660 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1663 struct netdev_stats dev_stats;
1666 ovs_mutex_lock(&netdev->mutex);
1667 get_stats_via_vport(netdev_, stats);
1668 error = get_stats_via_netlink(netdev_, &dev_stats);
1670 if (!netdev->vport_stats_error) {
1673 } else if (netdev->vport_stats_error) {
1674 /* Transmit and receive stats will appear to be swapped relative to the
1675 * other ports since we are the one sending the data, not a remote
1676 * computer. For consistency, we swap them back here. This does not
1677 * apply if we are getting stats from the vport layer because it always
1678 * tracks stats from the perspective of the switch. */
1681 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1682 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1683 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1684 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1685 stats->rx_length_errors = 0;
1686 stats->rx_over_errors = 0;
1687 stats->rx_crc_errors = 0;
1688 stats->rx_frame_errors = 0;
1689 stats->rx_fifo_errors = 0;
1690 stats->rx_missed_errors = 0;
1691 stats->tx_aborted_errors = 0;
1692 stats->tx_carrier_errors = 0;
1693 stats->tx_fifo_errors = 0;
1694 stats->tx_heartbeat_errors = 0;
1695 stats->tx_window_errors = 0;
1697 /* Use kernel netdev's packet and byte counts since vport counters
1698 * do not reflect packet counts on the wire when GSO, TSO or GRO
1700 stats->rx_packets = dev_stats.tx_packets;
1701 stats->rx_bytes = dev_stats.tx_bytes;
1702 stats->tx_packets = dev_stats.rx_packets;
1703 stats->tx_bytes = dev_stats.rx_bytes;
1705 stats->rx_dropped += dev_stats.tx_dropped;
1706 stats->tx_dropped += dev_stats.rx_dropped;
1708 stats->rx_errors += dev_stats.tx_errors;
1709 stats->tx_errors += dev_stats.rx_errors;
1711 stats->multicast += dev_stats.multicast;
1712 stats->collisions += dev_stats.collisions;
1714 ovs_mutex_unlock(&netdev->mutex);
1720 netdev_internal_get_stats(const struct netdev *netdev_,
1721 struct netdev_stats *stats)
1723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1726 ovs_mutex_lock(&netdev->mutex);
1727 get_stats_via_vport(netdev_, stats);
1728 error = netdev->vport_stats_error;
1729 ovs_mutex_unlock(&netdev->mutex);
1735 netdev_linux_read_features(struct netdev_linux *netdev)
1737 struct ethtool_cmd ecmd;
1741 if (netdev->cache_valid & VALID_FEATURES) {
1745 COVERAGE_INC(netdev_get_ethtool);
1746 memset(&ecmd, 0, sizeof ecmd);
1747 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1748 ETHTOOL_GSET, "ETHTOOL_GSET");
1753 /* Supported features. */
1754 netdev->supported = 0;
1755 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1756 netdev->supported |= NETDEV_F_10MB_HD;
1758 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1759 netdev->supported |= NETDEV_F_10MB_FD;
1761 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1762 netdev->supported |= NETDEV_F_100MB_HD;
1764 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1765 netdev->supported |= NETDEV_F_100MB_FD;
1767 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1768 netdev->supported |= NETDEV_F_1GB_HD;
1770 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1771 netdev->supported |= NETDEV_F_1GB_FD;
1773 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1774 netdev->supported |= NETDEV_F_10GB_FD;
1776 if (ecmd.supported & SUPPORTED_TP) {
1777 netdev->supported |= NETDEV_F_COPPER;
1779 if (ecmd.supported & SUPPORTED_FIBRE) {
1780 netdev->supported |= NETDEV_F_FIBER;
1782 if (ecmd.supported & SUPPORTED_Autoneg) {
1783 netdev->supported |= NETDEV_F_AUTONEG;
1785 if (ecmd.supported & SUPPORTED_Pause) {
1786 netdev->supported |= NETDEV_F_PAUSE;
1788 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1789 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1792 /* Advertised features. */
1793 netdev->advertised = 0;
1794 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1795 netdev->advertised |= NETDEV_F_10MB_HD;
1797 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1798 netdev->advertised |= NETDEV_F_10MB_FD;
1800 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1801 netdev->advertised |= NETDEV_F_100MB_HD;
1803 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1804 netdev->advertised |= NETDEV_F_100MB_FD;
1806 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1807 netdev->advertised |= NETDEV_F_1GB_HD;
1809 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1810 netdev->advertised |= NETDEV_F_1GB_FD;
1812 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1813 netdev->advertised |= NETDEV_F_10GB_FD;
1815 if (ecmd.advertising & ADVERTISED_TP) {
1816 netdev->advertised |= NETDEV_F_COPPER;
1818 if (ecmd.advertising & ADVERTISED_FIBRE) {
1819 netdev->advertised |= NETDEV_F_FIBER;
1821 if (ecmd.advertising & ADVERTISED_Autoneg) {
1822 netdev->advertised |= NETDEV_F_AUTONEG;
1824 if (ecmd.advertising & ADVERTISED_Pause) {
1825 netdev->advertised |= NETDEV_F_PAUSE;
1827 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1828 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1831 /* Current settings. */
1833 if (speed == SPEED_10) {
1834 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1835 } else if (speed == SPEED_100) {
1836 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1837 } else if (speed == SPEED_1000) {
1838 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1839 } else if (speed == SPEED_10000) {
1840 netdev->current = NETDEV_F_10GB_FD;
1841 } else if (speed == 40000) {
1842 netdev->current = NETDEV_F_40GB_FD;
1843 } else if (speed == 100000) {
1844 netdev->current = NETDEV_F_100GB_FD;
1845 } else if (speed == 1000000) {
1846 netdev->current = NETDEV_F_1TB_FD;
1848 netdev->current = 0;
1851 if (ecmd.port == PORT_TP) {
1852 netdev->current |= NETDEV_F_COPPER;
1853 } else if (ecmd.port == PORT_FIBRE) {
1854 netdev->current |= NETDEV_F_FIBER;
1858 netdev->current |= NETDEV_F_AUTONEG;
1862 netdev->cache_valid |= VALID_FEATURES;
1863 netdev->get_features_error = error;
1866 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1867 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1868 * Returns 0 if successful, otherwise a positive errno value. */
1870 netdev_linux_get_features(const struct netdev *netdev_,
1871 enum netdev_features *current,
1872 enum netdev_features *advertised,
1873 enum netdev_features *supported,
1874 enum netdev_features *peer)
1876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1879 ovs_mutex_lock(&netdev->mutex);
1880 netdev_linux_read_features(netdev);
1881 if (!netdev->get_features_error) {
1882 *current = netdev->current;
1883 *advertised = netdev->advertised;
1884 *supported = netdev->supported;
1885 *peer = 0; /* XXX */
1887 error = netdev->get_features_error;
1888 ovs_mutex_unlock(&netdev->mutex);
1893 /* Set the features advertised by 'netdev' to 'advertise'. */
1895 netdev_linux_set_advertisements(struct netdev *netdev_,
1896 enum netdev_features advertise)
1898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1899 struct ethtool_cmd ecmd;
1902 ovs_mutex_lock(&netdev->mutex);
1904 COVERAGE_INC(netdev_get_ethtool);
1905 memset(&ecmd, 0, sizeof ecmd);
1906 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1907 ETHTOOL_GSET, "ETHTOOL_GSET");
1912 ecmd.advertising = 0;
1913 if (advertise & NETDEV_F_10MB_HD) {
1914 ecmd.advertising |= ADVERTISED_10baseT_Half;
1916 if (advertise & NETDEV_F_10MB_FD) {
1917 ecmd.advertising |= ADVERTISED_10baseT_Full;
1919 if (advertise & NETDEV_F_100MB_HD) {
1920 ecmd.advertising |= ADVERTISED_100baseT_Half;
1922 if (advertise & NETDEV_F_100MB_FD) {
1923 ecmd.advertising |= ADVERTISED_100baseT_Full;
1925 if (advertise & NETDEV_F_1GB_HD) {
1926 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1928 if (advertise & NETDEV_F_1GB_FD) {
1929 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1931 if (advertise & NETDEV_F_10GB_FD) {
1932 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1934 if (advertise & NETDEV_F_COPPER) {
1935 ecmd.advertising |= ADVERTISED_TP;
1937 if (advertise & NETDEV_F_FIBER) {
1938 ecmd.advertising |= ADVERTISED_FIBRE;
1940 if (advertise & NETDEV_F_AUTONEG) {
1941 ecmd.advertising |= ADVERTISED_Autoneg;
1943 if (advertise & NETDEV_F_PAUSE) {
1944 ecmd.advertising |= ADVERTISED_Pause;
1946 if (advertise & NETDEV_F_PAUSE_ASYM) {
1947 ecmd.advertising |= ADVERTISED_Asym_Pause;
1949 COVERAGE_INC(netdev_set_ethtool);
1950 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1951 ETHTOOL_SSET, "ETHTOOL_SSET");
1954 ovs_mutex_unlock(&netdev->mutex);
1958 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1959 * successful, otherwise a positive errno value. */
1961 netdev_linux_set_policing(struct netdev *netdev_,
1962 uint32_t kbits_rate, uint32_t kbits_burst)
1964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1965 const char *netdev_name = netdev_get_name(netdev_);
1968 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1969 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1970 : kbits_burst); /* Stick with user-specified value. */
1972 ovs_mutex_lock(&netdev->mutex);
1973 if (netdev->cache_valid & VALID_POLICING) {
1974 error = netdev->netdev_policing_error;
1975 if (error || (netdev->kbits_rate == kbits_rate &&
1976 netdev->kbits_burst == kbits_burst)) {
1977 /* Assume that settings haven't changed since we last set them. */
1980 netdev->cache_valid &= ~VALID_POLICING;
1983 COVERAGE_INC(netdev_set_policing);
1984 /* Remove any existing ingress qdisc. */
1985 error = tc_add_del_ingress_qdisc(netdev_, false);
1987 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1988 netdev_name, ovs_strerror(error));
1993 error = tc_add_del_ingress_qdisc(netdev_, true);
1995 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1996 netdev_name, ovs_strerror(error));
2000 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2002 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2003 netdev_name, ovs_strerror(error));
2008 netdev->kbits_rate = kbits_rate;
2009 netdev->kbits_burst = kbits_burst;
2012 if (!error || error == ENODEV) {
2013 netdev->netdev_policing_error = error;
2014 netdev->cache_valid |= VALID_POLICING;
2016 ovs_mutex_unlock(&netdev->mutex);
2021 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2024 const struct tc_ops *const *opsp;
2026 for (opsp = tcs; *opsp != NULL; opsp++) {
2027 const struct tc_ops *ops = *opsp;
2028 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2029 sset_add(types, ops->ovs_name);
2035 static const struct tc_ops *
2036 tc_lookup_ovs_name(const char *name)
2038 const struct tc_ops *const *opsp;
2040 for (opsp = tcs; *opsp != NULL; opsp++) {
2041 const struct tc_ops *ops = *opsp;
2042 if (!strcmp(name, ops->ovs_name)) {
2049 static const struct tc_ops *
2050 tc_lookup_linux_name(const char *name)
2052 const struct tc_ops *const *opsp;
2054 for (opsp = tcs; *opsp != NULL; opsp++) {
2055 const struct tc_ops *ops = *opsp;
2056 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2063 static struct tc_queue *
2064 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2068 struct tc_queue *queue;
2070 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2071 if (queue->queue_id == queue_id) {
2078 static struct tc_queue *
2079 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2081 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2085 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2087 struct netdev_qos_capabilities *caps)
2089 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2093 caps->n_queues = ops->n_queues;
2098 netdev_linux_get_qos(const struct netdev *netdev_,
2099 const char **typep, struct smap *details)
2101 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2104 ovs_mutex_lock(&netdev->mutex);
2105 error = tc_query_qdisc(netdev_);
2107 *typep = netdev->tc->ops->ovs_name;
2108 error = (netdev->tc->ops->qdisc_get
2109 ? netdev->tc->ops->qdisc_get(netdev_, details)
2112 ovs_mutex_unlock(&netdev->mutex);
2118 netdev_linux_set_qos(struct netdev *netdev_,
2119 const char *type, const struct smap *details)
2121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2122 const struct tc_ops *new_ops;
2125 new_ops = tc_lookup_ovs_name(type);
2126 if (!new_ops || !new_ops->tc_install) {
2130 ovs_mutex_lock(&netdev->mutex);
2131 error = tc_query_qdisc(netdev_);
2136 if (new_ops == netdev->tc->ops) {
2137 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2139 /* Delete existing qdisc. */
2140 error = tc_del_qdisc(netdev_);
2144 ovs_assert(netdev->tc == NULL);
2146 /* Install new qdisc. */
2147 error = new_ops->tc_install(netdev_, details);
2148 ovs_assert((error == 0) == (netdev->tc != NULL));
2152 ovs_mutex_unlock(&netdev->mutex);
2157 netdev_linux_get_queue(const struct netdev *netdev_,
2158 unsigned int queue_id, struct smap *details)
2160 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2163 ovs_mutex_lock(&netdev->mutex);
2164 error = tc_query_qdisc(netdev_);
2166 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2168 ? netdev->tc->ops->class_get(netdev_, queue, details)
2171 ovs_mutex_unlock(&netdev->mutex);
2177 netdev_linux_set_queue(struct netdev *netdev_,
2178 unsigned int queue_id, const struct smap *details)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 error = (queue_id < netdev->tc->ops->n_queues
2187 && netdev->tc->ops->class_set
2188 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2191 ovs_mutex_unlock(&netdev->mutex);
2197 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2199 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2202 ovs_mutex_lock(&netdev->mutex);
2203 error = tc_query_qdisc(netdev_);
2205 if (netdev->tc->ops->class_delete) {
2206 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2208 ? netdev->tc->ops->class_delete(netdev_, queue)
2214 ovs_mutex_unlock(&netdev->mutex);
2220 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2221 unsigned int queue_id,
2222 struct netdev_queue_stats *stats)
2224 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2227 ovs_mutex_lock(&netdev->mutex);
2228 error = tc_query_qdisc(netdev_);
2230 if (netdev->tc->ops->class_get_stats) {
2231 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2233 stats->created = queue->created;
2234 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2243 ovs_mutex_unlock(&netdev->mutex);
2248 struct queue_dump_state {
2249 struct nl_dump dump;
2254 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2256 struct ofpbuf request;
2257 struct tcmsg *tcmsg;
2259 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2263 tcmsg->tcm_parent = 0;
2264 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2265 ofpbuf_uninit(&request);
2267 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2272 finish_queue_dump(struct queue_dump_state *state)
2274 ofpbuf_uninit(&state->buf);
2275 return nl_dump_done(&state->dump);
2278 struct netdev_linux_queue_state {
2279 unsigned int *queues;
2285 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2287 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2290 ovs_mutex_lock(&netdev->mutex);
2291 error = tc_query_qdisc(netdev_);
2293 if (netdev->tc->ops->class_get) {
2294 struct netdev_linux_queue_state *state;
2295 struct tc_queue *queue;
2298 *statep = state = xmalloc(sizeof *state);
2299 state->n_queues = hmap_count(&netdev->tc->queues);
2300 state->cur_queue = 0;
2301 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2304 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2305 state->queues[i++] = queue->queue_id;
2311 ovs_mutex_unlock(&netdev->mutex);
2317 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2318 unsigned int *queue_idp, struct smap *details)
2320 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2321 struct netdev_linux_queue_state *state = state_;
2324 ovs_mutex_lock(&netdev->mutex);
2325 while (state->cur_queue < state->n_queues) {
2326 unsigned int queue_id = state->queues[state->cur_queue++];
2327 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2330 *queue_idp = queue_id;
2331 error = netdev->tc->ops->class_get(netdev_, queue, details);
2335 ovs_mutex_unlock(&netdev->mutex);
2341 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2344 struct netdev_linux_queue_state *state = state_;
2346 free(state->queues);
2352 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2353 netdev_dump_queue_stats_cb *cb, void *aux)
2355 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2358 ovs_mutex_lock(&netdev->mutex);
2359 error = tc_query_qdisc(netdev_);
2361 struct queue_dump_state state;
2363 if (!netdev->tc->ops->class_dump_stats) {
2365 } else if (!start_queue_dump(netdev_, &state)) {
2371 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2372 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2379 retval = finish_queue_dump(&state);
2385 ovs_mutex_unlock(&netdev->mutex);
2391 netdev_linux_get_in4(const struct netdev *netdev_,
2392 struct in_addr *address, struct in_addr *netmask)
2394 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2397 ovs_mutex_lock(&netdev->mutex);
2398 if (!(netdev->cache_valid & VALID_IN4)) {
2399 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2400 SIOCGIFADDR, "SIOCGIFADDR");
2402 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2403 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2405 netdev->cache_valid |= VALID_IN4;
2413 if (netdev->address.s_addr != INADDR_ANY) {
2414 *address = netdev->address;
2415 *netmask = netdev->netmask;
2417 error = EADDRNOTAVAIL;
2420 ovs_mutex_unlock(&netdev->mutex);
2426 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2427 struct in_addr netmask)
2429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2432 ovs_mutex_lock(&netdev->mutex);
2433 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2435 netdev->cache_valid |= VALID_IN4;
2436 netdev->address = address;
2437 netdev->netmask = netmask;
2438 if (address.s_addr != INADDR_ANY) {
2439 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2440 "SIOCSIFNETMASK", netmask);
2443 ovs_mutex_unlock(&netdev->mutex);
2449 parse_if_inet6_line(const char *line,
2450 struct in6_addr *in6, char ifname[16 + 1])
2452 uint8_t *s6 = in6->s6_addr;
2453 #define X8 "%2"SCNx8
2454 return ovs_scan(line,
2455 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2456 "%*x %*x %*x %*x %16s\n",
2457 &s6[0], &s6[1], &s6[2], &s6[3],
2458 &s6[4], &s6[5], &s6[6], &s6[7],
2459 &s6[8], &s6[9], &s6[10], &s6[11],
2460 &s6[12], &s6[13], &s6[14], &s6[15],
2464 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2465 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2467 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2469 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2471 ovs_mutex_lock(&netdev->mutex);
2472 if (!(netdev->cache_valid & VALID_IN6)) {
2476 netdev->in6 = in6addr_any;
2478 file = fopen("/proc/net/if_inet6", "r");
2480 const char *name = netdev_get_name(netdev_);
2481 while (fgets(line, sizeof line, file)) {
2482 struct in6_addr in6_tmp;
2483 char ifname[16 + 1];
2484 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2485 && !strcmp(name, ifname))
2487 netdev->in6 = in6_tmp;
2493 netdev->cache_valid |= VALID_IN6;
2496 ovs_mutex_unlock(&netdev->mutex);
2502 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2504 struct sockaddr_in sin;
2505 memset(&sin, 0, sizeof sin);
2506 sin.sin_family = AF_INET;
2507 sin.sin_addr = addr;
2510 memset(sa, 0, sizeof *sa);
2511 memcpy(sa, &sin, sizeof sin);
2515 do_set_addr(struct netdev *netdev,
2516 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2520 make_in4_sockaddr(&ifr.ifr_addr, addr);
2521 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2525 /* Adds 'router' as a default IP gateway. */
2527 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2529 struct in_addr any = { INADDR_ANY };
2533 memset(&rt, 0, sizeof rt);
2534 make_in4_sockaddr(&rt.rt_dst, any);
2535 make_in4_sockaddr(&rt.rt_gateway, router);
2536 make_in4_sockaddr(&rt.rt_genmask, any);
2537 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2538 error = af_inet_ioctl(SIOCADDRT, &rt);
2540 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2546 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2549 static const char fn[] = "/proc/net/route";
2554 *netdev_name = NULL;
2555 stream = fopen(fn, "r");
2556 if (stream == NULL) {
2557 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2562 while (fgets(line, sizeof line, stream)) {
2565 ovs_be32 dest, gateway, mask;
2566 int refcnt, metric, mtu;
2567 unsigned int flags, use, window, irtt;
2570 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2572 iface, &dest, &gateway, &flags, &refcnt,
2573 &use, &metric, &mask, &mtu, &window, &irtt)) {
2574 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2578 if (!(flags & RTF_UP)) {
2579 /* Skip routes that aren't up. */
2583 /* The output of 'dest', 'mask', and 'gateway' were given in
2584 * network byte order, so we don't need need any endian
2585 * conversions here. */
2586 if ((dest & mask) == (host->s_addr & mask)) {
2588 /* The host is directly reachable. */
2589 next_hop->s_addr = 0;
2591 /* To reach the host, we must go through a gateway. */
2592 next_hop->s_addr = gateway;
2594 *netdev_name = xstrdup(iface);
2606 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2608 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2611 ovs_mutex_lock(&netdev->mutex);
2612 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2613 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2615 COVERAGE_INC(netdev_get_ethtool);
2616 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2617 error = netdev_linux_do_ethtool(netdev->up.name,
2620 "ETHTOOL_GDRVINFO");
2622 netdev->cache_valid |= VALID_DRVINFO;
2627 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2628 smap_add(smap, "driver_version", netdev->drvinfo.version);
2629 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2631 ovs_mutex_unlock(&netdev->mutex);
2637 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2640 smap_add(smap, "driver_name", "openvswitch");
2644 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2645 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2646 * returns 0. Otherwise, it returns a positive errno value; in particular,
2647 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2649 netdev_linux_arp_lookup(const struct netdev *netdev,
2650 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2653 struct sockaddr_in sin;
2656 memset(&r, 0, sizeof r);
2657 memset(&sin, 0, sizeof sin);
2658 sin.sin_family = AF_INET;
2659 sin.sin_addr.s_addr = ip;
2661 memcpy(&r.arp_pa, &sin, sizeof sin);
2662 r.arp_ha.sa_family = ARPHRD_ETHER;
2664 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2665 COVERAGE_INC(netdev_arp_lookup);
2666 retval = af_inet_ioctl(SIOCGARP, &r);
2668 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2669 } else if (retval != ENXIO) {
2670 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2671 netdev_get_name(netdev), IP_ARGS(ip),
2672 ovs_strerror(retval));
2678 nd_to_iff_flags(enum netdev_flags nd)
2681 if (nd & NETDEV_UP) {
2684 if (nd & NETDEV_PROMISC) {
2687 if (nd & NETDEV_LOOPBACK) {
2688 iff |= IFF_LOOPBACK;
2694 iff_to_nd_flags(int iff)
2696 enum netdev_flags nd = 0;
2700 if (iff & IFF_PROMISC) {
2701 nd |= NETDEV_PROMISC;
2703 if (iff & IFF_LOOPBACK) {
2704 nd |= NETDEV_LOOPBACK;
2710 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2711 enum netdev_flags on, enum netdev_flags *old_flagsp)
2712 OVS_REQUIRES(netdev->mutex)
2714 int old_flags, new_flags;
2717 old_flags = netdev->ifi_flags;
2718 *old_flagsp = iff_to_nd_flags(old_flags);
2719 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2720 if (new_flags != old_flags) {
2721 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2722 get_flags(&netdev->up, &netdev->ifi_flags);
2729 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2730 enum netdev_flags on, enum netdev_flags *old_flagsp)
2732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2735 ovs_mutex_lock(&netdev->mutex);
2736 error = update_flags(netdev, off, on, old_flagsp);
2737 ovs_mutex_unlock(&netdev->mutex);
2742 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2743 GET_FEATURES, GET_STATUS) \
2749 netdev_linux_wait, \
2751 netdev_linux_alloc, \
2753 netdev_linux_destruct, \
2754 netdev_linux_dealloc, \
2755 NULL, /* get_config */ \
2756 NULL, /* set_config */ \
2757 NULL, /* get_tunnel_config */ \
2758 NULL, /* build header */ \
2759 NULL, /* push header */ \
2760 NULL, /* pop header */ \
2761 NULL, /* get_numa_id */ \
2762 NULL, /* set_multiq */ \
2764 netdev_linux_send, \
2765 netdev_linux_send_wait, \
2767 netdev_linux_set_etheraddr, \
2768 netdev_linux_get_etheraddr, \
2769 netdev_linux_get_mtu, \
2770 netdev_linux_set_mtu, \
2771 netdev_linux_get_ifindex, \
2772 netdev_linux_get_carrier, \
2773 netdev_linux_get_carrier_resets, \
2774 netdev_linux_set_miimon_interval, \
2778 netdev_linux_set_advertisements, \
2780 netdev_linux_set_policing, \
2781 netdev_linux_get_qos_types, \
2782 netdev_linux_get_qos_capabilities, \
2783 netdev_linux_get_qos, \
2784 netdev_linux_set_qos, \
2785 netdev_linux_get_queue, \
2786 netdev_linux_set_queue, \
2787 netdev_linux_delete_queue, \
2788 netdev_linux_get_queue_stats, \
2789 netdev_linux_queue_dump_start, \
2790 netdev_linux_queue_dump_next, \
2791 netdev_linux_queue_dump_done, \
2792 netdev_linux_dump_queue_stats, \
2794 netdev_linux_get_in4, \
2795 netdev_linux_set_in4, \
2796 netdev_linux_get_in6, \
2797 netdev_linux_add_router, \
2798 netdev_linux_get_next_hop, \
2800 netdev_linux_arp_lookup, \
2802 netdev_linux_update_flags, \
2804 netdev_linux_rxq_alloc, \
2805 netdev_linux_rxq_construct, \
2806 netdev_linux_rxq_destruct, \
2807 netdev_linux_rxq_dealloc, \
2808 netdev_linux_rxq_recv, \
2809 netdev_linux_rxq_wait, \
2810 netdev_linux_rxq_drain, \
2813 const struct netdev_class netdev_linux_class =
2816 netdev_linux_construct,
2817 netdev_linux_get_stats,
2818 netdev_linux_get_features,
2819 netdev_linux_get_status);
2821 const struct netdev_class netdev_tap_class =
2824 netdev_linux_construct_tap,
2825 netdev_tap_get_stats,
2826 netdev_linux_get_features,
2827 netdev_linux_get_status);
2829 const struct netdev_class netdev_internal_class =
2832 netdev_linux_construct,
2833 netdev_internal_get_stats,
2834 NULL, /* get_features */
2835 netdev_internal_get_status);
2837 /* HTB traffic control class. */
2839 #define HTB_N_QUEUES 0xf000
2843 unsigned int max_rate; /* In bytes/s. */
2847 struct tc_queue tc_queue;
2848 unsigned int min_rate; /* In bytes/s. */
2849 unsigned int max_rate; /* In bytes/s. */
2850 unsigned int burst; /* In bytes. */
2851 unsigned int priority; /* Lower values are higher priorities. */
2855 htb_get__(const struct netdev *netdev_)
2857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2858 return CONTAINER_OF(netdev->tc, struct htb, tc);
2862 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2867 htb = xmalloc(sizeof *htb);
2868 tc_init(&htb->tc, &tc_ops_htb);
2869 htb->max_rate = max_rate;
2871 netdev->tc = &htb->tc;
2874 /* Create an HTB qdisc.
2876 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2878 htb_setup_qdisc__(struct netdev *netdev)
2881 struct tc_htb_glob opt;
2882 struct ofpbuf request;
2883 struct tcmsg *tcmsg;
2885 tc_del_qdisc(netdev);
2887 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2888 NLM_F_EXCL | NLM_F_CREATE, &request);
2892 tcmsg->tcm_handle = tc_make_handle(1, 0);
2893 tcmsg->tcm_parent = TC_H_ROOT;
2895 nl_msg_put_string(&request, TCA_KIND, "htb");
2897 memset(&opt, 0, sizeof opt);
2898 opt.rate2quantum = 10;
2902 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2903 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2904 nl_msg_end_nested(&request, opt_offset);
2906 return tc_transact(&request, NULL);
2909 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2910 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2912 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2913 unsigned int parent, struct htb_class *class)
2916 struct tc_htb_opt opt;
2917 struct ofpbuf request;
2918 struct tcmsg *tcmsg;
2922 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2924 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2925 netdev_get_name(netdev));
2929 memset(&opt, 0, sizeof opt);
2930 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2931 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2932 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2933 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2934 opt.prio = class->priority;
2936 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2940 tcmsg->tcm_handle = handle;
2941 tcmsg->tcm_parent = parent;
2943 nl_msg_put_string(&request, TCA_KIND, "htb");
2944 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2945 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2946 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2947 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2948 nl_msg_end_nested(&request, opt_offset);
2950 error = tc_transact(&request, NULL);
2952 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2953 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2954 netdev_get_name(netdev),
2955 tc_get_major(handle), tc_get_minor(handle),
2956 tc_get_major(parent), tc_get_minor(parent),
2957 class->min_rate, class->max_rate,
2958 class->burst, class->priority, ovs_strerror(error));
2963 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2964 * description of them into 'details'. The description complies with the
2965 * specification given in the vswitch database documentation for linux-htb
2968 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2970 static const struct nl_policy tca_htb_policy[] = {
2971 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2972 .min_len = sizeof(struct tc_htb_opt) },
2975 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2976 const struct tc_htb_opt *htb;
2978 if (!nl_parse_nested(nl_options, tca_htb_policy,
2979 attrs, ARRAY_SIZE(tca_htb_policy))) {
2980 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2984 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2985 class->min_rate = htb->rate.rate;
2986 class->max_rate = htb->ceil.rate;
2987 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2988 class->priority = htb->prio;
2993 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2994 struct htb_class *options,
2995 struct netdev_queue_stats *stats)
2997 struct nlattr *nl_options;
2998 unsigned int handle;
3001 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3002 if (!error && queue_id) {
3003 unsigned int major = tc_get_major(handle);
3004 unsigned int minor = tc_get_minor(handle);
3005 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3006 *queue_id = minor - 1;
3011 if (!error && options) {
3012 error = htb_parse_tca_options__(nl_options, options);
3018 htb_parse_qdisc_details__(struct netdev *netdev_,
3019 const struct smap *details, struct htb_class *hc)
3021 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3022 const char *max_rate_s;
3024 max_rate_s = smap_get(details, "max-rate");
3025 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3026 if (!hc->max_rate) {
3027 enum netdev_features current;
3029 netdev_linux_read_features(netdev);
3030 current = !netdev->get_features_error ? netdev->current : 0;
3031 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3033 hc->min_rate = hc->max_rate;
3039 htb_parse_class_details__(struct netdev *netdev,
3040 const struct smap *details, struct htb_class *hc)
3042 const struct htb *htb = htb_get__(netdev);
3043 const char *min_rate_s = smap_get(details, "min-rate");
3044 const char *max_rate_s = smap_get(details, "max-rate");
3045 const char *burst_s = smap_get(details, "burst");
3046 const char *priority_s = smap_get(details, "priority");
3049 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3051 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3052 netdev_get_name(netdev));
3056 /* HTB requires at least an mtu sized min-rate to send any traffic even
3057 * on uncongested links. */
3058 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3059 hc->min_rate = MAX(hc->min_rate, mtu);
3060 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3063 hc->max_rate = (max_rate_s
3064 ? strtoull(max_rate_s, NULL, 10) / 8
3066 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3067 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3071 * According to hints in the documentation that I've read, it is important
3072 * that 'burst' be at least as big as the largest frame that might be
3073 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3074 * but having it a bit too small is a problem. Since netdev_get_mtu()
3075 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3076 * the MTU. We actually add 64, instead of 14, as a guard against
3077 * additional headers get tacked on somewhere that we're not aware of. */
3078 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3079 hc->burst = MAX(hc->burst, mtu + 64);
3082 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3088 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3089 unsigned int parent, struct htb_class *options,
3090 struct netdev_queue_stats *stats)
3092 struct ofpbuf *reply;
3095 error = tc_query_class(netdev, handle, parent, &reply);
3097 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3098 ofpbuf_delete(reply);
3104 htb_tc_install(struct netdev *netdev, const struct smap *details)
3108 error = htb_setup_qdisc__(netdev);
3110 struct htb_class hc;
3112 htb_parse_qdisc_details__(netdev, details, &hc);
3113 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3114 tc_make_handle(1, 0), &hc);
3116 htb_install__(netdev, hc.max_rate);
3122 static struct htb_class *
3123 htb_class_cast__(const struct tc_queue *queue)
3125 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3129 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3130 const struct htb_class *hc)
3132 struct htb *htb = htb_get__(netdev);
3133 size_t hash = hash_int(queue_id, 0);
3134 struct tc_queue *queue;
3135 struct htb_class *hcp;
3137 queue = tc_find_queue__(netdev, queue_id, hash);
3139 hcp = htb_class_cast__(queue);
3141 hcp = xmalloc(sizeof *hcp);
3142 queue = &hcp->tc_queue;
3143 queue->queue_id = queue_id;
3144 queue->created = time_msec();
3145 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3148 hcp->min_rate = hc->min_rate;
3149 hcp->max_rate = hc->max_rate;
3150 hcp->burst = hc->burst;
3151 hcp->priority = hc->priority;
3155 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3158 struct queue_dump_state state;
3159 struct htb_class hc;
3161 /* Get qdisc options. */
3163 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3164 htb_install__(netdev, hc.max_rate);
3167 if (!start_queue_dump(netdev, &state)) {
3170 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3171 unsigned int queue_id;
3173 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3174 htb_update_queue__(netdev, queue_id, &hc);
3177 finish_queue_dump(&state);
3183 htb_tc_destroy(struct tc *tc)
3185 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3186 struct htb_class *hc, *next;
3188 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3189 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3197 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3199 const struct htb *htb = htb_get__(netdev);
3200 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3205 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3207 struct htb_class hc;
3210 htb_parse_qdisc_details__(netdev, details, &hc);
3211 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3212 tc_make_handle(1, 0), &hc);
3214 htb_get__(netdev)->max_rate = hc.max_rate;
3220 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3221 const struct tc_queue *queue, struct smap *details)
3223 const struct htb_class *hc = htb_class_cast__(queue);
3225 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3226 if (hc->min_rate != hc->max_rate) {
3227 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3229 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3231 smap_add_format(details, "priority", "%u", hc->priority);
3237 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3238 const struct smap *details)
3240 struct htb_class hc;
3243 error = htb_parse_class_details__(netdev, details, &hc);
3248 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3249 tc_make_handle(1, 0xfffe), &hc);
3254 htb_update_queue__(netdev, queue_id, &hc);
3259 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3261 struct htb_class *hc = htb_class_cast__(queue);
3262 struct htb *htb = htb_get__(netdev);
3265 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3267 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3274 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3275 struct netdev_queue_stats *stats)
3277 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3278 tc_make_handle(1, 0xfffe), NULL, stats);
3282 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3283 const struct ofpbuf *nlmsg,
3284 netdev_dump_queue_stats_cb *cb, void *aux)
3286 struct netdev_queue_stats stats;
3287 unsigned int handle, major, minor;
3290 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3295 major = tc_get_major(handle);
3296 minor = tc_get_minor(handle);
3297 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3298 (*cb)(minor - 1, &stats, aux);
3303 static const struct tc_ops tc_ops_htb = {
3304 "htb", /* linux_name */
3305 "linux-htb", /* ovs_name */
3306 HTB_N_QUEUES, /* n_queues */
3315 htb_class_get_stats,
3316 htb_class_dump_stats
3319 /* "linux-hfsc" traffic control class. */
3321 #define HFSC_N_QUEUES 0xf000
3329 struct tc_queue tc_queue;
3334 static struct hfsc *
3335 hfsc_get__(const struct netdev *netdev_)
3337 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3338 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3341 static struct hfsc_class *
3342 hfsc_class_cast__(const struct tc_queue *queue)
3344 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3348 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3353 hfsc = xmalloc(sizeof *hfsc);
3354 tc_init(&hfsc->tc, &tc_ops_hfsc);
3355 hfsc->max_rate = max_rate;
3356 netdev->tc = &hfsc->tc;
3360 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3361 const struct hfsc_class *hc)
3365 struct hfsc_class *hcp;
3366 struct tc_queue *queue;
3368 hfsc = hfsc_get__(netdev);
3369 hash = hash_int(queue_id, 0);
3371 queue = tc_find_queue__(netdev, queue_id, hash);
3373 hcp = hfsc_class_cast__(queue);
3375 hcp = xmalloc(sizeof *hcp);
3376 queue = &hcp->tc_queue;
3377 queue->queue_id = queue_id;
3378 queue->created = time_msec();
3379 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3382 hcp->min_rate = hc->min_rate;
3383 hcp->max_rate = hc->max_rate;
3387 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3389 const struct tc_service_curve *rsc, *fsc, *usc;
3390 static const struct nl_policy tca_hfsc_policy[] = {
3392 .type = NL_A_UNSPEC,
3394 .min_len = sizeof(struct tc_service_curve),
3397 .type = NL_A_UNSPEC,
3399 .min_len = sizeof(struct tc_service_curve),
3402 .type = NL_A_UNSPEC,
3404 .min_len = sizeof(struct tc_service_curve),
3407 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3409 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3410 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3411 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3415 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3416 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3417 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3419 if (rsc->m1 != 0 || rsc->d != 0 ||
3420 fsc->m1 != 0 || fsc->d != 0 ||
3421 usc->m1 != 0 || usc->d != 0) {
3422 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3423 "Non-linear service curves are not supported.");
3427 if (rsc->m2 != fsc->m2) {
3428 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3429 "Real-time service curves are not supported ");
3433 if (rsc->m2 > usc->m2) {
3434 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3435 "Min-rate service curve is greater than "
3436 "the max-rate service curve.");
3440 class->min_rate = fsc->m2;
3441 class->max_rate = usc->m2;
3446 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3447 struct hfsc_class *options,
3448 struct netdev_queue_stats *stats)
3451 unsigned int handle;
3452 struct nlattr *nl_options;
3454 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3460 unsigned int major, minor;
3462 major = tc_get_major(handle);
3463 minor = tc_get_minor(handle);
3464 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3465 *queue_id = minor - 1;
3472 error = hfsc_parse_tca_options__(nl_options, options);
3479 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3480 unsigned int parent, struct hfsc_class *options,
3481 struct netdev_queue_stats *stats)
3484 struct ofpbuf *reply;
3486 error = tc_query_class(netdev, handle, parent, &reply);
3491 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3492 ofpbuf_delete(reply);
3497 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3498 struct hfsc_class *class)
3500 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3502 const char *max_rate_s;
3504 max_rate_s = smap_get(details, "max-rate");
3505 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3508 enum netdev_features current;
3510 netdev_linux_read_features(netdev);
3511 current = !netdev->get_features_error ? netdev->current : 0;
3512 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3515 class->min_rate = max_rate;
3516 class->max_rate = max_rate;
3520 hfsc_parse_class_details__(struct netdev *netdev,
3521 const struct smap *details,
3522 struct hfsc_class * class)
3524 const struct hfsc *hfsc;
3525 uint32_t min_rate, max_rate;
3526 const char *min_rate_s, *max_rate_s;
3528 hfsc = hfsc_get__(netdev);
3529 min_rate_s = smap_get(details, "min-rate");
3530 max_rate_s = smap_get(details, "max-rate");
3532 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3533 min_rate = MAX(min_rate, 1);
3534 min_rate = MIN(min_rate, hfsc->max_rate);
3536 max_rate = (max_rate_s
3537 ? strtoull(max_rate_s, NULL, 10) / 8
3539 max_rate = MAX(max_rate, min_rate);
3540 max_rate = MIN(max_rate, hfsc->max_rate);
3542 class->min_rate = min_rate;
3543 class->max_rate = max_rate;
3548 /* Create an HFSC qdisc.
3550 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3552 hfsc_setup_qdisc__(struct netdev * netdev)
3554 struct tcmsg *tcmsg;
3555 struct ofpbuf request;
3556 struct tc_hfsc_qopt opt;
3558 tc_del_qdisc(netdev);
3560 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3561 NLM_F_EXCL | NLM_F_CREATE, &request);
3567 tcmsg->tcm_handle = tc_make_handle(1, 0);
3568 tcmsg->tcm_parent = TC_H_ROOT;
3570 memset(&opt, 0, sizeof opt);
3573 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3574 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3576 return tc_transact(&request, NULL);
3579 /* Create an HFSC class.
3581 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3582 * sc rate <min_rate> ul rate <max_rate>" */
3584 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3585 unsigned int parent, struct hfsc_class *class)
3589 struct tcmsg *tcmsg;
3590 struct ofpbuf request;
3591 struct tc_service_curve min, max;
3593 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3599 tcmsg->tcm_handle = handle;
3600 tcmsg->tcm_parent = parent;
3604 min.m2 = class->min_rate;
3608 max.m2 = class->max_rate;
3610 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3611 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3612 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3613 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3614 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3615 nl_msg_end_nested(&request, opt_offset);
3617 error = tc_transact(&request, NULL);
3619 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3620 "min-rate %ubps, max-rate %ubps (%s)",
3621 netdev_get_name(netdev),
3622 tc_get_major(handle), tc_get_minor(handle),
3623 tc_get_major(parent), tc_get_minor(parent),
3624 class->min_rate, class->max_rate, ovs_strerror(error));
3631 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3634 struct hfsc_class class;
3636 error = hfsc_setup_qdisc__(netdev);
3642 hfsc_parse_qdisc_details__(netdev, details, &class);
3643 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3644 tc_make_handle(1, 0), &class);
3650 hfsc_install__(netdev, class.max_rate);
3655 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3658 struct queue_dump_state state;
3659 struct hfsc_class hc;
3662 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3663 hfsc_install__(netdev, hc.max_rate);
3665 if (!start_queue_dump(netdev, &state)) {
3669 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3670 unsigned int queue_id;
3672 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3673 hfsc_update_queue__(netdev, queue_id, &hc);
3677 finish_queue_dump(&state);
3682 hfsc_tc_destroy(struct tc *tc)
3685 struct hfsc_class *hc, *next;
3687 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3689 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3690 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3699 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3701 const struct hfsc *hfsc;
3702 hfsc = hfsc_get__(netdev);
3703 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3708 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3711 struct hfsc_class class;
3713 hfsc_parse_qdisc_details__(netdev, details, &class);
3714 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3715 tc_make_handle(1, 0), &class);
3718 hfsc_get__(netdev)->max_rate = class.max_rate;
3725 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3726 const struct tc_queue *queue, struct smap *details)
3728 const struct hfsc_class *hc;
3730 hc = hfsc_class_cast__(queue);
3731 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3732 if (hc->min_rate != hc->max_rate) {
3733 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3739 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3740 const struct smap *details)
3743 struct hfsc_class class;
3745 error = hfsc_parse_class_details__(netdev, details, &class);
3750 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3751 tc_make_handle(1, 0xfffe), &class);
3756 hfsc_update_queue__(netdev, queue_id, &class);
3761 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3765 struct hfsc_class *hc;
3767 hc = hfsc_class_cast__(queue);
3768 hfsc = hfsc_get__(netdev);
3770 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3772 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3779 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3780 struct netdev_queue_stats *stats)
3782 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3783 tc_make_handle(1, 0xfffe), NULL, stats);
3787 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3788 const struct ofpbuf *nlmsg,
3789 netdev_dump_queue_stats_cb *cb, void *aux)
3791 struct netdev_queue_stats stats;
3792 unsigned int handle, major, minor;
3795 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3800 major = tc_get_major(handle);
3801 minor = tc_get_minor(handle);
3802 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3803 (*cb)(minor - 1, &stats, aux);
3808 static const struct tc_ops tc_ops_hfsc = {
3809 "hfsc", /* linux_name */
3810 "linux-hfsc", /* ovs_name */
3811 HFSC_N_QUEUES, /* n_queues */
3812 hfsc_tc_install, /* tc_install */
3813 hfsc_tc_load, /* tc_load */
3814 hfsc_tc_destroy, /* tc_destroy */
3815 hfsc_qdisc_get, /* qdisc_get */
3816 hfsc_qdisc_set, /* qdisc_set */
3817 hfsc_class_get, /* class_get */
3818 hfsc_class_set, /* class_set */
3819 hfsc_class_delete, /* class_delete */
3820 hfsc_class_get_stats, /* class_get_stats */
3821 hfsc_class_dump_stats /* class_dump_stats */
3824 /* "linux-default" traffic control class.
3826 * This class represents the default, unnamed Linux qdisc. It corresponds to
3827 * the "" (empty string) QoS type in the OVS database. */
3830 default_install__(struct netdev *netdev_)
3832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3833 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3835 /* Nothing but a tc class implementation is allowed to write to a tc. This
3836 * class never does that, so we can legitimately use a const tc object. */
3837 netdev->tc = CONST_CAST(struct tc *, &tc);
3841 default_tc_install(struct netdev *netdev,
3842 const struct smap *details OVS_UNUSED)
3844 default_install__(netdev);
3849 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3851 default_install__(netdev);
3855 static const struct tc_ops tc_ops_default = {
3856 NULL, /* linux_name */
3861 NULL, /* tc_destroy */
3862 NULL, /* qdisc_get */
3863 NULL, /* qdisc_set */
3864 NULL, /* class_get */
3865 NULL, /* class_set */
3866 NULL, /* class_delete */
3867 NULL, /* class_get_stats */
3868 NULL /* class_dump_stats */
3871 /* "linux-other" traffic control class.
3876 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3878 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3879 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3881 /* Nothing but a tc class implementation is allowed to write to a tc. This
3882 * class never does that, so we can legitimately use a const tc object. */
3883 netdev->tc = CONST_CAST(struct tc *, &tc);
3887 static const struct tc_ops tc_ops_other = {
3888 NULL, /* linux_name */
3889 "linux-other", /* ovs_name */
3891 NULL, /* tc_install */
3893 NULL, /* tc_destroy */
3894 NULL, /* qdisc_get */
3895 NULL, /* qdisc_set */
3896 NULL, /* class_get */
3897 NULL, /* class_set */
3898 NULL, /* class_delete */
3899 NULL, /* class_get_stats */
3900 NULL /* class_dump_stats */
3903 /* Traffic control. */
3905 /* Number of kernel "tc" ticks per second. */
3906 static double ticks_per_s;
3908 /* Number of kernel "jiffies" per second. This is used for the purpose of
3909 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3910 * one jiffy's worth of data.
3912 * There are two possibilities here:
3914 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3915 * approximate range of 100 to 1024. That means that we really need to
3916 * make sure that the qdisc can buffer that much data.
3918 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3919 * has finely granular timers and there's no need to fudge additional room
3920 * for buffers. (There's no extra effort needed to implement that: the
3921 * large 'buffer_hz' is used as a divisor, so practically any number will
3922 * come out as 0 in the division. Small integer results in the case of
3923 * really high dividends won't have any real effect anyhow.)
3925 static unsigned int buffer_hz;
3927 /* Returns tc handle 'major':'minor'. */
3929 tc_make_handle(unsigned int major, unsigned int minor)
3931 return TC_H_MAKE(major << 16, minor);
3934 /* Returns the major number from 'handle'. */
3936 tc_get_major(unsigned int handle)
3938 return TC_H_MAJ(handle) >> 16;
3941 /* Returns the minor number from 'handle'. */
3943 tc_get_minor(unsigned int handle)
3945 return TC_H_MIN(handle);
3948 static struct tcmsg *
3949 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3950 struct ofpbuf *request)
3952 struct tcmsg *tcmsg;
3956 error = get_ifindex(netdev, &ifindex);
3961 ofpbuf_init(request, 512);
3962 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3963 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3964 tcmsg->tcm_family = AF_UNSPEC;
3965 tcmsg->tcm_ifindex = ifindex;
3966 /* Caller should fill in tcmsg->tcm_handle. */
3967 /* Caller should fill in tcmsg->tcm_parent. */
3973 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3975 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3976 ofpbuf_uninit(request);
3980 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3981 * policing configuration.
3983 * This function is equivalent to running the following when 'add' is true:
3984 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3986 * This function is equivalent to running the following when 'add' is false:
3987 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3989 * The configuration and stats may be seen with the following command:
3990 * /sbin/tc -s qdisc show dev <devname>
3992 * Returns 0 if successful, otherwise a positive errno value.
3995 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3997 struct ofpbuf request;
3998 struct tcmsg *tcmsg;
4000 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4001 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4003 tcmsg = tc_make_request(netdev, type, flags, &request);
4007 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4008 tcmsg->tcm_parent = TC_H_INGRESS;
4009 nl_msg_put_string(&request, TCA_KIND, "ingress");
4010 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4012 error = tc_transact(&request, NULL);
4014 /* If we're deleting the qdisc, don't worry about some of the
4015 * error conditions. */
4016 if (!add && (error == ENOENT || error == EINVAL)) {
4025 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4028 * This function is equivalent to running:
4029 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4030 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4033 * The configuration and stats may be seen with the following command:
4034 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4036 * Returns 0 if successful, otherwise a positive errno value.
4039 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4041 struct tc_police tc_police;
4042 struct ofpbuf request;
4043 struct tcmsg *tcmsg;
4044 size_t basic_offset;
4045 size_t police_offset;
4049 memset(&tc_police, 0, sizeof tc_police);
4050 tc_police.action = TC_POLICE_SHOT;
4051 tc_police.mtu = mtu;
4052 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4053 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4054 kbits_burst * 1024);
4056 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4057 NLM_F_EXCL | NLM_F_CREATE, &request);
4061 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4062 tcmsg->tcm_info = tc_make_handle(49,
4063 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4065 nl_msg_put_string(&request, TCA_KIND, "basic");
4066 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4067 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4068 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4069 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4070 nl_msg_end_nested(&request, police_offset);
4071 nl_msg_end_nested(&request, basic_offset);
4073 error = tc_transact(&request, NULL);
4084 /* The values in psched are not individually very meaningful, but they are
4085 * important. The tables below show some values seen in the wild.
4089 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4090 * (Before that, there are hints that it was 1000000000.)
4092 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4096 * -----------------------------------
4097 * [1] 000c8000 000f4240 000f4240 00000064
4098 * [2] 000003e8 00000400 000f4240 3b9aca00
4099 * [3] 000003e8 00000400 000f4240 3b9aca00
4100 * [4] 000003e8 00000400 000f4240 00000064
4101 * [5] 000003e8 00000040 000f4240 3b9aca00
4102 * [6] 000003e8 00000040 000f4240 000000f9
4104 * a b c d ticks_per_s buffer_hz
4105 * ------- --------- ---------- ------------- ----------- -------------
4106 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4107 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4108 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4109 * [4] 1,000 1,024 1,000,000 100 976,562 100
4110 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4111 * [6] 1,000 64 1,000,000 249 15,625,000 249
4113 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4114 * [2] 2.6.26-1-686-bigmem from Debian lenny
4115 * [3] 2.6.26-2-sparc64 from Debian lenny
4116 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4117 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4118 * [6] 2.6.34 from kernel.org on KVM
4120 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4121 static const char fn[] = "/proc/net/psched";
4122 unsigned int a, b, c, d;
4125 if (!ovsthread_once_start(&once)) {
4132 stream = fopen(fn, "r");
4134 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4138 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4139 VLOG_WARN("%s: read failed", fn);
4143 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4147 VLOG_WARN("%s: invalid scheduler parameters", fn);
4151 ticks_per_s = (double) a * c / b;
4155 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4158 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4161 ovsthread_once_done(&once);
4164 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4165 * rate of 'rate' bytes per second. */
4167 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4170 return (rate * ticks) / ticks_per_s;
4173 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4174 * rate of 'rate' bytes per second. */
4176 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4179 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4182 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4183 * a transmission rate of 'rate' bytes per second. */
4185 tc_buffer_per_jiffy(unsigned int rate)
4188 return rate / buffer_hz;
4191 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4192 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4193 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4194 * stores NULL into it if it is absent.
4196 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4199 * Returns 0 if successful, otherwise a positive errno value. */
4201 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4202 struct nlattr **options)
4204 static const struct nl_policy tca_policy[] = {
4205 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4206 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4208 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4210 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4211 tca_policy, ta, ARRAY_SIZE(ta))) {
4212 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4217 *kind = nl_attr_get_string(ta[TCA_KIND]);
4221 *options = ta[TCA_OPTIONS];
4236 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4237 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4238 * into '*options', and its queue statistics into '*stats'. Any of the output
4239 * arguments may be null.
4241 * Returns 0 if successful, otherwise a positive errno value. */
4243 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4244 struct nlattr **options, struct netdev_queue_stats *stats)
4246 static const struct nl_policy tca_policy[] = {
4247 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4248 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4250 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4252 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4253 tca_policy, ta, ARRAY_SIZE(ta))) {
4254 VLOG_WARN_RL(&rl, "failed to parse class message");
4259 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4260 *handlep = tc->tcm_handle;
4264 *options = ta[TCA_OPTIONS];
4268 const struct gnet_stats_queue *gsq;
4269 struct gnet_stats_basic gsb;
4271 static const struct nl_policy stats_policy[] = {
4272 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4273 .min_len = sizeof gsb },
4274 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4275 .min_len = sizeof *gsq },
4277 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4279 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4280 sa, ARRAY_SIZE(sa))) {
4281 VLOG_WARN_RL(&rl, "failed to parse class stats");
4285 /* Alignment issues screw up the length of struct gnet_stats_basic on
4286 * some arch/bitsize combinations. Newer versions of Linux have a
4287 * struct gnet_stats_basic_packed, but we can't depend on that. The
4288 * easiest thing to do is just to make a copy. */
4289 memset(&gsb, 0, sizeof gsb);
4290 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4291 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4292 stats->tx_bytes = gsb.bytes;
4293 stats->tx_packets = gsb.packets;
4295 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4296 stats->tx_errors = gsq->drops;
4306 memset(stats, 0, sizeof *stats);
4311 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4314 tc_query_class(const struct netdev *netdev,
4315 unsigned int handle, unsigned int parent,
4316 struct ofpbuf **replyp)
4318 struct ofpbuf request;
4319 struct tcmsg *tcmsg;
4322 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4326 tcmsg->tcm_handle = handle;
4327 tcmsg->tcm_parent = parent;
4329 error = tc_transact(&request, replyp);
4331 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4332 netdev_get_name(netdev),
4333 tc_get_major(handle), tc_get_minor(handle),
4334 tc_get_major(parent), tc_get_minor(parent),
4335 ovs_strerror(error));
4340 /* Equivalent to "tc class del dev <name> handle <handle>". */
4342 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4344 struct ofpbuf request;
4345 struct tcmsg *tcmsg;
4348 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4352 tcmsg->tcm_handle = handle;
4353 tcmsg->tcm_parent = 0;
4355 error = tc_transact(&request, NULL);
4357 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4358 netdev_get_name(netdev),
4359 tc_get_major(handle), tc_get_minor(handle),
4360 ovs_strerror(error));
4365 /* Equivalent to "tc qdisc del dev <name> root". */
4367 tc_del_qdisc(struct netdev *netdev_)
4369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4370 struct ofpbuf request;
4371 struct tcmsg *tcmsg;
4374 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4378 tcmsg->tcm_handle = tc_make_handle(1, 0);
4379 tcmsg->tcm_parent = TC_H_ROOT;
4381 error = tc_transact(&request, NULL);
4382 if (error == EINVAL) {
4383 /* EINVAL probably means that the default qdisc was in use, in which
4384 * case we've accomplished our purpose. */
4387 if (!error && netdev->tc) {
4388 if (netdev->tc->ops->tc_destroy) {
4389 netdev->tc->ops->tc_destroy(netdev->tc);
4396 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4397 * kernel to determine what they are. Returns 0 if successful, otherwise a
4398 * positive errno value. */
4400 tc_query_qdisc(const struct netdev *netdev_)
4402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4403 struct ofpbuf request, *qdisc;
4404 const struct tc_ops *ops;
4405 struct tcmsg *tcmsg;
4413 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4414 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4415 * 2.6.35 without that fix backported to it.
4417 * To avoid the OOPS, we must not make a request that would attempt to dump
4418 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4419 * few others. There are a few ways that I can see to do this, but most of
4420 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4421 * technique chosen here is to assume that any non-default qdisc that we
4422 * create will have a class with handle 1:0. The built-in qdiscs only have
4423 * a class with handle 0:0.
4425 * We could check for Linux 2.6.35+ and use a more straightforward method
4427 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4431 tcmsg->tcm_handle = tc_make_handle(1, 0);
4432 tcmsg->tcm_parent = 0;
4434 /* Figure out what tc class to instantiate. */
4435 error = tc_transact(&request, &qdisc);
4439 error = tc_parse_qdisc(qdisc, &kind, NULL);
4441 ops = &tc_ops_other;
4443 ops = tc_lookup_linux_name(kind);
4445 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4446 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4448 ops = &tc_ops_other;
4451 } else if (error == ENOENT) {
4452 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4453 * other entity that doesn't have a handle 1:0. We will assume
4454 * that it's the system default qdisc. */
4455 ops = &tc_ops_default;
4458 /* Who knows? Maybe the device got deleted. */
4459 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4460 netdev_get_name(netdev_), ovs_strerror(error));
4461 ops = &tc_ops_other;
4464 /* Instantiate it. */
4465 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4466 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4467 ofpbuf_delete(qdisc);
4469 return error ? error : load_error;
4472 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4473 approximate the time to transmit packets of various lengths. For an MTU of
4474 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4475 represents two possible packet lengths; for a MTU of 513 through 1024, four
4476 possible lengths; and so on.
4478 Returns, for the specified 'mtu', the number of bits that packet lengths
4479 need to be shifted right to fit within such a 256-entry table. */
4481 tc_calc_cell_log(unsigned int mtu)
4486 mtu = ETH_PAYLOAD_MAX;
4488 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4490 for (cell_log = 0; mtu >= 256; cell_log++) {
4497 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4500 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4502 memset(rate, 0, sizeof *rate);
4503 rate->cell_log = tc_calc_cell_log(mtu);
4504 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4505 /* rate->cell_align = 0; */ /* distro headers. */
4506 rate->mpu = ETH_TOTAL_MIN;
4510 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4511 * attribute of the specified "type".
4513 * See tc_calc_cell_log() above for a description of "rtab"s. */
4515 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4520 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4521 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4522 unsigned packet_size = (i + 1) << rate->cell_log;
4523 if (packet_size < rate->mpu) {
4524 packet_size = rate->mpu;
4526 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4530 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4531 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4532 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4535 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4537 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4538 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4541 /* Linux-only functions declared in netdev-linux.h */
4543 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4544 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4546 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4547 const char *flag_name, bool enable)
4549 const char *netdev_name = netdev_get_name(netdev);
4550 struct ethtool_value evalue;
4554 COVERAGE_INC(netdev_get_ethtool);
4555 memset(&evalue, 0, sizeof evalue);
4556 error = netdev_linux_do_ethtool(netdev_name,
4557 (struct ethtool_cmd *)&evalue,
4558 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4563 COVERAGE_INC(netdev_set_ethtool);
4564 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4565 error = netdev_linux_do_ethtool(netdev_name,
4566 (struct ethtool_cmd *)&evalue,
4567 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4572 COVERAGE_INC(netdev_get_ethtool);
4573 memset(&evalue, 0, sizeof evalue);
4574 error = netdev_linux_do_ethtool(netdev_name,
4575 (struct ethtool_cmd *)&evalue,
4576 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4581 if (new_flags != evalue.data) {
4582 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4583 "device %s failed", enable ? "enable" : "disable",
4584 flag_name, netdev_name);
4591 /* Utility functions. */
4593 /* Copies 'src' into 'dst', performing format conversion in the process. */
4595 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4596 const struct rtnl_link_stats *src)
4598 dst->rx_packets = src->rx_packets;
4599 dst->tx_packets = src->tx_packets;
4600 dst->rx_bytes = src->rx_bytes;
4601 dst->tx_bytes = src->tx_bytes;
4602 dst->rx_errors = src->rx_errors;
4603 dst->tx_errors = src->tx_errors;
4604 dst->rx_dropped = src->rx_dropped;
4605 dst->tx_dropped = src->tx_dropped;
4606 dst->multicast = src->multicast;
4607 dst->collisions = src->collisions;
4608 dst->rx_length_errors = src->rx_length_errors;
4609 dst->rx_over_errors = src->rx_over_errors;
4610 dst->rx_crc_errors = src->rx_crc_errors;
4611 dst->rx_frame_errors = src->rx_frame_errors;
4612 dst->rx_fifo_errors = src->rx_fifo_errors;
4613 dst->rx_missed_errors = src->rx_missed_errors;
4614 dst->tx_aborted_errors = src->tx_aborted_errors;
4615 dst->tx_carrier_errors = src->tx_carrier_errors;
4616 dst->tx_fifo_errors = src->tx_fifo_errors;
4617 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4618 dst->tx_window_errors = src->tx_window_errors;
4621 /* Copies 'src' into 'dst', performing format conversion in the process. */
4623 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4624 const struct rtnl_link_stats64 *src)
4626 dst->rx_packets = src->rx_packets;
4627 dst->tx_packets = src->tx_packets;
4628 dst->rx_bytes = src->rx_bytes;
4629 dst->tx_bytes = src->tx_bytes;
4630 dst->rx_errors = src->rx_errors;
4631 dst->tx_errors = src->tx_errors;
4632 dst->rx_dropped = src->rx_dropped;
4633 dst->tx_dropped = src->tx_dropped;
4634 dst->multicast = src->multicast;
4635 dst->collisions = src->collisions;
4636 dst->rx_length_errors = src->rx_length_errors;
4637 dst->rx_over_errors = src->rx_over_errors;
4638 dst->rx_crc_errors = src->rx_crc_errors;
4639 dst->rx_frame_errors = src->rx_frame_errors;
4640 dst->rx_fifo_errors = src->rx_fifo_errors;
4641 dst->rx_missed_errors = src->rx_missed_errors;
4642 dst->tx_aborted_errors = src->tx_aborted_errors;
4643 dst->tx_carrier_errors = src->tx_carrier_errors;
4644 dst->tx_fifo_errors = src->tx_fifo_errors;
4645 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4646 dst->tx_window_errors = src->tx_window_errors;
4650 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4652 struct ofpbuf request;
4653 struct ofpbuf *reply;
4656 ofpbuf_init(&request, 0);
4657 nl_msg_put_nlmsghdr(&request,
4658 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4659 RTM_GETLINK, NLM_F_REQUEST);
4660 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4661 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4662 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4663 ofpbuf_uninit(&request);
4668 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4669 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
4670 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
4671 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
4674 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4675 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4676 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4679 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4684 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4689 ofpbuf_delete(reply);
4694 get_flags(const struct netdev *dev, unsigned int *flags)
4700 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4702 *flags = ifr.ifr_flags;
4708 set_flags(const char *name, unsigned int flags)
4712 ifr.ifr_flags = flags;
4713 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4717 do_get_ifindex(const char *netdev_name)
4722 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4723 COVERAGE_INC(netdev_get_ifindex);
4725 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4727 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4728 netdev_name, ovs_strerror(error));
4731 return ifr.ifr_ifindex;
4735 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4739 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4740 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4743 netdev->get_ifindex_error = -ifindex;
4744 netdev->ifindex = 0;
4746 netdev->get_ifindex_error = 0;
4747 netdev->ifindex = ifindex;
4749 netdev->cache_valid |= VALID_IFINDEX;
4752 *ifindexp = netdev->ifindex;
4753 return netdev->get_ifindex_error;
4757 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4763 memset(&ifr, 0, sizeof ifr);
4764 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4765 COVERAGE_INC(netdev_get_hwaddr);
4766 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4768 /* ENODEV probably means that a vif disappeared asynchronously and
4769 * hasn't been removed from the database yet, so reduce the log level
4770 * to INFO for that case. */
4771 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4772 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4773 netdev_name, ovs_strerror(error));
4776 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4777 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4778 VLOG_WARN("%s device has unknown hardware address family %d",
4779 netdev_name, hwaddr_family);
4781 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4786 set_etheraddr(const char *netdev_name,
4787 const uint8_t mac[ETH_ADDR_LEN])
4792 memset(&ifr, 0, sizeof ifr);
4793 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4794 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4795 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4796 COVERAGE_INC(netdev_set_hwaddr);
4797 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4799 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4800 netdev_name, ovs_strerror(error));
4806 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4807 int cmd, const char *cmd_name)
4812 memset(&ifr, 0, sizeof ifr);
4813 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4814 ifr.ifr_data = (caddr_t) ecmd;
4817 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4819 if (error != EOPNOTSUPP) {
4820 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4821 "failed: %s", cmd_name, name, ovs_strerror(error));
4823 /* The device doesn't support this operation. That's pretty
4824 * common, so there's no point in logging anything. */
4831 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4832 int cmd, const char *cmd_name)
4837 ifr.ifr_addr.sa_family = AF_INET;
4838 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4840 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4842 *ip = sin->sin_addr;
4847 /* Returns an AF_PACKET raw socket or a negative errno value. */
4849 af_packet_sock(void)
4851 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4854 if (ovsthread_once_start(&once)) {
4855 sock = socket(AF_PACKET, SOCK_RAW, 0);
4857 int error = set_nonblocking(sock);
4864 VLOG_ERR("failed to create packet socket: %s",
4865 ovs_strerror(errno));
4867 ovsthread_once_done(&once);