2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
69 #include "socket-util.h"
72 #include "unaligned.h"
75 VLOG_DEFINE_THIS_MODULE(netdev_linux);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_get_ethtool);
83 COVERAGE_DEFINE(netdev_set_ethtool);
86 /* These were introduced in Linux 2.6.14, so they might be missing if we have
88 #ifndef ADVERTISED_Pause
89 #define ADVERTISED_Pause (1 << 13)
91 #ifndef ADVERTISED_Asym_Pause
92 #define ADVERTISED_Asym_Pause (1 << 14)
95 /* These were introduced in Linux 2.6.24, so they might be missing if we
96 * have old headers. */
97 #ifndef ETHTOOL_GFLAGS
98 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
100 #ifndef ETHTOOL_SFLAGS
101 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 #define TC_RTAB_SIZE 1024
110 /* Linux 2.6.21 introduced struct tpacket_auxdata.
111 * Linux 2.6.27 added the tp_vlan_tci member.
112 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
113 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
114 * TP_STATUS_VLAN_TPID_VALID.
116 * With all this churn it's easiest to unconditionally define a replacement
117 * structure that has everything we want.
119 #ifndef PACKET_AUXDATA
120 #define PACKET_AUXDATA 8
122 #ifndef TP_STATUS_VLAN_VALID
123 #define TP_STATUS_VLAN_VALID (1 << 4)
125 #ifndef TP_STATUS_VLAN_TPID_VALID
126 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
128 #undef tpacket_auxdata
129 #define tpacket_auxdata rpl_tpacket_auxdata
130 struct tpacket_auxdata {
136 uint16_t tp_vlan_tci;
137 uint16_t tp_vlan_tpid;
141 VALID_IFINDEX = 1 << 0,
142 VALID_ETHERADDR = 1 << 1,
146 VALID_POLICING = 1 << 5,
147 VALID_VPORT_STAT_ERROR = 1 << 6,
148 VALID_DRVINFO = 1 << 7,
149 VALID_FEATURES = 1 << 8,
152 /* Traffic control. */
154 /* An instance of a traffic control class. Always associated with a particular
157 * Each TC implementation subclasses this with whatever additional data it
160 const struct tc_ops *ops;
161 struct hmap queues; /* Contains "struct tc_queue"s.
162 * Read by generic TC layer.
163 * Written only by TC implementation. */
166 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
168 /* One traffic control queue.
170 * Each TC implementation subclasses this with whatever additional data it
173 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
174 unsigned int queue_id; /* OpenFlow queue ID. */
175 long long int created; /* Time queue was created, in msecs. */
178 /* A particular kind of traffic control. Each implementation generally maps to
179 * one particular Linux qdisc class.
181 * The functions below return 0 if successful or a positive errno value on
182 * failure, except where otherwise noted. All of them must be provided, except
183 * where otherwise noted. */
185 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
186 * This is null for tc_ops_default and tc_ops_other, for which there are no
187 * appropriate values. */
188 const char *linux_name;
190 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
191 const char *ovs_name;
193 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
194 * queues. The queues are numbered 0 through n_queues - 1. */
195 unsigned int n_queues;
197 /* Called to install this TC class on 'netdev'. The implementation should
198 * make the Netlink calls required to set up 'netdev' with the right qdisc
199 * and configure it according to 'details'. The implementation may assume
200 * that the current qdisc is the default; that is, there is no need for it
201 * to delete the current qdisc before installing itself.
203 * The contents of 'details' should be documented as valid for 'ovs_name'
204 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
205 * (which is built as ovs-vswitchd.conf.db(8)).
207 * This function must return 0 if and only if it sets 'netdev->tc' to an
208 * initialized 'struct tc'.
210 * (This function is null for tc_ops_other, which cannot be installed. For
211 * other TC classes it should always be nonnull.) */
212 int (*tc_install)(struct netdev *netdev, const struct smap *details);
214 /* Called when the netdev code determines (through a Netlink query) that
215 * this TC class's qdisc is installed on 'netdev', but we didn't install
216 * it ourselves and so don't know any of the details.
218 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
219 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
220 * implementation should parse the other attributes of 'nlmsg' as
221 * necessary to determine its configuration. If necessary it should also
222 * use Netlink queries to determine the configuration of queues on
225 * This function must return 0 if and only if it sets 'netdev->tc' to an
226 * initialized 'struct tc'. */
227 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
229 /* Destroys the data structures allocated by the implementation as part of
230 * 'tc'. (This includes destroying 'tc->queues' by calling
233 * The implementation should not need to perform any Netlink calls. If
234 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
235 * (But it may not be desirable.)
237 * This function may be null if 'tc' is trivial. */
238 void (*tc_destroy)(struct tc *tc);
240 /* Retrieves details of 'netdev->tc' configuration into 'details'.
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the configuration.
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
248 * (which is built as ovs-vswitchd.conf.db(8)).
250 * This function may be null if 'tc' is not configurable.
252 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
254 /* Reconfigures 'netdev->tc' according to 'details', performing any
255 * required Netlink calls to complete the reconfiguration.
257 * The contents of 'details' should be documented as valid for 'ovs_name'
258 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
259 * (which is built as ovs-vswitchd.conf.db(8)).
261 * This function may be null if 'tc' is not configurable.
263 int (*qdisc_set)(struct netdev *, const struct smap *details);
265 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
266 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
268 * The contents of 'details' should be documented as valid for 'ovs_name'
269 * in the "other_config" column in the "Queue" table in
270 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
272 * The implementation should not need to perform any Netlink calls, because
273 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
274 * cached the queue configuration.
276 * This function may be null if 'tc' does not have queues ('n_queues' is
278 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
279 struct smap *details);
281 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
282 * 'details', perfoming any required Netlink calls to complete the
283 * reconfiguration. The caller ensures that 'queue_id' is less than
286 * The contents of 'details' should be documented as valid for 'ovs_name'
287 * in the "other_config" column in the "Queue" table in
288 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
290 * This function may be null if 'tc' does not have queues or its queues are
291 * not configurable. */
292 int (*class_set)(struct netdev *, unsigned int queue_id,
293 const struct smap *details);
295 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
296 * tc_queue's within 'netdev->tc->queues'.
298 * This function may be null if 'tc' does not have queues or its queues
299 * cannot be deleted. */
300 int (*class_delete)(struct netdev *, struct tc_queue *queue);
302 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
303 * 'struct tc_queue's within 'netdev->tc->queues'.
305 * On success, initializes '*stats'.
307 * This function may be null if 'tc' does not have queues or if it cannot
308 * report queue statistics. */
309 int (*class_get_stats)(const struct netdev *netdev,
310 const struct tc_queue *queue,
311 struct netdev_queue_stats *stats);
313 /* Extracts queue stats from 'nlmsg', which is a response to a
314 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
316 * This function may be null if 'tc' does not have queues or if it cannot
317 * report queue statistics. */
318 int (*class_dump_stats)(const struct netdev *netdev,
319 const struct ofpbuf *nlmsg,
320 netdev_dump_queue_stats_cb *cb, void *aux);
324 tc_init(struct tc *tc, const struct tc_ops *ops)
327 hmap_init(&tc->queues);
331 tc_destroy(struct tc *tc)
333 hmap_destroy(&tc->queues);
336 static const struct tc_ops tc_ops_htb;
337 static const struct tc_ops tc_ops_hfsc;
338 static const struct tc_ops tc_ops_default;
339 static const struct tc_ops tc_ops_other;
341 static const struct tc_ops *const tcs[] = {
342 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
343 &tc_ops_hfsc, /* Hierarchical fair service curve. */
344 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
345 &tc_ops_other, /* Some other qdisc. */
349 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
350 static unsigned int tc_get_major(unsigned int handle);
351 static unsigned int tc_get_minor(unsigned int handle);
353 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
354 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
355 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
357 static struct tcmsg *tc_make_request(const struct netdev *, int type,
358 unsigned int flags, struct ofpbuf *);
359 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
360 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
361 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
364 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
365 struct nlattr **options);
366 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
367 struct nlattr **options,
368 struct netdev_queue_stats *);
369 static int tc_query_class(const struct netdev *,
370 unsigned int handle, unsigned int parent,
371 struct ofpbuf **replyp);
372 static int tc_delete_class(const struct netdev *, unsigned int handle);
374 static int tc_del_qdisc(struct netdev *netdev);
375 static int tc_query_qdisc(const struct netdev *netdev);
377 static int tc_calc_cell_log(unsigned int mtu);
378 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
379 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
380 const struct tc_ratespec *rate);
381 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
383 struct netdev_linux {
386 /* Protects all members below. */
387 struct ovs_mutex mutex;
389 unsigned int cache_valid;
391 bool miimon; /* Link status of last poll. */
392 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
393 struct timer miimon_timer;
395 /* The following are figured out "on demand" only. They are only valid
396 * when the corresponding VALID_* bit in 'cache_valid' is set. */
398 uint8_t etheraddr[ETH_ADDR_LEN];
399 struct in_addr address, netmask;
402 unsigned int ifi_flags;
403 long long int carrier_resets;
404 uint32_t kbits_rate; /* Policing data. */
405 uint32_t kbits_burst;
406 int vport_stats_error; /* Cached error code from vport_get_stats().
407 0 or an errno value. */
408 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
409 int ether_addr_error; /* Cached error code from set/get etheraddr. */
410 int netdev_policing_error; /* Cached error code from set policing. */
411 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
412 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
414 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
415 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
418 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
421 /* For devices of class netdev_tap_class only. */
425 struct netdev_rxq_linux {
426 struct netdev_rxq up;
431 /* This is set pretty low because we probably won't learn anything from the
432 * additional log messages. */
433 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
435 /* Polling miimon status for all ports causes performance degradation when
436 * handling a large number of ports. If there are no devices using miimon, then
437 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
438 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
440 static void netdev_linux_run(void);
442 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
443 int cmd, const char *cmd_name);
444 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
445 int cmd, const char *cmd_name);
446 static int get_flags(const struct netdev *, unsigned int *flags);
447 static int set_flags(const char *, unsigned int flags);
448 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
449 enum netdev_flags on, enum netdev_flags *old_flagsp)
450 OVS_REQUIRES(netdev->mutex);
451 static int do_get_ifindex(const char *netdev_name);
452 static int get_ifindex(const struct netdev *, int *ifindexp);
453 static int do_set_addr(struct netdev *netdev,
454 int ioctl_nr, const char *ioctl_name,
455 struct in_addr addr);
456 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
457 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
458 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
459 static int af_packet_sock(void);
460 static bool netdev_linux_miimon_enabled(void);
461 static void netdev_linux_miimon_run(void);
462 static void netdev_linux_miimon_wait(void);
463 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
466 is_netdev_linux_class(const struct netdev_class *netdev_class)
468 return netdev_class->run == netdev_linux_run;
472 is_tap_netdev(const struct netdev *netdev)
474 return netdev_get_class(netdev) == &netdev_tap_class;
477 static struct netdev_linux *
478 netdev_linux_cast(const struct netdev *netdev)
480 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
482 return CONTAINER_OF(netdev, struct netdev_linux, up);
485 static struct netdev_rxq_linux *
486 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
488 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
489 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
492 static void netdev_linux_update(struct netdev_linux *netdev,
493 const struct rtnetlink_link_change *)
494 OVS_REQUIRES(netdev->mutex);
495 static void netdev_linux_changed(struct netdev_linux *netdev,
496 unsigned int ifi_flags, unsigned int mask)
497 OVS_REQUIRES(netdev->mutex);
499 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
500 * if no such socket could be created. */
501 static struct nl_sock *
502 netdev_linux_notify_sock(void)
504 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
505 static struct nl_sock *sock;
507 if (ovsthread_once_start(&once)) {
510 error = nl_sock_create(NETLINK_ROUTE, &sock);
512 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
514 nl_sock_destroy(sock);
518 ovsthread_once_done(&once);
525 netdev_linux_miimon_enabled(void)
529 atomic_read(&miimon_cnt, &miimon);
534 netdev_linux_run(void)
536 struct nl_sock *sock;
539 if (netdev_linux_miimon_enabled()) {
540 netdev_linux_miimon_run();
543 sock = netdev_linux_notify_sock();
549 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
550 uint64_t buf_stub[4096 / 8];
553 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
554 error = nl_sock_recv(sock, &buf, false);
556 struct rtnetlink_link_change change;
558 if (rtnetlink_link_parse(&buf, &change)) {
559 struct netdev *netdev_ = netdev_from_name(change.ifname);
560 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
561 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
563 ovs_mutex_lock(&netdev->mutex);
564 netdev_linux_update(netdev, &change);
565 ovs_mutex_unlock(&netdev->mutex);
567 netdev_close(netdev_);
569 } else if (error == ENOBUFS) {
570 struct shash device_shash;
571 struct shash_node *node;
575 shash_init(&device_shash);
576 netdev_get_devices(&netdev_linux_class, &device_shash);
577 SHASH_FOR_EACH (node, &device_shash) {
578 struct netdev *netdev_ = node->data;
579 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
582 ovs_mutex_lock(&netdev->mutex);
583 get_flags(netdev_, &flags);
584 netdev_linux_changed(netdev, flags, 0);
585 ovs_mutex_unlock(&netdev->mutex);
587 netdev_close(netdev_);
589 shash_destroy(&device_shash);
590 } else if (error != EAGAIN) {
591 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
592 ovs_strerror(error));
599 netdev_linux_wait(void)
601 struct nl_sock *sock;
603 if (netdev_linux_miimon_enabled()) {
604 netdev_linux_miimon_wait();
606 sock = netdev_linux_notify_sock();
608 nl_sock_wait(sock, POLLIN);
613 netdev_linux_changed(struct netdev_linux *dev,
614 unsigned int ifi_flags, unsigned int mask)
615 OVS_REQUIRES(dev->mutex)
617 netdev_change_seq_changed(&dev->up);
619 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
620 dev->carrier_resets++;
622 dev->ifi_flags = ifi_flags;
624 dev->cache_valid &= mask;
628 netdev_linux_update(struct netdev_linux *dev,
629 const struct rtnetlink_link_change *change)
630 OVS_REQUIRES(dev->mutex)
632 if (change->nlmsg_type == RTM_NEWLINK) {
634 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
636 /* Update netdev from rtnl-change msg. */
638 dev->mtu = change->mtu;
639 dev->cache_valid |= VALID_MTU;
640 dev->netdev_mtu_error = 0;
643 if (!eth_addr_is_zero(change->addr)) {
644 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
645 dev->cache_valid |= VALID_ETHERADDR;
646 dev->ether_addr_error = 0;
649 dev->ifindex = change->ifi_index;
650 dev->cache_valid |= VALID_IFINDEX;
651 dev->get_ifindex_error = 0;
654 netdev_linux_changed(dev, change->ifi_flags, 0);
658 static struct netdev *
659 netdev_linux_alloc(void)
661 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
666 netdev_linux_common_construct(struct netdev_linux *netdev)
668 ovs_mutex_init(&netdev->mutex);
671 /* Creates system and internal devices. */
673 netdev_linux_construct(struct netdev *netdev_)
675 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 netdev_linux_common_construct(netdev);
680 error = get_flags(&netdev->up, &netdev->ifi_flags);
681 if (error == ENODEV) {
682 if (netdev->up.netdev_class != &netdev_internal_class) {
683 /* The device does not exist, so don't allow it to be opened. */
686 /* "Internal" netdevs have to be created as netdev objects before
687 * they exist in the kernel, because creating them in the kernel
688 * happens by passing a netdev object to dpif_port_add().
689 * Therefore, ignore the error. */
696 /* For most types of netdevs we open the device for each call of
697 * netdev_open(). However, this is not the case with tap devices,
698 * since it is only possible to open the device once. In this
699 * situation we share a single file descriptor, and consequently
700 * buffers, across all readers. Therefore once data is read it will
701 * be unavailable to other reads for tap devices. */
703 netdev_linux_construct_tap(struct netdev *netdev_)
705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
706 static const char tap_dev[] = "/dev/net/tun";
707 const char *name = netdev_->name;
711 netdev_linux_common_construct(netdev);
713 /* Open tap device. */
714 netdev->tap_fd = open(tap_dev, O_RDWR);
715 if (netdev->tap_fd < 0) {
717 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
721 /* Create tap device. */
722 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
723 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
724 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
725 VLOG_WARN("%s: creating tap device failed: %s", name,
726 ovs_strerror(errno));
731 /* Make non-blocking. */
732 error = set_nonblocking(netdev->tap_fd);
740 close(netdev->tap_fd);
745 netdev_linux_destruct(struct netdev *netdev_)
747 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749 if (netdev->tc && netdev->tc->ops->tc_destroy) {
750 netdev->tc->ops->tc_destroy(netdev->tc);
753 if (netdev_get_class(netdev_) == &netdev_tap_class
754 && netdev->tap_fd >= 0)
756 close(netdev->tap_fd);
759 if (netdev->miimon_interval > 0) {
761 atomic_sub(&miimon_cnt, 1, &junk);
764 ovs_mutex_destroy(&netdev->mutex);
768 netdev_linux_dealloc(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 static struct netdev_rxq *
775 netdev_linux_rxq_alloc(void)
777 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
782 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
784 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
785 struct netdev *netdev_ = rx->up.netdev;
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 ovs_mutex_lock(&netdev->mutex);
790 rx->is_tap = is_tap_netdev(netdev_);
792 rx->fd = netdev->tap_fd;
794 struct sockaddr_ll sll;
796 /* Result of tcpdump -dd inbound */
797 static const struct sock_filter filt[] = {
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
807 /* Create file descriptor. */
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(rx->fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->up, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = htons(ETH_P_ALL);
840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), ovs_strerror(error));
847 /* Filter for only inbound packets. */
848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
852 VLOG_ERR("%s: failed to attach filter (%s)",
853 netdev_get_name(netdev_), ovs_strerror(error));
857 ovs_mutex_unlock(&netdev->mutex);
865 ovs_mutex_unlock(&netdev->mutex);
870 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
872 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
880 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
882 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
888 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
893 return htons(ETH_TYPE_VLAN);
898 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
904 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
909 struct cmsghdr *cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
920 iov.iov_base = ofpbuf_data(buffer);
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
932 } while (retval < 0 && errno == EINTR);
936 } else if (retval > size) {
940 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
967 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
970 size_t size = ofpbuf_tailroom(buffer);
973 retval = read(fd, ofpbuf_data(buffer), size);
974 } while (retval < 0 && errno == EINTR);
978 } else if (retval > size) {
982 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
987 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packet, int *c)
989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
990 struct netdev *netdev = rx->up.netdev;
991 struct ofpbuf *buffer;
995 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
996 mtu = ETH_PAYLOAD_MAX;
999 buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM);
1001 retval = (rx->is_tap
1002 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1003 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1006 if (retval != EAGAIN && retval != EMSGSIZE) {
1007 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1008 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1010 ofpbuf_delete(buffer);
1012 dp_packet_pad(buffer);
1021 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1023 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1024 poll_fd_wait(rx->fd, POLLIN);
1028 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1030 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1033 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1034 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1038 drain_fd(rx->fd, ifr.ifr_qlen);
1041 return drain_rcvbuf(rx->fd);
1045 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1046 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1047 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1048 * the packet is too big or too small to transmit on the device.
1050 * The caller retains ownership of 'buffer' in all cases.
1052 * The kernel maintains a packet transmission queue, so the caller is not
1053 * expected to do additional queuing of packets. */
1055 netdev_linux_send(struct netdev *netdev_, struct ofpbuf *pkt, bool may_steal)
1057 const void *data = ofpbuf_data(pkt);
1058 size_t size = ofpbuf_size(pkt);
1063 if (!is_tap_netdev(netdev_)) {
1064 /* Use our AF_PACKET socket to send to this device. */
1065 struct sockaddr_ll sll;
1071 sock = af_packet_sock();
1076 ifindex = netdev_get_ifindex(netdev_);
1081 /* We don't bother setting most fields in sockaddr_ll because the
1082 * kernel ignores them for SOCK_RAW. */
1083 memset(&sll, 0, sizeof sll);
1084 sll.sll_family = AF_PACKET;
1085 sll.sll_ifindex = ifindex;
1087 iov.iov_base = CONST_CAST(void *, data);
1090 msg.msg_name = &sll;
1091 msg.msg_namelen = sizeof sll;
1094 msg.msg_control = NULL;
1095 msg.msg_controllen = 0;
1098 retval = sendmsg(sock, &msg, 0);
1100 /* Use the tap fd to send to this device. This is essential for
1101 * tap devices, because packets sent to a tap device with an
1102 * AF_PACKET socket will loop back to be *received* again on the
1103 * tap device. This doesn't occur on other interface types
1104 * because we attach a socket filter to the rx socket. */
1105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1107 retval = write(netdev->tap_fd, data, size);
1115 /* The Linux AF_PACKET implementation never blocks waiting for room
1116 * for packets, instead returning ENOBUFS. Translate this into
1117 * EAGAIN for the caller. */
1118 if (errno == ENOBUFS) {
1120 } else if (errno == EINTR) {
1122 } else if (errno != EAGAIN) {
1123 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1124 netdev_get_name(netdev_), ovs_strerror(errno));
1127 } else if (retval != size) {
1128 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes of "
1129 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1137 /* Registers with the poll loop to wake up from the next call to poll_block()
1138 * when the packet transmission queue has sufficient room to transmit a packet
1139 * with netdev_send().
1141 * The kernel maintains a packet transmission queue, so the client is not
1142 * expected to do additional queuing of packets. Thus, this function is
1143 * unlikely to ever be used. It is included for completeness. */
1145 netdev_linux_send_wait(struct netdev *netdev)
1147 if (is_tap_netdev(netdev)) {
1148 /* TAP device always accepts packets.*/
1149 poll_immediate_wake();
1153 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1154 * otherwise a positive errno value. */
1156 netdev_linux_set_etheraddr(struct netdev *netdev_,
1157 const uint8_t mac[ETH_ADDR_LEN])
1159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1160 enum netdev_flags old_flags = 0;
1163 ovs_mutex_lock(&netdev->mutex);
1165 if (netdev->cache_valid & VALID_ETHERADDR) {
1166 error = netdev->ether_addr_error;
1167 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1170 netdev->cache_valid &= ~VALID_ETHERADDR;
1173 /* Tap devices must be brought down before setting the address. */
1174 if (is_tap_netdev(netdev_)) {
1175 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1177 error = set_etheraddr(netdev_get_name(netdev_), mac);
1178 if (!error || error == ENODEV) {
1179 netdev->ether_addr_error = error;
1180 netdev->cache_valid |= VALID_ETHERADDR;
1182 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1186 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1187 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1191 ovs_mutex_unlock(&netdev->mutex);
1195 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1197 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1198 uint8_t mac[ETH_ADDR_LEN])
1200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1203 ovs_mutex_lock(&netdev->mutex);
1204 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1205 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1207 netdev->cache_valid |= VALID_ETHERADDR;
1210 error = netdev->ether_addr_error;
1212 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1214 ovs_mutex_unlock(&netdev->mutex);
1220 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1224 if (!(netdev->cache_valid & VALID_MTU)) {
1227 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1228 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1229 netdev->mtu = ifr.ifr_mtu;
1230 netdev->cache_valid |= VALID_MTU;
1233 error = netdev->netdev_mtu_error;
1235 *mtup = netdev->mtu;
1241 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1242 * in bytes, not including the hardware header; thus, this is typically 1500
1243 * bytes for Ethernet devices. */
1245 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1247 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1250 ovs_mutex_lock(&netdev->mutex);
1251 error = netdev_linux_get_mtu__(netdev, mtup);
1252 ovs_mutex_unlock(&netdev->mutex);
1257 /* Sets the maximum size of transmitted (MTU) for given device using linux
1258 * networking ioctl interface.
1261 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1263 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1267 ovs_mutex_lock(&netdev->mutex);
1268 if (netdev->cache_valid & VALID_MTU) {
1269 error = netdev->netdev_mtu_error;
1270 if (error || netdev->mtu == mtu) {
1273 netdev->cache_valid &= ~VALID_MTU;
1276 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1277 SIOCSIFMTU, "SIOCSIFMTU");
1278 if (!error || error == ENODEV) {
1279 netdev->netdev_mtu_error = error;
1280 netdev->mtu = ifr.ifr_mtu;
1281 netdev->cache_valid |= VALID_MTU;
1284 ovs_mutex_unlock(&netdev->mutex);
1288 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1289 * On failure, returns a negative errno value. */
1291 netdev_linux_get_ifindex(const struct netdev *netdev_)
1293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1296 ovs_mutex_lock(&netdev->mutex);
1297 error = get_ifindex(netdev_, &ifindex);
1298 ovs_mutex_unlock(&netdev->mutex);
1300 return error ? -error : ifindex;
1304 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1308 ovs_mutex_lock(&netdev->mutex);
1309 if (netdev->miimon_interval > 0) {
1310 *carrier = netdev->miimon;
1312 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1314 ovs_mutex_unlock(&netdev->mutex);
1319 static long long int
1320 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1322 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1323 long long int carrier_resets;
1325 ovs_mutex_lock(&netdev->mutex);
1326 carrier_resets = netdev->carrier_resets;
1327 ovs_mutex_unlock(&netdev->mutex);
1329 return carrier_resets;
1333 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1334 struct mii_ioctl_data *data)
1339 memset(&ifr, 0, sizeof ifr);
1340 memcpy(&ifr.ifr_data, data, sizeof *data);
1341 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1342 memcpy(data, &ifr.ifr_data, sizeof *data);
1348 netdev_linux_get_miimon(const char *name, bool *miimon)
1350 struct mii_ioctl_data data;
1355 memset(&data, 0, sizeof data);
1356 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1358 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1359 data.reg_num = MII_BMSR;
1360 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1364 *miimon = !!(data.val_out & BMSR_LSTATUS);
1366 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1369 struct ethtool_cmd ecmd;
1371 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1374 COVERAGE_INC(netdev_get_ethtool);
1375 memset(&ecmd, 0, sizeof ecmd);
1376 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1379 struct ethtool_value eval;
1381 memcpy(&eval, &ecmd, sizeof eval);
1382 *miimon = !!eval.data;
1384 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1392 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1393 long long int interval)
1395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1397 ovs_mutex_lock(&netdev->mutex);
1398 interval = interval > 0 ? MAX(interval, 100) : 0;
1399 if (netdev->miimon_interval != interval) {
1402 if (interval && !netdev->miimon_interval) {
1403 atomic_add(&miimon_cnt, 1, &junk);
1404 } else if (!interval && netdev->miimon_interval) {
1405 atomic_sub(&miimon_cnt, 1, &junk);
1408 netdev->miimon_interval = interval;
1409 timer_set_expired(&netdev->miimon_timer);
1411 ovs_mutex_unlock(&netdev->mutex);
1417 netdev_linux_miimon_run(void)
1419 struct shash device_shash;
1420 struct shash_node *node;
1422 shash_init(&device_shash);
1423 netdev_get_devices(&netdev_linux_class, &device_shash);
1424 SHASH_FOR_EACH (node, &device_shash) {
1425 struct netdev *netdev = node->data;
1426 struct netdev_linux *dev = netdev_linux_cast(netdev);
1429 ovs_mutex_lock(&dev->mutex);
1430 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1431 netdev_linux_get_miimon(dev->up.name, &miimon);
1432 if (miimon != dev->miimon) {
1433 dev->miimon = miimon;
1434 netdev_linux_changed(dev, dev->ifi_flags, 0);
1437 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1439 ovs_mutex_unlock(&dev->mutex);
1440 netdev_close(netdev);
1443 shash_destroy(&device_shash);
1447 netdev_linux_miimon_wait(void)
1449 struct shash device_shash;
1450 struct shash_node *node;
1452 shash_init(&device_shash);
1453 netdev_get_devices(&netdev_linux_class, &device_shash);
1454 SHASH_FOR_EACH (node, &device_shash) {
1455 struct netdev *netdev = node->data;
1456 struct netdev_linux *dev = netdev_linux_cast(netdev);
1458 ovs_mutex_lock(&dev->mutex);
1459 if (dev->miimon_interval > 0) {
1460 timer_wait(&dev->miimon_timer);
1462 ovs_mutex_unlock(&dev->mutex);
1463 netdev_close(netdev);
1465 shash_destroy(&device_shash);
1469 swap_uint64(uint64_t *a, uint64_t *b)
1476 /* Copies 'src' into 'dst', performing format conversion in the process.
1478 * 'src' is allowed to be misaligned. */
1480 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1481 const struct ovs_vport_stats *src)
1483 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1484 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1485 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1486 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1487 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1488 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1489 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1490 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1492 dst->collisions = 0;
1493 dst->rx_length_errors = 0;
1494 dst->rx_over_errors = 0;
1495 dst->rx_crc_errors = 0;
1496 dst->rx_frame_errors = 0;
1497 dst->rx_fifo_errors = 0;
1498 dst->rx_missed_errors = 0;
1499 dst->tx_aborted_errors = 0;
1500 dst->tx_carrier_errors = 0;
1501 dst->tx_fifo_errors = 0;
1502 dst->tx_heartbeat_errors = 0;
1503 dst->tx_window_errors = 0;
1507 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1509 struct dpif_linux_vport reply;
1513 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1516 } else if (!reply.stats) {
1521 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1529 get_stats_via_vport(const struct netdev *netdev_,
1530 struct netdev_stats *stats)
1532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1534 if (!netdev->vport_stats_error ||
1535 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1538 error = get_stats_via_vport__(netdev_, stats);
1539 if (error && error != ENOENT) {
1540 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1542 netdev_get_name(netdev_), ovs_strerror(error));
1544 netdev->vport_stats_error = error;
1545 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1549 /* Retrieves current device stats for 'netdev-linux'. */
1551 netdev_linux_get_stats(const struct netdev *netdev_,
1552 struct netdev_stats *stats)
1554 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1555 struct netdev_stats dev_stats;
1558 ovs_mutex_lock(&netdev->mutex);
1559 get_stats_via_vport(netdev_, stats);
1560 error = get_stats_via_netlink(netdev_, &dev_stats);
1562 if (!netdev->vport_stats_error) {
1565 } else if (netdev->vport_stats_error) {
1566 /* stats not available from OVS then use netdev stats. */
1569 /* Use kernel netdev's packet and byte counts since vport's counters
1570 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1572 stats->rx_packets = dev_stats.rx_packets;
1573 stats->rx_bytes = dev_stats.rx_bytes;
1574 stats->tx_packets = dev_stats.tx_packets;
1575 stats->tx_bytes = dev_stats.tx_bytes;
1577 stats->rx_errors += dev_stats.rx_errors;
1578 stats->tx_errors += dev_stats.tx_errors;
1579 stats->rx_dropped += dev_stats.rx_dropped;
1580 stats->tx_dropped += dev_stats.tx_dropped;
1581 stats->multicast += dev_stats.multicast;
1582 stats->collisions += dev_stats.collisions;
1583 stats->rx_length_errors += dev_stats.rx_length_errors;
1584 stats->rx_over_errors += dev_stats.rx_over_errors;
1585 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1586 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1587 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1588 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1589 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1590 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1591 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1592 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1593 stats->tx_window_errors += dev_stats.tx_window_errors;
1595 ovs_mutex_unlock(&netdev->mutex);
1600 /* Retrieves current device stats for 'netdev-tap' netdev or
1601 * netdev-internal. */
1603 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1606 struct netdev_stats dev_stats;
1609 ovs_mutex_lock(&netdev->mutex);
1610 get_stats_via_vport(netdev_, stats);
1611 error = get_stats_via_netlink(netdev_, &dev_stats);
1613 if (!netdev->vport_stats_error) {
1616 } else if (netdev->vport_stats_error) {
1617 /* Transmit and receive stats will appear to be swapped relative to the
1618 * other ports since we are the one sending the data, not a remote
1619 * computer. For consistency, we swap them back here. This does not
1620 * apply if we are getting stats from the vport layer because it always
1621 * tracks stats from the perspective of the switch. */
1624 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1625 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1626 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1627 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1628 stats->rx_length_errors = 0;
1629 stats->rx_over_errors = 0;
1630 stats->rx_crc_errors = 0;
1631 stats->rx_frame_errors = 0;
1632 stats->rx_fifo_errors = 0;
1633 stats->rx_missed_errors = 0;
1634 stats->tx_aborted_errors = 0;
1635 stats->tx_carrier_errors = 0;
1636 stats->tx_fifo_errors = 0;
1637 stats->tx_heartbeat_errors = 0;
1638 stats->tx_window_errors = 0;
1640 /* Use kernel netdev's packet and byte counts since vport counters
1641 * do not reflect packet counts on the wire when GSO, TSO or GRO
1643 stats->rx_packets = dev_stats.tx_packets;
1644 stats->rx_bytes = dev_stats.tx_bytes;
1645 stats->tx_packets = dev_stats.rx_packets;
1646 stats->tx_bytes = dev_stats.rx_bytes;
1648 stats->rx_dropped += dev_stats.tx_dropped;
1649 stats->tx_dropped += dev_stats.rx_dropped;
1651 stats->rx_errors += dev_stats.tx_errors;
1652 stats->tx_errors += dev_stats.rx_errors;
1654 stats->multicast += dev_stats.multicast;
1655 stats->collisions += dev_stats.collisions;
1657 ovs_mutex_unlock(&netdev->mutex);
1663 netdev_internal_get_stats(const struct netdev *netdev_,
1664 struct netdev_stats *stats)
1666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1669 ovs_mutex_lock(&netdev->mutex);
1670 get_stats_via_vport(netdev_, stats);
1671 error = netdev->vport_stats_error;
1672 ovs_mutex_unlock(&netdev->mutex);
1678 netdev_internal_set_stats(struct netdev *netdev,
1679 const struct netdev_stats *stats)
1681 struct ovs_vport_stats vport_stats;
1682 struct dpif_linux_vport vport;
1685 vport_stats.rx_packets = stats->rx_packets;
1686 vport_stats.tx_packets = stats->tx_packets;
1687 vport_stats.rx_bytes = stats->rx_bytes;
1688 vport_stats.tx_bytes = stats->tx_bytes;
1689 vport_stats.rx_errors = stats->rx_errors;
1690 vport_stats.tx_errors = stats->tx_errors;
1691 vport_stats.rx_dropped = stats->rx_dropped;
1692 vport_stats.tx_dropped = stats->tx_dropped;
1694 dpif_linux_vport_init(&vport);
1695 vport.cmd = OVS_VPORT_CMD_SET;
1696 vport.name = netdev_get_name(netdev);
1697 vport.stats = &vport_stats;
1699 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1701 /* If the vport layer doesn't know about the device, that doesn't mean it
1702 * doesn't exist (after all were able to open it when netdev_open() was
1703 * called), it just means that it isn't attached and we'll be getting
1704 * stats a different way. */
1705 if (err == ENODEV) {
1713 netdev_linux_read_features(struct netdev_linux *netdev)
1715 struct ethtool_cmd ecmd;
1719 if (netdev->cache_valid & VALID_FEATURES) {
1723 COVERAGE_INC(netdev_get_ethtool);
1724 memset(&ecmd, 0, sizeof ecmd);
1725 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1726 ETHTOOL_GSET, "ETHTOOL_GSET");
1731 /* Supported features. */
1732 netdev->supported = 0;
1733 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1734 netdev->supported |= NETDEV_F_10MB_HD;
1736 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1737 netdev->supported |= NETDEV_F_10MB_FD;
1739 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1740 netdev->supported |= NETDEV_F_100MB_HD;
1742 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1743 netdev->supported |= NETDEV_F_100MB_FD;
1745 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1746 netdev->supported |= NETDEV_F_1GB_HD;
1748 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1749 netdev->supported |= NETDEV_F_1GB_FD;
1751 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1752 netdev->supported |= NETDEV_F_10GB_FD;
1754 if (ecmd.supported & SUPPORTED_TP) {
1755 netdev->supported |= NETDEV_F_COPPER;
1757 if (ecmd.supported & SUPPORTED_FIBRE) {
1758 netdev->supported |= NETDEV_F_FIBER;
1760 if (ecmd.supported & SUPPORTED_Autoneg) {
1761 netdev->supported |= NETDEV_F_AUTONEG;
1763 if (ecmd.supported & SUPPORTED_Pause) {
1764 netdev->supported |= NETDEV_F_PAUSE;
1766 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1767 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1770 /* Advertised features. */
1771 netdev->advertised = 0;
1772 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1773 netdev->advertised |= NETDEV_F_10MB_HD;
1775 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1776 netdev->advertised |= NETDEV_F_10MB_FD;
1778 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1779 netdev->advertised |= NETDEV_F_100MB_HD;
1781 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1782 netdev->advertised |= NETDEV_F_100MB_FD;
1784 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1785 netdev->advertised |= NETDEV_F_1GB_HD;
1787 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1788 netdev->advertised |= NETDEV_F_1GB_FD;
1790 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1791 netdev->advertised |= NETDEV_F_10GB_FD;
1793 if (ecmd.advertising & ADVERTISED_TP) {
1794 netdev->advertised |= NETDEV_F_COPPER;
1796 if (ecmd.advertising & ADVERTISED_FIBRE) {
1797 netdev->advertised |= NETDEV_F_FIBER;
1799 if (ecmd.advertising & ADVERTISED_Autoneg) {
1800 netdev->advertised |= NETDEV_F_AUTONEG;
1802 if (ecmd.advertising & ADVERTISED_Pause) {
1803 netdev->advertised |= NETDEV_F_PAUSE;
1805 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1806 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1809 /* Current settings. */
1811 if (speed == SPEED_10) {
1812 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1813 } else if (speed == SPEED_100) {
1814 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1815 } else if (speed == SPEED_1000) {
1816 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1817 } else if (speed == SPEED_10000) {
1818 netdev->current = NETDEV_F_10GB_FD;
1819 } else if (speed == 40000) {
1820 netdev->current = NETDEV_F_40GB_FD;
1821 } else if (speed == 100000) {
1822 netdev->current = NETDEV_F_100GB_FD;
1823 } else if (speed == 1000000) {
1824 netdev->current = NETDEV_F_1TB_FD;
1826 netdev->current = 0;
1829 if (ecmd.port == PORT_TP) {
1830 netdev->current |= NETDEV_F_COPPER;
1831 } else if (ecmd.port == PORT_FIBRE) {
1832 netdev->current |= NETDEV_F_FIBER;
1836 netdev->current |= NETDEV_F_AUTONEG;
1840 netdev->cache_valid |= VALID_FEATURES;
1841 netdev->get_features_error = error;
1844 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1845 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1846 * Returns 0 if successful, otherwise a positive errno value. */
1848 netdev_linux_get_features(const struct netdev *netdev_,
1849 enum netdev_features *current,
1850 enum netdev_features *advertised,
1851 enum netdev_features *supported,
1852 enum netdev_features *peer)
1854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1857 ovs_mutex_lock(&netdev->mutex);
1858 netdev_linux_read_features(netdev);
1859 if (!netdev->get_features_error) {
1860 *current = netdev->current;
1861 *advertised = netdev->advertised;
1862 *supported = netdev->supported;
1863 *peer = 0; /* XXX */
1865 error = netdev->get_features_error;
1866 ovs_mutex_unlock(&netdev->mutex);
1871 /* Set the features advertised by 'netdev' to 'advertise'. */
1873 netdev_linux_set_advertisements(struct netdev *netdev_,
1874 enum netdev_features advertise)
1876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1877 struct ethtool_cmd ecmd;
1880 ovs_mutex_lock(&netdev->mutex);
1882 COVERAGE_INC(netdev_get_ethtool);
1883 memset(&ecmd, 0, sizeof ecmd);
1884 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1885 ETHTOOL_GSET, "ETHTOOL_GSET");
1890 ecmd.advertising = 0;
1891 if (advertise & NETDEV_F_10MB_HD) {
1892 ecmd.advertising |= ADVERTISED_10baseT_Half;
1894 if (advertise & NETDEV_F_10MB_FD) {
1895 ecmd.advertising |= ADVERTISED_10baseT_Full;
1897 if (advertise & NETDEV_F_100MB_HD) {
1898 ecmd.advertising |= ADVERTISED_100baseT_Half;
1900 if (advertise & NETDEV_F_100MB_FD) {
1901 ecmd.advertising |= ADVERTISED_100baseT_Full;
1903 if (advertise & NETDEV_F_1GB_HD) {
1904 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1906 if (advertise & NETDEV_F_1GB_FD) {
1907 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1909 if (advertise & NETDEV_F_10GB_FD) {
1910 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1912 if (advertise & NETDEV_F_COPPER) {
1913 ecmd.advertising |= ADVERTISED_TP;
1915 if (advertise & NETDEV_F_FIBER) {
1916 ecmd.advertising |= ADVERTISED_FIBRE;
1918 if (advertise & NETDEV_F_AUTONEG) {
1919 ecmd.advertising |= ADVERTISED_Autoneg;
1921 if (advertise & NETDEV_F_PAUSE) {
1922 ecmd.advertising |= ADVERTISED_Pause;
1924 if (advertise & NETDEV_F_PAUSE_ASYM) {
1925 ecmd.advertising |= ADVERTISED_Asym_Pause;
1927 COVERAGE_INC(netdev_set_ethtool);
1928 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1929 ETHTOOL_SSET, "ETHTOOL_SSET");
1932 ovs_mutex_unlock(&netdev->mutex);
1936 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1937 * successful, otherwise a positive errno value. */
1939 netdev_linux_set_policing(struct netdev *netdev_,
1940 uint32_t kbits_rate, uint32_t kbits_burst)
1942 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1943 const char *netdev_name = netdev_get_name(netdev_);
1946 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1947 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1948 : kbits_burst); /* Stick with user-specified value. */
1950 ovs_mutex_lock(&netdev->mutex);
1951 if (netdev->cache_valid & VALID_POLICING) {
1952 error = netdev->netdev_policing_error;
1953 if (error || (netdev->kbits_rate == kbits_rate &&
1954 netdev->kbits_burst == kbits_burst)) {
1955 /* Assume that settings haven't changed since we last set them. */
1958 netdev->cache_valid &= ~VALID_POLICING;
1961 COVERAGE_INC(netdev_set_policing);
1962 /* Remove any existing ingress qdisc. */
1963 error = tc_add_del_ingress_qdisc(netdev_, false);
1965 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1966 netdev_name, ovs_strerror(error));
1971 error = tc_add_del_ingress_qdisc(netdev_, true);
1973 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1974 netdev_name, ovs_strerror(error));
1978 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1980 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1981 netdev_name, ovs_strerror(error));
1986 netdev->kbits_rate = kbits_rate;
1987 netdev->kbits_burst = kbits_burst;
1990 if (!error || error == ENODEV) {
1991 netdev->netdev_policing_error = error;
1992 netdev->cache_valid |= VALID_POLICING;
1994 ovs_mutex_unlock(&netdev->mutex);
1999 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2002 const struct tc_ops *const *opsp;
2004 for (opsp = tcs; *opsp != NULL; opsp++) {
2005 const struct tc_ops *ops = *opsp;
2006 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2007 sset_add(types, ops->ovs_name);
2013 static const struct tc_ops *
2014 tc_lookup_ovs_name(const char *name)
2016 const struct tc_ops *const *opsp;
2018 for (opsp = tcs; *opsp != NULL; opsp++) {
2019 const struct tc_ops *ops = *opsp;
2020 if (!strcmp(name, ops->ovs_name)) {
2027 static const struct tc_ops *
2028 tc_lookup_linux_name(const char *name)
2030 const struct tc_ops *const *opsp;
2032 for (opsp = tcs; *opsp != NULL; opsp++) {
2033 const struct tc_ops *ops = *opsp;
2034 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2041 static struct tc_queue *
2042 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2045 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2046 struct tc_queue *queue;
2048 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2049 if (queue->queue_id == queue_id) {
2056 static struct tc_queue *
2057 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2059 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2063 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2065 struct netdev_qos_capabilities *caps)
2067 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2071 caps->n_queues = ops->n_queues;
2076 netdev_linux_get_qos(const struct netdev *netdev_,
2077 const char **typep, struct smap *details)
2079 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2082 ovs_mutex_lock(&netdev->mutex);
2083 error = tc_query_qdisc(netdev_);
2085 *typep = netdev->tc->ops->ovs_name;
2086 error = (netdev->tc->ops->qdisc_get
2087 ? netdev->tc->ops->qdisc_get(netdev_, details)
2090 ovs_mutex_unlock(&netdev->mutex);
2096 netdev_linux_set_qos(struct netdev *netdev_,
2097 const char *type, const struct smap *details)
2099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2100 const struct tc_ops *new_ops;
2103 new_ops = tc_lookup_ovs_name(type);
2104 if (!new_ops || !new_ops->tc_install) {
2108 ovs_mutex_lock(&netdev->mutex);
2109 error = tc_query_qdisc(netdev_);
2114 if (new_ops == netdev->tc->ops) {
2115 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2117 /* Delete existing qdisc. */
2118 error = tc_del_qdisc(netdev_);
2122 ovs_assert(netdev->tc == NULL);
2124 /* Install new qdisc. */
2125 error = new_ops->tc_install(netdev_, details);
2126 ovs_assert((error == 0) == (netdev->tc != NULL));
2130 ovs_mutex_unlock(&netdev->mutex);
2135 netdev_linux_get_queue(const struct netdev *netdev_,
2136 unsigned int queue_id, struct smap *details)
2138 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2141 ovs_mutex_lock(&netdev->mutex);
2142 error = tc_query_qdisc(netdev_);
2144 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2146 ? netdev->tc->ops->class_get(netdev_, queue, details)
2149 ovs_mutex_unlock(&netdev->mutex);
2155 netdev_linux_set_queue(struct netdev *netdev_,
2156 unsigned int queue_id, const struct smap *details)
2158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2161 ovs_mutex_lock(&netdev->mutex);
2162 error = tc_query_qdisc(netdev_);
2164 error = (queue_id < netdev->tc->ops->n_queues
2165 && netdev->tc->ops->class_set
2166 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2169 ovs_mutex_unlock(&netdev->mutex);
2175 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2180 ovs_mutex_lock(&netdev->mutex);
2181 error = tc_query_qdisc(netdev_);
2183 if (netdev->tc->ops->class_delete) {
2184 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2186 ? netdev->tc->ops->class_delete(netdev_, queue)
2192 ovs_mutex_unlock(&netdev->mutex);
2198 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2199 unsigned int queue_id,
2200 struct netdev_queue_stats *stats)
2202 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2205 ovs_mutex_lock(&netdev->mutex);
2206 error = tc_query_qdisc(netdev_);
2208 if (netdev->tc->ops->class_get_stats) {
2209 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2211 stats->created = queue->created;
2212 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2221 ovs_mutex_unlock(&netdev->mutex);
2226 struct queue_dump_state {
2227 struct nl_dump dump;
2232 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2234 struct ofpbuf request;
2235 struct tcmsg *tcmsg;
2237 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2241 tcmsg->tcm_parent = 0;
2242 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2243 ofpbuf_uninit(&request);
2245 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2250 finish_queue_dump(struct queue_dump_state *state)
2252 ofpbuf_uninit(&state->buf);
2253 return nl_dump_done(&state->dump);
2256 struct netdev_linux_queue_state {
2257 unsigned int *queues;
2263 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2265 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2268 ovs_mutex_lock(&netdev->mutex);
2269 error = tc_query_qdisc(netdev_);
2271 if (netdev->tc->ops->class_get) {
2272 struct netdev_linux_queue_state *state;
2273 struct tc_queue *queue;
2276 *statep = state = xmalloc(sizeof *state);
2277 state->n_queues = hmap_count(&netdev->tc->queues);
2278 state->cur_queue = 0;
2279 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2282 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2283 state->queues[i++] = queue->queue_id;
2289 ovs_mutex_unlock(&netdev->mutex);
2295 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2296 unsigned int *queue_idp, struct smap *details)
2298 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2299 struct netdev_linux_queue_state *state = state_;
2302 ovs_mutex_lock(&netdev->mutex);
2303 while (state->cur_queue < state->n_queues) {
2304 unsigned int queue_id = state->queues[state->cur_queue++];
2305 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2308 *queue_idp = queue_id;
2309 error = netdev->tc->ops->class_get(netdev_, queue, details);
2313 ovs_mutex_unlock(&netdev->mutex);
2319 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2322 struct netdev_linux_queue_state *state = state_;
2324 free(state->queues);
2330 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2331 netdev_dump_queue_stats_cb *cb, void *aux)
2333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2336 ovs_mutex_lock(&netdev->mutex);
2337 error = tc_query_qdisc(netdev_);
2339 struct queue_dump_state state;
2341 if (!netdev->tc->ops->class_dump_stats) {
2343 } else if (!start_queue_dump(netdev_, &state)) {
2349 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2350 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2357 retval = finish_queue_dump(&state);
2363 ovs_mutex_unlock(&netdev->mutex);
2369 netdev_linux_get_in4(const struct netdev *netdev_,
2370 struct in_addr *address, struct in_addr *netmask)
2372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2375 ovs_mutex_lock(&netdev->mutex);
2376 if (!(netdev->cache_valid & VALID_IN4)) {
2377 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2378 SIOCGIFADDR, "SIOCGIFADDR");
2380 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2381 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2383 netdev->cache_valid |= VALID_IN4;
2391 if (netdev->address.s_addr != INADDR_ANY) {
2392 *address = netdev->address;
2393 *netmask = netdev->netmask;
2395 error = EADDRNOTAVAIL;
2398 ovs_mutex_unlock(&netdev->mutex);
2404 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2405 struct in_addr netmask)
2407 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2410 ovs_mutex_lock(&netdev->mutex);
2411 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2413 netdev->cache_valid |= VALID_IN4;
2414 netdev->address = address;
2415 netdev->netmask = netmask;
2416 if (address.s_addr != INADDR_ANY) {
2417 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2418 "SIOCSIFNETMASK", netmask);
2421 ovs_mutex_unlock(&netdev->mutex);
2427 parse_if_inet6_line(const char *line,
2428 struct in6_addr *in6, char ifname[16 + 1])
2430 uint8_t *s6 = in6->s6_addr;
2431 #define X8 "%2"SCNx8
2432 return ovs_scan(line,
2433 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2434 "%*x %*x %*x %*x %16s\n",
2435 &s6[0], &s6[1], &s6[2], &s6[3],
2436 &s6[4], &s6[5], &s6[6], &s6[7],
2437 &s6[8], &s6[9], &s6[10], &s6[11],
2438 &s6[12], &s6[13], &s6[14], &s6[15],
2442 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2443 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2445 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2449 ovs_mutex_lock(&netdev->mutex);
2450 if (!(netdev->cache_valid & VALID_IN6)) {
2454 netdev->in6 = in6addr_any;
2456 file = fopen("/proc/net/if_inet6", "r");
2458 const char *name = netdev_get_name(netdev_);
2459 while (fgets(line, sizeof line, file)) {
2460 struct in6_addr in6_tmp;
2461 char ifname[16 + 1];
2462 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2463 && !strcmp(name, ifname))
2465 netdev->in6 = in6_tmp;
2471 netdev->cache_valid |= VALID_IN6;
2474 ovs_mutex_unlock(&netdev->mutex);
2480 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2482 struct sockaddr_in sin;
2483 memset(&sin, 0, sizeof sin);
2484 sin.sin_family = AF_INET;
2485 sin.sin_addr = addr;
2488 memset(sa, 0, sizeof *sa);
2489 memcpy(sa, &sin, sizeof sin);
2493 do_set_addr(struct netdev *netdev,
2494 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2498 make_in4_sockaddr(&ifr.ifr_addr, addr);
2499 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2503 /* Adds 'router' as a default IP gateway. */
2505 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2507 struct in_addr any = { INADDR_ANY };
2511 memset(&rt, 0, sizeof rt);
2512 make_in4_sockaddr(&rt.rt_dst, any);
2513 make_in4_sockaddr(&rt.rt_gateway, router);
2514 make_in4_sockaddr(&rt.rt_genmask, any);
2515 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2516 error = af_inet_ioctl(SIOCADDRT, &rt);
2518 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2524 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2527 static const char fn[] = "/proc/net/route";
2532 *netdev_name = NULL;
2533 stream = fopen(fn, "r");
2534 if (stream == NULL) {
2535 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2540 while (fgets(line, sizeof line, stream)) {
2543 ovs_be32 dest, gateway, mask;
2544 int refcnt, metric, mtu;
2545 unsigned int flags, use, window, irtt;
2548 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2550 iface, &dest, &gateway, &flags, &refcnt,
2551 &use, &metric, &mask, &mtu, &window, &irtt)) {
2552 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2556 if (!(flags & RTF_UP)) {
2557 /* Skip routes that aren't up. */
2561 /* The output of 'dest', 'mask', and 'gateway' were given in
2562 * network byte order, so we don't need need any endian
2563 * conversions here. */
2564 if ((dest & mask) == (host->s_addr & mask)) {
2566 /* The host is directly reachable. */
2567 next_hop->s_addr = 0;
2569 /* To reach the host, we must go through a gateway. */
2570 next_hop->s_addr = gateway;
2572 *netdev_name = xstrdup(iface);
2584 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2589 ovs_mutex_lock(&netdev->mutex);
2590 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2591 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2593 COVERAGE_INC(netdev_get_ethtool);
2594 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2595 error = netdev_linux_do_ethtool(netdev->up.name,
2598 "ETHTOOL_GDRVINFO");
2600 netdev->cache_valid |= VALID_DRVINFO;
2605 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2606 smap_add(smap, "driver_version", netdev->drvinfo.version);
2607 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2609 ovs_mutex_unlock(&netdev->mutex);
2615 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2618 smap_add(smap, "driver_name", "openvswitch");
2622 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2623 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2624 * returns 0. Otherwise, it returns a positive errno value; in particular,
2625 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2627 netdev_linux_arp_lookup(const struct netdev *netdev,
2628 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2631 struct sockaddr_in sin;
2634 memset(&r, 0, sizeof r);
2635 memset(&sin, 0, sizeof sin);
2636 sin.sin_family = AF_INET;
2637 sin.sin_addr.s_addr = ip;
2639 memcpy(&r.arp_pa, &sin, sizeof sin);
2640 r.arp_ha.sa_family = ARPHRD_ETHER;
2642 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2643 COVERAGE_INC(netdev_arp_lookup);
2644 retval = af_inet_ioctl(SIOCGARP, &r);
2646 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2647 } else if (retval != ENXIO) {
2648 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2649 netdev_get_name(netdev), IP_ARGS(ip),
2650 ovs_strerror(retval));
2656 nd_to_iff_flags(enum netdev_flags nd)
2659 if (nd & NETDEV_UP) {
2662 if (nd & NETDEV_PROMISC) {
2665 if (nd & NETDEV_LOOPBACK) {
2666 iff |= IFF_LOOPBACK;
2672 iff_to_nd_flags(int iff)
2674 enum netdev_flags nd = 0;
2678 if (iff & IFF_PROMISC) {
2679 nd |= NETDEV_PROMISC;
2681 if (iff & IFF_LOOPBACK) {
2682 nd |= NETDEV_LOOPBACK;
2688 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2689 enum netdev_flags on, enum netdev_flags *old_flagsp)
2690 OVS_REQUIRES(netdev->mutex)
2692 int old_flags, new_flags;
2695 old_flags = netdev->ifi_flags;
2696 *old_flagsp = iff_to_nd_flags(old_flags);
2697 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2698 if (new_flags != old_flags) {
2699 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2700 get_flags(&netdev->up, &netdev->ifi_flags);
2707 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2708 enum netdev_flags on, enum netdev_flags *old_flagsp)
2710 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2713 ovs_mutex_lock(&netdev->mutex);
2714 error = update_flags(netdev, off, on, old_flagsp);
2715 ovs_mutex_unlock(&netdev->mutex);
2720 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2721 GET_FEATURES, GET_STATUS) \
2727 netdev_linux_wait, \
2729 netdev_linux_alloc, \
2731 netdev_linux_destruct, \
2732 netdev_linux_dealloc, \
2733 NULL, /* get_config */ \
2734 NULL, /* set_config */ \
2735 NULL, /* get_tunnel_config */ \
2737 netdev_linux_send, \
2738 netdev_linux_send_wait, \
2740 netdev_linux_set_etheraddr, \
2741 netdev_linux_get_etheraddr, \
2742 netdev_linux_get_mtu, \
2743 netdev_linux_set_mtu, \
2744 netdev_linux_get_ifindex, \
2745 netdev_linux_get_carrier, \
2746 netdev_linux_get_carrier_resets, \
2747 netdev_linux_set_miimon_interval, \
2752 netdev_linux_set_advertisements, \
2754 netdev_linux_set_policing, \
2755 netdev_linux_get_qos_types, \
2756 netdev_linux_get_qos_capabilities, \
2757 netdev_linux_get_qos, \
2758 netdev_linux_set_qos, \
2759 netdev_linux_get_queue, \
2760 netdev_linux_set_queue, \
2761 netdev_linux_delete_queue, \
2762 netdev_linux_get_queue_stats, \
2763 netdev_linux_queue_dump_start, \
2764 netdev_linux_queue_dump_next, \
2765 netdev_linux_queue_dump_done, \
2766 netdev_linux_dump_queue_stats, \
2768 netdev_linux_get_in4, \
2769 netdev_linux_set_in4, \
2770 netdev_linux_get_in6, \
2771 netdev_linux_add_router, \
2772 netdev_linux_get_next_hop, \
2774 netdev_linux_arp_lookup, \
2776 netdev_linux_update_flags, \
2778 netdev_linux_rxq_alloc, \
2779 netdev_linux_rxq_construct, \
2780 netdev_linux_rxq_destruct, \
2781 netdev_linux_rxq_dealloc, \
2782 netdev_linux_rxq_recv, \
2783 netdev_linux_rxq_wait, \
2784 netdev_linux_rxq_drain, \
2787 const struct netdev_class netdev_linux_class =
2790 netdev_linux_construct,
2791 netdev_linux_get_stats,
2792 NULL, /* set_stats */
2793 netdev_linux_get_features,
2794 netdev_linux_get_status);
2796 const struct netdev_class netdev_tap_class =
2799 netdev_linux_construct_tap,
2800 netdev_tap_get_stats,
2801 NULL, /* set_stats */
2802 netdev_linux_get_features,
2803 netdev_linux_get_status);
2805 const struct netdev_class netdev_internal_class =
2808 netdev_linux_construct,
2809 netdev_internal_get_stats,
2810 netdev_internal_set_stats,
2811 NULL, /* get_features */
2812 netdev_internal_get_status);
2814 /* HTB traffic control class. */
2816 #define HTB_N_QUEUES 0xf000
2820 unsigned int max_rate; /* In bytes/s. */
2824 struct tc_queue tc_queue;
2825 unsigned int min_rate; /* In bytes/s. */
2826 unsigned int max_rate; /* In bytes/s. */
2827 unsigned int burst; /* In bytes. */
2828 unsigned int priority; /* Lower values are higher priorities. */
2832 htb_get__(const struct netdev *netdev_)
2834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2835 return CONTAINER_OF(netdev->tc, struct htb, tc);
2839 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2841 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2844 htb = xmalloc(sizeof *htb);
2845 tc_init(&htb->tc, &tc_ops_htb);
2846 htb->max_rate = max_rate;
2848 netdev->tc = &htb->tc;
2851 /* Create an HTB qdisc.
2853 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2855 htb_setup_qdisc__(struct netdev *netdev)
2858 struct tc_htb_glob opt;
2859 struct ofpbuf request;
2860 struct tcmsg *tcmsg;
2862 tc_del_qdisc(netdev);
2864 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2865 NLM_F_EXCL | NLM_F_CREATE, &request);
2869 tcmsg->tcm_handle = tc_make_handle(1, 0);
2870 tcmsg->tcm_parent = TC_H_ROOT;
2872 nl_msg_put_string(&request, TCA_KIND, "htb");
2874 memset(&opt, 0, sizeof opt);
2875 opt.rate2quantum = 10;
2879 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2880 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2881 nl_msg_end_nested(&request, opt_offset);
2883 return tc_transact(&request, NULL);
2886 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2887 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2889 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2890 unsigned int parent, struct htb_class *class)
2893 struct tc_htb_opt opt;
2894 struct ofpbuf request;
2895 struct tcmsg *tcmsg;
2899 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2901 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2902 netdev_get_name(netdev));
2906 memset(&opt, 0, sizeof opt);
2907 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2908 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2909 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2910 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2911 opt.prio = class->priority;
2913 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2917 tcmsg->tcm_handle = handle;
2918 tcmsg->tcm_parent = parent;
2920 nl_msg_put_string(&request, TCA_KIND, "htb");
2921 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2922 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2923 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2924 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2925 nl_msg_end_nested(&request, opt_offset);
2927 error = tc_transact(&request, NULL);
2929 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2930 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2931 netdev_get_name(netdev),
2932 tc_get_major(handle), tc_get_minor(handle),
2933 tc_get_major(parent), tc_get_minor(parent),
2934 class->min_rate, class->max_rate,
2935 class->burst, class->priority, ovs_strerror(error));
2940 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2941 * description of them into 'details'. The description complies with the
2942 * specification given in the vswitch database documentation for linux-htb
2945 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2947 static const struct nl_policy tca_htb_policy[] = {
2948 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2949 .min_len = sizeof(struct tc_htb_opt) },
2952 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2953 const struct tc_htb_opt *htb;
2955 if (!nl_parse_nested(nl_options, tca_htb_policy,
2956 attrs, ARRAY_SIZE(tca_htb_policy))) {
2957 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2961 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2962 class->min_rate = htb->rate.rate;
2963 class->max_rate = htb->ceil.rate;
2964 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2965 class->priority = htb->prio;
2970 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2971 struct htb_class *options,
2972 struct netdev_queue_stats *stats)
2974 struct nlattr *nl_options;
2975 unsigned int handle;
2978 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2979 if (!error && queue_id) {
2980 unsigned int major = tc_get_major(handle);
2981 unsigned int minor = tc_get_minor(handle);
2982 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2983 *queue_id = minor - 1;
2988 if (!error && options) {
2989 error = htb_parse_tca_options__(nl_options, options);
2995 htb_parse_qdisc_details__(struct netdev *netdev_,
2996 const struct smap *details, struct htb_class *hc)
2998 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2999 const char *max_rate_s;
3001 max_rate_s = smap_get(details, "max-rate");
3002 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3003 if (!hc->max_rate) {
3004 enum netdev_features current;
3006 netdev_linux_read_features(netdev);
3007 current = !netdev->get_features_error ? netdev->current : 0;
3008 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3010 hc->min_rate = hc->max_rate;
3016 htb_parse_class_details__(struct netdev *netdev,
3017 const struct smap *details, struct htb_class *hc)
3019 const struct htb *htb = htb_get__(netdev);
3020 const char *min_rate_s = smap_get(details, "min-rate");
3021 const char *max_rate_s = smap_get(details, "max-rate");
3022 const char *burst_s = smap_get(details, "burst");
3023 const char *priority_s = smap_get(details, "priority");
3026 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3028 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3029 netdev_get_name(netdev));
3033 /* HTB requires at least an mtu sized min-rate to send any traffic even
3034 * on uncongested links. */
3035 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3036 hc->min_rate = MAX(hc->min_rate, mtu);
3037 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3040 hc->max_rate = (max_rate_s
3041 ? strtoull(max_rate_s, NULL, 10) / 8
3043 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3044 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3048 * According to hints in the documentation that I've read, it is important
3049 * that 'burst' be at least as big as the largest frame that might be
3050 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3051 * but having it a bit too small is a problem. Since netdev_get_mtu()
3052 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3053 * the MTU. We actually add 64, instead of 14, as a guard against
3054 * additional headers get tacked on somewhere that we're not aware of. */
3055 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3056 hc->burst = MAX(hc->burst, mtu + 64);
3059 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3065 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3066 unsigned int parent, struct htb_class *options,
3067 struct netdev_queue_stats *stats)
3069 struct ofpbuf *reply;
3072 error = tc_query_class(netdev, handle, parent, &reply);
3074 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3075 ofpbuf_delete(reply);
3081 htb_tc_install(struct netdev *netdev, const struct smap *details)
3085 error = htb_setup_qdisc__(netdev);
3087 struct htb_class hc;
3089 htb_parse_qdisc_details__(netdev, details, &hc);
3090 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3091 tc_make_handle(1, 0), &hc);
3093 htb_install__(netdev, hc.max_rate);
3099 static struct htb_class *
3100 htb_class_cast__(const struct tc_queue *queue)
3102 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3106 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3107 const struct htb_class *hc)
3109 struct htb *htb = htb_get__(netdev);
3110 size_t hash = hash_int(queue_id, 0);
3111 struct tc_queue *queue;
3112 struct htb_class *hcp;
3114 queue = tc_find_queue__(netdev, queue_id, hash);
3116 hcp = htb_class_cast__(queue);
3118 hcp = xmalloc(sizeof *hcp);
3119 queue = &hcp->tc_queue;
3120 queue->queue_id = queue_id;
3121 queue->created = time_msec();
3122 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3125 hcp->min_rate = hc->min_rate;
3126 hcp->max_rate = hc->max_rate;
3127 hcp->burst = hc->burst;
3128 hcp->priority = hc->priority;
3132 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3135 struct queue_dump_state state;
3136 struct htb_class hc;
3138 /* Get qdisc options. */
3140 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3141 htb_install__(netdev, hc.max_rate);
3144 if (!start_queue_dump(netdev, &state)) {
3147 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3148 unsigned int queue_id;
3150 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3151 htb_update_queue__(netdev, queue_id, &hc);
3154 finish_queue_dump(&state);
3160 htb_tc_destroy(struct tc *tc)
3162 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3163 struct htb_class *hc, *next;
3165 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3166 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3174 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3176 const struct htb *htb = htb_get__(netdev);
3177 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3182 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3184 struct htb_class hc;
3187 htb_parse_qdisc_details__(netdev, details, &hc);
3188 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3189 tc_make_handle(1, 0), &hc);
3191 htb_get__(netdev)->max_rate = hc.max_rate;
3197 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3198 const struct tc_queue *queue, struct smap *details)
3200 const struct htb_class *hc = htb_class_cast__(queue);
3202 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3203 if (hc->min_rate != hc->max_rate) {
3204 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3206 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3208 smap_add_format(details, "priority", "%u", hc->priority);
3214 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3215 const struct smap *details)
3217 struct htb_class hc;
3220 error = htb_parse_class_details__(netdev, details, &hc);
3225 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3226 tc_make_handle(1, 0xfffe), &hc);
3231 htb_update_queue__(netdev, queue_id, &hc);
3236 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3238 struct htb_class *hc = htb_class_cast__(queue);
3239 struct htb *htb = htb_get__(netdev);
3242 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3244 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3251 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3252 struct netdev_queue_stats *stats)
3254 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3255 tc_make_handle(1, 0xfffe), NULL, stats);
3259 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3260 const struct ofpbuf *nlmsg,
3261 netdev_dump_queue_stats_cb *cb, void *aux)
3263 struct netdev_queue_stats stats;
3264 unsigned int handle, major, minor;
3267 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3272 major = tc_get_major(handle);
3273 minor = tc_get_minor(handle);
3274 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3275 (*cb)(minor - 1, &stats, aux);
3280 static const struct tc_ops tc_ops_htb = {
3281 "htb", /* linux_name */
3282 "linux-htb", /* ovs_name */
3283 HTB_N_QUEUES, /* n_queues */
3292 htb_class_get_stats,
3293 htb_class_dump_stats
3296 /* "linux-hfsc" traffic control class. */
3298 #define HFSC_N_QUEUES 0xf000
3306 struct tc_queue tc_queue;
3311 static struct hfsc *
3312 hfsc_get__(const struct netdev *netdev_)
3314 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3315 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3318 static struct hfsc_class *
3319 hfsc_class_cast__(const struct tc_queue *queue)
3321 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3325 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3330 hfsc = xmalloc(sizeof *hfsc);
3331 tc_init(&hfsc->tc, &tc_ops_hfsc);
3332 hfsc->max_rate = max_rate;
3333 netdev->tc = &hfsc->tc;
3337 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3338 const struct hfsc_class *hc)
3342 struct hfsc_class *hcp;
3343 struct tc_queue *queue;
3345 hfsc = hfsc_get__(netdev);
3346 hash = hash_int(queue_id, 0);
3348 queue = tc_find_queue__(netdev, queue_id, hash);
3350 hcp = hfsc_class_cast__(queue);
3352 hcp = xmalloc(sizeof *hcp);
3353 queue = &hcp->tc_queue;
3354 queue->queue_id = queue_id;
3355 queue->created = time_msec();
3356 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3359 hcp->min_rate = hc->min_rate;
3360 hcp->max_rate = hc->max_rate;
3364 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3366 const struct tc_service_curve *rsc, *fsc, *usc;
3367 static const struct nl_policy tca_hfsc_policy[] = {
3369 .type = NL_A_UNSPEC,
3371 .min_len = sizeof(struct tc_service_curve),
3374 .type = NL_A_UNSPEC,
3376 .min_len = sizeof(struct tc_service_curve),
3379 .type = NL_A_UNSPEC,
3381 .min_len = sizeof(struct tc_service_curve),
3384 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3386 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3387 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3388 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3392 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3393 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3394 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3396 if (rsc->m1 != 0 || rsc->d != 0 ||
3397 fsc->m1 != 0 || fsc->d != 0 ||
3398 usc->m1 != 0 || usc->d != 0) {
3399 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3400 "Non-linear service curves are not supported.");
3404 if (rsc->m2 != fsc->m2) {
3405 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3406 "Real-time service curves are not supported ");
3410 if (rsc->m2 > usc->m2) {
3411 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3412 "Min-rate service curve is greater than "
3413 "the max-rate service curve.");
3417 class->min_rate = fsc->m2;
3418 class->max_rate = usc->m2;
3423 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3424 struct hfsc_class *options,
3425 struct netdev_queue_stats *stats)
3428 unsigned int handle;
3429 struct nlattr *nl_options;
3431 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3437 unsigned int major, minor;
3439 major = tc_get_major(handle);
3440 minor = tc_get_minor(handle);
3441 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3442 *queue_id = minor - 1;
3449 error = hfsc_parse_tca_options__(nl_options, options);
3456 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3457 unsigned int parent, struct hfsc_class *options,
3458 struct netdev_queue_stats *stats)
3461 struct ofpbuf *reply;
3463 error = tc_query_class(netdev, handle, parent, &reply);
3468 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3469 ofpbuf_delete(reply);
3474 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3475 struct hfsc_class *class)
3477 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3479 const char *max_rate_s;
3481 max_rate_s = smap_get(details, "max-rate");
3482 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3485 enum netdev_features current;
3487 netdev_linux_read_features(netdev);
3488 current = !netdev->get_features_error ? netdev->current : 0;
3489 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3492 class->min_rate = max_rate;
3493 class->max_rate = max_rate;
3497 hfsc_parse_class_details__(struct netdev *netdev,
3498 const struct smap *details,
3499 struct hfsc_class * class)
3501 const struct hfsc *hfsc;
3502 uint32_t min_rate, max_rate;
3503 const char *min_rate_s, *max_rate_s;
3505 hfsc = hfsc_get__(netdev);
3506 min_rate_s = smap_get(details, "min-rate");
3507 max_rate_s = smap_get(details, "max-rate");
3509 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3510 min_rate = MAX(min_rate, 1);
3511 min_rate = MIN(min_rate, hfsc->max_rate);
3513 max_rate = (max_rate_s
3514 ? strtoull(max_rate_s, NULL, 10) / 8
3516 max_rate = MAX(max_rate, min_rate);
3517 max_rate = MIN(max_rate, hfsc->max_rate);
3519 class->min_rate = min_rate;
3520 class->max_rate = max_rate;
3525 /* Create an HFSC qdisc.
3527 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3529 hfsc_setup_qdisc__(struct netdev * netdev)
3531 struct tcmsg *tcmsg;
3532 struct ofpbuf request;
3533 struct tc_hfsc_qopt opt;
3535 tc_del_qdisc(netdev);
3537 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3538 NLM_F_EXCL | NLM_F_CREATE, &request);
3544 tcmsg->tcm_handle = tc_make_handle(1, 0);
3545 tcmsg->tcm_parent = TC_H_ROOT;
3547 memset(&opt, 0, sizeof opt);
3550 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3551 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3553 return tc_transact(&request, NULL);
3556 /* Create an HFSC class.
3558 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3559 * sc rate <min_rate> ul rate <max_rate>" */
3561 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3562 unsigned int parent, struct hfsc_class *class)
3566 struct tcmsg *tcmsg;
3567 struct ofpbuf request;
3568 struct tc_service_curve min, max;
3570 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3576 tcmsg->tcm_handle = handle;
3577 tcmsg->tcm_parent = parent;
3581 min.m2 = class->min_rate;
3585 max.m2 = class->max_rate;
3587 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3588 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3589 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3590 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3591 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3592 nl_msg_end_nested(&request, opt_offset);
3594 error = tc_transact(&request, NULL);
3596 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3597 "min-rate %ubps, max-rate %ubps (%s)",
3598 netdev_get_name(netdev),
3599 tc_get_major(handle), tc_get_minor(handle),
3600 tc_get_major(parent), tc_get_minor(parent),
3601 class->min_rate, class->max_rate, ovs_strerror(error));
3608 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3611 struct hfsc_class class;
3613 error = hfsc_setup_qdisc__(netdev);
3619 hfsc_parse_qdisc_details__(netdev, details, &class);
3620 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3621 tc_make_handle(1, 0), &class);
3627 hfsc_install__(netdev, class.max_rate);
3632 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3635 struct queue_dump_state state;
3636 struct hfsc_class hc;
3639 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3640 hfsc_install__(netdev, hc.max_rate);
3642 if (!start_queue_dump(netdev, &state)) {
3646 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3647 unsigned int queue_id;
3649 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3650 hfsc_update_queue__(netdev, queue_id, &hc);
3654 finish_queue_dump(&state);
3659 hfsc_tc_destroy(struct tc *tc)
3662 struct hfsc_class *hc, *next;
3664 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3666 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3667 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3676 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3678 const struct hfsc *hfsc;
3679 hfsc = hfsc_get__(netdev);
3680 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3685 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3688 struct hfsc_class class;
3690 hfsc_parse_qdisc_details__(netdev, details, &class);
3691 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3692 tc_make_handle(1, 0), &class);
3695 hfsc_get__(netdev)->max_rate = class.max_rate;
3702 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3703 const struct tc_queue *queue, struct smap *details)
3705 const struct hfsc_class *hc;
3707 hc = hfsc_class_cast__(queue);
3708 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3709 if (hc->min_rate != hc->max_rate) {
3710 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3716 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3717 const struct smap *details)
3720 struct hfsc_class class;
3722 error = hfsc_parse_class_details__(netdev, details, &class);
3727 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3728 tc_make_handle(1, 0xfffe), &class);
3733 hfsc_update_queue__(netdev, queue_id, &class);
3738 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3742 struct hfsc_class *hc;
3744 hc = hfsc_class_cast__(queue);
3745 hfsc = hfsc_get__(netdev);
3747 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3749 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3756 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3757 struct netdev_queue_stats *stats)
3759 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3760 tc_make_handle(1, 0xfffe), NULL, stats);
3764 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3765 const struct ofpbuf *nlmsg,
3766 netdev_dump_queue_stats_cb *cb, void *aux)
3768 struct netdev_queue_stats stats;
3769 unsigned int handle, major, minor;
3772 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3777 major = tc_get_major(handle);
3778 minor = tc_get_minor(handle);
3779 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3780 (*cb)(minor - 1, &stats, aux);
3785 static const struct tc_ops tc_ops_hfsc = {
3786 "hfsc", /* linux_name */
3787 "linux-hfsc", /* ovs_name */
3788 HFSC_N_QUEUES, /* n_queues */
3789 hfsc_tc_install, /* tc_install */
3790 hfsc_tc_load, /* tc_load */
3791 hfsc_tc_destroy, /* tc_destroy */
3792 hfsc_qdisc_get, /* qdisc_get */
3793 hfsc_qdisc_set, /* qdisc_set */
3794 hfsc_class_get, /* class_get */
3795 hfsc_class_set, /* class_set */
3796 hfsc_class_delete, /* class_delete */
3797 hfsc_class_get_stats, /* class_get_stats */
3798 hfsc_class_dump_stats /* class_dump_stats */
3801 /* "linux-default" traffic control class.
3803 * This class represents the default, unnamed Linux qdisc. It corresponds to
3804 * the "" (empty string) QoS type in the OVS database. */
3807 default_install__(struct netdev *netdev_)
3809 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3810 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3812 /* Nothing but a tc class implementation is allowed to write to a tc. This
3813 * class never does that, so we can legitimately use a const tc object. */
3814 netdev->tc = CONST_CAST(struct tc *, &tc);
3818 default_tc_install(struct netdev *netdev,
3819 const struct smap *details OVS_UNUSED)
3821 default_install__(netdev);
3826 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3828 default_install__(netdev);
3832 static const struct tc_ops tc_ops_default = {
3833 NULL, /* linux_name */
3838 NULL, /* tc_destroy */
3839 NULL, /* qdisc_get */
3840 NULL, /* qdisc_set */
3841 NULL, /* class_get */
3842 NULL, /* class_set */
3843 NULL, /* class_delete */
3844 NULL, /* class_get_stats */
3845 NULL /* class_dump_stats */
3848 /* "linux-other" traffic control class.
3853 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3856 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3858 /* Nothing but a tc class implementation is allowed to write to a tc. This
3859 * class never does that, so we can legitimately use a const tc object. */
3860 netdev->tc = CONST_CAST(struct tc *, &tc);
3864 static const struct tc_ops tc_ops_other = {
3865 NULL, /* linux_name */
3866 "linux-other", /* ovs_name */
3868 NULL, /* tc_install */
3870 NULL, /* tc_destroy */
3871 NULL, /* qdisc_get */
3872 NULL, /* qdisc_set */
3873 NULL, /* class_get */
3874 NULL, /* class_set */
3875 NULL, /* class_delete */
3876 NULL, /* class_get_stats */
3877 NULL /* class_dump_stats */
3880 /* Traffic control. */
3882 /* Number of kernel "tc" ticks per second. */
3883 static double ticks_per_s;
3885 /* Number of kernel "jiffies" per second. This is used for the purpose of
3886 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3887 * one jiffy's worth of data.
3889 * There are two possibilities here:
3891 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3892 * approximate range of 100 to 1024. That means that we really need to
3893 * make sure that the qdisc can buffer that much data.
3895 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3896 * has finely granular timers and there's no need to fudge additional room
3897 * for buffers. (There's no extra effort needed to implement that: the
3898 * large 'buffer_hz' is used as a divisor, so practically any number will
3899 * come out as 0 in the division. Small integer results in the case of
3900 * really high dividends won't have any real effect anyhow.)
3902 static unsigned int buffer_hz;
3904 /* Returns tc handle 'major':'minor'. */
3906 tc_make_handle(unsigned int major, unsigned int minor)
3908 return TC_H_MAKE(major << 16, minor);
3911 /* Returns the major number from 'handle'. */
3913 tc_get_major(unsigned int handle)
3915 return TC_H_MAJ(handle) >> 16;
3918 /* Returns the minor number from 'handle'. */
3920 tc_get_minor(unsigned int handle)
3922 return TC_H_MIN(handle);
3925 static struct tcmsg *
3926 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3927 struct ofpbuf *request)
3929 struct tcmsg *tcmsg;
3933 error = get_ifindex(netdev, &ifindex);
3938 ofpbuf_init(request, 512);
3939 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3940 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3941 tcmsg->tcm_family = AF_UNSPEC;
3942 tcmsg->tcm_ifindex = ifindex;
3943 /* Caller should fill in tcmsg->tcm_handle. */
3944 /* Caller should fill in tcmsg->tcm_parent. */
3950 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3952 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3953 ofpbuf_uninit(request);
3957 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3958 * policing configuration.
3960 * This function is equivalent to running the following when 'add' is true:
3961 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3963 * This function is equivalent to running the following when 'add' is false:
3964 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3966 * The configuration and stats may be seen with the following command:
3967 * /sbin/tc -s qdisc show dev <devname>
3969 * Returns 0 if successful, otherwise a positive errno value.
3972 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3974 struct ofpbuf request;
3975 struct tcmsg *tcmsg;
3977 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3978 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3980 tcmsg = tc_make_request(netdev, type, flags, &request);
3984 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3985 tcmsg->tcm_parent = TC_H_INGRESS;
3986 nl_msg_put_string(&request, TCA_KIND, "ingress");
3987 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3989 error = tc_transact(&request, NULL);
3991 /* If we're deleting the qdisc, don't worry about some of the
3992 * error conditions. */
3993 if (!add && (error == ENOENT || error == EINVAL)) {
4002 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4005 * This function is equivalent to running:
4006 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4007 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4010 * The configuration and stats may be seen with the following command:
4011 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4013 * Returns 0 if successful, otherwise a positive errno value.
4016 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4018 struct tc_police tc_police;
4019 struct ofpbuf request;
4020 struct tcmsg *tcmsg;
4021 size_t basic_offset;
4022 size_t police_offset;
4026 memset(&tc_police, 0, sizeof tc_police);
4027 tc_police.action = TC_POLICE_SHOT;
4028 tc_police.mtu = mtu;
4029 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
4030 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4031 kbits_burst * 1024);
4033 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4034 NLM_F_EXCL | NLM_F_CREATE, &request);
4038 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4039 tcmsg->tcm_info = tc_make_handle(49,
4040 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4042 nl_msg_put_string(&request, TCA_KIND, "basic");
4043 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4044 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4045 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4046 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4047 nl_msg_end_nested(&request, police_offset);
4048 nl_msg_end_nested(&request, basic_offset);
4050 error = tc_transact(&request, NULL);
4061 /* The values in psched are not individually very meaningful, but they are
4062 * important. The tables below show some values seen in the wild.
4066 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4067 * (Before that, there are hints that it was 1000000000.)
4069 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4073 * -----------------------------------
4074 * [1] 000c8000 000f4240 000f4240 00000064
4075 * [2] 000003e8 00000400 000f4240 3b9aca00
4076 * [3] 000003e8 00000400 000f4240 3b9aca00
4077 * [4] 000003e8 00000400 000f4240 00000064
4078 * [5] 000003e8 00000040 000f4240 3b9aca00
4079 * [6] 000003e8 00000040 000f4240 000000f9
4081 * a b c d ticks_per_s buffer_hz
4082 * ------- --------- ---------- ------------- ----------- -------------
4083 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4084 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4085 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4086 * [4] 1,000 1,024 1,000,000 100 976,562 100
4087 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4088 * [6] 1,000 64 1,000,000 249 15,625,000 249
4090 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4091 * [2] 2.6.26-1-686-bigmem from Debian lenny
4092 * [3] 2.6.26-2-sparc64 from Debian lenny
4093 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4094 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4095 * [6] 2.6.34 from kernel.org on KVM
4097 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4098 static const char fn[] = "/proc/net/psched";
4099 unsigned int a, b, c, d;
4102 if (!ovsthread_once_start(&once)) {
4109 stream = fopen(fn, "r");
4111 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4115 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4116 VLOG_WARN("%s: read failed", fn);
4120 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4124 VLOG_WARN("%s: invalid scheduler parameters", fn);
4128 ticks_per_s = (double) a * c / b;
4132 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4135 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4138 ovsthread_once_done(&once);
4141 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4142 * rate of 'rate' bytes per second. */
4144 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4147 return (rate * ticks) / ticks_per_s;
4150 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4151 * rate of 'rate' bytes per second. */
4153 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4156 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4159 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4160 * a transmission rate of 'rate' bytes per second. */
4162 tc_buffer_per_jiffy(unsigned int rate)
4165 return rate / buffer_hz;
4168 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4169 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4170 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4171 * stores NULL into it if it is absent.
4173 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4176 * Returns 0 if successful, otherwise a positive errno value. */
4178 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4179 struct nlattr **options)
4181 static const struct nl_policy tca_policy[] = {
4182 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4183 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4185 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4187 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4188 tca_policy, ta, ARRAY_SIZE(ta))) {
4189 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4194 *kind = nl_attr_get_string(ta[TCA_KIND]);
4198 *options = ta[TCA_OPTIONS];
4213 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4214 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4215 * into '*options', and its queue statistics into '*stats'. Any of the output
4216 * arguments may be null.
4218 * Returns 0 if successful, otherwise a positive errno value. */
4220 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4221 struct nlattr **options, struct netdev_queue_stats *stats)
4223 static const struct nl_policy tca_policy[] = {
4224 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4225 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4227 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4229 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4230 tca_policy, ta, ARRAY_SIZE(ta))) {
4231 VLOG_WARN_RL(&rl, "failed to parse class message");
4236 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4237 *handlep = tc->tcm_handle;
4241 *options = ta[TCA_OPTIONS];
4245 const struct gnet_stats_queue *gsq;
4246 struct gnet_stats_basic gsb;
4248 static const struct nl_policy stats_policy[] = {
4249 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4250 .min_len = sizeof gsb },
4251 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4252 .min_len = sizeof *gsq },
4254 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4256 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4257 sa, ARRAY_SIZE(sa))) {
4258 VLOG_WARN_RL(&rl, "failed to parse class stats");
4262 /* Alignment issues screw up the length of struct gnet_stats_basic on
4263 * some arch/bitsize combinations. Newer versions of Linux have a
4264 * struct gnet_stats_basic_packed, but we can't depend on that. The
4265 * easiest thing to do is just to make a copy. */
4266 memset(&gsb, 0, sizeof gsb);
4267 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4268 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4269 stats->tx_bytes = gsb.bytes;
4270 stats->tx_packets = gsb.packets;
4272 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4273 stats->tx_errors = gsq->drops;
4283 memset(stats, 0, sizeof *stats);
4288 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4291 tc_query_class(const struct netdev *netdev,
4292 unsigned int handle, unsigned int parent,
4293 struct ofpbuf **replyp)
4295 struct ofpbuf request;
4296 struct tcmsg *tcmsg;
4299 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4303 tcmsg->tcm_handle = handle;
4304 tcmsg->tcm_parent = parent;
4306 error = tc_transact(&request, replyp);
4308 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4309 netdev_get_name(netdev),
4310 tc_get_major(handle), tc_get_minor(handle),
4311 tc_get_major(parent), tc_get_minor(parent),
4312 ovs_strerror(error));
4317 /* Equivalent to "tc class del dev <name> handle <handle>". */
4319 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4321 struct ofpbuf request;
4322 struct tcmsg *tcmsg;
4325 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4329 tcmsg->tcm_handle = handle;
4330 tcmsg->tcm_parent = 0;
4332 error = tc_transact(&request, NULL);
4334 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4335 netdev_get_name(netdev),
4336 tc_get_major(handle), tc_get_minor(handle),
4337 ovs_strerror(error));
4342 /* Equivalent to "tc qdisc del dev <name> root". */
4344 tc_del_qdisc(struct netdev *netdev_)
4346 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4347 struct ofpbuf request;
4348 struct tcmsg *tcmsg;
4351 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4355 tcmsg->tcm_handle = tc_make_handle(1, 0);
4356 tcmsg->tcm_parent = TC_H_ROOT;
4358 error = tc_transact(&request, NULL);
4359 if (error == EINVAL) {
4360 /* EINVAL probably means that the default qdisc was in use, in which
4361 * case we've accomplished our purpose. */
4364 if (!error && netdev->tc) {
4365 if (netdev->tc->ops->tc_destroy) {
4366 netdev->tc->ops->tc_destroy(netdev->tc);
4373 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4374 * kernel to determine what they are. Returns 0 if successful, otherwise a
4375 * positive errno value. */
4377 tc_query_qdisc(const struct netdev *netdev_)
4379 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4380 struct ofpbuf request, *qdisc;
4381 const struct tc_ops *ops;
4382 struct tcmsg *tcmsg;
4390 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4391 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4392 * 2.6.35 without that fix backported to it.
4394 * To avoid the OOPS, we must not make a request that would attempt to dump
4395 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4396 * few others. There are a few ways that I can see to do this, but most of
4397 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4398 * technique chosen here is to assume that any non-default qdisc that we
4399 * create will have a class with handle 1:0. The built-in qdiscs only have
4400 * a class with handle 0:0.
4402 * We could check for Linux 2.6.35+ and use a more straightforward method
4404 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4408 tcmsg->tcm_handle = tc_make_handle(1, 0);
4409 tcmsg->tcm_parent = 0;
4411 /* Figure out what tc class to instantiate. */
4412 error = tc_transact(&request, &qdisc);
4416 error = tc_parse_qdisc(qdisc, &kind, NULL);
4418 ops = &tc_ops_other;
4420 ops = tc_lookup_linux_name(kind);
4422 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4423 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4425 ops = &tc_ops_other;
4428 } else if (error == ENOENT) {
4429 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4430 * other entity that doesn't have a handle 1:0. We will assume
4431 * that it's the system default qdisc. */
4432 ops = &tc_ops_default;
4435 /* Who knows? Maybe the device got deleted. */
4436 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4437 netdev_get_name(netdev_), ovs_strerror(error));
4438 ops = &tc_ops_other;
4441 /* Instantiate it. */
4442 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4443 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4444 ofpbuf_delete(qdisc);
4446 return error ? error : load_error;
4449 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4450 approximate the time to transmit packets of various lengths. For an MTU of
4451 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4452 represents two possible packet lengths; for a MTU of 513 through 1024, four
4453 possible lengths; and so on.
4455 Returns, for the specified 'mtu', the number of bits that packet lengths
4456 need to be shifted right to fit within such a 256-entry table. */
4458 tc_calc_cell_log(unsigned int mtu)
4463 mtu = ETH_PAYLOAD_MAX;
4465 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4467 for (cell_log = 0; mtu >= 256; cell_log++) {
4474 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4477 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4479 memset(rate, 0, sizeof *rate);
4480 rate->cell_log = tc_calc_cell_log(mtu);
4481 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4482 /* rate->cell_align = 0; */ /* distro headers. */
4483 rate->mpu = ETH_TOTAL_MIN;
4487 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4488 * attribute of the specified "type".
4490 * See tc_calc_cell_log() above for a description of "rtab"s. */
4492 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4497 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4498 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4499 unsigned packet_size = (i + 1) << rate->cell_log;
4500 if (packet_size < rate->mpu) {
4501 packet_size = rate->mpu;
4503 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4507 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4508 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4509 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4512 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4514 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4515 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4518 /* Linux-only functions declared in netdev-linux.h */
4520 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4521 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4523 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4524 const char *flag_name, bool enable)
4526 const char *netdev_name = netdev_get_name(netdev);
4527 struct ethtool_value evalue;
4531 COVERAGE_INC(netdev_get_ethtool);
4532 memset(&evalue, 0, sizeof evalue);
4533 error = netdev_linux_do_ethtool(netdev_name,
4534 (struct ethtool_cmd *)&evalue,
4535 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4540 COVERAGE_INC(netdev_set_ethtool);
4541 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4542 error = netdev_linux_do_ethtool(netdev_name,
4543 (struct ethtool_cmd *)&evalue,
4544 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4549 COVERAGE_INC(netdev_get_ethtool);
4550 memset(&evalue, 0, sizeof evalue);
4551 error = netdev_linux_do_ethtool(netdev_name,
4552 (struct ethtool_cmd *)&evalue,
4553 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4558 if (new_flags != evalue.data) {
4559 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4560 "device %s failed", enable ? "enable" : "disable",
4561 flag_name, netdev_name);
4568 /* Utility functions. */
4570 /* Copies 'src' into 'dst', performing format conversion in the process. */
4572 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4573 const struct rtnl_link_stats *src)
4575 dst->rx_packets = src->rx_packets;
4576 dst->tx_packets = src->tx_packets;
4577 dst->rx_bytes = src->rx_bytes;
4578 dst->tx_bytes = src->tx_bytes;
4579 dst->rx_errors = src->rx_errors;
4580 dst->tx_errors = src->tx_errors;
4581 dst->rx_dropped = src->rx_dropped;
4582 dst->tx_dropped = src->tx_dropped;
4583 dst->multicast = src->multicast;
4584 dst->collisions = src->collisions;
4585 dst->rx_length_errors = src->rx_length_errors;
4586 dst->rx_over_errors = src->rx_over_errors;
4587 dst->rx_crc_errors = src->rx_crc_errors;
4588 dst->rx_frame_errors = src->rx_frame_errors;
4589 dst->rx_fifo_errors = src->rx_fifo_errors;
4590 dst->rx_missed_errors = src->rx_missed_errors;
4591 dst->tx_aborted_errors = src->tx_aborted_errors;
4592 dst->tx_carrier_errors = src->tx_carrier_errors;
4593 dst->tx_fifo_errors = src->tx_fifo_errors;
4594 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4595 dst->tx_window_errors = src->tx_window_errors;
4599 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4601 struct ofpbuf request;
4602 struct ofpbuf *reply;
4605 ofpbuf_init(&request, 0);
4606 nl_msg_put_nlmsghdr(&request,
4607 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4608 RTM_GETLINK, NLM_F_REQUEST);
4609 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4610 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4611 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4612 ofpbuf_uninit(&request);
4617 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4618 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4619 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4620 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4623 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4627 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4632 ofpbuf_delete(reply);
4637 get_flags(const struct netdev *dev, unsigned int *flags)
4643 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4645 *flags = ifr.ifr_flags;
4651 set_flags(const char *name, unsigned int flags)
4655 ifr.ifr_flags = flags;
4656 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4660 do_get_ifindex(const char *netdev_name)
4665 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4666 COVERAGE_INC(netdev_get_ifindex);
4668 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4670 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4671 netdev_name, ovs_strerror(error));
4674 return ifr.ifr_ifindex;
4678 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4682 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4683 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4686 netdev->get_ifindex_error = -ifindex;
4687 netdev->ifindex = 0;
4689 netdev->get_ifindex_error = 0;
4690 netdev->ifindex = ifindex;
4692 netdev->cache_valid |= VALID_IFINDEX;
4695 *ifindexp = netdev->ifindex;
4696 return netdev->get_ifindex_error;
4700 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4706 memset(&ifr, 0, sizeof ifr);
4707 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4708 COVERAGE_INC(netdev_get_hwaddr);
4709 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4711 /* ENODEV probably means that a vif disappeared asynchronously and
4712 * hasn't been removed from the database yet, so reduce the log level
4713 * to INFO for that case. */
4714 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4715 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4716 netdev_name, ovs_strerror(error));
4719 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4720 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4721 VLOG_WARN("%s device has unknown hardware address family %d",
4722 netdev_name, hwaddr_family);
4724 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4729 set_etheraddr(const char *netdev_name,
4730 const uint8_t mac[ETH_ADDR_LEN])
4735 memset(&ifr, 0, sizeof ifr);
4736 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4737 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4738 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4739 COVERAGE_INC(netdev_set_hwaddr);
4740 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4742 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4743 netdev_name, ovs_strerror(error));
4749 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4750 int cmd, const char *cmd_name)
4755 memset(&ifr, 0, sizeof ifr);
4756 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4757 ifr.ifr_data = (caddr_t) ecmd;
4760 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4762 if (error != EOPNOTSUPP) {
4763 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4764 "failed: %s", cmd_name, name, ovs_strerror(error));
4766 /* The device doesn't support this operation. That's pretty
4767 * common, so there's no point in logging anything. */
4774 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4775 int cmd, const char *cmd_name)
4780 ifr.ifr_addr.sa_family = AF_INET;
4781 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4783 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4785 *ip = sin->sin_addr;
4790 /* Returns an AF_PACKET raw socket or a negative errno value. */
4792 af_packet_sock(void)
4794 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4797 if (ovsthread_once_start(&once)) {
4798 sock = socket(AF_PACKET, SOCK_RAW, 0);
4800 int error = set_nonblocking(sock);
4807 VLOG_ERR("failed to create packet socket: %s",
4808 ovs_strerror(errno));
4810 ovsthread_once_done(&once);