2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
153 /* Traffic control. */
155 /* An instance of a traffic control class. Always associated with a particular
158 * Each TC implementation subclasses this with whatever additional data it
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
169 /* One traffic control queue.
171 * Each TC implementation subclasses this with whatever additional data it
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
251 * This function may be null if 'tc' is not configurable.
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' is not configurable.
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
277 * This function may be null if 'tc' does not have queues ('n_queues' is
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
306 * On success, initializes '*stats'.
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
325 tc_init(struct tc *tc, const struct tc_ops *ops)
328 hmap_init(&tc->queues);
332 tc_destroy(struct tc *tc)
334 hmap_destroy(&tc->queues);
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
384 struct netdev_linux {
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
390 unsigned int cache_valid;
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
422 /* For devices of class netdev_tap_class only. */
426 struct netdev_rxq_linux {
427 struct netdev_rxq up;
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
440 * Readers do not depend on this variable synchronizing with the related
441 * changes in the device miimon status, so we can use atomic_count. */
442 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
444 static void netdev_linux_run(void);
446 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
447 int cmd, const char *cmd_name);
448 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
449 int cmd, const char *cmd_name);
450 static int get_flags(const struct netdev *, unsigned int *flags);
451 static int set_flags(const char *, unsigned int flags);
452 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
453 enum netdev_flags on, enum netdev_flags *old_flagsp)
454 OVS_REQUIRES(netdev->mutex);
455 static int do_get_ifindex(const char *netdev_name);
456 static int get_ifindex(const struct netdev *, int *ifindexp);
457 static int do_set_addr(struct netdev *netdev,
458 int ioctl_nr, const char *ioctl_name,
459 struct in_addr addr);
460 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
461 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
462 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
463 static int af_packet_sock(void);
464 static bool netdev_linux_miimon_enabled(void);
465 static void netdev_linux_miimon_run(void);
466 static void netdev_linux_miimon_wait(void);
467 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
470 is_netdev_linux_class(const struct netdev_class *netdev_class)
472 return netdev_class->run == netdev_linux_run;
476 is_tap_netdev(const struct netdev *netdev)
478 return netdev_get_class(netdev) == &netdev_tap_class;
481 static struct netdev_linux *
482 netdev_linux_cast(const struct netdev *netdev)
484 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
486 return CONTAINER_OF(netdev, struct netdev_linux, up);
489 static struct netdev_rxq_linux *
490 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
492 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
493 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
496 static void netdev_linux_update(struct netdev_linux *netdev,
497 const struct rtnetlink_link_change *)
498 OVS_REQUIRES(netdev->mutex);
499 static void netdev_linux_changed(struct netdev_linux *netdev,
500 unsigned int ifi_flags, unsigned int mask)
501 OVS_REQUIRES(netdev->mutex);
503 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
504 * if no such socket could be created. */
505 static struct nl_sock *
506 netdev_linux_notify_sock(void)
508 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
509 static struct nl_sock *sock;
511 if (ovsthread_once_start(&once)) {
514 error = nl_sock_create(NETLINK_ROUTE, &sock);
516 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
518 nl_sock_destroy(sock);
522 ovsthread_once_done(&once);
529 netdev_linux_miimon_enabled(void)
531 return atomic_count_get(&miimon_cnt) > 0;
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 netdev_change_seq_changed(&dev->up);
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
761 atomic_count_dec(&miimon_cnt);
764 ovs_mutex_destroy(&netdev->mutex);
768 netdev_linux_dealloc(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 static struct netdev_rxq *
775 netdev_linux_rxq_alloc(void)
777 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
782 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
784 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
785 struct netdev *netdev_ = rx->up.netdev;
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 ovs_mutex_lock(&netdev->mutex);
790 rx->is_tap = is_tap_netdev(netdev_);
792 rx->fd = netdev->tap_fd;
794 struct sockaddr_ll sll;
796 /* Result of tcpdump -dd inbound */
797 static const struct sock_filter filt[] = {
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
807 /* Create file descriptor. */
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(rx->fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->up, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = htons(ETH_P_ALL);
840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), ovs_strerror(error));
847 /* Filter for only inbound packets. */
848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
852 VLOG_ERR("%s: failed to attach filter (%s)",
853 netdev_get_name(netdev_), ovs_strerror(error));
857 ovs_mutex_unlock(&netdev->mutex);
865 ovs_mutex_unlock(&netdev->mutex);
870 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
872 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
880 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
882 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
888 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
893 return htons(ETH_TYPE_VLAN);
898 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
904 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
909 struct cmsghdr *cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
920 iov.iov_base = ofpbuf_data(buffer);
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
932 } while (retval < 0 && errno == EINTR);
936 } else if (retval > size) {
940 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
967 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
970 size_t size = ofpbuf_tailroom(buffer);
973 retval = read(fd, ofpbuf_data(buffer), size);
974 } while (retval < 0 && errno == EINTR);
978 } else if (retval > size) {
982 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
987 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
990 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
991 struct netdev *netdev = rx->up.netdev;
992 struct dpif_packet *packet;
993 struct ofpbuf *buffer;
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
1001 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1002 DP_NETDEV_HEADROOM);
1003 buffer = &packet->ofpbuf;
1005 retval = (rx->is_tap
1006 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1007 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1010 if (retval != EAGAIN && retval != EMSGSIZE) {
1011 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1012 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1014 dpif_packet_delete(packet);
1016 dp_packet_pad(buffer);
1017 packets[0] = packet;
1025 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1027 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1028 poll_fd_wait(rx->fd, POLLIN);
1032 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1034 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1037 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1038 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1042 drain_fd(rx->fd, ifr.ifr_qlen);
1045 return drain_rcvbuf(rx->fd);
1049 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1050 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1051 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1052 * the packet is too big or too small to transmit on the device.
1054 * The caller retains ownership of 'buffer' in all cases.
1056 * The kernel maintains a packet transmission queue, so the caller is not
1057 * expected to do additional queuing of packets. */
1059 netdev_linux_send(struct netdev *netdev_, struct dpif_packet **pkts, int cnt,
1065 /* 'i' is incremented only if there's no error */
1066 for (i = 0; i < cnt;) {
1067 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1068 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1071 if (!is_tap_netdev(netdev_)) {
1072 /* Use our AF_PACKET socket to send to this device. */
1073 struct sockaddr_ll sll;
1079 sock = af_packet_sock();
1084 ifindex = netdev_get_ifindex(netdev_);
1089 /* We don't bother setting most fields in sockaddr_ll because the
1090 * kernel ignores them for SOCK_RAW. */
1091 memset(&sll, 0, sizeof sll);
1092 sll.sll_family = AF_PACKET;
1093 sll.sll_ifindex = ifindex;
1095 iov.iov_base = CONST_CAST(void *, data);
1098 msg.msg_name = &sll;
1099 msg.msg_namelen = sizeof sll;
1102 msg.msg_control = NULL;
1103 msg.msg_controllen = 0;
1106 retval = sendmsg(sock, &msg, 0);
1108 /* Use the tap fd to send to this device. This is essential for
1109 * tap devices, because packets sent to a tap device with an
1110 * AF_PACKET socket will loop back to be *received* again on the
1111 * tap device. This doesn't occur on other interface types
1112 * because we attach a socket filter to the rx socket. */
1113 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1115 retval = write(netdev->tap_fd, data, size);
1119 /* The Linux AF_PACKET implementation never blocks waiting for room
1120 * for packets, instead returning ENOBUFS. Translate this into
1121 * EAGAIN for the caller. */
1122 error = errno == ENOBUFS ? EAGAIN : errno;
1123 if (error == EINTR) {
1124 /* continue without incrementing 'i', i.e. retry this packet */
1128 } else if (retval != size) {
1129 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1130 " of %"PRIuSIZE") on %s", retval, size,
1131 netdev_get_name(netdev_));
1136 /* Process the next packet in the batch */
1141 for (i = 0; i < cnt; i++) {
1142 dpif_packet_delete(pkts[i]);
1146 if (error && error != EAGAIN) {
1147 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1148 netdev_get_name(netdev_), ovs_strerror(error));
1155 /* Registers with the poll loop to wake up from the next call to poll_block()
1156 * when the packet transmission queue has sufficient room to transmit a packet
1157 * with netdev_send().
1159 * The kernel maintains a packet transmission queue, so the client is not
1160 * expected to do additional queuing of packets. Thus, this function is
1161 * unlikely to ever be used. It is included for completeness. */
1163 netdev_linux_send_wait(struct netdev *netdev)
1165 if (is_tap_netdev(netdev)) {
1166 /* TAP device always accepts packets.*/
1167 poll_immediate_wake();
1171 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1172 * otherwise a positive errno value. */
1174 netdev_linux_set_etheraddr(struct netdev *netdev_,
1175 const uint8_t mac[ETH_ADDR_LEN])
1177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1178 enum netdev_flags old_flags = 0;
1181 ovs_mutex_lock(&netdev->mutex);
1183 if (netdev->cache_valid & VALID_ETHERADDR) {
1184 error = netdev->ether_addr_error;
1185 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1188 netdev->cache_valid &= ~VALID_ETHERADDR;
1191 /* Tap devices must be brought down before setting the address. */
1192 if (is_tap_netdev(netdev_)) {
1193 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1195 error = set_etheraddr(netdev_get_name(netdev_), mac);
1196 if (!error || error == ENODEV) {
1197 netdev->ether_addr_error = error;
1198 netdev->cache_valid |= VALID_ETHERADDR;
1200 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1204 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1205 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1209 ovs_mutex_unlock(&netdev->mutex);
1213 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1215 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1216 uint8_t mac[ETH_ADDR_LEN])
1218 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1221 ovs_mutex_lock(&netdev->mutex);
1222 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1223 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1225 netdev->cache_valid |= VALID_ETHERADDR;
1228 error = netdev->ether_addr_error;
1230 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1232 ovs_mutex_unlock(&netdev->mutex);
1238 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1242 if (!(netdev->cache_valid & VALID_MTU)) {
1245 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1246 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1247 netdev->mtu = ifr.ifr_mtu;
1248 netdev->cache_valid |= VALID_MTU;
1251 error = netdev->netdev_mtu_error;
1253 *mtup = netdev->mtu;
1259 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1260 * in bytes, not including the hardware header; thus, this is typically 1500
1261 * bytes for Ethernet devices. */
1263 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1268 ovs_mutex_lock(&netdev->mutex);
1269 error = netdev_linux_get_mtu__(netdev, mtup);
1270 ovs_mutex_unlock(&netdev->mutex);
1275 /* Sets the maximum size of transmitted (MTU) for given device using linux
1276 * networking ioctl interface.
1279 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1281 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1285 ovs_mutex_lock(&netdev->mutex);
1286 if (netdev->cache_valid & VALID_MTU) {
1287 error = netdev->netdev_mtu_error;
1288 if (error || netdev->mtu == mtu) {
1291 netdev->cache_valid &= ~VALID_MTU;
1294 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1295 SIOCSIFMTU, "SIOCSIFMTU");
1296 if (!error || error == ENODEV) {
1297 netdev->netdev_mtu_error = error;
1298 netdev->mtu = ifr.ifr_mtu;
1299 netdev->cache_valid |= VALID_MTU;
1302 ovs_mutex_unlock(&netdev->mutex);
1306 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1307 * On failure, returns a negative errno value. */
1309 netdev_linux_get_ifindex(const struct netdev *netdev_)
1311 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1314 ovs_mutex_lock(&netdev->mutex);
1315 error = get_ifindex(netdev_, &ifindex);
1316 ovs_mutex_unlock(&netdev->mutex);
1318 return error ? -error : ifindex;
1322 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1326 ovs_mutex_lock(&netdev->mutex);
1327 if (netdev->miimon_interval > 0) {
1328 *carrier = netdev->miimon;
1330 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1332 ovs_mutex_unlock(&netdev->mutex);
1337 static long long int
1338 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1341 long long int carrier_resets;
1343 ovs_mutex_lock(&netdev->mutex);
1344 carrier_resets = netdev->carrier_resets;
1345 ovs_mutex_unlock(&netdev->mutex);
1347 return carrier_resets;
1351 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1352 struct mii_ioctl_data *data)
1357 memset(&ifr, 0, sizeof ifr);
1358 memcpy(&ifr.ifr_data, data, sizeof *data);
1359 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1360 memcpy(data, &ifr.ifr_data, sizeof *data);
1366 netdev_linux_get_miimon(const char *name, bool *miimon)
1368 struct mii_ioctl_data data;
1373 memset(&data, 0, sizeof data);
1374 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1376 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1377 data.reg_num = MII_BMSR;
1378 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1382 *miimon = !!(data.val_out & BMSR_LSTATUS);
1384 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1387 struct ethtool_cmd ecmd;
1389 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1392 COVERAGE_INC(netdev_get_ethtool);
1393 memset(&ecmd, 0, sizeof ecmd);
1394 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1397 struct ethtool_value eval;
1399 memcpy(&eval, &ecmd, sizeof eval);
1400 *miimon = !!eval.data;
1402 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1410 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1411 long long int interval)
1413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1415 ovs_mutex_lock(&netdev->mutex);
1416 interval = interval > 0 ? MAX(interval, 100) : 0;
1417 if (netdev->miimon_interval != interval) {
1418 if (interval && !netdev->miimon_interval) {
1419 atomic_count_inc(&miimon_cnt);
1420 } else if (!interval && netdev->miimon_interval) {
1421 atomic_count_dec(&miimon_cnt);
1424 netdev->miimon_interval = interval;
1425 timer_set_expired(&netdev->miimon_timer);
1427 ovs_mutex_unlock(&netdev->mutex);
1433 netdev_linux_miimon_run(void)
1435 struct shash device_shash;
1436 struct shash_node *node;
1438 shash_init(&device_shash);
1439 netdev_get_devices(&netdev_linux_class, &device_shash);
1440 SHASH_FOR_EACH (node, &device_shash) {
1441 struct netdev *netdev = node->data;
1442 struct netdev_linux *dev = netdev_linux_cast(netdev);
1445 ovs_mutex_lock(&dev->mutex);
1446 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1447 netdev_linux_get_miimon(dev->up.name, &miimon);
1448 if (miimon != dev->miimon) {
1449 dev->miimon = miimon;
1450 netdev_linux_changed(dev, dev->ifi_flags, 0);
1453 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1455 ovs_mutex_unlock(&dev->mutex);
1456 netdev_close(netdev);
1459 shash_destroy(&device_shash);
1463 netdev_linux_miimon_wait(void)
1465 struct shash device_shash;
1466 struct shash_node *node;
1468 shash_init(&device_shash);
1469 netdev_get_devices(&netdev_linux_class, &device_shash);
1470 SHASH_FOR_EACH (node, &device_shash) {
1471 struct netdev *netdev = node->data;
1472 struct netdev_linux *dev = netdev_linux_cast(netdev);
1474 ovs_mutex_lock(&dev->mutex);
1475 if (dev->miimon_interval > 0) {
1476 timer_wait(&dev->miimon_timer);
1478 ovs_mutex_unlock(&dev->mutex);
1479 netdev_close(netdev);
1481 shash_destroy(&device_shash);
1485 swap_uint64(uint64_t *a, uint64_t *b)
1492 /* Copies 'src' into 'dst', performing format conversion in the process.
1494 * 'src' is allowed to be misaligned. */
1496 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1497 const struct ovs_vport_stats *src)
1499 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1500 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1501 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1502 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1503 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1504 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1505 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1506 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1508 dst->collisions = 0;
1509 dst->rx_length_errors = 0;
1510 dst->rx_over_errors = 0;
1511 dst->rx_crc_errors = 0;
1512 dst->rx_frame_errors = 0;
1513 dst->rx_fifo_errors = 0;
1514 dst->rx_missed_errors = 0;
1515 dst->tx_aborted_errors = 0;
1516 dst->tx_carrier_errors = 0;
1517 dst->tx_fifo_errors = 0;
1518 dst->tx_heartbeat_errors = 0;
1519 dst->tx_window_errors = 0;
1523 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1525 struct dpif_linux_vport reply;
1529 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1532 } else if (!reply.stats) {
1537 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1545 get_stats_via_vport(const struct netdev *netdev_,
1546 struct netdev_stats *stats)
1548 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1550 if (!netdev->vport_stats_error ||
1551 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1554 error = get_stats_via_vport__(netdev_, stats);
1555 if (error && error != ENOENT) {
1556 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1558 netdev_get_name(netdev_), ovs_strerror(error));
1560 netdev->vport_stats_error = error;
1561 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1565 /* Retrieves current device stats for 'netdev-linux'. */
1567 netdev_linux_get_stats(const struct netdev *netdev_,
1568 struct netdev_stats *stats)
1570 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1571 struct netdev_stats dev_stats;
1574 ovs_mutex_lock(&netdev->mutex);
1575 get_stats_via_vport(netdev_, stats);
1576 error = get_stats_via_netlink(netdev_, &dev_stats);
1578 if (!netdev->vport_stats_error) {
1581 } else if (netdev->vport_stats_error) {
1582 /* stats not available from OVS then use netdev stats. */
1585 /* Use kernel netdev's packet and byte counts since vport's counters
1586 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1588 stats->rx_packets = dev_stats.rx_packets;
1589 stats->rx_bytes = dev_stats.rx_bytes;
1590 stats->tx_packets = dev_stats.tx_packets;
1591 stats->tx_bytes = dev_stats.tx_bytes;
1593 stats->rx_errors += dev_stats.rx_errors;
1594 stats->tx_errors += dev_stats.tx_errors;
1595 stats->rx_dropped += dev_stats.rx_dropped;
1596 stats->tx_dropped += dev_stats.tx_dropped;
1597 stats->multicast += dev_stats.multicast;
1598 stats->collisions += dev_stats.collisions;
1599 stats->rx_length_errors += dev_stats.rx_length_errors;
1600 stats->rx_over_errors += dev_stats.rx_over_errors;
1601 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1602 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1603 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1604 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1605 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1606 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1607 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1608 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1609 stats->tx_window_errors += dev_stats.tx_window_errors;
1611 ovs_mutex_unlock(&netdev->mutex);
1616 /* Retrieves current device stats for 'netdev-tap' netdev or
1617 * netdev-internal. */
1619 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1621 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1622 struct netdev_stats dev_stats;
1625 ovs_mutex_lock(&netdev->mutex);
1626 get_stats_via_vport(netdev_, stats);
1627 error = get_stats_via_netlink(netdev_, &dev_stats);
1629 if (!netdev->vport_stats_error) {
1632 } else if (netdev->vport_stats_error) {
1633 /* Transmit and receive stats will appear to be swapped relative to the
1634 * other ports since we are the one sending the data, not a remote
1635 * computer. For consistency, we swap them back here. This does not
1636 * apply if we are getting stats from the vport layer because it always
1637 * tracks stats from the perspective of the switch. */
1640 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1641 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1642 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1643 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1644 stats->rx_length_errors = 0;
1645 stats->rx_over_errors = 0;
1646 stats->rx_crc_errors = 0;
1647 stats->rx_frame_errors = 0;
1648 stats->rx_fifo_errors = 0;
1649 stats->rx_missed_errors = 0;
1650 stats->tx_aborted_errors = 0;
1651 stats->tx_carrier_errors = 0;
1652 stats->tx_fifo_errors = 0;
1653 stats->tx_heartbeat_errors = 0;
1654 stats->tx_window_errors = 0;
1656 /* Use kernel netdev's packet and byte counts since vport counters
1657 * do not reflect packet counts on the wire when GSO, TSO or GRO
1659 stats->rx_packets = dev_stats.tx_packets;
1660 stats->rx_bytes = dev_stats.tx_bytes;
1661 stats->tx_packets = dev_stats.rx_packets;
1662 stats->tx_bytes = dev_stats.rx_bytes;
1664 stats->rx_dropped += dev_stats.tx_dropped;
1665 stats->tx_dropped += dev_stats.rx_dropped;
1667 stats->rx_errors += dev_stats.tx_errors;
1668 stats->tx_errors += dev_stats.rx_errors;
1670 stats->multicast += dev_stats.multicast;
1671 stats->collisions += dev_stats.collisions;
1673 ovs_mutex_unlock(&netdev->mutex);
1679 netdev_internal_get_stats(const struct netdev *netdev_,
1680 struct netdev_stats *stats)
1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1685 ovs_mutex_lock(&netdev->mutex);
1686 get_stats_via_vport(netdev_, stats);
1687 error = netdev->vport_stats_error;
1688 ovs_mutex_unlock(&netdev->mutex);
1694 netdev_internal_set_stats(struct netdev *netdev,
1695 const struct netdev_stats *stats)
1697 struct ovs_vport_stats vport_stats;
1698 struct dpif_linux_vport vport;
1701 put_32aligned_u64(&vport_stats.rx_packets, stats->rx_packets);
1702 put_32aligned_u64(&vport_stats.tx_packets, stats->tx_packets);
1703 put_32aligned_u64(&vport_stats.rx_bytes, stats->rx_bytes);
1704 put_32aligned_u64(&vport_stats.tx_bytes, stats->tx_bytes);
1705 put_32aligned_u64(&vport_stats.rx_errors, stats->rx_errors);
1706 put_32aligned_u64(&vport_stats.tx_errors, stats->tx_errors);
1707 put_32aligned_u64(&vport_stats.rx_dropped, stats->rx_dropped);
1708 put_32aligned_u64(&vport_stats.tx_dropped, stats->tx_dropped);
1710 dpif_linux_vport_init(&vport);
1711 vport.cmd = OVS_VPORT_CMD_SET;
1712 vport.name = netdev_get_name(netdev);
1713 vport.stats = &vport_stats;
1715 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1717 /* If the vport layer doesn't know about the device, that doesn't mean it
1718 * doesn't exist (after all were able to open it when netdev_open() was
1719 * called), it just means that it isn't attached and we'll be getting
1720 * stats a different way. */
1721 if (err == ENODEV) {
1729 netdev_linux_read_features(struct netdev_linux *netdev)
1731 struct ethtool_cmd ecmd;
1735 if (netdev->cache_valid & VALID_FEATURES) {
1739 COVERAGE_INC(netdev_get_ethtool);
1740 memset(&ecmd, 0, sizeof ecmd);
1741 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1742 ETHTOOL_GSET, "ETHTOOL_GSET");
1747 /* Supported features. */
1748 netdev->supported = 0;
1749 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1750 netdev->supported |= NETDEV_F_10MB_HD;
1752 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1753 netdev->supported |= NETDEV_F_10MB_FD;
1755 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1756 netdev->supported |= NETDEV_F_100MB_HD;
1758 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1759 netdev->supported |= NETDEV_F_100MB_FD;
1761 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1762 netdev->supported |= NETDEV_F_1GB_HD;
1764 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1765 netdev->supported |= NETDEV_F_1GB_FD;
1767 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1768 netdev->supported |= NETDEV_F_10GB_FD;
1770 if (ecmd.supported & SUPPORTED_TP) {
1771 netdev->supported |= NETDEV_F_COPPER;
1773 if (ecmd.supported & SUPPORTED_FIBRE) {
1774 netdev->supported |= NETDEV_F_FIBER;
1776 if (ecmd.supported & SUPPORTED_Autoneg) {
1777 netdev->supported |= NETDEV_F_AUTONEG;
1779 if (ecmd.supported & SUPPORTED_Pause) {
1780 netdev->supported |= NETDEV_F_PAUSE;
1782 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1783 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1786 /* Advertised features. */
1787 netdev->advertised = 0;
1788 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1789 netdev->advertised |= NETDEV_F_10MB_HD;
1791 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1792 netdev->advertised |= NETDEV_F_10MB_FD;
1794 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1795 netdev->advertised |= NETDEV_F_100MB_HD;
1797 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1798 netdev->advertised |= NETDEV_F_100MB_FD;
1800 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1801 netdev->advertised |= NETDEV_F_1GB_HD;
1803 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1804 netdev->advertised |= NETDEV_F_1GB_FD;
1806 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1807 netdev->advertised |= NETDEV_F_10GB_FD;
1809 if (ecmd.advertising & ADVERTISED_TP) {
1810 netdev->advertised |= NETDEV_F_COPPER;
1812 if (ecmd.advertising & ADVERTISED_FIBRE) {
1813 netdev->advertised |= NETDEV_F_FIBER;
1815 if (ecmd.advertising & ADVERTISED_Autoneg) {
1816 netdev->advertised |= NETDEV_F_AUTONEG;
1818 if (ecmd.advertising & ADVERTISED_Pause) {
1819 netdev->advertised |= NETDEV_F_PAUSE;
1821 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1822 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1825 /* Current settings. */
1827 if (speed == SPEED_10) {
1828 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1829 } else if (speed == SPEED_100) {
1830 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1831 } else if (speed == SPEED_1000) {
1832 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1833 } else if (speed == SPEED_10000) {
1834 netdev->current = NETDEV_F_10GB_FD;
1835 } else if (speed == 40000) {
1836 netdev->current = NETDEV_F_40GB_FD;
1837 } else if (speed == 100000) {
1838 netdev->current = NETDEV_F_100GB_FD;
1839 } else if (speed == 1000000) {
1840 netdev->current = NETDEV_F_1TB_FD;
1842 netdev->current = 0;
1845 if (ecmd.port == PORT_TP) {
1846 netdev->current |= NETDEV_F_COPPER;
1847 } else if (ecmd.port == PORT_FIBRE) {
1848 netdev->current |= NETDEV_F_FIBER;
1852 netdev->current |= NETDEV_F_AUTONEG;
1856 netdev->cache_valid |= VALID_FEATURES;
1857 netdev->get_features_error = error;
1860 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1861 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1862 * Returns 0 if successful, otherwise a positive errno value. */
1864 netdev_linux_get_features(const struct netdev *netdev_,
1865 enum netdev_features *current,
1866 enum netdev_features *advertised,
1867 enum netdev_features *supported,
1868 enum netdev_features *peer)
1870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1873 ovs_mutex_lock(&netdev->mutex);
1874 netdev_linux_read_features(netdev);
1875 if (!netdev->get_features_error) {
1876 *current = netdev->current;
1877 *advertised = netdev->advertised;
1878 *supported = netdev->supported;
1879 *peer = 0; /* XXX */
1881 error = netdev->get_features_error;
1882 ovs_mutex_unlock(&netdev->mutex);
1887 /* Set the features advertised by 'netdev' to 'advertise'. */
1889 netdev_linux_set_advertisements(struct netdev *netdev_,
1890 enum netdev_features advertise)
1892 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1893 struct ethtool_cmd ecmd;
1896 ovs_mutex_lock(&netdev->mutex);
1898 COVERAGE_INC(netdev_get_ethtool);
1899 memset(&ecmd, 0, sizeof ecmd);
1900 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1901 ETHTOOL_GSET, "ETHTOOL_GSET");
1906 ecmd.advertising = 0;
1907 if (advertise & NETDEV_F_10MB_HD) {
1908 ecmd.advertising |= ADVERTISED_10baseT_Half;
1910 if (advertise & NETDEV_F_10MB_FD) {
1911 ecmd.advertising |= ADVERTISED_10baseT_Full;
1913 if (advertise & NETDEV_F_100MB_HD) {
1914 ecmd.advertising |= ADVERTISED_100baseT_Half;
1916 if (advertise & NETDEV_F_100MB_FD) {
1917 ecmd.advertising |= ADVERTISED_100baseT_Full;
1919 if (advertise & NETDEV_F_1GB_HD) {
1920 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1922 if (advertise & NETDEV_F_1GB_FD) {
1923 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1925 if (advertise & NETDEV_F_10GB_FD) {
1926 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1928 if (advertise & NETDEV_F_COPPER) {
1929 ecmd.advertising |= ADVERTISED_TP;
1931 if (advertise & NETDEV_F_FIBER) {
1932 ecmd.advertising |= ADVERTISED_FIBRE;
1934 if (advertise & NETDEV_F_AUTONEG) {
1935 ecmd.advertising |= ADVERTISED_Autoneg;
1937 if (advertise & NETDEV_F_PAUSE) {
1938 ecmd.advertising |= ADVERTISED_Pause;
1940 if (advertise & NETDEV_F_PAUSE_ASYM) {
1941 ecmd.advertising |= ADVERTISED_Asym_Pause;
1943 COVERAGE_INC(netdev_set_ethtool);
1944 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1945 ETHTOOL_SSET, "ETHTOOL_SSET");
1948 ovs_mutex_unlock(&netdev->mutex);
1952 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1953 * successful, otherwise a positive errno value. */
1955 netdev_linux_set_policing(struct netdev *netdev_,
1956 uint32_t kbits_rate, uint32_t kbits_burst)
1958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1959 const char *netdev_name = netdev_get_name(netdev_);
1962 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1963 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1964 : kbits_burst); /* Stick with user-specified value. */
1966 ovs_mutex_lock(&netdev->mutex);
1967 if (netdev->cache_valid & VALID_POLICING) {
1968 error = netdev->netdev_policing_error;
1969 if (error || (netdev->kbits_rate == kbits_rate &&
1970 netdev->kbits_burst == kbits_burst)) {
1971 /* Assume that settings haven't changed since we last set them. */
1974 netdev->cache_valid &= ~VALID_POLICING;
1977 COVERAGE_INC(netdev_set_policing);
1978 /* Remove any existing ingress qdisc. */
1979 error = tc_add_del_ingress_qdisc(netdev_, false);
1981 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1982 netdev_name, ovs_strerror(error));
1987 error = tc_add_del_ingress_qdisc(netdev_, true);
1989 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1990 netdev_name, ovs_strerror(error));
1994 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1996 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1997 netdev_name, ovs_strerror(error));
2002 netdev->kbits_rate = kbits_rate;
2003 netdev->kbits_burst = kbits_burst;
2006 if (!error || error == ENODEV) {
2007 netdev->netdev_policing_error = error;
2008 netdev->cache_valid |= VALID_POLICING;
2010 ovs_mutex_unlock(&netdev->mutex);
2015 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2018 const struct tc_ops *const *opsp;
2020 for (opsp = tcs; *opsp != NULL; opsp++) {
2021 const struct tc_ops *ops = *opsp;
2022 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2023 sset_add(types, ops->ovs_name);
2029 static const struct tc_ops *
2030 tc_lookup_ovs_name(const char *name)
2032 const struct tc_ops *const *opsp;
2034 for (opsp = tcs; *opsp != NULL; opsp++) {
2035 const struct tc_ops *ops = *opsp;
2036 if (!strcmp(name, ops->ovs_name)) {
2043 static const struct tc_ops *
2044 tc_lookup_linux_name(const char *name)
2046 const struct tc_ops *const *opsp;
2048 for (opsp = tcs; *opsp != NULL; opsp++) {
2049 const struct tc_ops *ops = *opsp;
2050 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2057 static struct tc_queue *
2058 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2062 struct tc_queue *queue;
2064 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2065 if (queue->queue_id == queue_id) {
2072 static struct tc_queue *
2073 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2075 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2079 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2081 struct netdev_qos_capabilities *caps)
2083 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2087 caps->n_queues = ops->n_queues;
2092 netdev_linux_get_qos(const struct netdev *netdev_,
2093 const char **typep, struct smap *details)
2095 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2098 ovs_mutex_lock(&netdev->mutex);
2099 error = tc_query_qdisc(netdev_);
2101 *typep = netdev->tc->ops->ovs_name;
2102 error = (netdev->tc->ops->qdisc_get
2103 ? netdev->tc->ops->qdisc_get(netdev_, details)
2106 ovs_mutex_unlock(&netdev->mutex);
2112 netdev_linux_set_qos(struct netdev *netdev_,
2113 const char *type, const struct smap *details)
2115 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2116 const struct tc_ops *new_ops;
2119 new_ops = tc_lookup_ovs_name(type);
2120 if (!new_ops || !new_ops->tc_install) {
2124 ovs_mutex_lock(&netdev->mutex);
2125 error = tc_query_qdisc(netdev_);
2130 if (new_ops == netdev->tc->ops) {
2131 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2133 /* Delete existing qdisc. */
2134 error = tc_del_qdisc(netdev_);
2138 ovs_assert(netdev->tc == NULL);
2140 /* Install new qdisc. */
2141 error = new_ops->tc_install(netdev_, details);
2142 ovs_assert((error == 0) == (netdev->tc != NULL));
2146 ovs_mutex_unlock(&netdev->mutex);
2151 netdev_linux_get_queue(const struct netdev *netdev_,
2152 unsigned int queue_id, struct smap *details)
2154 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2157 ovs_mutex_lock(&netdev->mutex);
2158 error = tc_query_qdisc(netdev_);
2160 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2162 ? netdev->tc->ops->class_get(netdev_, queue, details)
2165 ovs_mutex_unlock(&netdev->mutex);
2171 netdev_linux_set_queue(struct netdev *netdev_,
2172 unsigned int queue_id, const struct smap *details)
2174 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2177 ovs_mutex_lock(&netdev->mutex);
2178 error = tc_query_qdisc(netdev_);
2180 error = (queue_id < netdev->tc->ops->n_queues
2181 && netdev->tc->ops->class_set
2182 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2185 ovs_mutex_unlock(&netdev->mutex);
2191 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2193 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2196 ovs_mutex_lock(&netdev->mutex);
2197 error = tc_query_qdisc(netdev_);
2199 if (netdev->tc->ops->class_delete) {
2200 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2202 ? netdev->tc->ops->class_delete(netdev_, queue)
2208 ovs_mutex_unlock(&netdev->mutex);
2214 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2215 unsigned int queue_id,
2216 struct netdev_queue_stats *stats)
2218 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2221 ovs_mutex_lock(&netdev->mutex);
2222 error = tc_query_qdisc(netdev_);
2224 if (netdev->tc->ops->class_get_stats) {
2225 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2227 stats->created = queue->created;
2228 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2237 ovs_mutex_unlock(&netdev->mutex);
2242 struct queue_dump_state {
2243 struct nl_dump dump;
2248 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2250 struct ofpbuf request;
2251 struct tcmsg *tcmsg;
2253 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2257 tcmsg->tcm_parent = 0;
2258 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2259 ofpbuf_uninit(&request);
2261 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2266 finish_queue_dump(struct queue_dump_state *state)
2268 ofpbuf_uninit(&state->buf);
2269 return nl_dump_done(&state->dump);
2272 struct netdev_linux_queue_state {
2273 unsigned int *queues;
2279 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2281 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2284 ovs_mutex_lock(&netdev->mutex);
2285 error = tc_query_qdisc(netdev_);
2287 if (netdev->tc->ops->class_get) {
2288 struct netdev_linux_queue_state *state;
2289 struct tc_queue *queue;
2292 *statep = state = xmalloc(sizeof *state);
2293 state->n_queues = hmap_count(&netdev->tc->queues);
2294 state->cur_queue = 0;
2295 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2298 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2299 state->queues[i++] = queue->queue_id;
2305 ovs_mutex_unlock(&netdev->mutex);
2311 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2312 unsigned int *queue_idp, struct smap *details)
2314 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2315 struct netdev_linux_queue_state *state = state_;
2318 ovs_mutex_lock(&netdev->mutex);
2319 while (state->cur_queue < state->n_queues) {
2320 unsigned int queue_id = state->queues[state->cur_queue++];
2321 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2324 *queue_idp = queue_id;
2325 error = netdev->tc->ops->class_get(netdev_, queue, details);
2329 ovs_mutex_unlock(&netdev->mutex);
2335 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2338 struct netdev_linux_queue_state *state = state_;
2340 free(state->queues);
2346 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2347 netdev_dump_queue_stats_cb *cb, void *aux)
2349 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2352 ovs_mutex_lock(&netdev->mutex);
2353 error = tc_query_qdisc(netdev_);
2355 struct queue_dump_state state;
2357 if (!netdev->tc->ops->class_dump_stats) {
2359 } else if (!start_queue_dump(netdev_, &state)) {
2365 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2366 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2373 retval = finish_queue_dump(&state);
2379 ovs_mutex_unlock(&netdev->mutex);
2385 netdev_linux_get_in4(const struct netdev *netdev_,
2386 struct in_addr *address, struct in_addr *netmask)
2388 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2391 ovs_mutex_lock(&netdev->mutex);
2392 if (!(netdev->cache_valid & VALID_IN4)) {
2393 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2394 SIOCGIFADDR, "SIOCGIFADDR");
2396 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2397 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2399 netdev->cache_valid |= VALID_IN4;
2407 if (netdev->address.s_addr != INADDR_ANY) {
2408 *address = netdev->address;
2409 *netmask = netdev->netmask;
2411 error = EADDRNOTAVAIL;
2414 ovs_mutex_unlock(&netdev->mutex);
2420 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2421 struct in_addr netmask)
2423 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2426 ovs_mutex_lock(&netdev->mutex);
2427 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2429 netdev->cache_valid |= VALID_IN4;
2430 netdev->address = address;
2431 netdev->netmask = netmask;
2432 if (address.s_addr != INADDR_ANY) {
2433 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2434 "SIOCSIFNETMASK", netmask);
2437 ovs_mutex_unlock(&netdev->mutex);
2443 parse_if_inet6_line(const char *line,
2444 struct in6_addr *in6, char ifname[16 + 1])
2446 uint8_t *s6 = in6->s6_addr;
2447 #define X8 "%2"SCNx8
2448 return ovs_scan(line,
2449 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2450 "%*x %*x %*x %*x %16s\n",
2451 &s6[0], &s6[1], &s6[2], &s6[3],
2452 &s6[4], &s6[5], &s6[6], &s6[7],
2453 &s6[8], &s6[9], &s6[10], &s6[11],
2454 &s6[12], &s6[13], &s6[14], &s6[15],
2458 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2459 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2461 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2463 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2465 ovs_mutex_lock(&netdev->mutex);
2466 if (!(netdev->cache_valid & VALID_IN6)) {
2470 netdev->in6 = in6addr_any;
2472 file = fopen("/proc/net/if_inet6", "r");
2474 const char *name = netdev_get_name(netdev_);
2475 while (fgets(line, sizeof line, file)) {
2476 struct in6_addr in6_tmp;
2477 char ifname[16 + 1];
2478 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2479 && !strcmp(name, ifname))
2481 netdev->in6 = in6_tmp;
2487 netdev->cache_valid |= VALID_IN6;
2490 ovs_mutex_unlock(&netdev->mutex);
2496 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2498 struct sockaddr_in sin;
2499 memset(&sin, 0, sizeof sin);
2500 sin.sin_family = AF_INET;
2501 sin.sin_addr = addr;
2504 memset(sa, 0, sizeof *sa);
2505 memcpy(sa, &sin, sizeof sin);
2509 do_set_addr(struct netdev *netdev,
2510 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2514 make_in4_sockaddr(&ifr.ifr_addr, addr);
2515 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2519 /* Adds 'router' as a default IP gateway. */
2521 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2523 struct in_addr any = { INADDR_ANY };
2527 memset(&rt, 0, sizeof rt);
2528 make_in4_sockaddr(&rt.rt_dst, any);
2529 make_in4_sockaddr(&rt.rt_gateway, router);
2530 make_in4_sockaddr(&rt.rt_genmask, any);
2531 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2532 error = af_inet_ioctl(SIOCADDRT, &rt);
2534 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2540 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2543 static const char fn[] = "/proc/net/route";
2548 *netdev_name = NULL;
2549 stream = fopen(fn, "r");
2550 if (stream == NULL) {
2551 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2556 while (fgets(line, sizeof line, stream)) {
2559 ovs_be32 dest, gateway, mask;
2560 int refcnt, metric, mtu;
2561 unsigned int flags, use, window, irtt;
2564 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2566 iface, &dest, &gateway, &flags, &refcnt,
2567 &use, &metric, &mask, &mtu, &window, &irtt)) {
2568 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2572 if (!(flags & RTF_UP)) {
2573 /* Skip routes that aren't up. */
2577 /* The output of 'dest', 'mask', and 'gateway' were given in
2578 * network byte order, so we don't need need any endian
2579 * conversions here. */
2580 if ((dest & mask) == (host->s_addr & mask)) {
2582 /* The host is directly reachable. */
2583 next_hop->s_addr = 0;
2585 /* To reach the host, we must go through a gateway. */
2586 next_hop->s_addr = gateway;
2588 *netdev_name = xstrdup(iface);
2600 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2605 ovs_mutex_lock(&netdev->mutex);
2606 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2607 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2609 COVERAGE_INC(netdev_get_ethtool);
2610 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2611 error = netdev_linux_do_ethtool(netdev->up.name,
2614 "ETHTOOL_GDRVINFO");
2616 netdev->cache_valid |= VALID_DRVINFO;
2621 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2622 smap_add(smap, "driver_version", netdev->drvinfo.version);
2623 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2625 ovs_mutex_unlock(&netdev->mutex);
2631 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2634 smap_add(smap, "driver_name", "openvswitch");
2638 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2639 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2640 * returns 0. Otherwise, it returns a positive errno value; in particular,
2641 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2643 netdev_linux_arp_lookup(const struct netdev *netdev,
2644 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2647 struct sockaddr_in sin;
2650 memset(&r, 0, sizeof r);
2651 memset(&sin, 0, sizeof sin);
2652 sin.sin_family = AF_INET;
2653 sin.sin_addr.s_addr = ip;
2655 memcpy(&r.arp_pa, &sin, sizeof sin);
2656 r.arp_ha.sa_family = ARPHRD_ETHER;
2658 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2659 COVERAGE_INC(netdev_arp_lookup);
2660 retval = af_inet_ioctl(SIOCGARP, &r);
2662 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2663 } else if (retval != ENXIO) {
2664 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2665 netdev_get_name(netdev), IP_ARGS(ip),
2666 ovs_strerror(retval));
2672 nd_to_iff_flags(enum netdev_flags nd)
2675 if (nd & NETDEV_UP) {
2678 if (nd & NETDEV_PROMISC) {
2681 if (nd & NETDEV_LOOPBACK) {
2682 iff |= IFF_LOOPBACK;
2688 iff_to_nd_flags(int iff)
2690 enum netdev_flags nd = 0;
2694 if (iff & IFF_PROMISC) {
2695 nd |= NETDEV_PROMISC;
2697 if (iff & IFF_LOOPBACK) {
2698 nd |= NETDEV_LOOPBACK;
2704 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2705 enum netdev_flags on, enum netdev_flags *old_flagsp)
2706 OVS_REQUIRES(netdev->mutex)
2708 int old_flags, new_flags;
2711 old_flags = netdev->ifi_flags;
2712 *old_flagsp = iff_to_nd_flags(old_flags);
2713 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2714 if (new_flags != old_flags) {
2715 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2716 get_flags(&netdev->up, &netdev->ifi_flags);
2723 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2724 enum netdev_flags on, enum netdev_flags *old_flagsp)
2726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2729 ovs_mutex_lock(&netdev->mutex);
2730 error = update_flags(netdev, off, on, old_flagsp);
2731 ovs_mutex_unlock(&netdev->mutex);
2736 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2737 GET_FEATURES, GET_STATUS) \
2743 netdev_linux_wait, \
2745 netdev_linux_alloc, \
2747 netdev_linux_destruct, \
2748 netdev_linux_dealloc, \
2749 NULL, /* get_config */ \
2750 NULL, /* set_config */ \
2751 NULL, /* get_tunnel_config */ \
2753 netdev_linux_send, \
2754 netdev_linux_send_wait, \
2756 netdev_linux_set_etheraddr, \
2757 netdev_linux_get_etheraddr, \
2758 netdev_linux_get_mtu, \
2759 netdev_linux_set_mtu, \
2760 netdev_linux_get_ifindex, \
2761 netdev_linux_get_carrier, \
2762 netdev_linux_get_carrier_resets, \
2763 netdev_linux_set_miimon_interval, \
2768 netdev_linux_set_advertisements, \
2770 netdev_linux_set_policing, \
2771 netdev_linux_get_qos_types, \
2772 netdev_linux_get_qos_capabilities, \
2773 netdev_linux_get_qos, \
2774 netdev_linux_set_qos, \
2775 netdev_linux_get_queue, \
2776 netdev_linux_set_queue, \
2777 netdev_linux_delete_queue, \
2778 netdev_linux_get_queue_stats, \
2779 netdev_linux_queue_dump_start, \
2780 netdev_linux_queue_dump_next, \
2781 netdev_linux_queue_dump_done, \
2782 netdev_linux_dump_queue_stats, \
2784 netdev_linux_get_in4, \
2785 netdev_linux_set_in4, \
2786 netdev_linux_get_in6, \
2787 netdev_linux_add_router, \
2788 netdev_linux_get_next_hop, \
2790 netdev_linux_arp_lookup, \
2792 netdev_linux_update_flags, \
2794 netdev_linux_rxq_alloc, \
2795 netdev_linux_rxq_construct, \
2796 netdev_linux_rxq_destruct, \
2797 netdev_linux_rxq_dealloc, \
2798 netdev_linux_rxq_recv, \
2799 netdev_linux_rxq_wait, \
2800 netdev_linux_rxq_drain, \
2803 const struct netdev_class netdev_linux_class =
2806 netdev_linux_construct,
2807 netdev_linux_get_stats,
2808 NULL, /* set_stats */
2809 netdev_linux_get_features,
2810 netdev_linux_get_status);
2812 const struct netdev_class netdev_tap_class =
2815 netdev_linux_construct_tap,
2816 netdev_tap_get_stats,
2817 NULL, /* set_stats */
2818 netdev_linux_get_features,
2819 netdev_linux_get_status);
2821 const struct netdev_class netdev_internal_class =
2824 netdev_linux_construct,
2825 netdev_internal_get_stats,
2826 netdev_internal_set_stats,
2827 NULL, /* get_features */
2828 netdev_internal_get_status);
2830 /* HTB traffic control class. */
2832 #define HTB_N_QUEUES 0xf000
2836 unsigned int max_rate; /* In bytes/s. */
2840 struct tc_queue tc_queue;
2841 unsigned int min_rate; /* In bytes/s. */
2842 unsigned int max_rate; /* In bytes/s. */
2843 unsigned int burst; /* In bytes. */
2844 unsigned int priority; /* Lower values are higher priorities. */
2848 htb_get__(const struct netdev *netdev_)
2850 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2851 return CONTAINER_OF(netdev->tc, struct htb, tc);
2855 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2860 htb = xmalloc(sizeof *htb);
2861 tc_init(&htb->tc, &tc_ops_htb);
2862 htb->max_rate = max_rate;
2864 netdev->tc = &htb->tc;
2867 /* Create an HTB qdisc.
2869 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2871 htb_setup_qdisc__(struct netdev *netdev)
2874 struct tc_htb_glob opt;
2875 struct ofpbuf request;
2876 struct tcmsg *tcmsg;
2878 tc_del_qdisc(netdev);
2880 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2881 NLM_F_EXCL | NLM_F_CREATE, &request);
2885 tcmsg->tcm_handle = tc_make_handle(1, 0);
2886 tcmsg->tcm_parent = TC_H_ROOT;
2888 nl_msg_put_string(&request, TCA_KIND, "htb");
2890 memset(&opt, 0, sizeof opt);
2891 opt.rate2quantum = 10;
2895 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2896 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2897 nl_msg_end_nested(&request, opt_offset);
2899 return tc_transact(&request, NULL);
2902 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2903 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2905 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2906 unsigned int parent, struct htb_class *class)
2909 struct tc_htb_opt opt;
2910 struct ofpbuf request;
2911 struct tcmsg *tcmsg;
2915 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2917 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2918 netdev_get_name(netdev));
2922 memset(&opt, 0, sizeof opt);
2923 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2924 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2925 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2926 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2927 opt.prio = class->priority;
2929 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2933 tcmsg->tcm_handle = handle;
2934 tcmsg->tcm_parent = parent;
2936 nl_msg_put_string(&request, TCA_KIND, "htb");
2937 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2938 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2939 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2940 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2941 nl_msg_end_nested(&request, opt_offset);
2943 error = tc_transact(&request, NULL);
2945 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2946 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2947 netdev_get_name(netdev),
2948 tc_get_major(handle), tc_get_minor(handle),
2949 tc_get_major(parent), tc_get_minor(parent),
2950 class->min_rate, class->max_rate,
2951 class->burst, class->priority, ovs_strerror(error));
2956 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2957 * description of them into 'details'. The description complies with the
2958 * specification given in the vswitch database documentation for linux-htb
2961 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2963 static const struct nl_policy tca_htb_policy[] = {
2964 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2965 .min_len = sizeof(struct tc_htb_opt) },
2968 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2969 const struct tc_htb_opt *htb;
2971 if (!nl_parse_nested(nl_options, tca_htb_policy,
2972 attrs, ARRAY_SIZE(tca_htb_policy))) {
2973 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2977 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2978 class->min_rate = htb->rate.rate;
2979 class->max_rate = htb->ceil.rate;
2980 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2981 class->priority = htb->prio;
2986 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2987 struct htb_class *options,
2988 struct netdev_queue_stats *stats)
2990 struct nlattr *nl_options;
2991 unsigned int handle;
2994 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2995 if (!error && queue_id) {
2996 unsigned int major = tc_get_major(handle);
2997 unsigned int minor = tc_get_minor(handle);
2998 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2999 *queue_id = minor - 1;
3004 if (!error && options) {
3005 error = htb_parse_tca_options__(nl_options, options);
3011 htb_parse_qdisc_details__(struct netdev *netdev_,
3012 const struct smap *details, struct htb_class *hc)
3014 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3015 const char *max_rate_s;
3017 max_rate_s = smap_get(details, "max-rate");
3018 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3019 if (!hc->max_rate) {
3020 enum netdev_features current;
3022 netdev_linux_read_features(netdev);
3023 current = !netdev->get_features_error ? netdev->current : 0;
3024 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3026 hc->min_rate = hc->max_rate;
3032 htb_parse_class_details__(struct netdev *netdev,
3033 const struct smap *details, struct htb_class *hc)
3035 const struct htb *htb = htb_get__(netdev);
3036 const char *min_rate_s = smap_get(details, "min-rate");
3037 const char *max_rate_s = smap_get(details, "max-rate");
3038 const char *burst_s = smap_get(details, "burst");
3039 const char *priority_s = smap_get(details, "priority");
3042 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3044 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3045 netdev_get_name(netdev));
3049 /* HTB requires at least an mtu sized min-rate to send any traffic even
3050 * on uncongested links. */
3051 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3052 hc->min_rate = MAX(hc->min_rate, mtu);
3053 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3056 hc->max_rate = (max_rate_s
3057 ? strtoull(max_rate_s, NULL, 10) / 8
3059 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3060 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3064 * According to hints in the documentation that I've read, it is important
3065 * that 'burst' be at least as big as the largest frame that might be
3066 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3067 * but having it a bit too small is a problem. Since netdev_get_mtu()
3068 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3069 * the MTU. We actually add 64, instead of 14, as a guard against
3070 * additional headers get tacked on somewhere that we're not aware of. */
3071 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3072 hc->burst = MAX(hc->burst, mtu + 64);
3075 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3081 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3082 unsigned int parent, struct htb_class *options,
3083 struct netdev_queue_stats *stats)
3085 struct ofpbuf *reply;
3088 error = tc_query_class(netdev, handle, parent, &reply);
3090 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3091 ofpbuf_delete(reply);
3097 htb_tc_install(struct netdev *netdev, const struct smap *details)
3101 error = htb_setup_qdisc__(netdev);
3103 struct htb_class hc;
3105 htb_parse_qdisc_details__(netdev, details, &hc);
3106 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3107 tc_make_handle(1, 0), &hc);
3109 htb_install__(netdev, hc.max_rate);
3115 static struct htb_class *
3116 htb_class_cast__(const struct tc_queue *queue)
3118 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3122 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3123 const struct htb_class *hc)
3125 struct htb *htb = htb_get__(netdev);
3126 size_t hash = hash_int(queue_id, 0);
3127 struct tc_queue *queue;
3128 struct htb_class *hcp;
3130 queue = tc_find_queue__(netdev, queue_id, hash);
3132 hcp = htb_class_cast__(queue);
3134 hcp = xmalloc(sizeof *hcp);
3135 queue = &hcp->tc_queue;
3136 queue->queue_id = queue_id;
3137 queue->created = time_msec();
3138 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3141 hcp->min_rate = hc->min_rate;
3142 hcp->max_rate = hc->max_rate;
3143 hcp->burst = hc->burst;
3144 hcp->priority = hc->priority;
3148 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3151 struct queue_dump_state state;
3152 struct htb_class hc;
3154 /* Get qdisc options. */
3156 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3157 htb_install__(netdev, hc.max_rate);
3160 if (!start_queue_dump(netdev, &state)) {
3163 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3164 unsigned int queue_id;
3166 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3167 htb_update_queue__(netdev, queue_id, &hc);
3170 finish_queue_dump(&state);
3176 htb_tc_destroy(struct tc *tc)
3178 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3179 struct htb_class *hc, *next;
3181 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3182 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3190 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3192 const struct htb *htb = htb_get__(netdev);
3193 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3198 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3200 struct htb_class hc;
3203 htb_parse_qdisc_details__(netdev, details, &hc);
3204 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3205 tc_make_handle(1, 0), &hc);
3207 htb_get__(netdev)->max_rate = hc.max_rate;
3213 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3214 const struct tc_queue *queue, struct smap *details)
3216 const struct htb_class *hc = htb_class_cast__(queue);
3218 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3219 if (hc->min_rate != hc->max_rate) {
3220 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3222 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3224 smap_add_format(details, "priority", "%u", hc->priority);
3230 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3231 const struct smap *details)
3233 struct htb_class hc;
3236 error = htb_parse_class_details__(netdev, details, &hc);
3241 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3242 tc_make_handle(1, 0xfffe), &hc);
3247 htb_update_queue__(netdev, queue_id, &hc);
3252 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3254 struct htb_class *hc = htb_class_cast__(queue);
3255 struct htb *htb = htb_get__(netdev);
3258 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3260 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3267 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3268 struct netdev_queue_stats *stats)
3270 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3271 tc_make_handle(1, 0xfffe), NULL, stats);
3275 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3276 const struct ofpbuf *nlmsg,
3277 netdev_dump_queue_stats_cb *cb, void *aux)
3279 struct netdev_queue_stats stats;
3280 unsigned int handle, major, minor;
3283 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3288 major = tc_get_major(handle);
3289 minor = tc_get_minor(handle);
3290 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3291 (*cb)(minor - 1, &stats, aux);
3296 static const struct tc_ops tc_ops_htb = {
3297 "htb", /* linux_name */
3298 "linux-htb", /* ovs_name */
3299 HTB_N_QUEUES, /* n_queues */
3308 htb_class_get_stats,
3309 htb_class_dump_stats
3312 /* "linux-hfsc" traffic control class. */
3314 #define HFSC_N_QUEUES 0xf000
3322 struct tc_queue tc_queue;
3327 static struct hfsc *
3328 hfsc_get__(const struct netdev *netdev_)
3330 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3331 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3334 static struct hfsc_class *
3335 hfsc_class_cast__(const struct tc_queue *queue)
3337 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3341 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3343 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3346 hfsc = xmalloc(sizeof *hfsc);
3347 tc_init(&hfsc->tc, &tc_ops_hfsc);
3348 hfsc->max_rate = max_rate;
3349 netdev->tc = &hfsc->tc;
3353 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3354 const struct hfsc_class *hc)
3358 struct hfsc_class *hcp;
3359 struct tc_queue *queue;
3361 hfsc = hfsc_get__(netdev);
3362 hash = hash_int(queue_id, 0);
3364 queue = tc_find_queue__(netdev, queue_id, hash);
3366 hcp = hfsc_class_cast__(queue);
3368 hcp = xmalloc(sizeof *hcp);
3369 queue = &hcp->tc_queue;
3370 queue->queue_id = queue_id;
3371 queue->created = time_msec();
3372 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3375 hcp->min_rate = hc->min_rate;
3376 hcp->max_rate = hc->max_rate;
3380 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3382 const struct tc_service_curve *rsc, *fsc, *usc;
3383 static const struct nl_policy tca_hfsc_policy[] = {
3385 .type = NL_A_UNSPEC,
3387 .min_len = sizeof(struct tc_service_curve),
3390 .type = NL_A_UNSPEC,
3392 .min_len = sizeof(struct tc_service_curve),
3395 .type = NL_A_UNSPEC,
3397 .min_len = sizeof(struct tc_service_curve),
3400 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3402 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3403 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3404 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3408 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3409 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3410 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3412 if (rsc->m1 != 0 || rsc->d != 0 ||
3413 fsc->m1 != 0 || fsc->d != 0 ||
3414 usc->m1 != 0 || usc->d != 0) {
3415 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3416 "Non-linear service curves are not supported.");
3420 if (rsc->m2 != fsc->m2) {
3421 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3422 "Real-time service curves are not supported ");
3426 if (rsc->m2 > usc->m2) {
3427 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3428 "Min-rate service curve is greater than "
3429 "the max-rate service curve.");
3433 class->min_rate = fsc->m2;
3434 class->max_rate = usc->m2;
3439 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3440 struct hfsc_class *options,
3441 struct netdev_queue_stats *stats)
3444 unsigned int handle;
3445 struct nlattr *nl_options;
3447 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3453 unsigned int major, minor;
3455 major = tc_get_major(handle);
3456 minor = tc_get_minor(handle);
3457 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3458 *queue_id = minor - 1;
3465 error = hfsc_parse_tca_options__(nl_options, options);
3472 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3473 unsigned int parent, struct hfsc_class *options,
3474 struct netdev_queue_stats *stats)
3477 struct ofpbuf *reply;
3479 error = tc_query_class(netdev, handle, parent, &reply);
3484 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3485 ofpbuf_delete(reply);
3490 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3491 struct hfsc_class *class)
3493 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3495 const char *max_rate_s;
3497 max_rate_s = smap_get(details, "max-rate");
3498 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3501 enum netdev_features current;
3503 netdev_linux_read_features(netdev);
3504 current = !netdev->get_features_error ? netdev->current : 0;
3505 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3508 class->min_rate = max_rate;
3509 class->max_rate = max_rate;
3513 hfsc_parse_class_details__(struct netdev *netdev,
3514 const struct smap *details,
3515 struct hfsc_class * class)
3517 const struct hfsc *hfsc;
3518 uint32_t min_rate, max_rate;
3519 const char *min_rate_s, *max_rate_s;
3521 hfsc = hfsc_get__(netdev);
3522 min_rate_s = smap_get(details, "min-rate");
3523 max_rate_s = smap_get(details, "max-rate");
3525 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3526 min_rate = MAX(min_rate, 1);
3527 min_rate = MIN(min_rate, hfsc->max_rate);
3529 max_rate = (max_rate_s
3530 ? strtoull(max_rate_s, NULL, 10) / 8
3532 max_rate = MAX(max_rate, min_rate);
3533 max_rate = MIN(max_rate, hfsc->max_rate);
3535 class->min_rate = min_rate;
3536 class->max_rate = max_rate;
3541 /* Create an HFSC qdisc.
3543 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3545 hfsc_setup_qdisc__(struct netdev * netdev)
3547 struct tcmsg *tcmsg;
3548 struct ofpbuf request;
3549 struct tc_hfsc_qopt opt;
3551 tc_del_qdisc(netdev);
3553 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3554 NLM_F_EXCL | NLM_F_CREATE, &request);
3560 tcmsg->tcm_handle = tc_make_handle(1, 0);
3561 tcmsg->tcm_parent = TC_H_ROOT;
3563 memset(&opt, 0, sizeof opt);
3566 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3567 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3569 return tc_transact(&request, NULL);
3572 /* Create an HFSC class.
3574 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3575 * sc rate <min_rate> ul rate <max_rate>" */
3577 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3578 unsigned int parent, struct hfsc_class *class)
3582 struct tcmsg *tcmsg;
3583 struct ofpbuf request;
3584 struct tc_service_curve min, max;
3586 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3592 tcmsg->tcm_handle = handle;
3593 tcmsg->tcm_parent = parent;
3597 min.m2 = class->min_rate;
3601 max.m2 = class->max_rate;
3603 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3604 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3605 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3606 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3607 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3608 nl_msg_end_nested(&request, opt_offset);
3610 error = tc_transact(&request, NULL);
3612 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3613 "min-rate %ubps, max-rate %ubps (%s)",
3614 netdev_get_name(netdev),
3615 tc_get_major(handle), tc_get_minor(handle),
3616 tc_get_major(parent), tc_get_minor(parent),
3617 class->min_rate, class->max_rate, ovs_strerror(error));
3624 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3627 struct hfsc_class class;
3629 error = hfsc_setup_qdisc__(netdev);
3635 hfsc_parse_qdisc_details__(netdev, details, &class);
3636 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3637 tc_make_handle(1, 0), &class);
3643 hfsc_install__(netdev, class.max_rate);
3648 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3651 struct queue_dump_state state;
3652 struct hfsc_class hc;
3655 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3656 hfsc_install__(netdev, hc.max_rate);
3658 if (!start_queue_dump(netdev, &state)) {
3662 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3663 unsigned int queue_id;
3665 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3666 hfsc_update_queue__(netdev, queue_id, &hc);
3670 finish_queue_dump(&state);
3675 hfsc_tc_destroy(struct tc *tc)
3678 struct hfsc_class *hc, *next;
3680 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3682 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3683 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3692 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3694 const struct hfsc *hfsc;
3695 hfsc = hfsc_get__(netdev);
3696 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3701 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3704 struct hfsc_class class;
3706 hfsc_parse_qdisc_details__(netdev, details, &class);
3707 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3708 tc_make_handle(1, 0), &class);
3711 hfsc_get__(netdev)->max_rate = class.max_rate;
3718 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3719 const struct tc_queue *queue, struct smap *details)
3721 const struct hfsc_class *hc;
3723 hc = hfsc_class_cast__(queue);
3724 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3725 if (hc->min_rate != hc->max_rate) {
3726 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3732 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3733 const struct smap *details)
3736 struct hfsc_class class;
3738 error = hfsc_parse_class_details__(netdev, details, &class);
3743 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3744 tc_make_handle(1, 0xfffe), &class);
3749 hfsc_update_queue__(netdev, queue_id, &class);
3754 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3758 struct hfsc_class *hc;
3760 hc = hfsc_class_cast__(queue);
3761 hfsc = hfsc_get__(netdev);
3763 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3765 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3772 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3773 struct netdev_queue_stats *stats)
3775 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3776 tc_make_handle(1, 0xfffe), NULL, stats);
3780 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3781 const struct ofpbuf *nlmsg,
3782 netdev_dump_queue_stats_cb *cb, void *aux)
3784 struct netdev_queue_stats stats;
3785 unsigned int handle, major, minor;
3788 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3793 major = tc_get_major(handle);
3794 minor = tc_get_minor(handle);
3795 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3796 (*cb)(minor - 1, &stats, aux);
3801 static const struct tc_ops tc_ops_hfsc = {
3802 "hfsc", /* linux_name */
3803 "linux-hfsc", /* ovs_name */
3804 HFSC_N_QUEUES, /* n_queues */
3805 hfsc_tc_install, /* tc_install */
3806 hfsc_tc_load, /* tc_load */
3807 hfsc_tc_destroy, /* tc_destroy */
3808 hfsc_qdisc_get, /* qdisc_get */
3809 hfsc_qdisc_set, /* qdisc_set */
3810 hfsc_class_get, /* class_get */
3811 hfsc_class_set, /* class_set */
3812 hfsc_class_delete, /* class_delete */
3813 hfsc_class_get_stats, /* class_get_stats */
3814 hfsc_class_dump_stats /* class_dump_stats */
3817 /* "linux-default" traffic control class.
3819 * This class represents the default, unnamed Linux qdisc. It corresponds to
3820 * the "" (empty string) QoS type in the OVS database. */
3823 default_install__(struct netdev *netdev_)
3825 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3826 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3828 /* Nothing but a tc class implementation is allowed to write to a tc. This
3829 * class never does that, so we can legitimately use a const tc object. */
3830 netdev->tc = CONST_CAST(struct tc *, &tc);
3834 default_tc_install(struct netdev *netdev,
3835 const struct smap *details OVS_UNUSED)
3837 default_install__(netdev);
3842 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3844 default_install__(netdev);
3848 static const struct tc_ops tc_ops_default = {
3849 NULL, /* linux_name */
3854 NULL, /* tc_destroy */
3855 NULL, /* qdisc_get */
3856 NULL, /* qdisc_set */
3857 NULL, /* class_get */
3858 NULL, /* class_set */
3859 NULL, /* class_delete */
3860 NULL, /* class_get_stats */
3861 NULL /* class_dump_stats */
3864 /* "linux-other" traffic control class.
3869 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3871 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3872 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3874 /* Nothing but a tc class implementation is allowed to write to a tc. This
3875 * class never does that, so we can legitimately use a const tc object. */
3876 netdev->tc = CONST_CAST(struct tc *, &tc);
3880 static const struct tc_ops tc_ops_other = {
3881 NULL, /* linux_name */
3882 "linux-other", /* ovs_name */
3884 NULL, /* tc_install */
3886 NULL, /* tc_destroy */
3887 NULL, /* qdisc_get */
3888 NULL, /* qdisc_set */
3889 NULL, /* class_get */
3890 NULL, /* class_set */
3891 NULL, /* class_delete */
3892 NULL, /* class_get_stats */
3893 NULL /* class_dump_stats */
3896 /* Traffic control. */
3898 /* Number of kernel "tc" ticks per second. */
3899 static double ticks_per_s;
3901 /* Number of kernel "jiffies" per second. This is used for the purpose of
3902 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3903 * one jiffy's worth of data.
3905 * There are two possibilities here:
3907 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3908 * approximate range of 100 to 1024. That means that we really need to
3909 * make sure that the qdisc can buffer that much data.
3911 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3912 * has finely granular timers and there's no need to fudge additional room
3913 * for buffers. (There's no extra effort needed to implement that: the
3914 * large 'buffer_hz' is used as a divisor, so practically any number will
3915 * come out as 0 in the division. Small integer results in the case of
3916 * really high dividends won't have any real effect anyhow.)
3918 static unsigned int buffer_hz;
3920 /* Returns tc handle 'major':'minor'. */
3922 tc_make_handle(unsigned int major, unsigned int minor)
3924 return TC_H_MAKE(major << 16, minor);
3927 /* Returns the major number from 'handle'. */
3929 tc_get_major(unsigned int handle)
3931 return TC_H_MAJ(handle) >> 16;
3934 /* Returns the minor number from 'handle'. */
3936 tc_get_minor(unsigned int handle)
3938 return TC_H_MIN(handle);
3941 static struct tcmsg *
3942 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3943 struct ofpbuf *request)
3945 struct tcmsg *tcmsg;
3949 error = get_ifindex(netdev, &ifindex);
3954 ofpbuf_init(request, 512);
3955 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3956 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3957 tcmsg->tcm_family = AF_UNSPEC;
3958 tcmsg->tcm_ifindex = ifindex;
3959 /* Caller should fill in tcmsg->tcm_handle. */
3960 /* Caller should fill in tcmsg->tcm_parent. */
3966 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3968 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3969 ofpbuf_uninit(request);
3973 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3974 * policing configuration.
3976 * This function is equivalent to running the following when 'add' is true:
3977 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3979 * This function is equivalent to running the following when 'add' is false:
3980 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3982 * The configuration and stats may be seen with the following command:
3983 * /sbin/tc -s qdisc show dev <devname>
3985 * Returns 0 if successful, otherwise a positive errno value.
3988 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3990 struct ofpbuf request;
3991 struct tcmsg *tcmsg;
3993 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3994 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3996 tcmsg = tc_make_request(netdev, type, flags, &request);
4000 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4001 tcmsg->tcm_parent = TC_H_INGRESS;
4002 nl_msg_put_string(&request, TCA_KIND, "ingress");
4003 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4005 error = tc_transact(&request, NULL);
4007 /* If we're deleting the qdisc, don't worry about some of the
4008 * error conditions. */
4009 if (!add && (error == ENOENT || error == EINVAL)) {
4018 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4021 * This function is equivalent to running:
4022 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4023 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4026 * The configuration and stats may be seen with the following command:
4027 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4029 * Returns 0 if successful, otherwise a positive errno value.
4032 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4034 struct tc_police tc_police;
4035 struct ofpbuf request;
4036 struct tcmsg *tcmsg;
4037 size_t basic_offset;
4038 size_t police_offset;
4042 memset(&tc_police, 0, sizeof tc_police);
4043 tc_police.action = TC_POLICE_SHOT;
4044 tc_police.mtu = mtu;
4045 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4046 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4047 kbits_burst * 1024);
4049 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4050 NLM_F_EXCL | NLM_F_CREATE, &request);
4054 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4055 tcmsg->tcm_info = tc_make_handle(49,
4056 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4058 nl_msg_put_string(&request, TCA_KIND, "basic");
4059 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4060 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4061 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4062 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4063 nl_msg_end_nested(&request, police_offset);
4064 nl_msg_end_nested(&request, basic_offset);
4066 error = tc_transact(&request, NULL);
4077 /* The values in psched are not individually very meaningful, but they are
4078 * important. The tables below show some values seen in the wild.
4082 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4083 * (Before that, there are hints that it was 1000000000.)
4085 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4089 * -----------------------------------
4090 * [1] 000c8000 000f4240 000f4240 00000064
4091 * [2] 000003e8 00000400 000f4240 3b9aca00
4092 * [3] 000003e8 00000400 000f4240 3b9aca00
4093 * [4] 000003e8 00000400 000f4240 00000064
4094 * [5] 000003e8 00000040 000f4240 3b9aca00
4095 * [6] 000003e8 00000040 000f4240 000000f9
4097 * a b c d ticks_per_s buffer_hz
4098 * ------- --------- ---------- ------------- ----------- -------------
4099 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4100 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4101 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4102 * [4] 1,000 1,024 1,000,000 100 976,562 100
4103 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4104 * [6] 1,000 64 1,000,000 249 15,625,000 249
4106 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4107 * [2] 2.6.26-1-686-bigmem from Debian lenny
4108 * [3] 2.6.26-2-sparc64 from Debian lenny
4109 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4110 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4111 * [6] 2.6.34 from kernel.org on KVM
4113 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4114 static const char fn[] = "/proc/net/psched";
4115 unsigned int a, b, c, d;
4118 if (!ovsthread_once_start(&once)) {
4125 stream = fopen(fn, "r");
4127 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4131 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4132 VLOG_WARN("%s: read failed", fn);
4136 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4140 VLOG_WARN("%s: invalid scheduler parameters", fn);
4144 ticks_per_s = (double) a * c / b;
4148 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4151 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4154 ovsthread_once_done(&once);
4157 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4158 * rate of 'rate' bytes per second. */
4160 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4163 return (rate * ticks) / ticks_per_s;
4166 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4167 * rate of 'rate' bytes per second. */
4169 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4172 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4175 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4176 * a transmission rate of 'rate' bytes per second. */
4178 tc_buffer_per_jiffy(unsigned int rate)
4181 return rate / buffer_hz;
4184 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4185 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4186 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4187 * stores NULL into it if it is absent.
4189 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4192 * Returns 0 if successful, otherwise a positive errno value. */
4194 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4195 struct nlattr **options)
4197 static const struct nl_policy tca_policy[] = {
4198 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4199 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4201 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4203 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4204 tca_policy, ta, ARRAY_SIZE(ta))) {
4205 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4210 *kind = nl_attr_get_string(ta[TCA_KIND]);
4214 *options = ta[TCA_OPTIONS];
4229 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4230 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4231 * into '*options', and its queue statistics into '*stats'. Any of the output
4232 * arguments may be null.
4234 * Returns 0 if successful, otherwise a positive errno value. */
4236 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4237 struct nlattr **options, struct netdev_queue_stats *stats)
4239 static const struct nl_policy tca_policy[] = {
4240 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4241 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4243 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4245 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4246 tca_policy, ta, ARRAY_SIZE(ta))) {
4247 VLOG_WARN_RL(&rl, "failed to parse class message");
4252 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4253 *handlep = tc->tcm_handle;
4257 *options = ta[TCA_OPTIONS];
4261 const struct gnet_stats_queue *gsq;
4262 struct gnet_stats_basic gsb;
4264 static const struct nl_policy stats_policy[] = {
4265 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4266 .min_len = sizeof gsb },
4267 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4268 .min_len = sizeof *gsq },
4270 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4272 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4273 sa, ARRAY_SIZE(sa))) {
4274 VLOG_WARN_RL(&rl, "failed to parse class stats");
4278 /* Alignment issues screw up the length of struct gnet_stats_basic on
4279 * some arch/bitsize combinations. Newer versions of Linux have a
4280 * struct gnet_stats_basic_packed, but we can't depend on that. The
4281 * easiest thing to do is just to make a copy. */
4282 memset(&gsb, 0, sizeof gsb);
4283 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4284 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4285 stats->tx_bytes = gsb.bytes;
4286 stats->tx_packets = gsb.packets;
4288 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4289 stats->tx_errors = gsq->drops;
4299 memset(stats, 0, sizeof *stats);
4304 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4307 tc_query_class(const struct netdev *netdev,
4308 unsigned int handle, unsigned int parent,
4309 struct ofpbuf **replyp)
4311 struct ofpbuf request;
4312 struct tcmsg *tcmsg;
4315 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4319 tcmsg->tcm_handle = handle;
4320 tcmsg->tcm_parent = parent;
4322 error = tc_transact(&request, replyp);
4324 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4325 netdev_get_name(netdev),
4326 tc_get_major(handle), tc_get_minor(handle),
4327 tc_get_major(parent), tc_get_minor(parent),
4328 ovs_strerror(error));
4333 /* Equivalent to "tc class del dev <name> handle <handle>". */
4335 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4337 struct ofpbuf request;
4338 struct tcmsg *tcmsg;
4341 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4345 tcmsg->tcm_handle = handle;
4346 tcmsg->tcm_parent = 0;
4348 error = tc_transact(&request, NULL);
4350 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4351 netdev_get_name(netdev),
4352 tc_get_major(handle), tc_get_minor(handle),
4353 ovs_strerror(error));
4358 /* Equivalent to "tc qdisc del dev <name> root". */
4360 tc_del_qdisc(struct netdev *netdev_)
4362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4363 struct ofpbuf request;
4364 struct tcmsg *tcmsg;
4367 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4371 tcmsg->tcm_handle = tc_make_handle(1, 0);
4372 tcmsg->tcm_parent = TC_H_ROOT;
4374 error = tc_transact(&request, NULL);
4375 if (error == EINVAL) {
4376 /* EINVAL probably means that the default qdisc was in use, in which
4377 * case we've accomplished our purpose. */
4380 if (!error && netdev->tc) {
4381 if (netdev->tc->ops->tc_destroy) {
4382 netdev->tc->ops->tc_destroy(netdev->tc);
4389 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4390 * kernel to determine what they are. Returns 0 if successful, otherwise a
4391 * positive errno value. */
4393 tc_query_qdisc(const struct netdev *netdev_)
4395 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4396 struct ofpbuf request, *qdisc;
4397 const struct tc_ops *ops;
4398 struct tcmsg *tcmsg;
4406 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4407 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4408 * 2.6.35 without that fix backported to it.
4410 * To avoid the OOPS, we must not make a request that would attempt to dump
4411 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4412 * few others. There are a few ways that I can see to do this, but most of
4413 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4414 * technique chosen here is to assume that any non-default qdisc that we
4415 * create will have a class with handle 1:0. The built-in qdiscs only have
4416 * a class with handle 0:0.
4418 * We could check for Linux 2.6.35+ and use a more straightforward method
4420 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4424 tcmsg->tcm_handle = tc_make_handle(1, 0);
4425 tcmsg->tcm_parent = 0;
4427 /* Figure out what tc class to instantiate. */
4428 error = tc_transact(&request, &qdisc);
4432 error = tc_parse_qdisc(qdisc, &kind, NULL);
4434 ops = &tc_ops_other;
4436 ops = tc_lookup_linux_name(kind);
4438 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4439 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4441 ops = &tc_ops_other;
4444 } else if (error == ENOENT) {
4445 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4446 * other entity that doesn't have a handle 1:0. We will assume
4447 * that it's the system default qdisc. */
4448 ops = &tc_ops_default;
4451 /* Who knows? Maybe the device got deleted. */
4452 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4453 netdev_get_name(netdev_), ovs_strerror(error));
4454 ops = &tc_ops_other;
4457 /* Instantiate it. */
4458 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4459 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4460 ofpbuf_delete(qdisc);
4462 return error ? error : load_error;
4465 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4466 approximate the time to transmit packets of various lengths. For an MTU of
4467 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4468 represents two possible packet lengths; for a MTU of 513 through 1024, four
4469 possible lengths; and so on.
4471 Returns, for the specified 'mtu', the number of bits that packet lengths
4472 need to be shifted right to fit within such a 256-entry table. */
4474 tc_calc_cell_log(unsigned int mtu)
4479 mtu = ETH_PAYLOAD_MAX;
4481 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4483 for (cell_log = 0; mtu >= 256; cell_log++) {
4490 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4493 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4495 memset(rate, 0, sizeof *rate);
4496 rate->cell_log = tc_calc_cell_log(mtu);
4497 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4498 /* rate->cell_align = 0; */ /* distro headers. */
4499 rate->mpu = ETH_TOTAL_MIN;
4503 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4504 * attribute of the specified "type".
4506 * See tc_calc_cell_log() above for a description of "rtab"s. */
4508 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4513 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4514 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4515 unsigned packet_size = (i + 1) << rate->cell_log;
4516 if (packet_size < rate->mpu) {
4517 packet_size = rate->mpu;
4519 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4523 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4524 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4525 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4528 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4530 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4531 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4534 /* Linux-only functions declared in netdev-linux.h */
4536 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4537 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4539 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4540 const char *flag_name, bool enable)
4542 const char *netdev_name = netdev_get_name(netdev);
4543 struct ethtool_value evalue;
4547 COVERAGE_INC(netdev_get_ethtool);
4548 memset(&evalue, 0, sizeof evalue);
4549 error = netdev_linux_do_ethtool(netdev_name,
4550 (struct ethtool_cmd *)&evalue,
4551 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4556 COVERAGE_INC(netdev_set_ethtool);
4557 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4558 error = netdev_linux_do_ethtool(netdev_name,
4559 (struct ethtool_cmd *)&evalue,
4560 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4565 COVERAGE_INC(netdev_get_ethtool);
4566 memset(&evalue, 0, sizeof evalue);
4567 error = netdev_linux_do_ethtool(netdev_name,
4568 (struct ethtool_cmd *)&evalue,
4569 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4574 if (new_flags != evalue.data) {
4575 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4576 "device %s failed", enable ? "enable" : "disable",
4577 flag_name, netdev_name);
4584 /* Utility functions. */
4586 /* Copies 'src' into 'dst', performing format conversion in the process. */
4588 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4589 const struct rtnl_link_stats *src)
4591 dst->rx_packets = src->rx_packets;
4592 dst->tx_packets = src->tx_packets;
4593 dst->rx_bytes = src->rx_bytes;
4594 dst->tx_bytes = src->tx_bytes;
4595 dst->rx_errors = src->rx_errors;
4596 dst->tx_errors = src->tx_errors;
4597 dst->rx_dropped = src->rx_dropped;
4598 dst->tx_dropped = src->tx_dropped;
4599 dst->multicast = src->multicast;
4600 dst->collisions = src->collisions;
4601 dst->rx_length_errors = src->rx_length_errors;
4602 dst->rx_over_errors = src->rx_over_errors;
4603 dst->rx_crc_errors = src->rx_crc_errors;
4604 dst->rx_frame_errors = src->rx_frame_errors;
4605 dst->rx_fifo_errors = src->rx_fifo_errors;
4606 dst->rx_missed_errors = src->rx_missed_errors;
4607 dst->tx_aborted_errors = src->tx_aborted_errors;
4608 dst->tx_carrier_errors = src->tx_carrier_errors;
4609 dst->tx_fifo_errors = src->tx_fifo_errors;
4610 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4611 dst->tx_window_errors = src->tx_window_errors;
4615 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4617 struct ofpbuf request;
4618 struct ofpbuf *reply;
4621 ofpbuf_init(&request, 0);
4622 nl_msg_put_nlmsghdr(&request,
4623 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4624 RTM_GETLINK, NLM_F_REQUEST);
4625 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4626 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4627 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4628 ofpbuf_uninit(&request);
4633 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4634 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4635 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4636 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4639 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4643 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4648 ofpbuf_delete(reply);
4653 get_flags(const struct netdev *dev, unsigned int *flags)
4659 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4661 *flags = ifr.ifr_flags;
4667 set_flags(const char *name, unsigned int flags)
4671 ifr.ifr_flags = flags;
4672 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4676 do_get_ifindex(const char *netdev_name)
4681 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4682 COVERAGE_INC(netdev_get_ifindex);
4684 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4686 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4687 netdev_name, ovs_strerror(error));
4690 return ifr.ifr_ifindex;
4694 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4696 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4698 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4699 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4702 netdev->get_ifindex_error = -ifindex;
4703 netdev->ifindex = 0;
4705 netdev->get_ifindex_error = 0;
4706 netdev->ifindex = ifindex;
4708 netdev->cache_valid |= VALID_IFINDEX;
4711 *ifindexp = netdev->ifindex;
4712 return netdev->get_ifindex_error;
4716 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4722 memset(&ifr, 0, sizeof ifr);
4723 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4724 COVERAGE_INC(netdev_get_hwaddr);
4725 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4727 /* ENODEV probably means that a vif disappeared asynchronously and
4728 * hasn't been removed from the database yet, so reduce the log level
4729 * to INFO for that case. */
4730 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4731 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4732 netdev_name, ovs_strerror(error));
4735 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4736 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4737 VLOG_WARN("%s device has unknown hardware address family %d",
4738 netdev_name, hwaddr_family);
4740 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4745 set_etheraddr(const char *netdev_name,
4746 const uint8_t mac[ETH_ADDR_LEN])
4751 memset(&ifr, 0, sizeof ifr);
4752 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4753 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4754 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4755 COVERAGE_INC(netdev_set_hwaddr);
4756 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4758 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4759 netdev_name, ovs_strerror(error));
4765 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4766 int cmd, const char *cmd_name)
4771 memset(&ifr, 0, sizeof ifr);
4772 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4773 ifr.ifr_data = (caddr_t) ecmd;
4776 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4778 if (error != EOPNOTSUPP) {
4779 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4780 "failed: %s", cmd_name, name, ovs_strerror(error));
4782 /* The device doesn't support this operation. That's pretty
4783 * common, so there's no point in logging anything. */
4790 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4791 int cmd, const char *cmd_name)
4796 ifr.ifr_addr.sa_family = AF_INET;
4797 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4799 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4801 *ip = sin->sin_addr;
4806 /* Returns an AF_PACKET raw socket or a negative errno value. */
4808 af_packet_sock(void)
4810 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4813 if (ovsthread_once_start(&once)) {
4814 sock = socket(AF_PACKET, SOCK_RAW, 0);
4816 int error = set_nonblocking(sock);
4823 VLOG_ERR("failed to create packet socket: %s",
4824 ovs_strerror(errno));
4826 ovsthread_once_done(&once);