2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
153 /* Traffic control. */
155 /* An instance of a traffic control class. Always associated with a particular
158 * Each TC implementation subclasses this with whatever additional data it
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
169 /* One traffic control queue.
171 * Each TC implementation subclasses this with whatever additional data it
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
251 * This function may be null if 'tc' is not configurable.
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' is not configurable.
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
277 * This function may be null if 'tc' does not have queues ('n_queues' is
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
306 * On success, initializes '*stats'.
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
325 tc_init(struct tc *tc, const struct tc_ops *ops)
328 hmap_init(&tc->queues);
332 tc_destroy(struct tc *tc)
334 hmap_destroy(&tc->queues);
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
384 struct netdev_linux {
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
390 unsigned int cache_valid;
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
422 /* For devices of class netdev_tap_class only. */
426 struct netdev_rxq_linux {
427 struct netdev_rxq up;
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
440 * Readers do not depend on this variable synchronizing with the related
441 * changes in the device miimon status, so we can use atomic_count. */
442 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
444 static void netdev_linux_run(void);
446 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
447 int cmd, const char *cmd_name);
448 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
449 int cmd, const char *cmd_name);
450 static int get_flags(const struct netdev *, unsigned int *flags);
451 static int set_flags(const char *, unsigned int flags);
452 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
453 enum netdev_flags on, enum netdev_flags *old_flagsp)
454 OVS_REQUIRES(netdev->mutex);
455 static int do_get_ifindex(const char *netdev_name);
456 static int get_ifindex(const struct netdev *, int *ifindexp);
457 static int do_set_addr(struct netdev *netdev,
458 int ioctl_nr, const char *ioctl_name,
459 struct in_addr addr);
460 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
461 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
462 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
463 static int af_packet_sock(void);
464 static bool netdev_linux_miimon_enabled(void);
465 static void netdev_linux_miimon_run(void);
466 static void netdev_linux_miimon_wait(void);
467 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
470 is_netdev_linux_class(const struct netdev_class *netdev_class)
472 return netdev_class->run == netdev_linux_run;
476 is_tap_netdev(const struct netdev *netdev)
478 return netdev_get_class(netdev) == &netdev_tap_class;
481 static struct netdev_linux *
482 netdev_linux_cast(const struct netdev *netdev)
484 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
486 return CONTAINER_OF(netdev, struct netdev_linux, up);
489 static struct netdev_rxq_linux *
490 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
492 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
493 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
496 static void netdev_linux_update(struct netdev_linux *netdev,
497 const struct rtnetlink_link_change *)
498 OVS_REQUIRES(netdev->mutex);
499 static void netdev_linux_changed(struct netdev_linux *netdev,
500 unsigned int ifi_flags, unsigned int mask)
501 OVS_REQUIRES(netdev->mutex);
503 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
504 * if no such socket could be created. */
505 static struct nl_sock *
506 netdev_linux_notify_sock(void)
508 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
509 static struct nl_sock *sock;
511 if (ovsthread_once_start(&once)) {
514 error = nl_sock_create(NETLINK_ROUTE, &sock);
516 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
518 nl_sock_destroy(sock);
522 ovsthread_once_done(&once);
529 netdev_linux_miimon_enabled(void)
531 return atomic_count_get(&miimon_cnt) > 0;
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 netdev_change_seq_changed(&dev->up);
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
761 atomic_count_dec(&miimon_cnt);
764 ovs_mutex_destroy(&netdev->mutex);
768 netdev_linux_dealloc(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 static struct netdev_rxq *
775 netdev_linux_rxq_alloc(void)
777 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
782 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
784 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
785 struct netdev *netdev_ = rx->up.netdev;
786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 ovs_mutex_lock(&netdev->mutex);
790 rx->is_tap = is_tap_netdev(netdev_);
792 rx->fd = netdev->tap_fd;
794 struct sockaddr_ll sll;
796 /* Result of tcpdump -dd inbound */
797 static const struct sock_filter filt[] = {
798 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
799 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
800 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
801 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
803 static const struct sock_fprog fprog = {
804 ARRAY_SIZE(filt), (struct sock_filter *) filt
807 /* Create file descriptor. */
808 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
811 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
816 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
818 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
819 netdev_get_name(netdev_), ovs_strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(rx->fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->up, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = htons(ETH_P_ALL);
840 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), ovs_strerror(error));
847 /* Filter for only inbound packets. */
848 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
852 VLOG_ERR("%s: failed to attach filter (%s)",
853 netdev_get_name(netdev_), ovs_strerror(error));
857 ovs_mutex_unlock(&netdev->mutex);
865 ovs_mutex_unlock(&netdev->mutex);
870 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
872 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
880 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
882 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
888 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
890 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
891 return htons(aux->tp_vlan_tpid);
893 return htons(ETH_TYPE_VLAN);
898 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
900 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
904 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
909 struct cmsghdr *cmsg;
912 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
916 /* Reserve headroom for a single VLAN tag */
917 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
918 size = ofpbuf_tailroom(buffer);
920 iov.iov_base = ofpbuf_data(buffer);
922 msgh.msg_name = NULL;
923 msgh.msg_namelen = 0;
926 msgh.msg_control = &cmsg_buffer;
927 msgh.msg_controllen = sizeof cmsg_buffer;
931 retval = recvmsg(fd, &msgh, MSG_TRUNC);
932 } while (retval < 0 && errno == EINTR);
936 } else if (retval > size) {
940 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
942 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
943 const struct tpacket_auxdata *aux;
945 if (cmsg->cmsg_level != SOL_PACKET
946 || cmsg->cmsg_type != PACKET_AUXDATA
947 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
951 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
952 if (auxdata_has_vlan_tci(aux)) {
953 if (retval < ETH_HEADER_LEN) {
957 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
958 htons(aux->tp_vlan_tci));
967 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
970 size_t size = ofpbuf_tailroom(buffer);
973 retval = read(fd, ofpbuf_data(buffer), size);
974 } while (retval < 0 && errno == EINTR);
978 } else if (retval > size) {
982 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
987 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
990 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
991 struct netdev *netdev = rx->up.netdev;
992 struct dpif_packet *packet;
993 struct ofpbuf *buffer;
997 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
998 mtu = ETH_PAYLOAD_MAX;
1001 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1002 DP_NETDEV_HEADROOM);
1003 buffer = &packet->ofpbuf;
1005 retval = (rx->is_tap
1006 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1007 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1010 if (retval != EAGAIN && retval != EMSGSIZE) {
1011 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1012 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1014 dpif_packet_delete(packet);
1016 dp_packet_pad(buffer);
1017 dpif_packet_set_dp_hash(packet, 0);
1018 packets[0] = packet;
1026 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1028 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1029 poll_fd_wait(rx->fd, POLLIN);
1033 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1035 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1038 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1039 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1043 drain_fd(rx->fd, ifr.ifr_qlen);
1046 return drain_rcvbuf(rx->fd);
1050 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1051 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1052 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1053 * the packet is too big or too small to transmit on the device.
1055 * The caller retains ownership of 'buffer' in all cases.
1057 * The kernel maintains a packet transmission queue, so the caller is not
1058 * expected to do additional queuing of packets. */
1060 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1061 struct dpif_packet **pkts, int cnt, bool may_steal)
1066 /* 'i' is incremented only if there's no error */
1067 for (i = 0; i < cnt;) {
1068 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1069 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1072 if (!is_tap_netdev(netdev_)) {
1073 /* Use our AF_PACKET socket to send to this device. */
1074 struct sockaddr_ll sll;
1080 sock = af_packet_sock();
1085 ifindex = netdev_get_ifindex(netdev_);
1090 /* We don't bother setting most fields in sockaddr_ll because the
1091 * kernel ignores them for SOCK_RAW. */
1092 memset(&sll, 0, sizeof sll);
1093 sll.sll_family = AF_PACKET;
1094 sll.sll_ifindex = ifindex;
1096 iov.iov_base = CONST_CAST(void *, data);
1099 msg.msg_name = &sll;
1100 msg.msg_namelen = sizeof sll;
1103 msg.msg_control = NULL;
1104 msg.msg_controllen = 0;
1107 retval = sendmsg(sock, &msg, 0);
1109 /* Use the tap fd to send to this device. This is essential for
1110 * tap devices, because packets sent to a tap device with an
1111 * AF_PACKET socket will loop back to be *received* again on the
1112 * tap device. This doesn't occur on other interface types
1113 * because we attach a socket filter to the rx socket. */
1114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1116 retval = write(netdev->tap_fd, data, size);
1120 /* The Linux AF_PACKET implementation never blocks waiting for room
1121 * for packets, instead returning ENOBUFS. Translate this into
1122 * EAGAIN for the caller. */
1123 error = errno == ENOBUFS ? EAGAIN : errno;
1124 if (error == EINTR) {
1125 /* continue without incrementing 'i', i.e. retry this packet */
1129 } else if (retval != size) {
1130 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1131 " of %"PRIuSIZE") on %s", retval, size,
1132 netdev_get_name(netdev_));
1137 /* Process the next packet in the batch */
1142 for (i = 0; i < cnt; i++) {
1143 dpif_packet_delete(pkts[i]);
1147 if (error && error != EAGAIN) {
1148 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1149 netdev_get_name(netdev_), ovs_strerror(error));
1156 /* Registers with the poll loop to wake up from the next call to poll_block()
1157 * when the packet transmission queue has sufficient room to transmit a packet
1158 * with netdev_send().
1160 * The kernel maintains a packet transmission queue, so the client is not
1161 * expected to do additional queuing of packets. Thus, this function is
1162 * unlikely to ever be used. It is included for completeness. */
1164 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1166 if (is_tap_netdev(netdev)) {
1167 /* TAP device always accepts packets.*/
1168 poll_immediate_wake();
1172 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1173 * otherwise a positive errno value. */
1175 netdev_linux_set_etheraddr(struct netdev *netdev_,
1176 const uint8_t mac[ETH_ADDR_LEN])
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179 enum netdev_flags old_flags = 0;
1182 ovs_mutex_lock(&netdev->mutex);
1184 if (netdev->cache_valid & VALID_ETHERADDR) {
1185 error = netdev->ether_addr_error;
1186 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1189 netdev->cache_valid &= ~VALID_ETHERADDR;
1192 /* Tap devices must be brought down before setting the address. */
1193 if (is_tap_netdev(netdev_)) {
1194 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1196 error = set_etheraddr(netdev_get_name(netdev_), mac);
1197 if (!error || error == ENODEV) {
1198 netdev->ether_addr_error = error;
1199 netdev->cache_valid |= VALID_ETHERADDR;
1201 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1205 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1206 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1210 ovs_mutex_unlock(&netdev->mutex);
1214 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1216 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1217 uint8_t mac[ETH_ADDR_LEN])
1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1224 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1226 netdev->cache_valid |= VALID_ETHERADDR;
1229 error = netdev->ether_addr_error;
1231 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1233 ovs_mutex_unlock(&netdev->mutex);
1239 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1243 if (!(netdev->cache_valid & VALID_MTU)) {
1246 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1247 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1248 netdev->mtu = ifr.ifr_mtu;
1249 netdev->cache_valid |= VALID_MTU;
1252 error = netdev->netdev_mtu_error;
1254 *mtup = netdev->mtu;
1260 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1261 * in bytes, not including the hardware header; thus, this is typically 1500
1262 * bytes for Ethernet devices. */
1264 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1266 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1269 ovs_mutex_lock(&netdev->mutex);
1270 error = netdev_linux_get_mtu__(netdev, mtup);
1271 ovs_mutex_unlock(&netdev->mutex);
1276 /* Sets the maximum size of transmitted (MTU) for given device using linux
1277 * networking ioctl interface.
1280 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1282 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1286 ovs_mutex_lock(&netdev->mutex);
1287 if (netdev->cache_valid & VALID_MTU) {
1288 error = netdev->netdev_mtu_error;
1289 if (error || netdev->mtu == mtu) {
1292 netdev->cache_valid &= ~VALID_MTU;
1295 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1296 SIOCSIFMTU, "SIOCSIFMTU");
1297 if (!error || error == ENODEV) {
1298 netdev->netdev_mtu_error = error;
1299 netdev->mtu = ifr.ifr_mtu;
1300 netdev->cache_valid |= VALID_MTU;
1303 ovs_mutex_unlock(&netdev->mutex);
1307 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1308 * On failure, returns a negative errno value. */
1310 netdev_linux_get_ifindex(const struct netdev *netdev_)
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1315 ovs_mutex_lock(&netdev->mutex);
1316 error = get_ifindex(netdev_, &ifindex);
1317 ovs_mutex_unlock(&netdev->mutex);
1319 return error ? -error : ifindex;
1323 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1327 ovs_mutex_lock(&netdev->mutex);
1328 if (netdev->miimon_interval > 0) {
1329 *carrier = netdev->miimon;
1331 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1333 ovs_mutex_unlock(&netdev->mutex);
1338 static long long int
1339 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1341 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1342 long long int carrier_resets;
1344 ovs_mutex_lock(&netdev->mutex);
1345 carrier_resets = netdev->carrier_resets;
1346 ovs_mutex_unlock(&netdev->mutex);
1348 return carrier_resets;
1352 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1353 struct mii_ioctl_data *data)
1358 memset(&ifr, 0, sizeof ifr);
1359 memcpy(&ifr.ifr_data, data, sizeof *data);
1360 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1361 memcpy(data, &ifr.ifr_data, sizeof *data);
1367 netdev_linux_get_miimon(const char *name, bool *miimon)
1369 struct mii_ioctl_data data;
1374 memset(&data, 0, sizeof data);
1375 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1377 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1378 data.reg_num = MII_BMSR;
1379 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1383 *miimon = !!(data.val_out & BMSR_LSTATUS);
1385 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1388 struct ethtool_cmd ecmd;
1390 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1393 COVERAGE_INC(netdev_get_ethtool);
1394 memset(&ecmd, 0, sizeof ecmd);
1395 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1398 struct ethtool_value eval;
1400 memcpy(&eval, &ecmd, sizeof eval);
1401 *miimon = !!eval.data;
1403 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1411 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1412 long long int interval)
1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1416 ovs_mutex_lock(&netdev->mutex);
1417 interval = interval > 0 ? MAX(interval, 100) : 0;
1418 if (netdev->miimon_interval != interval) {
1419 if (interval && !netdev->miimon_interval) {
1420 atomic_count_inc(&miimon_cnt);
1421 } else if (!interval && netdev->miimon_interval) {
1422 atomic_count_dec(&miimon_cnt);
1425 netdev->miimon_interval = interval;
1426 timer_set_expired(&netdev->miimon_timer);
1428 ovs_mutex_unlock(&netdev->mutex);
1434 netdev_linux_miimon_run(void)
1436 struct shash device_shash;
1437 struct shash_node *node;
1439 shash_init(&device_shash);
1440 netdev_get_devices(&netdev_linux_class, &device_shash);
1441 SHASH_FOR_EACH (node, &device_shash) {
1442 struct netdev *netdev = node->data;
1443 struct netdev_linux *dev = netdev_linux_cast(netdev);
1446 ovs_mutex_lock(&dev->mutex);
1447 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1448 netdev_linux_get_miimon(dev->up.name, &miimon);
1449 if (miimon != dev->miimon) {
1450 dev->miimon = miimon;
1451 netdev_linux_changed(dev, dev->ifi_flags, 0);
1454 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1456 ovs_mutex_unlock(&dev->mutex);
1457 netdev_close(netdev);
1460 shash_destroy(&device_shash);
1464 netdev_linux_miimon_wait(void)
1466 struct shash device_shash;
1467 struct shash_node *node;
1469 shash_init(&device_shash);
1470 netdev_get_devices(&netdev_linux_class, &device_shash);
1471 SHASH_FOR_EACH (node, &device_shash) {
1472 struct netdev *netdev = node->data;
1473 struct netdev_linux *dev = netdev_linux_cast(netdev);
1475 ovs_mutex_lock(&dev->mutex);
1476 if (dev->miimon_interval > 0) {
1477 timer_wait(&dev->miimon_timer);
1479 ovs_mutex_unlock(&dev->mutex);
1480 netdev_close(netdev);
1482 shash_destroy(&device_shash);
1486 swap_uint64(uint64_t *a, uint64_t *b)
1493 /* Copies 'src' into 'dst', performing format conversion in the process.
1495 * 'src' is allowed to be misaligned. */
1497 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1498 const struct ovs_vport_stats *src)
1500 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1501 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1502 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1503 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1504 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1505 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1506 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1507 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1509 dst->collisions = 0;
1510 dst->rx_length_errors = 0;
1511 dst->rx_over_errors = 0;
1512 dst->rx_crc_errors = 0;
1513 dst->rx_frame_errors = 0;
1514 dst->rx_fifo_errors = 0;
1515 dst->rx_missed_errors = 0;
1516 dst->tx_aborted_errors = 0;
1517 dst->tx_carrier_errors = 0;
1518 dst->tx_fifo_errors = 0;
1519 dst->tx_heartbeat_errors = 0;
1520 dst->tx_window_errors = 0;
1524 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1526 struct dpif_netlink_vport reply;
1530 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1533 } else if (!reply.stats) {
1538 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1546 get_stats_via_vport(const struct netdev *netdev_,
1547 struct netdev_stats *stats)
1549 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1551 if (!netdev->vport_stats_error ||
1552 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1555 error = get_stats_via_vport__(netdev_, stats);
1556 if (error && error != ENOENT) {
1557 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1559 netdev_get_name(netdev_), ovs_strerror(error));
1561 netdev->vport_stats_error = error;
1562 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1566 /* Retrieves current device stats for 'netdev-linux'. */
1568 netdev_linux_get_stats(const struct netdev *netdev_,
1569 struct netdev_stats *stats)
1571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1572 struct netdev_stats dev_stats;
1575 ovs_mutex_lock(&netdev->mutex);
1576 get_stats_via_vport(netdev_, stats);
1577 error = get_stats_via_netlink(netdev_, &dev_stats);
1579 if (!netdev->vport_stats_error) {
1582 } else if (netdev->vport_stats_error) {
1583 /* stats not available from OVS then use netdev stats. */
1586 /* Use kernel netdev's packet and byte counts since vport's counters
1587 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1589 stats->rx_packets = dev_stats.rx_packets;
1590 stats->rx_bytes = dev_stats.rx_bytes;
1591 stats->tx_packets = dev_stats.tx_packets;
1592 stats->tx_bytes = dev_stats.tx_bytes;
1594 stats->rx_errors += dev_stats.rx_errors;
1595 stats->tx_errors += dev_stats.tx_errors;
1596 stats->rx_dropped += dev_stats.rx_dropped;
1597 stats->tx_dropped += dev_stats.tx_dropped;
1598 stats->multicast += dev_stats.multicast;
1599 stats->collisions += dev_stats.collisions;
1600 stats->rx_length_errors += dev_stats.rx_length_errors;
1601 stats->rx_over_errors += dev_stats.rx_over_errors;
1602 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1603 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1604 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1605 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1606 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1607 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1608 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1609 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1610 stats->tx_window_errors += dev_stats.tx_window_errors;
1612 ovs_mutex_unlock(&netdev->mutex);
1617 /* Retrieves current device stats for 'netdev-tap' netdev or
1618 * netdev-internal. */
1620 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1622 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1623 struct netdev_stats dev_stats;
1626 ovs_mutex_lock(&netdev->mutex);
1627 get_stats_via_vport(netdev_, stats);
1628 error = get_stats_via_netlink(netdev_, &dev_stats);
1630 if (!netdev->vport_stats_error) {
1633 } else if (netdev->vport_stats_error) {
1634 /* Transmit and receive stats will appear to be swapped relative to the
1635 * other ports since we are the one sending the data, not a remote
1636 * computer. For consistency, we swap them back here. This does not
1637 * apply if we are getting stats from the vport layer because it always
1638 * tracks stats from the perspective of the switch. */
1641 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1642 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1643 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1644 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1645 stats->rx_length_errors = 0;
1646 stats->rx_over_errors = 0;
1647 stats->rx_crc_errors = 0;
1648 stats->rx_frame_errors = 0;
1649 stats->rx_fifo_errors = 0;
1650 stats->rx_missed_errors = 0;
1651 stats->tx_aborted_errors = 0;
1652 stats->tx_carrier_errors = 0;
1653 stats->tx_fifo_errors = 0;
1654 stats->tx_heartbeat_errors = 0;
1655 stats->tx_window_errors = 0;
1657 /* Use kernel netdev's packet and byte counts since vport counters
1658 * do not reflect packet counts on the wire when GSO, TSO or GRO
1660 stats->rx_packets = dev_stats.tx_packets;
1661 stats->rx_bytes = dev_stats.tx_bytes;
1662 stats->tx_packets = dev_stats.rx_packets;
1663 stats->tx_bytes = dev_stats.rx_bytes;
1665 stats->rx_dropped += dev_stats.tx_dropped;
1666 stats->tx_dropped += dev_stats.rx_dropped;
1668 stats->rx_errors += dev_stats.tx_errors;
1669 stats->tx_errors += dev_stats.rx_errors;
1671 stats->multicast += dev_stats.multicast;
1672 stats->collisions += dev_stats.collisions;
1674 ovs_mutex_unlock(&netdev->mutex);
1680 netdev_internal_get_stats(const struct netdev *netdev_,
1681 struct netdev_stats *stats)
1683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1686 ovs_mutex_lock(&netdev->mutex);
1687 get_stats_via_vport(netdev_, stats);
1688 error = netdev->vport_stats_error;
1689 ovs_mutex_unlock(&netdev->mutex);
1695 netdev_linux_read_features(struct netdev_linux *netdev)
1697 struct ethtool_cmd ecmd;
1701 if (netdev->cache_valid & VALID_FEATURES) {
1705 COVERAGE_INC(netdev_get_ethtool);
1706 memset(&ecmd, 0, sizeof ecmd);
1707 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1708 ETHTOOL_GSET, "ETHTOOL_GSET");
1713 /* Supported features. */
1714 netdev->supported = 0;
1715 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1716 netdev->supported |= NETDEV_F_10MB_HD;
1718 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1719 netdev->supported |= NETDEV_F_10MB_FD;
1721 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1722 netdev->supported |= NETDEV_F_100MB_HD;
1724 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1725 netdev->supported |= NETDEV_F_100MB_FD;
1727 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1728 netdev->supported |= NETDEV_F_1GB_HD;
1730 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1731 netdev->supported |= NETDEV_F_1GB_FD;
1733 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1734 netdev->supported |= NETDEV_F_10GB_FD;
1736 if (ecmd.supported & SUPPORTED_TP) {
1737 netdev->supported |= NETDEV_F_COPPER;
1739 if (ecmd.supported & SUPPORTED_FIBRE) {
1740 netdev->supported |= NETDEV_F_FIBER;
1742 if (ecmd.supported & SUPPORTED_Autoneg) {
1743 netdev->supported |= NETDEV_F_AUTONEG;
1745 if (ecmd.supported & SUPPORTED_Pause) {
1746 netdev->supported |= NETDEV_F_PAUSE;
1748 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1749 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1752 /* Advertised features. */
1753 netdev->advertised = 0;
1754 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1755 netdev->advertised |= NETDEV_F_10MB_HD;
1757 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1758 netdev->advertised |= NETDEV_F_10MB_FD;
1760 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1761 netdev->advertised |= NETDEV_F_100MB_HD;
1763 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1764 netdev->advertised |= NETDEV_F_100MB_FD;
1766 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1767 netdev->advertised |= NETDEV_F_1GB_HD;
1769 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1770 netdev->advertised |= NETDEV_F_1GB_FD;
1772 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1773 netdev->advertised |= NETDEV_F_10GB_FD;
1775 if (ecmd.advertising & ADVERTISED_TP) {
1776 netdev->advertised |= NETDEV_F_COPPER;
1778 if (ecmd.advertising & ADVERTISED_FIBRE) {
1779 netdev->advertised |= NETDEV_F_FIBER;
1781 if (ecmd.advertising & ADVERTISED_Autoneg) {
1782 netdev->advertised |= NETDEV_F_AUTONEG;
1784 if (ecmd.advertising & ADVERTISED_Pause) {
1785 netdev->advertised |= NETDEV_F_PAUSE;
1787 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1788 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1791 /* Current settings. */
1793 if (speed == SPEED_10) {
1794 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1795 } else if (speed == SPEED_100) {
1796 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1797 } else if (speed == SPEED_1000) {
1798 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1799 } else if (speed == SPEED_10000) {
1800 netdev->current = NETDEV_F_10GB_FD;
1801 } else if (speed == 40000) {
1802 netdev->current = NETDEV_F_40GB_FD;
1803 } else if (speed == 100000) {
1804 netdev->current = NETDEV_F_100GB_FD;
1805 } else if (speed == 1000000) {
1806 netdev->current = NETDEV_F_1TB_FD;
1808 netdev->current = 0;
1811 if (ecmd.port == PORT_TP) {
1812 netdev->current |= NETDEV_F_COPPER;
1813 } else if (ecmd.port == PORT_FIBRE) {
1814 netdev->current |= NETDEV_F_FIBER;
1818 netdev->current |= NETDEV_F_AUTONEG;
1822 netdev->cache_valid |= VALID_FEATURES;
1823 netdev->get_features_error = error;
1826 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1827 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1828 * Returns 0 if successful, otherwise a positive errno value. */
1830 netdev_linux_get_features(const struct netdev *netdev_,
1831 enum netdev_features *current,
1832 enum netdev_features *advertised,
1833 enum netdev_features *supported,
1834 enum netdev_features *peer)
1836 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1839 ovs_mutex_lock(&netdev->mutex);
1840 netdev_linux_read_features(netdev);
1841 if (!netdev->get_features_error) {
1842 *current = netdev->current;
1843 *advertised = netdev->advertised;
1844 *supported = netdev->supported;
1845 *peer = 0; /* XXX */
1847 error = netdev->get_features_error;
1848 ovs_mutex_unlock(&netdev->mutex);
1853 /* Set the features advertised by 'netdev' to 'advertise'. */
1855 netdev_linux_set_advertisements(struct netdev *netdev_,
1856 enum netdev_features advertise)
1858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1859 struct ethtool_cmd ecmd;
1862 ovs_mutex_lock(&netdev->mutex);
1864 COVERAGE_INC(netdev_get_ethtool);
1865 memset(&ecmd, 0, sizeof ecmd);
1866 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1867 ETHTOOL_GSET, "ETHTOOL_GSET");
1872 ecmd.advertising = 0;
1873 if (advertise & NETDEV_F_10MB_HD) {
1874 ecmd.advertising |= ADVERTISED_10baseT_Half;
1876 if (advertise & NETDEV_F_10MB_FD) {
1877 ecmd.advertising |= ADVERTISED_10baseT_Full;
1879 if (advertise & NETDEV_F_100MB_HD) {
1880 ecmd.advertising |= ADVERTISED_100baseT_Half;
1882 if (advertise & NETDEV_F_100MB_FD) {
1883 ecmd.advertising |= ADVERTISED_100baseT_Full;
1885 if (advertise & NETDEV_F_1GB_HD) {
1886 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1888 if (advertise & NETDEV_F_1GB_FD) {
1889 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1891 if (advertise & NETDEV_F_10GB_FD) {
1892 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1894 if (advertise & NETDEV_F_COPPER) {
1895 ecmd.advertising |= ADVERTISED_TP;
1897 if (advertise & NETDEV_F_FIBER) {
1898 ecmd.advertising |= ADVERTISED_FIBRE;
1900 if (advertise & NETDEV_F_AUTONEG) {
1901 ecmd.advertising |= ADVERTISED_Autoneg;
1903 if (advertise & NETDEV_F_PAUSE) {
1904 ecmd.advertising |= ADVERTISED_Pause;
1906 if (advertise & NETDEV_F_PAUSE_ASYM) {
1907 ecmd.advertising |= ADVERTISED_Asym_Pause;
1909 COVERAGE_INC(netdev_set_ethtool);
1910 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1911 ETHTOOL_SSET, "ETHTOOL_SSET");
1914 ovs_mutex_unlock(&netdev->mutex);
1918 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1919 * successful, otherwise a positive errno value. */
1921 netdev_linux_set_policing(struct netdev *netdev_,
1922 uint32_t kbits_rate, uint32_t kbits_burst)
1924 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1925 const char *netdev_name = netdev_get_name(netdev_);
1928 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1929 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1930 : kbits_burst); /* Stick with user-specified value. */
1932 ovs_mutex_lock(&netdev->mutex);
1933 if (netdev->cache_valid & VALID_POLICING) {
1934 error = netdev->netdev_policing_error;
1935 if (error || (netdev->kbits_rate == kbits_rate &&
1936 netdev->kbits_burst == kbits_burst)) {
1937 /* Assume that settings haven't changed since we last set them. */
1940 netdev->cache_valid &= ~VALID_POLICING;
1943 COVERAGE_INC(netdev_set_policing);
1944 /* Remove any existing ingress qdisc. */
1945 error = tc_add_del_ingress_qdisc(netdev_, false);
1947 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1948 netdev_name, ovs_strerror(error));
1953 error = tc_add_del_ingress_qdisc(netdev_, true);
1955 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1956 netdev_name, ovs_strerror(error));
1960 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1962 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1963 netdev_name, ovs_strerror(error));
1968 netdev->kbits_rate = kbits_rate;
1969 netdev->kbits_burst = kbits_burst;
1972 if (!error || error == ENODEV) {
1973 netdev->netdev_policing_error = error;
1974 netdev->cache_valid |= VALID_POLICING;
1976 ovs_mutex_unlock(&netdev->mutex);
1981 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1984 const struct tc_ops *const *opsp;
1986 for (opsp = tcs; *opsp != NULL; opsp++) {
1987 const struct tc_ops *ops = *opsp;
1988 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1989 sset_add(types, ops->ovs_name);
1995 static const struct tc_ops *
1996 tc_lookup_ovs_name(const char *name)
1998 const struct tc_ops *const *opsp;
2000 for (opsp = tcs; *opsp != NULL; opsp++) {
2001 const struct tc_ops *ops = *opsp;
2002 if (!strcmp(name, ops->ovs_name)) {
2009 static const struct tc_ops *
2010 tc_lookup_linux_name(const char *name)
2012 const struct tc_ops *const *opsp;
2014 for (opsp = tcs; *opsp != NULL; opsp++) {
2015 const struct tc_ops *ops = *opsp;
2016 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2023 static struct tc_queue *
2024 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2028 struct tc_queue *queue;
2030 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2031 if (queue->queue_id == queue_id) {
2038 static struct tc_queue *
2039 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2041 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2045 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2047 struct netdev_qos_capabilities *caps)
2049 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2053 caps->n_queues = ops->n_queues;
2058 netdev_linux_get_qos(const struct netdev *netdev_,
2059 const char **typep, struct smap *details)
2061 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2064 ovs_mutex_lock(&netdev->mutex);
2065 error = tc_query_qdisc(netdev_);
2067 *typep = netdev->tc->ops->ovs_name;
2068 error = (netdev->tc->ops->qdisc_get
2069 ? netdev->tc->ops->qdisc_get(netdev_, details)
2072 ovs_mutex_unlock(&netdev->mutex);
2078 netdev_linux_set_qos(struct netdev *netdev_,
2079 const char *type, const struct smap *details)
2081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2082 const struct tc_ops *new_ops;
2085 new_ops = tc_lookup_ovs_name(type);
2086 if (!new_ops || !new_ops->tc_install) {
2090 ovs_mutex_lock(&netdev->mutex);
2091 error = tc_query_qdisc(netdev_);
2096 if (new_ops == netdev->tc->ops) {
2097 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2099 /* Delete existing qdisc. */
2100 error = tc_del_qdisc(netdev_);
2104 ovs_assert(netdev->tc == NULL);
2106 /* Install new qdisc. */
2107 error = new_ops->tc_install(netdev_, details);
2108 ovs_assert((error == 0) == (netdev->tc != NULL));
2112 ovs_mutex_unlock(&netdev->mutex);
2117 netdev_linux_get_queue(const struct netdev *netdev_,
2118 unsigned int queue_id, struct smap *details)
2120 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2123 ovs_mutex_lock(&netdev->mutex);
2124 error = tc_query_qdisc(netdev_);
2126 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2128 ? netdev->tc->ops->class_get(netdev_, queue, details)
2131 ovs_mutex_unlock(&netdev->mutex);
2137 netdev_linux_set_queue(struct netdev *netdev_,
2138 unsigned int queue_id, const struct smap *details)
2140 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2143 ovs_mutex_lock(&netdev->mutex);
2144 error = tc_query_qdisc(netdev_);
2146 error = (queue_id < netdev->tc->ops->n_queues
2147 && netdev->tc->ops->class_set
2148 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2151 ovs_mutex_unlock(&netdev->mutex);
2157 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2159 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2162 ovs_mutex_lock(&netdev->mutex);
2163 error = tc_query_qdisc(netdev_);
2165 if (netdev->tc->ops->class_delete) {
2166 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2168 ? netdev->tc->ops->class_delete(netdev_, queue)
2174 ovs_mutex_unlock(&netdev->mutex);
2180 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2181 unsigned int queue_id,
2182 struct netdev_queue_stats *stats)
2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2187 ovs_mutex_lock(&netdev->mutex);
2188 error = tc_query_qdisc(netdev_);
2190 if (netdev->tc->ops->class_get_stats) {
2191 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2193 stats->created = queue->created;
2194 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2203 ovs_mutex_unlock(&netdev->mutex);
2208 struct queue_dump_state {
2209 struct nl_dump dump;
2214 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2216 struct ofpbuf request;
2217 struct tcmsg *tcmsg;
2219 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2223 tcmsg->tcm_parent = 0;
2224 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2225 ofpbuf_uninit(&request);
2227 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2232 finish_queue_dump(struct queue_dump_state *state)
2234 ofpbuf_uninit(&state->buf);
2235 return nl_dump_done(&state->dump);
2238 struct netdev_linux_queue_state {
2239 unsigned int *queues;
2245 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2247 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2250 ovs_mutex_lock(&netdev->mutex);
2251 error = tc_query_qdisc(netdev_);
2253 if (netdev->tc->ops->class_get) {
2254 struct netdev_linux_queue_state *state;
2255 struct tc_queue *queue;
2258 *statep = state = xmalloc(sizeof *state);
2259 state->n_queues = hmap_count(&netdev->tc->queues);
2260 state->cur_queue = 0;
2261 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2264 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2265 state->queues[i++] = queue->queue_id;
2271 ovs_mutex_unlock(&netdev->mutex);
2277 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2278 unsigned int *queue_idp, struct smap *details)
2280 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2281 struct netdev_linux_queue_state *state = state_;
2284 ovs_mutex_lock(&netdev->mutex);
2285 while (state->cur_queue < state->n_queues) {
2286 unsigned int queue_id = state->queues[state->cur_queue++];
2287 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2290 *queue_idp = queue_id;
2291 error = netdev->tc->ops->class_get(netdev_, queue, details);
2295 ovs_mutex_unlock(&netdev->mutex);
2301 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2304 struct netdev_linux_queue_state *state = state_;
2306 free(state->queues);
2312 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2313 netdev_dump_queue_stats_cb *cb, void *aux)
2315 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2318 ovs_mutex_lock(&netdev->mutex);
2319 error = tc_query_qdisc(netdev_);
2321 struct queue_dump_state state;
2323 if (!netdev->tc->ops->class_dump_stats) {
2325 } else if (!start_queue_dump(netdev_, &state)) {
2331 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2332 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2339 retval = finish_queue_dump(&state);
2345 ovs_mutex_unlock(&netdev->mutex);
2351 netdev_linux_get_in4(const struct netdev *netdev_,
2352 struct in_addr *address, struct in_addr *netmask)
2354 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2357 ovs_mutex_lock(&netdev->mutex);
2358 if (!(netdev->cache_valid & VALID_IN4)) {
2359 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2360 SIOCGIFADDR, "SIOCGIFADDR");
2362 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2363 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2365 netdev->cache_valid |= VALID_IN4;
2373 if (netdev->address.s_addr != INADDR_ANY) {
2374 *address = netdev->address;
2375 *netmask = netdev->netmask;
2377 error = EADDRNOTAVAIL;
2380 ovs_mutex_unlock(&netdev->mutex);
2386 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2387 struct in_addr netmask)
2389 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2392 ovs_mutex_lock(&netdev->mutex);
2393 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2395 netdev->cache_valid |= VALID_IN4;
2396 netdev->address = address;
2397 netdev->netmask = netmask;
2398 if (address.s_addr != INADDR_ANY) {
2399 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2400 "SIOCSIFNETMASK", netmask);
2403 ovs_mutex_unlock(&netdev->mutex);
2409 parse_if_inet6_line(const char *line,
2410 struct in6_addr *in6, char ifname[16 + 1])
2412 uint8_t *s6 = in6->s6_addr;
2413 #define X8 "%2"SCNx8
2414 return ovs_scan(line,
2415 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2416 "%*x %*x %*x %*x %16s\n",
2417 &s6[0], &s6[1], &s6[2], &s6[3],
2418 &s6[4], &s6[5], &s6[6], &s6[7],
2419 &s6[8], &s6[9], &s6[10], &s6[11],
2420 &s6[12], &s6[13], &s6[14], &s6[15],
2424 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2425 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2427 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2431 ovs_mutex_lock(&netdev->mutex);
2432 if (!(netdev->cache_valid & VALID_IN6)) {
2436 netdev->in6 = in6addr_any;
2438 file = fopen("/proc/net/if_inet6", "r");
2440 const char *name = netdev_get_name(netdev_);
2441 while (fgets(line, sizeof line, file)) {
2442 struct in6_addr in6_tmp;
2443 char ifname[16 + 1];
2444 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2445 && !strcmp(name, ifname))
2447 netdev->in6 = in6_tmp;
2453 netdev->cache_valid |= VALID_IN6;
2456 ovs_mutex_unlock(&netdev->mutex);
2462 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2464 struct sockaddr_in sin;
2465 memset(&sin, 0, sizeof sin);
2466 sin.sin_family = AF_INET;
2467 sin.sin_addr = addr;
2470 memset(sa, 0, sizeof *sa);
2471 memcpy(sa, &sin, sizeof sin);
2475 do_set_addr(struct netdev *netdev,
2476 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2480 make_in4_sockaddr(&ifr.ifr_addr, addr);
2481 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2485 /* Adds 'router' as a default IP gateway. */
2487 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2489 struct in_addr any = { INADDR_ANY };
2493 memset(&rt, 0, sizeof rt);
2494 make_in4_sockaddr(&rt.rt_dst, any);
2495 make_in4_sockaddr(&rt.rt_gateway, router);
2496 make_in4_sockaddr(&rt.rt_genmask, any);
2497 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2498 error = af_inet_ioctl(SIOCADDRT, &rt);
2500 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2506 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2509 static const char fn[] = "/proc/net/route";
2514 *netdev_name = NULL;
2515 stream = fopen(fn, "r");
2516 if (stream == NULL) {
2517 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2522 while (fgets(line, sizeof line, stream)) {
2525 ovs_be32 dest, gateway, mask;
2526 int refcnt, metric, mtu;
2527 unsigned int flags, use, window, irtt;
2530 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2532 iface, &dest, &gateway, &flags, &refcnt,
2533 &use, &metric, &mask, &mtu, &window, &irtt)) {
2534 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2538 if (!(flags & RTF_UP)) {
2539 /* Skip routes that aren't up. */
2543 /* The output of 'dest', 'mask', and 'gateway' were given in
2544 * network byte order, so we don't need need any endian
2545 * conversions here. */
2546 if ((dest & mask) == (host->s_addr & mask)) {
2548 /* The host is directly reachable. */
2549 next_hop->s_addr = 0;
2551 /* To reach the host, we must go through a gateway. */
2552 next_hop->s_addr = gateway;
2554 *netdev_name = xstrdup(iface);
2566 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2568 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2571 ovs_mutex_lock(&netdev->mutex);
2572 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2573 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2575 COVERAGE_INC(netdev_get_ethtool);
2576 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2577 error = netdev_linux_do_ethtool(netdev->up.name,
2580 "ETHTOOL_GDRVINFO");
2582 netdev->cache_valid |= VALID_DRVINFO;
2587 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2588 smap_add(smap, "driver_version", netdev->drvinfo.version);
2589 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2591 ovs_mutex_unlock(&netdev->mutex);
2597 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2600 smap_add(smap, "driver_name", "openvswitch");
2604 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2605 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2606 * returns 0. Otherwise, it returns a positive errno value; in particular,
2607 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2609 netdev_linux_arp_lookup(const struct netdev *netdev,
2610 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2613 struct sockaddr_in sin;
2616 memset(&r, 0, sizeof r);
2617 memset(&sin, 0, sizeof sin);
2618 sin.sin_family = AF_INET;
2619 sin.sin_addr.s_addr = ip;
2621 memcpy(&r.arp_pa, &sin, sizeof sin);
2622 r.arp_ha.sa_family = ARPHRD_ETHER;
2624 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2625 COVERAGE_INC(netdev_arp_lookup);
2626 retval = af_inet_ioctl(SIOCGARP, &r);
2628 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2629 } else if (retval != ENXIO) {
2630 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2631 netdev_get_name(netdev), IP_ARGS(ip),
2632 ovs_strerror(retval));
2638 nd_to_iff_flags(enum netdev_flags nd)
2641 if (nd & NETDEV_UP) {
2644 if (nd & NETDEV_PROMISC) {
2647 if (nd & NETDEV_LOOPBACK) {
2648 iff |= IFF_LOOPBACK;
2654 iff_to_nd_flags(int iff)
2656 enum netdev_flags nd = 0;
2660 if (iff & IFF_PROMISC) {
2661 nd |= NETDEV_PROMISC;
2663 if (iff & IFF_LOOPBACK) {
2664 nd |= NETDEV_LOOPBACK;
2670 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2671 enum netdev_flags on, enum netdev_flags *old_flagsp)
2672 OVS_REQUIRES(netdev->mutex)
2674 int old_flags, new_flags;
2677 old_flags = netdev->ifi_flags;
2678 *old_flagsp = iff_to_nd_flags(old_flags);
2679 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2680 if (new_flags != old_flags) {
2681 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2682 get_flags(&netdev->up, &netdev->ifi_flags);
2689 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2690 enum netdev_flags on, enum netdev_flags *old_flagsp)
2692 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2695 ovs_mutex_lock(&netdev->mutex);
2696 error = update_flags(netdev, off, on, old_flagsp);
2697 ovs_mutex_unlock(&netdev->mutex);
2702 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2703 GET_FEATURES, GET_STATUS) \
2709 netdev_linux_wait, \
2711 netdev_linux_alloc, \
2713 netdev_linux_destruct, \
2714 netdev_linux_dealloc, \
2715 NULL, /* get_config */ \
2716 NULL, /* set_config */ \
2717 NULL, /* get_tunnel_config */ \
2718 NULL, /* get_numa_id */ \
2719 NULL, /* set_multiq */ \
2721 netdev_linux_send, \
2722 netdev_linux_send_wait, \
2724 netdev_linux_set_etheraddr, \
2725 netdev_linux_get_etheraddr, \
2726 netdev_linux_get_mtu, \
2727 netdev_linux_set_mtu, \
2728 netdev_linux_get_ifindex, \
2729 netdev_linux_get_carrier, \
2730 netdev_linux_get_carrier_resets, \
2731 netdev_linux_set_miimon_interval, \
2735 netdev_linux_set_advertisements, \
2737 netdev_linux_set_policing, \
2738 netdev_linux_get_qos_types, \
2739 netdev_linux_get_qos_capabilities, \
2740 netdev_linux_get_qos, \
2741 netdev_linux_set_qos, \
2742 netdev_linux_get_queue, \
2743 netdev_linux_set_queue, \
2744 netdev_linux_delete_queue, \
2745 netdev_linux_get_queue_stats, \
2746 netdev_linux_queue_dump_start, \
2747 netdev_linux_queue_dump_next, \
2748 netdev_linux_queue_dump_done, \
2749 netdev_linux_dump_queue_stats, \
2751 netdev_linux_get_in4, \
2752 netdev_linux_set_in4, \
2753 netdev_linux_get_in6, \
2754 netdev_linux_add_router, \
2755 netdev_linux_get_next_hop, \
2757 netdev_linux_arp_lookup, \
2759 netdev_linux_update_flags, \
2761 netdev_linux_rxq_alloc, \
2762 netdev_linux_rxq_construct, \
2763 netdev_linux_rxq_destruct, \
2764 netdev_linux_rxq_dealloc, \
2765 netdev_linux_rxq_recv, \
2766 netdev_linux_rxq_wait, \
2767 netdev_linux_rxq_drain, \
2770 const struct netdev_class netdev_linux_class =
2773 netdev_linux_construct,
2774 netdev_linux_get_stats,
2775 netdev_linux_get_features,
2776 netdev_linux_get_status);
2778 const struct netdev_class netdev_tap_class =
2781 netdev_linux_construct_tap,
2782 netdev_tap_get_stats,
2783 netdev_linux_get_features,
2784 netdev_linux_get_status);
2786 const struct netdev_class netdev_internal_class =
2789 netdev_linux_construct,
2790 netdev_internal_get_stats,
2791 NULL, /* get_features */
2792 netdev_internal_get_status);
2794 /* HTB traffic control class. */
2796 #define HTB_N_QUEUES 0xf000
2800 unsigned int max_rate; /* In bytes/s. */
2804 struct tc_queue tc_queue;
2805 unsigned int min_rate; /* In bytes/s. */
2806 unsigned int max_rate; /* In bytes/s. */
2807 unsigned int burst; /* In bytes. */
2808 unsigned int priority; /* Lower values are higher priorities. */
2812 htb_get__(const struct netdev *netdev_)
2814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2815 return CONTAINER_OF(netdev->tc, struct htb, tc);
2819 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2824 htb = xmalloc(sizeof *htb);
2825 tc_init(&htb->tc, &tc_ops_htb);
2826 htb->max_rate = max_rate;
2828 netdev->tc = &htb->tc;
2831 /* Create an HTB qdisc.
2833 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2835 htb_setup_qdisc__(struct netdev *netdev)
2838 struct tc_htb_glob opt;
2839 struct ofpbuf request;
2840 struct tcmsg *tcmsg;
2842 tc_del_qdisc(netdev);
2844 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2845 NLM_F_EXCL | NLM_F_CREATE, &request);
2849 tcmsg->tcm_handle = tc_make_handle(1, 0);
2850 tcmsg->tcm_parent = TC_H_ROOT;
2852 nl_msg_put_string(&request, TCA_KIND, "htb");
2854 memset(&opt, 0, sizeof opt);
2855 opt.rate2quantum = 10;
2859 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2860 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2861 nl_msg_end_nested(&request, opt_offset);
2863 return tc_transact(&request, NULL);
2866 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2867 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2869 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2870 unsigned int parent, struct htb_class *class)
2873 struct tc_htb_opt opt;
2874 struct ofpbuf request;
2875 struct tcmsg *tcmsg;
2879 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2881 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2882 netdev_get_name(netdev));
2886 memset(&opt, 0, sizeof opt);
2887 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2888 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2889 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2890 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2891 opt.prio = class->priority;
2893 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2897 tcmsg->tcm_handle = handle;
2898 tcmsg->tcm_parent = parent;
2900 nl_msg_put_string(&request, TCA_KIND, "htb");
2901 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2902 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2903 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2904 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2905 nl_msg_end_nested(&request, opt_offset);
2907 error = tc_transact(&request, NULL);
2909 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2910 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2911 netdev_get_name(netdev),
2912 tc_get_major(handle), tc_get_minor(handle),
2913 tc_get_major(parent), tc_get_minor(parent),
2914 class->min_rate, class->max_rate,
2915 class->burst, class->priority, ovs_strerror(error));
2920 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2921 * description of them into 'details'. The description complies with the
2922 * specification given in the vswitch database documentation for linux-htb
2925 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2927 static const struct nl_policy tca_htb_policy[] = {
2928 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2929 .min_len = sizeof(struct tc_htb_opt) },
2932 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2933 const struct tc_htb_opt *htb;
2935 if (!nl_parse_nested(nl_options, tca_htb_policy,
2936 attrs, ARRAY_SIZE(tca_htb_policy))) {
2937 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2941 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2942 class->min_rate = htb->rate.rate;
2943 class->max_rate = htb->ceil.rate;
2944 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2945 class->priority = htb->prio;
2950 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2951 struct htb_class *options,
2952 struct netdev_queue_stats *stats)
2954 struct nlattr *nl_options;
2955 unsigned int handle;
2958 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2959 if (!error && queue_id) {
2960 unsigned int major = tc_get_major(handle);
2961 unsigned int minor = tc_get_minor(handle);
2962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2963 *queue_id = minor - 1;
2968 if (!error && options) {
2969 error = htb_parse_tca_options__(nl_options, options);
2975 htb_parse_qdisc_details__(struct netdev *netdev_,
2976 const struct smap *details, struct htb_class *hc)
2978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2979 const char *max_rate_s;
2981 max_rate_s = smap_get(details, "max-rate");
2982 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2983 if (!hc->max_rate) {
2984 enum netdev_features current;
2986 netdev_linux_read_features(netdev);
2987 current = !netdev->get_features_error ? netdev->current : 0;
2988 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2990 hc->min_rate = hc->max_rate;
2996 htb_parse_class_details__(struct netdev *netdev,
2997 const struct smap *details, struct htb_class *hc)
2999 const struct htb *htb = htb_get__(netdev);
3000 const char *min_rate_s = smap_get(details, "min-rate");
3001 const char *max_rate_s = smap_get(details, "max-rate");
3002 const char *burst_s = smap_get(details, "burst");
3003 const char *priority_s = smap_get(details, "priority");
3006 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3008 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3009 netdev_get_name(netdev));
3013 /* HTB requires at least an mtu sized min-rate to send any traffic even
3014 * on uncongested links. */
3015 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3016 hc->min_rate = MAX(hc->min_rate, mtu);
3017 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3020 hc->max_rate = (max_rate_s
3021 ? strtoull(max_rate_s, NULL, 10) / 8
3023 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3024 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3028 * According to hints in the documentation that I've read, it is important
3029 * that 'burst' be at least as big as the largest frame that might be
3030 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3031 * but having it a bit too small is a problem. Since netdev_get_mtu()
3032 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3033 * the MTU. We actually add 64, instead of 14, as a guard against
3034 * additional headers get tacked on somewhere that we're not aware of. */
3035 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3036 hc->burst = MAX(hc->burst, mtu + 64);
3039 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3045 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3046 unsigned int parent, struct htb_class *options,
3047 struct netdev_queue_stats *stats)
3049 struct ofpbuf *reply;
3052 error = tc_query_class(netdev, handle, parent, &reply);
3054 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3055 ofpbuf_delete(reply);
3061 htb_tc_install(struct netdev *netdev, const struct smap *details)
3065 error = htb_setup_qdisc__(netdev);
3067 struct htb_class hc;
3069 htb_parse_qdisc_details__(netdev, details, &hc);
3070 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3071 tc_make_handle(1, 0), &hc);
3073 htb_install__(netdev, hc.max_rate);
3079 static struct htb_class *
3080 htb_class_cast__(const struct tc_queue *queue)
3082 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3086 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3087 const struct htb_class *hc)
3089 struct htb *htb = htb_get__(netdev);
3090 size_t hash = hash_int(queue_id, 0);
3091 struct tc_queue *queue;
3092 struct htb_class *hcp;
3094 queue = tc_find_queue__(netdev, queue_id, hash);
3096 hcp = htb_class_cast__(queue);
3098 hcp = xmalloc(sizeof *hcp);
3099 queue = &hcp->tc_queue;
3100 queue->queue_id = queue_id;
3101 queue->created = time_msec();
3102 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3105 hcp->min_rate = hc->min_rate;
3106 hcp->max_rate = hc->max_rate;
3107 hcp->burst = hc->burst;
3108 hcp->priority = hc->priority;
3112 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3115 struct queue_dump_state state;
3116 struct htb_class hc;
3118 /* Get qdisc options. */
3120 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3121 htb_install__(netdev, hc.max_rate);
3124 if (!start_queue_dump(netdev, &state)) {
3127 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3128 unsigned int queue_id;
3130 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3131 htb_update_queue__(netdev, queue_id, &hc);
3134 finish_queue_dump(&state);
3140 htb_tc_destroy(struct tc *tc)
3142 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3143 struct htb_class *hc, *next;
3145 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3146 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3154 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3156 const struct htb *htb = htb_get__(netdev);
3157 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3162 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3164 struct htb_class hc;
3167 htb_parse_qdisc_details__(netdev, details, &hc);
3168 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3169 tc_make_handle(1, 0), &hc);
3171 htb_get__(netdev)->max_rate = hc.max_rate;
3177 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3178 const struct tc_queue *queue, struct smap *details)
3180 const struct htb_class *hc = htb_class_cast__(queue);
3182 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3183 if (hc->min_rate != hc->max_rate) {
3184 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3186 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3188 smap_add_format(details, "priority", "%u", hc->priority);
3194 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3195 const struct smap *details)
3197 struct htb_class hc;
3200 error = htb_parse_class_details__(netdev, details, &hc);
3205 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3206 tc_make_handle(1, 0xfffe), &hc);
3211 htb_update_queue__(netdev, queue_id, &hc);
3216 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3218 struct htb_class *hc = htb_class_cast__(queue);
3219 struct htb *htb = htb_get__(netdev);
3222 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3224 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3231 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3232 struct netdev_queue_stats *stats)
3234 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3235 tc_make_handle(1, 0xfffe), NULL, stats);
3239 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3240 const struct ofpbuf *nlmsg,
3241 netdev_dump_queue_stats_cb *cb, void *aux)
3243 struct netdev_queue_stats stats;
3244 unsigned int handle, major, minor;
3247 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3252 major = tc_get_major(handle);
3253 minor = tc_get_minor(handle);
3254 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3255 (*cb)(minor - 1, &stats, aux);
3260 static const struct tc_ops tc_ops_htb = {
3261 "htb", /* linux_name */
3262 "linux-htb", /* ovs_name */
3263 HTB_N_QUEUES, /* n_queues */
3272 htb_class_get_stats,
3273 htb_class_dump_stats
3276 /* "linux-hfsc" traffic control class. */
3278 #define HFSC_N_QUEUES 0xf000
3286 struct tc_queue tc_queue;
3291 static struct hfsc *
3292 hfsc_get__(const struct netdev *netdev_)
3294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3295 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3298 static struct hfsc_class *
3299 hfsc_class_cast__(const struct tc_queue *queue)
3301 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3305 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3307 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3310 hfsc = xmalloc(sizeof *hfsc);
3311 tc_init(&hfsc->tc, &tc_ops_hfsc);
3312 hfsc->max_rate = max_rate;
3313 netdev->tc = &hfsc->tc;
3317 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3318 const struct hfsc_class *hc)
3322 struct hfsc_class *hcp;
3323 struct tc_queue *queue;
3325 hfsc = hfsc_get__(netdev);
3326 hash = hash_int(queue_id, 0);
3328 queue = tc_find_queue__(netdev, queue_id, hash);
3330 hcp = hfsc_class_cast__(queue);
3332 hcp = xmalloc(sizeof *hcp);
3333 queue = &hcp->tc_queue;
3334 queue->queue_id = queue_id;
3335 queue->created = time_msec();
3336 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3339 hcp->min_rate = hc->min_rate;
3340 hcp->max_rate = hc->max_rate;
3344 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3346 const struct tc_service_curve *rsc, *fsc, *usc;
3347 static const struct nl_policy tca_hfsc_policy[] = {
3349 .type = NL_A_UNSPEC,
3351 .min_len = sizeof(struct tc_service_curve),
3354 .type = NL_A_UNSPEC,
3356 .min_len = sizeof(struct tc_service_curve),
3359 .type = NL_A_UNSPEC,
3361 .min_len = sizeof(struct tc_service_curve),
3364 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3366 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3367 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3368 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3372 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3373 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3374 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3376 if (rsc->m1 != 0 || rsc->d != 0 ||
3377 fsc->m1 != 0 || fsc->d != 0 ||
3378 usc->m1 != 0 || usc->d != 0) {
3379 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3380 "Non-linear service curves are not supported.");
3384 if (rsc->m2 != fsc->m2) {
3385 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3386 "Real-time service curves are not supported ");
3390 if (rsc->m2 > usc->m2) {
3391 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3392 "Min-rate service curve is greater than "
3393 "the max-rate service curve.");
3397 class->min_rate = fsc->m2;
3398 class->max_rate = usc->m2;
3403 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3404 struct hfsc_class *options,
3405 struct netdev_queue_stats *stats)
3408 unsigned int handle;
3409 struct nlattr *nl_options;
3411 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3417 unsigned int major, minor;
3419 major = tc_get_major(handle);
3420 minor = tc_get_minor(handle);
3421 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3422 *queue_id = minor - 1;
3429 error = hfsc_parse_tca_options__(nl_options, options);
3436 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3437 unsigned int parent, struct hfsc_class *options,
3438 struct netdev_queue_stats *stats)
3441 struct ofpbuf *reply;
3443 error = tc_query_class(netdev, handle, parent, &reply);
3448 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3449 ofpbuf_delete(reply);
3454 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3455 struct hfsc_class *class)
3457 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3459 const char *max_rate_s;
3461 max_rate_s = smap_get(details, "max-rate");
3462 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3465 enum netdev_features current;
3467 netdev_linux_read_features(netdev);
3468 current = !netdev->get_features_error ? netdev->current : 0;
3469 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3472 class->min_rate = max_rate;
3473 class->max_rate = max_rate;
3477 hfsc_parse_class_details__(struct netdev *netdev,
3478 const struct smap *details,
3479 struct hfsc_class * class)
3481 const struct hfsc *hfsc;
3482 uint32_t min_rate, max_rate;
3483 const char *min_rate_s, *max_rate_s;
3485 hfsc = hfsc_get__(netdev);
3486 min_rate_s = smap_get(details, "min-rate");
3487 max_rate_s = smap_get(details, "max-rate");
3489 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3490 min_rate = MAX(min_rate, 1);
3491 min_rate = MIN(min_rate, hfsc->max_rate);
3493 max_rate = (max_rate_s
3494 ? strtoull(max_rate_s, NULL, 10) / 8
3496 max_rate = MAX(max_rate, min_rate);
3497 max_rate = MIN(max_rate, hfsc->max_rate);
3499 class->min_rate = min_rate;
3500 class->max_rate = max_rate;
3505 /* Create an HFSC qdisc.
3507 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3509 hfsc_setup_qdisc__(struct netdev * netdev)
3511 struct tcmsg *tcmsg;
3512 struct ofpbuf request;
3513 struct tc_hfsc_qopt opt;
3515 tc_del_qdisc(netdev);
3517 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3518 NLM_F_EXCL | NLM_F_CREATE, &request);
3524 tcmsg->tcm_handle = tc_make_handle(1, 0);
3525 tcmsg->tcm_parent = TC_H_ROOT;
3527 memset(&opt, 0, sizeof opt);
3530 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3531 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3533 return tc_transact(&request, NULL);
3536 /* Create an HFSC class.
3538 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3539 * sc rate <min_rate> ul rate <max_rate>" */
3541 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3542 unsigned int parent, struct hfsc_class *class)
3546 struct tcmsg *tcmsg;
3547 struct ofpbuf request;
3548 struct tc_service_curve min, max;
3550 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3556 tcmsg->tcm_handle = handle;
3557 tcmsg->tcm_parent = parent;
3561 min.m2 = class->min_rate;
3565 max.m2 = class->max_rate;
3567 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3568 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3569 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3570 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3571 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3572 nl_msg_end_nested(&request, opt_offset);
3574 error = tc_transact(&request, NULL);
3576 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3577 "min-rate %ubps, max-rate %ubps (%s)",
3578 netdev_get_name(netdev),
3579 tc_get_major(handle), tc_get_minor(handle),
3580 tc_get_major(parent), tc_get_minor(parent),
3581 class->min_rate, class->max_rate, ovs_strerror(error));
3588 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3591 struct hfsc_class class;
3593 error = hfsc_setup_qdisc__(netdev);
3599 hfsc_parse_qdisc_details__(netdev, details, &class);
3600 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3601 tc_make_handle(1, 0), &class);
3607 hfsc_install__(netdev, class.max_rate);
3612 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3615 struct queue_dump_state state;
3616 struct hfsc_class hc;
3619 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3620 hfsc_install__(netdev, hc.max_rate);
3622 if (!start_queue_dump(netdev, &state)) {
3626 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3627 unsigned int queue_id;
3629 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3630 hfsc_update_queue__(netdev, queue_id, &hc);
3634 finish_queue_dump(&state);
3639 hfsc_tc_destroy(struct tc *tc)
3642 struct hfsc_class *hc, *next;
3644 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3646 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3647 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3656 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3658 const struct hfsc *hfsc;
3659 hfsc = hfsc_get__(netdev);
3660 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3665 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3668 struct hfsc_class class;
3670 hfsc_parse_qdisc_details__(netdev, details, &class);
3671 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3672 tc_make_handle(1, 0), &class);
3675 hfsc_get__(netdev)->max_rate = class.max_rate;
3682 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3683 const struct tc_queue *queue, struct smap *details)
3685 const struct hfsc_class *hc;
3687 hc = hfsc_class_cast__(queue);
3688 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3689 if (hc->min_rate != hc->max_rate) {
3690 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3696 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3697 const struct smap *details)
3700 struct hfsc_class class;
3702 error = hfsc_parse_class_details__(netdev, details, &class);
3707 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3708 tc_make_handle(1, 0xfffe), &class);
3713 hfsc_update_queue__(netdev, queue_id, &class);
3718 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3722 struct hfsc_class *hc;
3724 hc = hfsc_class_cast__(queue);
3725 hfsc = hfsc_get__(netdev);
3727 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3729 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3736 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3737 struct netdev_queue_stats *stats)
3739 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3740 tc_make_handle(1, 0xfffe), NULL, stats);
3744 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3745 const struct ofpbuf *nlmsg,
3746 netdev_dump_queue_stats_cb *cb, void *aux)
3748 struct netdev_queue_stats stats;
3749 unsigned int handle, major, minor;
3752 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3757 major = tc_get_major(handle);
3758 minor = tc_get_minor(handle);
3759 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3760 (*cb)(minor - 1, &stats, aux);
3765 static const struct tc_ops tc_ops_hfsc = {
3766 "hfsc", /* linux_name */
3767 "linux-hfsc", /* ovs_name */
3768 HFSC_N_QUEUES, /* n_queues */
3769 hfsc_tc_install, /* tc_install */
3770 hfsc_tc_load, /* tc_load */
3771 hfsc_tc_destroy, /* tc_destroy */
3772 hfsc_qdisc_get, /* qdisc_get */
3773 hfsc_qdisc_set, /* qdisc_set */
3774 hfsc_class_get, /* class_get */
3775 hfsc_class_set, /* class_set */
3776 hfsc_class_delete, /* class_delete */
3777 hfsc_class_get_stats, /* class_get_stats */
3778 hfsc_class_dump_stats /* class_dump_stats */
3781 /* "linux-default" traffic control class.
3783 * This class represents the default, unnamed Linux qdisc. It corresponds to
3784 * the "" (empty string) QoS type in the OVS database. */
3787 default_install__(struct netdev *netdev_)
3789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3790 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3792 /* Nothing but a tc class implementation is allowed to write to a tc. This
3793 * class never does that, so we can legitimately use a const tc object. */
3794 netdev->tc = CONST_CAST(struct tc *, &tc);
3798 default_tc_install(struct netdev *netdev,
3799 const struct smap *details OVS_UNUSED)
3801 default_install__(netdev);
3806 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3808 default_install__(netdev);
3812 static const struct tc_ops tc_ops_default = {
3813 NULL, /* linux_name */
3818 NULL, /* tc_destroy */
3819 NULL, /* qdisc_get */
3820 NULL, /* qdisc_set */
3821 NULL, /* class_get */
3822 NULL, /* class_set */
3823 NULL, /* class_delete */
3824 NULL, /* class_get_stats */
3825 NULL /* class_dump_stats */
3828 /* "linux-other" traffic control class.
3833 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3836 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3838 /* Nothing but a tc class implementation is allowed to write to a tc. This
3839 * class never does that, so we can legitimately use a const tc object. */
3840 netdev->tc = CONST_CAST(struct tc *, &tc);
3844 static const struct tc_ops tc_ops_other = {
3845 NULL, /* linux_name */
3846 "linux-other", /* ovs_name */
3848 NULL, /* tc_install */
3850 NULL, /* tc_destroy */
3851 NULL, /* qdisc_get */
3852 NULL, /* qdisc_set */
3853 NULL, /* class_get */
3854 NULL, /* class_set */
3855 NULL, /* class_delete */
3856 NULL, /* class_get_stats */
3857 NULL /* class_dump_stats */
3860 /* Traffic control. */
3862 /* Number of kernel "tc" ticks per second. */
3863 static double ticks_per_s;
3865 /* Number of kernel "jiffies" per second. This is used for the purpose of
3866 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3867 * one jiffy's worth of data.
3869 * There are two possibilities here:
3871 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3872 * approximate range of 100 to 1024. That means that we really need to
3873 * make sure that the qdisc can buffer that much data.
3875 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3876 * has finely granular timers and there's no need to fudge additional room
3877 * for buffers. (There's no extra effort needed to implement that: the
3878 * large 'buffer_hz' is used as a divisor, so practically any number will
3879 * come out as 0 in the division. Small integer results in the case of
3880 * really high dividends won't have any real effect anyhow.)
3882 static unsigned int buffer_hz;
3884 /* Returns tc handle 'major':'minor'. */
3886 tc_make_handle(unsigned int major, unsigned int minor)
3888 return TC_H_MAKE(major << 16, minor);
3891 /* Returns the major number from 'handle'. */
3893 tc_get_major(unsigned int handle)
3895 return TC_H_MAJ(handle) >> 16;
3898 /* Returns the minor number from 'handle'. */
3900 tc_get_minor(unsigned int handle)
3902 return TC_H_MIN(handle);
3905 static struct tcmsg *
3906 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3907 struct ofpbuf *request)
3909 struct tcmsg *tcmsg;
3913 error = get_ifindex(netdev, &ifindex);
3918 ofpbuf_init(request, 512);
3919 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3920 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3921 tcmsg->tcm_family = AF_UNSPEC;
3922 tcmsg->tcm_ifindex = ifindex;
3923 /* Caller should fill in tcmsg->tcm_handle. */
3924 /* Caller should fill in tcmsg->tcm_parent. */
3930 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3932 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3933 ofpbuf_uninit(request);
3937 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3938 * policing configuration.
3940 * This function is equivalent to running the following when 'add' is true:
3941 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3943 * This function is equivalent to running the following when 'add' is false:
3944 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3946 * The configuration and stats may be seen with the following command:
3947 * /sbin/tc -s qdisc show dev <devname>
3949 * Returns 0 if successful, otherwise a positive errno value.
3952 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3954 struct ofpbuf request;
3955 struct tcmsg *tcmsg;
3957 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3958 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3960 tcmsg = tc_make_request(netdev, type, flags, &request);
3964 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3965 tcmsg->tcm_parent = TC_H_INGRESS;
3966 nl_msg_put_string(&request, TCA_KIND, "ingress");
3967 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3969 error = tc_transact(&request, NULL);
3971 /* If we're deleting the qdisc, don't worry about some of the
3972 * error conditions. */
3973 if (!add && (error == ENOENT || error == EINVAL)) {
3982 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3985 * This function is equivalent to running:
3986 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3987 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3990 * The configuration and stats may be seen with the following command:
3991 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3993 * Returns 0 if successful, otherwise a positive errno value.
3996 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3998 struct tc_police tc_police;
3999 struct ofpbuf request;
4000 struct tcmsg *tcmsg;
4001 size_t basic_offset;
4002 size_t police_offset;
4006 memset(&tc_police, 0, sizeof tc_police);
4007 tc_police.action = TC_POLICE_SHOT;
4008 tc_police.mtu = mtu;
4009 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4010 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4011 kbits_burst * 1024);
4013 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4014 NLM_F_EXCL | NLM_F_CREATE, &request);
4018 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4019 tcmsg->tcm_info = tc_make_handle(49,
4020 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4022 nl_msg_put_string(&request, TCA_KIND, "basic");
4023 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4024 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4025 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4026 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4027 nl_msg_end_nested(&request, police_offset);
4028 nl_msg_end_nested(&request, basic_offset);
4030 error = tc_transact(&request, NULL);
4041 /* The values in psched are not individually very meaningful, but they are
4042 * important. The tables below show some values seen in the wild.
4046 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4047 * (Before that, there are hints that it was 1000000000.)
4049 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4053 * -----------------------------------
4054 * [1] 000c8000 000f4240 000f4240 00000064
4055 * [2] 000003e8 00000400 000f4240 3b9aca00
4056 * [3] 000003e8 00000400 000f4240 3b9aca00
4057 * [4] 000003e8 00000400 000f4240 00000064
4058 * [5] 000003e8 00000040 000f4240 3b9aca00
4059 * [6] 000003e8 00000040 000f4240 000000f9
4061 * a b c d ticks_per_s buffer_hz
4062 * ------- --------- ---------- ------------- ----------- -------------
4063 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4064 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4065 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4066 * [4] 1,000 1,024 1,000,000 100 976,562 100
4067 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4068 * [6] 1,000 64 1,000,000 249 15,625,000 249
4070 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4071 * [2] 2.6.26-1-686-bigmem from Debian lenny
4072 * [3] 2.6.26-2-sparc64 from Debian lenny
4073 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4074 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4075 * [6] 2.6.34 from kernel.org on KVM
4077 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4078 static const char fn[] = "/proc/net/psched";
4079 unsigned int a, b, c, d;
4082 if (!ovsthread_once_start(&once)) {
4089 stream = fopen(fn, "r");
4091 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4095 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4096 VLOG_WARN("%s: read failed", fn);
4100 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4104 VLOG_WARN("%s: invalid scheduler parameters", fn);
4108 ticks_per_s = (double) a * c / b;
4112 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4115 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4118 ovsthread_once_done(&once);
4121 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4122 * rate of 'rate' bytes per second. */
4124 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4127 return (rate * ticks) / ticks_per_s;
4130 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4131 * rate of 'rate' bytes per second. */
4133 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4136 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4139 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4140 * a transmission rate of 'rate' bytes per second. */
4142 tc_buffer_per_jiffy(unsigned int rate)
4145 return rate / buffer_hz;
4148 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4149 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4150 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4151 * stores NULL into it if it is absent.
4153 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4156 * Returns 0 if successful, otherwise a positive errno value. */
4158 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4159 struct nlattr **options)
4161 static const struct nl_policy tca_policy[] = {
4162 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4163 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4165 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4167 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4168 tca_policy, ta, ARRAY_SIZE(ta))) {
4169 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4174 *kind = nl_attr_get_string(ta[TCA_KIND]);
4178 *options = ta[TCA_OPTIONS];
4193 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4194 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4195 * into '*options', and its queue statistics into '*stats'. Any of the output
4196 * arguments may be null.
4198 * Returns 0 if successful, otherwise a positive errno value. */
4200 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4201 struct nlattr **options, struct netdev_queue_stats *stats)
4203 static const struct nl_policy tca_policy[] = {
4204 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4205 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4207 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4209 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4210 tca_policy, ta, ARRAY_SIZE(ta))) {
4211 VLOG_WARN_RL(&rl, "failed to parse class message");
4216 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4217 *handlep = tc->tcm_handle;
4221 *options = ta[TCA_OPTIONS];
4225 const struct gnet_stats_queue *gsq;
4226 struct gnet_stats_basic gsb;
4228 static const struct nl_policy stats_policy[] = {
4229 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4230 .min_len = sizeof gsb },
4231 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4232 .min_len = sizeof *gsq },
4234 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4236 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4237 sa, ARRAY_SIZE(sa))) {
4238 VLOG_WARN_RL(&rl, "failed to parse class stats");
4242 /* Alignment issues screw up the length of struct gnet_stats_basic on
4243 * some arch/bitsize combinations. Newer versions of Linux have a
4244 * struct gnet_stats_basic_packed, but we can't depend on that. The
4245 * easiest thing to do is just to make a copy. */
4246 memset(&gsb, 0, sizeof gsb);
4247 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4248 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4249 stats->tx_bytes = gsb.bytes;
4250 stats->tx_packets = gsb.packets;
4252 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4253 stats->tx_errors = gsq->drops;
4263 memset(stats, 0, sizeof *stats);
4268 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4271 tc_query_class(const struct netdev *netdev,
4272 unsigned int handle, unsigned int parent,
4273 struct ofpbuf **replyp)
4275 struct ofpbuf request;
4276 struct tcmsg *tcmsg;
4279 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4283 tcmsg->tcm_handle = handle;
4284 tcmsg->tcm_parent = parent;
4286 error = tc_transact(&request, replyp);
4288 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4289 netdev_get_name(netdev),
4290 tc_get_major(handle), tc_get_minor(handle),
4291 tc_get_major(parent), tc_get_minor(parent),
4292 ovs_strerror(error));
4297 /* Equivalent to "tc class del dev <name> handle <handle>". */
4299 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4301 struct ofpbuf request;
4302 struct tcmsg *tcmsg;
4305 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4309 tcmsg->tcm_handle = handle;
4310 tcmsg->tcm_parent = 0;
4312 error = tc_transact(&request, NULL);
4314 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4315 netdev_get_name(netdev),
4316 tc_get_major(handle), tc_get_minor(handle),
4317 ovs_strerror(error));
4322 /* Equivalent to "tc qdisc del dev <name> root". */
4324 tc_del_qdisc(struct netdev *netdev_)
4326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4327 struct ofpbuf request;
4328 struct tcmsg *tcmsg;
4331 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4335 tcmsg->tcm_handle = tc_make_handle(1, 0);
4336 tcmsg->tcm_parent = TC_H_ROOT;
4338 error = tc_transact(&request, NULL);
4339 if (error == EINVAL) {
4340 /* EINVAL probably means that the default qdisc was in use, in which
4341 * case we've accomplished our purpose. */
4344 if (!error && netdev->tc) {
4345 if (netdev->tc->ops->tc_destroy) {
4346 netdev->tc->ops->tc_destroy(netdev->tc);
4353 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4354 * kernel to determine what they are. Returns 0 if successful, otherwise a
4355 * positive errno value. */
4357 tc_query_qdisc(const struct netdev *netdev_)
4359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4360 struct ofpbuf request, *qdisc;
4361 const struct tc_ops *ops;
4362 struct tcmsg *tcmsg;
4370 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4371 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4372 * 2.6.35 without that fix backported to it.
4374 * To avoid the OOPS, we must not make a request that would attempt to dump
4375 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4376 * few others. There are a few ways that I can see to do this, but most of
4377 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4378 * technique chosen here is to assume that any non-default qdisc that we
4379 * create will have a class with handle 1:0. The built-in qdiscs only have
4380 * a class with handle 0:0.
4382 * We could check for Linux 2.6.35+ and use a more straightforward method
4384 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4388 tcmsg->tcm_handle = tc_make_handle(1, 0);
4389 tcmsg->tcm_parent = 0;
4391 /* Figure out what tc class to instantiate. */
4392 error = tc_transact(&request, &qdisc);
4396 error = tc_parse_qdisc(qdisc, &kind, NULL);
4398 ops = &tc_ops_other;
4400 ops = tc_lookup_linux_name(kind);
4402 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4403 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4405 ops = &tc_ops_other;
4408 } else if (error == ENOENT) {
4409 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4410 * other entity that doesn't have a handle 1:0. We will assume
4411 * that it's the system default qdisc. */
4412 ops = &tc_ops_default;
4415 /* Who knows? Maybe the device got deleted. */
4416 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4417 netdev_get_name(netdev_), ovs_strerror(error));
4418 ops = &tc_ops_other;
4421 /* Instantiate it. */
4422 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4423 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4424 ofpbuf_delete(qdisc);
4426 return error ? error : load_error;
4429 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4430 approximate the time to transmit packets of various lengths. For an MTU of
4431 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4432 represents two possible packet lengths; for a MTU of 513 through 1024, four
4433 possible lengths; and so on.
4435 Returns, for the specified 'mtu', the number of bits that packet lengths
4436 need to be shifted right to fit within such a 256-entry table. */
4438 tc_calc_cell_log(unsigned int mtu)
4443 mtu = ETH_PAYLOAD_MAX;
4445 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4447 for (cell_log = 0; mtu >= 256; cell_log++) {
4454 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4457 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4459 memset(rate, 0, sizeof *rate);
4460 rate->cell_log = tc_calc_cell_log(mtu);
4461 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4462 /* rate->cell_align = 0; */ /* distro headers. */
4463 rate->mpu = ETH_TOTAL_MIN;
4467 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4468 * attribute of the specified "type".
4470 * See tc_calc_cell_log() above for a description of "rtab"s. */
4472 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4477 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4478 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4479 unsigned packet_size = (i + 1) << rate->cell_log;
4480 if (packet_size < rate->mpu) {
4481 packet_size = rate->mpu;
4483 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4487 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4488 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4489 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4492 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4494 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4495 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4498 /* Linux-only functions declared in netdev-linux.h */
4500 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4501 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4503 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4504 const char *flag_name, bool enable)
4506 const char *netdev_name = netdev_get_name(netdev);
4507 struct ethtool_value evalue;
4511 COVERAGE_INC(netdev_get_ethtool);
4512 memset(&evalue, 0, sizeof evalue);
4513 error = netdev_linux_do_ethtool(netdev_name,
4514 (struct ethtool_cmd *)&evalue,
4515 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4520 COVERAGE_INC(netdev_set_ethtool);
4521 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4522 error = netdev_linux_do_ethtool(netdev_name,
4523 (struct ethtool_cmd *)&evalue,
4524 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4529 COVERAGE_INC(netdev_get_ethtool);
4530 memset(&evalue, 0, sizeof evalue);
4531 error = netdev_linux_do_ethtool(netdev_name,
4532 (struct ethtool_cmd *)&evalue,
4533 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4538 if (new_flags != evalue.data) {
4539 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4540 "device %s failed", enable ? "enable" : "disable",
4541 flag_name, netdev_name);
4548 /* Utility functions. */
4550 /* Copies 'src' into 'dst', performing format conversion in the process. */
4552 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4553 const struct rtnl_link_stats *src)
4555 dst->rx_packets = src->rx_packets;
4556 dst->tx_packets = src->tx_packets;
4557 dst->rx_bytes = src->rx_bytes;
4558 dst->tx_bytes = src->tx_bytes;
4559 dst->rx_errors = src->rx_errors;
4560 dst->tx_errors = src->tx_errors;
4561 dst->rx_dropped = src->rx_dropped;
4562 dst->tx_dropped = src->tx_dropped;
4563 dst->multicast = src->multicast;
4564 dst->collisions = src->collisions;
4565 dst->rx_length_errors = src->rx_length_errors;
4566 dst->rx_over_errors = src->rx_over_errors;
4567 dst->rx_crc_errors = src->rx_crc_errors;
4568 dst->rx_frame_errors = src->rx_frame_errors;
4569 dst->rx_fifo_errors = src->rx_fifo_errors;
4570 dst->rx_missed_errors = src->rx_missed_errors;
4571 dst->tx_aborted_errors = src->tx_aborted_errors;
4572 dst->tx_carrier_errors = src->tx_carrier_errors;
4573 dst->tx_fifo_errors = src->tx_fifo_errors;
4574 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4575 dst->tx_window_errors = src->tx_window_errors;
4579 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4581 struct ofpbuf request;
4582 struct ofpbuf *reply;
4585 ofpbuf_init(&request, 0);
4586 nl_msg_put_nlmsghdr(&request,
4587 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4588 RTM_GETLINK, NLM_F_REQUEST);
4589 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4590 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4591 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4592 ofpbuf_uninit(&request);
4597 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4598 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4599 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4600 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4603 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4607 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4612 ofpbuf_delete(reply);
4617 get_flags(const struct netdev *dev, unsigned int *flags)
4623 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4625 *flags = ifr.ifr_flags;
4631 set_flags(const char *name, unsigned int flags)
4635 ifr.ifr_flags = flags;
4636 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4640 do_get_ifindex(const char *netdev_name)
4645 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4646 COVERAGE_INC(netdev_get_ifindex);
4648 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4650 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4651 netdev_name, ovs_strerror(error));
4654 return ifr.ifr_ifindex;
4658 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4662 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4663 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4666 netdev->get_ifindex_error = -ifindex;
4667 netdev->ifindex = 0;
4669 netdev->get_ifindex_error = 0;
4670 netdev->ifindex = ifindex;
4672 netdev->cache_valid |= VALID_IFINDEX;
4675 *ifindexp = netdev->ifindex;
4676 return netdev->get_ifindex_error;
4680 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4686 memset(&ifr, 0, sizeof ifr);
4687 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4688 COVERAGE_INC(netdev_get_hwaddr);
4689 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4691 /* ENODEV probably means that a vif disappeared asynchronously and
4692 * hasn't been removed from the database yet, so reduce the log level
4693 * to INFO for that case. */
4694 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4695 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4696 netdev_name, ovs_strerror(error));
4699 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4700 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4701 VLOG_WARN("%s device has unknown hardware address family %d",
4702 netdev_name, hwaddr_family);
4704 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4709 set_etheraddr(const char *netdev_name,
4710 const uint8_t mac[ETH_ADDR_LEN])
4715 memset(&ifr, 0, sizeof ifr);
4716 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4717 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4718 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4719 COVERAGE_INC(netdev_set_hwaddr);
4720 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4722 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4723 netdev_name, ovs_strerror(error));
4729 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4730 int cmd, const char *cmd_name)
4735 memset(&ifr, 0, sizeof ifr);
4736 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4737 ifr.ifr_data = (caddr_t) ecmd;
4740 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4742 if (error != EOPNOTSUPP) {
4743 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4744 "failed: %s", cmd_name, name, ovs_strerror(error));
4746 /* The device doesn't support this operation. That's pretty
4747 * common, so there's no point in logging anything. */
4754 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4755 int cmd, const char *cmd_name)
4760 ifr.ifr_addr.sa_family = AF_INET;
4761 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4763 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4765 *ip = sin->sin_addr;
4770 /* Returns an AF_PACKET raw socket or a negative errno value. */
4772 af_packet_sock(void)
4774 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4777 if (ovsthread_once_start(&once)) {
4778 sock = socket(AF_PACKET, SOCK_RAW, 0);
4780 int error = set_nonblocking(sock);
4787 VLOG_ERR("failed to create packet socket: %s",
4788 ovs_strerror(errno));
4790 ovsthread_once_done(&once);