2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
68 #include "socket-util.h"
71 #include "unaligned.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_get_ethtool);
82 COVERAGE_DEFINE(netdev_set_ethtool);
85 /* These were introduced in Linux 2.6.14, so they might be missing if we have
87 #ifndef ADVERTISED_Pause
88 #define ADVERTISED_Pause (1 << 13)
90 #ifndef ADVERTISED_Asym_Pause
91 #define ADVERTISED_Asym_Pause (1 << 14)
94 /* These were introduced in Linux 2.6.24, so they might be missing if we
95 * have old headers. */
96 #ifndef ETHTOOL_GFLAGS
97 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
99 #ifndef ETHTOOL_SFLAGS
100 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
103 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
106 #define TC_RTAB_SIZE 1024
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_VPORT_STAT_ERROR = 1 << 6,
117 VALID_DRVINFO = 1 << 7,
118 VALID_FEATURES = 1 << 8,
121 /* Traffic control. */
123 /* An instance of a traffic control class. Always associated with a particular
126 * Each TC implementation subclasses this with whatever additional data it
129 const struct tc_ops *ops;
130 struct hmap queues; /* Contains "struct tc_queue"s.
131 * Read by generic TC layer.
132 * Written only by TC implementation. */
135 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
137 /* One traffic control queue.
139 * Each TC implementation subclasses this with whatever additional data it
142 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
143 unsigned int queue_id; /* OpenFlow queue ID. */
144 long long int created; /* Time queue was created, in msecs. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct smap *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct smap *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct smap *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct smap *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *const tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
330 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_linux {
355 /* Protects all members below. */
356 struct ovs_mutex mutex;
358 unsigned int cache_valid;
359 unsigned int change_seq;
361 bool miimon; /* Link status of last poll. */
362 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
363 struct timer miimon_timer;
365 /* The following are figured out "on demand" only. They are only valid
366 * when the corresponding VALID_* bit in 'cache_valid' is set. */
368 uint8_t etheraddr[ETH_ADDR_LEN];
369 struct in_addr address, netmask;
372 unsigned int ifi_flags;
373 long long int carrier_resets;
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 int vport_stats_error; /* Cached error code from vport_get_stats().
377 0 or an errno value. */
378 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
379 int ether_addr_error; /* Cached error code from set/get etheraddr. */
380 int netdev_policing_error; /* Cached error code from set policing. */
381 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
382 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
384 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
385 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
391 /* For devices of class netdev_tap_class only. */
395 struct netdev_rx_linux {
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static void netdev_linux_run(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
410 int cmd, const char *cmd_name);
411 static int get_flags(const struct netdev *, unsigned int *flags);
412 static int set_flags(const char *, unsigned int flags);
413 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
414 enum netdev_flags on, enum netdev_flags *old_flagsp)
415 OVS_REQUIRES(netdev->mutex);
416 static int do_get_ifindex(const char *netdev_name);
417 static int get_ifindex(const struct netdev *, int *ifindexp);
418 static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
422 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
423 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
425 static int af_packet_sock(void);
426 static void netdev_linux_miimon_run(void);
427 static void netdev_linux_miimon_wait(void);
430 is_netdev_linux_class(const struct netdev_class *netdev_class)
432 return netdev_class->run == netdev_linux_run;
436 is_tap_netdev(const struct netdev *netdev)
438 return netdev_get_class(netdev) == &netdev_tap_class;
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
446 return CONTAINER_OF(netdev, struct netdev_linux, up);
449 static struct netdev_rx_linux *
450 netdev_rx_linux_cast(const struct netdev_rx *rx)
452 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
453 return CONTAINER_OF(rx, struct netdev_rx_linux, up);
456 static void netdev_linux_update(struct netdev_linux *netdev,
457 const struct rtnetlink_link_change *)
458 OVS_REQUIRES(netdev->mutex);
459 static void netdev_linux_changed(struct netdev_linux *netdev,
460 unsigned int ifi_flags, unsigned int mask)
461 OVS_REQUIRES(netdev->mutex);
463 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
464 * if no such socket could be created. */
465 static struct nl_sock *
466 netdev_linux_notify_sock(void)
468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
469 static struct nl_sock *sock;
471 if (ovsthread_once_start(&once)) {
474 error = nl_sock_create(NETLINK_ROUTE, &sock);
476 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
478 nl_sock_destroy(sock);
482 ovsthread_once_done(&once);
489 netdev_linux_run(void)
491 struct nl_sock *sock;
494 netdev_linux_miimon_run();
496 sock = netdev_linux_notify_sock();
502 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
503 uint64_t buf_stub[4096 / 8];
506 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
507 error = nl_sock_recv(sock, &buf, false);
509 struct rtnetlink_link_change change;
511 if (rtnetlink_link_parse(&buf, &change)) {
512 struct netdev *netdev_ = netdev_from_name(change.ifname);
513 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
516 ovs_mutex_lock(&netdev->mutex);
517 netdev_linux_update(netdev, &change);
518 ovs_mutex_unlock(&netdev->mutex);
520 netdev_close(netdev_);
522 } else if (error == ENOBUFS) {
523 struct shash device_shash;
524 struct shash_node *node;
528 shash_init(&device_shash);
529 netdev_get_devices(&netdev_linux_class, &device_shash);
530 SHASH_FOR_EACH (node, &device_shash) {
531 struct netdev *netdev_ = node->data;
532 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
535 ovs_mutex_lock(&netdev->mutex);
536 get_flags(netdev_, &flags);
537 netdev_linux_changed(netdev, flags, 0);
538 ovs_mutex_unlock(&netdev->mutex);
540 netdev_close(netdev_);
542 shash_destroy(&device_shash);
543 } else if (error != EAGAIN) {
544 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
545 ovs_strerror(error));
552 netdev_linux_wait(void)
554 struct nl_sock *sock;
556 netdev_linux_miimon_wait();
557 sock = netdev_linux_notify_sock();
559 nl_sock_wait(sock, POLLIN);
564 netdev_linux_changed(struct netdev_linux *dev,
565 unsigned int ifi_flags, unsigned int mask)
566 OVS_REQUIRES(dev->mutex)
569 if (!dev->change_seq) {
573 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
574 dev->carrier_resets++;
576 dev->ifi_flags = ifi_flags;
578 dev->cache_valid &= mask;
582 netdev_linux_update(struct netdev_linux *dev,
583 const struct rtnetlink_link_change *change)
584 OVS_REQUIRES(dev->mutex)
586 if (change->nlmsg_type == RTM_NEWLINK) {
588 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
590 /* Update netdev from rtnl-change msg. */
592 dev->mtu = change->mtu;
593 dev->cache_valid |= VALID_MTU;
594 dev->netdev_mtu_error = 0;
597 if (!eth_addr_is_zero(change->addr)) {
598 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
599 dev->cache_valid |= VALID_ETHERADDR;
600 dev->ether_addr_error = 0;
603 dev->ifindex = change->ifi_index;
604 dev->cache_valid |= VALID_IFINDEX;
605 dev->get_ifindex_error = 0;
608 netdev_linux_changed(dev, change->ifi_flags, 0);
612 static struct netdev *
613 netdev_linux_alloc(void)
615 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
620 netdev_linux_common_construct(struct netdev_linux *netdev)
622 ovs_mutex_init(&netdev->mutex, PTHREAD_MUTEX_NORMAL);
623 netdev->change_seq = 1;
626 /* Creates system and internal devices. */
628 netdev_linux_construct(struct netdev *netdev_)
630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
633 netdev_linux_common_construct(netdev);
635 error = get_flags(&netdev->up, &netdev->ifi_flags);
636 if (error == ENODEV) {
637 if (netdev->up.netdev_class != &netdev_internal_class) {
638 /* The device does not exist, so don't allow it to be opened. */
641 /* "Internal" netdevs have to be created as netdev objects before
642 * they exist in the kernel, because creating them in the kernel
643 * happens by passing a netdev object to dpif_port_add().
644 * Therefore, ignore the error. */
651 /* For most types of netdevs we open the device for each call of
652 * netdev_open(). However, this is not the case with tap devices,
653 * since it is only possible to open the device once. In this
654 * situation we share a single file descriptor, and consequently
655 * buffers, across all readers. Therefore once data is read it will
656 * be unavailable to other reads for tap devices. */
658 netdev_linux_construct_tap(struct netdev *netdev_)
660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
661 static const char tap_dev[] = "/dev/net/tun";
662 const char *name = netdev_->name;
666 netdev_linux_common_construct(netdev);
668 /* Open tap device. */
669 netdev->tap_fd = open(tap_dev, O_RDWR);
670 if (netdev->tap_fd < 0) {
672 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
676 /* Create tap device. */
677 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
678 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
679 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
680 VLOG_WARN("%s: creating tap device failed: %s", name,
681 ovs_strerror(errno));
686 /* Make non-blocking. */
687 error = set_nonblocking(netdev->tap_fd);
695 close(netdev->tap_fd);
700 netdev_linux_destruct(struct netdev *netdev_)
702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
704 if (netdev->tc && netdev->tc->ops->tc_destroy) {
705 netdev->tc->ops->tc_destroy(netdev->tc);
708 if (netdev_get_class(netdev_) == &netdev_tap_class
709 && netdev->tap_fd >= 0)
711 close(netdev->tap_fd);
714 ovs_mutex_destroy(&netdev->mutex);
718 netdev_linux_dealloc(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 static struct netdev_rx *
725 netdev_linux_rx_alloc(void)
727 struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
732 netdev_linux_rx_construct(struct netdev_rx *rx_)
734 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
735 struct netdev *netdev_ = rx->up.netdev;
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 ovs_mutex_lock(&netdev->mutex);
740 rx->is_tap = is_tap_netdev(netdev_);
742 rx->fd = netdev->tap_fd;
744 struct sockaddr_ll sll;
746 /* Result of tcpdump -dd inbound */
747 static const struct sock_filter filt[] = {
748 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
749 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
750 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
751 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
753 static const struct sock_fprog fprog = {
754 ARRAY_SIZE(filt), (struct sock_filter *) filt
757 /* Create file descriptor. */
758 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
761 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
765 /* Set non-blocking mode. */
766 error = set_nonblocking(rx->fd);
771 /* Get ethernet device index. */
772 error = get_ifindex(&netdev->up, &ifindex);
777 /* Bind to specific ethernet device. */
778 memset(&sll, 0, sizeof sll);
779 sll.sll_family = AF_PACKET;
780 sll.sll_ifindex = ifindex;
781 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
782 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
784 VLOG_ERR("%s: failed to bind raw socket (%s)",
785 netdev_get_name(netdev_), ovs_strerror(error));
789 /* Filter for only inbound packets. */
790 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
794 VLOG_ERR("%s: failed to attach filter (%s)",
795 netdev_get_name(netdev_), ovs_strerror(error));
799 ovs_mutex_unlock(&netdev->mutex);
807 ovs_mutex_unlock(&netdev->mutex);
812 netdev_linux_rx_destruct(struct netdev_rx *rx_)
814 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
822 netdev_linux_rx_dealloc(struct netdev_rx *rx_)
824 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
830 netdev_linux_rx_recv(struct netdev_rx *rx_, void *data, size_t size)
832 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
837 ? read(rx->fd, data, size)
838 : recv(rx->fd, data, size, MSG_TRUNC));
839 } while (retval < 0 && errno == EINTR);
842 return retval > size ? -EMSGSIZE : retval;
844 if (errno != EAGAIN) {
845 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
846 ovs_strerror(errno), netdev_rx_get_name(rx_));
853 netdev_linux_rx_wait(struct netdev_rx *rx_)
855 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
856 poll_fd_wait(rx->fd, POLLIN);
860 netdev_linux_rx_drain(struct netdev_rx *rx_)
862 struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
865 int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
866 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
870 drain_fd(rx->fd, ifr.ifr_qlen);
873 return drain_rcvbuf(rx->fd);
877 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
878 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
879 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
880 * the packet is too big or too small to transmit on the device.
882 * The caller retains ownership of 'buffer' in all cases.
884 * The kernel maintains a packet transmission queue, so the caller is not
885 * expected to do additional queuing of packets. */
887 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
892 if (!is_tap_netdev(netdev_)) {
893 /* Use our AF_PACKET socket to send to this device. */
894 struct sockaddr_ll sll;
900 sock = af_packet_sock();
905 ifindex = netdev_get_ifindex(netdev_);
910 /* We don't bother setting most fields in sockaddr_ll because the
911 * kernel ignores them for SOCK_RAW. */
912 memset(&sll, 0, sizeof sll);
913 sll.sll_family = AF_PACKET;
914 sll.sll_ifindex = ifindex;
916 iov.iov_base = CONST_CAST(void *, data);
920 msg.msg_namelen = sizeof sll;
923 msg.msg_control = NULL;
924 msg.msg_controllen = 0;
927 retval = sendmsg(sock, &msg, 0);
929 /* Use the tap fd to send to this device. This is essential for
930 * tap devices, because packets sent to a tap device with an
931 * AF_PACKET socket will loop back to be *received* again on the
932 * tap device. This doesn't occur on other interface types
933 * because we attach a socket filter to the rx socket. */
934 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
936 retval = write(netdev->tap_fd, data, size);
940 /* The Linux AF_PACKET implementation never blocks waiting for room
941 * for packets, instead returning ENOBUFS. Translate this into
942 * EAGAIN for the caller. */
943 if (errno == ENOBUFS) {
945 } else if (errno == EINTR) {
947 } else if (errno != EAGAIN) {
948 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
949 netdev_get_name(netdev_), ovs_strerror(errno));
952 } else if (retval != size) {
953 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
954 "%zu) on %s", retval, size, netdev_get_name(netdev_));
962 /* Registers with the poll loop to wake up from the next call to poll_block()
963 * when the packet transmission queue has sufficient room to transmit a packet
964 * with netdev_send().
966 * The kernel maintains a packet transmission queue, so the client is not
967 * expected to do additional queuing of packets. Thus, this function is
968 * unlikely to ever be used. It is included for completeness. */
970 netdev_linux_send_wait(struct netdev *netdev)
972 if (is_tap_netdev(netdev)) {
973 /* TAP device always accepts packets.*/
974 poll_immediate_wake();
978 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
979 * otherwise a positive errno value. */
981 netdev_linux_set_etheraddr(struct netdev *netdev_,
982 const uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
985 enum netdev_flags old_flags = 0;
988 ovs_mutex_lock(&netdev->mutex);
990 if (netdev->cache_valid & VALID_ETHERADDR) {
991 error = netdev->ether_addr_error;
992 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
995 netdev->cache_valid &= ~VALID_ETHERADDR;
998 /* Tap devices must be brought down before setting the address. */
999 if (is_tap_netdev(netdev_)) {
1000 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1002 error = set_etheraddr(netdev_get_name(netdev_), mac);
1003 if (!error || error == ENODEV) {
1004 netdev->ether_addr_error = error;
1005 netdev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1011 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1012 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1016 ovs_mutex_unlock(&netdev->mutex);
1020 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1022 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1023 uint8_t mac[ETH_ADDR_LEN])
1025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1028 ovs_mutex_lock(&netdev->mutex);
1029 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1030 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1032 netdev->cache_valid |= VALID_ETHERADDR;
1035 error = netdev->ether_addr_error;
1037 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1039 ovs_mutex_unlock(&netdev->mutex);
1044 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1045 * in bytes, not including the hardware header; thus, this is typically 1500
1046 * bytes for Ethernet devices. */
1048 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1053 ovs_mutex_lock(&netdev->mutex);
1054 if (!(netdev->cache_valid & VALID_MTU)) {
1057 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1058 netdev_get_name(netdev_), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1059 netdev->mtu = ifr.ifr_mtu;
1060 netdev->cache_valid |= VALID_MTU;
1063 error = netdev->netdev_mtu_error;
1065 *mtup = netdev->mtu;
1067 ovs_mutex_unlock(&netdev->mutex);
1072 /* Sets the maximum size of transmitted (MTU) for given device using linux
1073 * networking ioctl interface.
1076 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1082 ovs_mutex_lock(&netdev->mutex);
1083 if (netdev->cache_valid & VALID_MTU) {
1084 error = netdev->netdev_mtu_error;
1085 if (error || netdev->mtu == mtu) {
1088 netdev->cache_valid &= ~VALID_MTU;
1091 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1092 SIOCSIFMTU, "SIOCSIFMTU");
1093 if (!error || error == ENODEV) {
1094 netdev->netdev_mtu_error = error;
1095 netdev->mtu = ifr.ifr_mtu;
1096 netdev->cache_valid |= VALID_MTU;
1099 ovs_mutex_unlock(&netdev->mutex);
1103 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1104 * On failure, returns a negative errno value. */
1106 netdev_linux_get_ifindex(const struct netdev *netdev_)
1108 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1111 ovs_mutex_lock(&netdev->mutex);
1112 error = get_ifindex(netdev_, &ifindex);
1113 ovs_mutex_unlock(&netdev->mutex);
1115 return error ? -error : ifindex;
1119 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1123 ovs_mutex_lock(&netdev->mutex);
1124 if (netdev->miimon_interval > 0) {
1125 *carrier = netdev->miimon;
1127 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1129 ovs_mutex_unlock(&netdev->mutex);
1134 static long long int
1135 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1137 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1138 long long int carrier_resets;
1140 ovs_mutex_lock(&netdev->mutex);
1141 carrier_resets = netdev->carrier_resets;
1142 ovs_mutex_unlock(&netdev->mutex);
1144 return carrier_resets;
1148 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1149 struct mii_ioctl_data *data)
1154 memset(&ifr, 0, sizeof ifr);
1155 memcpy(&ifr.ifr_data, data, sizeof *data);
1156 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1157 memcpy(data, &ifr.ifr_data, sizeof *data);
1163 netdev_linux_get_miimon(const char *name, bool *miimon)
1165 struct mii_ioctl_data data;
1170 memset(&data, 0, sizeof data);
1171 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1173 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1174 data.reg_num = MII_BMSR;
1175 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1179 *miimon = !!(data.val_out & BMSR_LSTATUS);
1181 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1184 struct ethtool_cmd ecmd;
1186 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1189 COVERAGE_INC(netdev_get_ethtool);
1190 memset(&ecmd, 0, sizeof ecmd);
1191 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1194 struct ethtool_value eval;
1196 memcpy(&eval, &ecmd, sizeof eval);
1197 *miimon = !!eval.data;
1199 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1207 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1208 long long int interval)
1210 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1212 ovs_mutex_lock(&netdev->mutex);
1213 interval = interval > 0 ? MAX(interval, 100) : 0;
1214 if (netdev->miimon_interval != interval) {
1215 netdev->miimon_interval = interval;
1216 timer_set_expired(&netdev->miimon_timer);
1218 ovs_mutex_unlock(&netdev->mutex);
1224 netdev_linux_miimon_run(void)
1226 struct shash device_shash;
1227 struct shash_node *node;
1229 shash_init(&device_shash);
1230 netdev_get_devices(&netdev_linux_class, &device_shash);
1231 SHASH_FOR_EACH (node, &device_shash) {
1232 struct netdev *netdev = node->data;
1233 struct netdev_linux *dev = netdev_linux_cast(netdev);
1236 ovs_mutex_lock(&dev->mutex);
1237 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1238 netdev_linux_get_miimon(dev->up.name, &miimon);
1239 if (miimon != dev->miimon) {
1240 dev->miimon = miimon;
1241 netdev_linux_changed(dev, dev->ifi_flags, 0);
1244 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1246 ovs_mutex_unlock(&dev->mutex);
1247 netdev_close(netdev);
1250 shash_destroy(&device_shash);
1254 netdev_linux_miimon_wait(void)
1256 struct shash device_shash;
1257 struct shash_node *node;
1259 shash_init(&device_shash);
1260 netdev_get_devices(&netdev_linux_class, &device_shash);
1261 SHASH_FOR_EACH (node, &device_shash) {
1262 struct netdev *netdev = node->data;
1263 struct netdev_linux *dev = netdev_linux_cast(netdev);
1265 ovs_mutex_lock(&dev->mutex);
1266 if (dev->miimon_interval > 0) {
1267 timer_wait(&dev->miimon_timer);
1269 ovs_mutex_unlock(&dev->mutex);
1270 netdev_close(netdev);
1272 shash_destroy(&device_shash);
1275 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1276 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1279 check_for_working_netlink_stats(void)
1281 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1282 * preferable, so if that works, we'll use it. */
1283 int ifindex = do_get_ifindex("lo");
1285 VLOG_WARN("failed to get ifindex for lo, "
1286 "obtaining netdev stats from proc");
1289 struct netdev_stats stats;
1290 int error = get_stats_via_netlink(ifindex, &stats);
1292 VLOG_DBG("obtaining netdev stats via rtnetlink");
1295 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1296 "via proc (you are probably running a pre-2.6.19 "
1297 "kernel)", ovs_strerror(error));
1304 swap_uint64(uint64_t *a, uint64_t *b)
1311 /* Copies 'src' into 'dst', performing format conversion in the process.
1313 * 'src' is allowed to be misaligned. */
1315 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1316 const struct ovs_vport_stats *src)
1318 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1319 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1320 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1321 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1322 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1323 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1324 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1325 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1327 dst->collisions = 0;
1328 dst->rx_length_errors = 0;
1329 dst->rx_over_errors = 0;
1330 dst->rx_crc_errors = 0;
1331 dst->rx_frame_errors = 0;
1332 dst->rx_fifo_errors = 0;
1333 dst->rx_missed_errors = 0;
1334 dst->tx_aborted_errors = 0;
1335 dst->tx_carrier_errors = 0;
1336 dst->tx_fifo_errors = 0;
1337 dst->tx_heartbeat_errors = 0;
1338 dst->tx_window_errors = 0;
1342 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1344 struct dpif_linux_vport reply;
1348 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1351 } else if (!reply.stats) {
1356 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1364 get_stats_via_vport(const struct netdev *netdev_,
1365 struct netdev_stats *stats)
1367 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1369 if (!netdev->vport_stats_error ||
1370 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1373 error = get_stats_via_vport__(netdev_, stats);
1374 if (error && error != ENOENT) {
1375 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1377 netdev_get_name(netdev_), ovs_strerror(error));
1379 netdev->vport_stats_error = error;
1380 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1385 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1386 struct netdev_stats *stats)
1388 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1389 static int use_netlink_stats;
1392 if (ovsthread_once_start(&once)) {
1393 use_netlink_stats = check_for_working_netlink_stats();
1394 ovsthread_once_done(&once);
1397 if (use_netlink_stats) {
1400 error = get_ifindex(netdev_, &ifindex);
1402 error = get_stats_via_netlink(ifindex, stats);
1405 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1409 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1410 netdev_get_name(netdev_), error);
1416 /* Retrieves current device stats for 'netdev-linux'. */
1418 netdev_linux_get_stats(const struct netdev *netdev_,
1419 struct netdev_stats *stats)
1421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1422 struct netdev_stats dev_stats;
1425 ovs_mutex_lock(&netdev->mutex);
1426 get_stats_via_vport(netdev_, stats);
1427 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1429 if (!netdev->vport_stats_error) {
1432 } else if (netdev->vport_stats_error) {
1433 /* stats not available from OVS then use ioctl stats. */
1436 stats->rx_errors += dev_stats.rx_errors;
1437 stats->tx_errors += dev_stats.tx_errors;
1438 stats->rx_dropped += dev_stats.rx_dropped;
1439 stats->tx_dropped += dev_stats.tx_dropped;
1440 stats->multicast += dev_stats.multicast;
1441 stats->collisions += dev_stats.collisions;
1442 stats->rx_length_errors += dev_stats.rx_length_errors;
1443 stats->rx_over_errors += dev_stats.rx_over_errors;
1444 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1445 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1446 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1447 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1448 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1449 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1450 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1451 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1452 stats->tx_window_errors += dev_stats.tx_window_errors;
1454 ovs_mutex_unlock(&netdev->mutex);
1459 /* Retrieves current device stats for 'netdev-tap' netdev or
1460 * netdev-internal. */
1462 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1464 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1465 struct netdev_stats dev_stats;
1468 ovs_mutex_lock(&netdev->mutex);
1469 get_stats_via_vport(netdev_, stats);
1470 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1472 if (!netdev->vport_stats_error) {
1475 } else if (netdev->vport_stats_error) {
1476 /* Transmit and receive stats will appear to be swapped relative to the
1477 * other ports since we are the one sending the data, not a remote
1478 * computer. For consistency, we swap them back here. This does not
1479 * apply if we are getting stats from the vport layer because it always
1480 * tracks stats from the perspective of the switch. */
1483 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1484 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1485 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1486 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1487 stats->rx_length_errors = 0;
1488 stats->rx_over_errors = 0;
1489 stats->rx_crc_errors = 0;
1490 stats->rx_frame_errors = 0;
1491 stats->rx_fifo_errors = 0;
1492 stats->rx_missed_errors = 0;
1493 stats->tx_aborted_errors = 0;
1494 stats->tx_carrier_errors = 0;
1495 stats->tx_fifo_errors = 0;
1496 stats->tx_heartbeat_errors = 0;
1497 stats->tx_window_errors = 0;
1499 stats->rx_dropped += dev_stats.tx_dropped;
1500 stats->tx_dropped += dev_stats.rx_dropped;
1502 stats->rx_errors += dev_stats.tx_errors;
1503 stats->tx_errors += dev_stats.rx_errors;
1505 stats->multicast += dev_stats.multicast;
1506 stats->collisions += dev_stats.collisions;
1508 ovs_mutex_unlock(&netdev->mutex);
1514 netdev_internal_get_stats(const struct netdev *netdev_,
1515 struct netdev_stats *stats)
1517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1520 ovs_mutex_lock(&netdev->mutex);
1521 get_stats_via_vport(netdev_, stats);
1522 error = netdev->vport_stats_error;
1523 ovs_mutex_unlock(&netdev->mutex);
1529 netdev_internal_set_stats(struct netdev *netdev,
1530 const struct netdev_stats *stats)
1532 struct ovs_vport_stats vport_stats;
1533 struct dpif_linux_vport vport;
1536 vport_stats.rx_packets = stats->rx_packets;
1537 vport_stats.tx_packets = stats->tx_packets;
1538 vport_stats.rx_bytes = stats->rx_bytes;
1539 vport_stats.tx_bytes = stats->tx_bytes;
1540 vport_stats.rx_errors = stats->rx_errors;
1541 vport_stats.tx_errors = stats->tx_errors;
1542 vport_stats.rx_dropped = stats->rx_dropped;
1543 vport_stats.tx_dropped = stats->tx_dropped;
1545 dpif_linux_vport_init(&vport);
1546 vport.cmd = OVS_VPORT_CMD_SET;
1547 vport.name = netdev_get_name(netdev);
1548 vport.stats = &vport_stats;
1550 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1552 /* If the vport layer doesn't know about the device, that doesn't mean it
1553 * doesn't exist (after all were able to open it when netdev_open() was
1554 * called), it just means that it isn't attached and we'll be getting
1555 * stats a different way. */
1556 if (err == ENODEV) {
1564 netdev_linux_read_features(struct netdev_linux *netdev)
1565 OVS_REQUIRES(netdev->mutex)
1567 struct ethtool_cmd ecmd;
1571 if (netdev->cache_valid & VALID_FEATURES) {
1575 COVERAGE_INC(netdev_get_ethtool);
1576 memset(&ecmd, 0, sizeof ecmd);
1577 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1578 ETHTOOL_GSET, "ETHTOOL_GSET");
1583 /* Supported features. */
1584 netdev->supported = 0;
1585 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1586 netdev->supported |= NETDEV_F_10MB_HD;
1588 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1589 netdev->supported |= NETDEV_F_10MB_FD;
1591 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1592 netdev->supported |= NETDEV_F_100MB_HD;
1594 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1595 netdev->supported |= NETDEV_F_100MB_FD;
1597 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1598 netdev->supported |= NETDEV_F_1GB_HD;
1600 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1601 netdev->supported |= NETDEV_F_1GB_FD;
1603 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1604 netdev->supported |= NETDEV_F_10GB_FD;
1606 if (ecmd.supported & SUPPORTED_TP) {
1607 netdev->supported |= NETDEV_F_COPPER;
1609 if (ecmd.supported & SUPPORTED_FIBRE) {
1610 netdev->supported |= NETDEV_F_FIBER;
1612 if (ecmd.supported & SUPPORTED_Autoneg) {
1613 netdev->supported |= NETDEV_F_AUTONEG;
1615 if (ecmd.supported & SUPPORTED_Pause) {
1616 netdev->supported |= NETDEV_F_PAUSE;
1618 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1619 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1622 /* Advertised features. */
1623 netdev->advertised = 0;
1624 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1625 netdev->advertised |= NETDEV_F_10MB_HD;
1627 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1628 netdev->advertised |= NETDEV_F_10MB_FD;
1630 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1631 netdev->advertised |= NETDEV_F_100MB_HD;
1633 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1634 netdev->advertised |= NETDEV_F_100MB_FD;
1636 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1637 netdev->advertised |= NETDEV_F_1GB_HD;
1639 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1640 netdev->advertised |= NETDEV_F_1GB_FD;
1642 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1643 netdev->advertised |= NETDEV_F_10GB_FD;
1645 if (ecmd.advertising & ADVERTISED_TP) {
1646 netdev->advertised |= NETDEV_F_COPPER;
1648 if (ecmd.advertising & ADVERTISED_FIBRE) {
1649 netdev->advertised |= NETDEV_F_FIBER;
1651 if (ecmd.advertising & ADVERTISED_Autoneg) {
1652 netdev->advertised |= NETDEV_F_AUTONEG;
1654 if (ecmd.advertising & ADVERTISED_Pause) {
1655 netdev->advertised |= NETDEV_F_PAUSE;
1657 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1658 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1661 /* Current settings. */
1663 if (speed == SPEED_10) {
1664 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1665 } else if (speed == SPEED_100) {
1666 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1667 } else if (speed == SPEED_1000) {
1668 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1669 } else if (speed == SPEED_10000) {
1670 netdev->current = NETDEV_F_10GB_FD;
1671 } else if (speed == 40000) {
1672 netdev->current = NETDEV_F_40GB_FD;
1673 } else if (speed == 100000) {
1674 netdev->current = NETDEV_F_100GB_FD;
1675 } else if (speed == 1000000) {
1676 netdev->current = NETDEV_F_1TB_FD;
1678 netdev->current = 0;
1681 if (ecmd.port == PORT_TP) {
1682 netdev->current |= NETDEV_F_COPPER;
1683 } else if (ecmd.port == PORT_FIBRE) {
1684 netdev->current |= NETDEV_F_FIBER;
1688 netdev->current |= NETDEV_F_AUTONEG;
1692 netdev->cache_valid |= VALID_FEATURES;
1693 netdev->get_features_error = error;
1696 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1697 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1698 * Returns 0 if successful, otherwise a positive errno value. */
1700 netdev_linux_get_features(const struct netdev *netdev_,
1701 enum netdev_features *current,
1702 enum netdev_features *advertised,
1703 enum netdev_features *supported,
1704 enum netdev_features *peer)
1706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1709 ovs_mutex_lock(&netdev->mutex);
1710 netdev_linux_read_features(netdev);
1711 if (!netdev->get_features_error) {
1712 *current = netdev->current;
1713 *advertised = netdev->advertised;
1714 *supported = netdev->supported;
1715 *peer = 0; /* XXX */
1717 error = netdev->get_features_error;
1718 ovs_mutex_unlock(&netdev->mutex);
1723 /* Set the features advertised by 'netdev' to 'advertise'. */
1725 netdev_linux_set_advertisements(struct netdev *netdev_,
1726 enum netdev_features advertise)
1728 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1729 struct ethtool_cmd ecmd;
1732 ovs_mutex_lock(&netdev->mutex);
1734 COVERAGE_INC(netdev_get_ethtool);
1735 memset(&ecmd, 0, sizeof ecmd);
1736 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1737 ETHTOOL_GSET, "ETHTOOL_GSET");
1742 ecmd.advertising = 0;
1743 if (advertise & NETDEV_F_10MB_HD) {
1744 ecmd.advertising |= ADVERTISED_10baseT_Half;
1746 if (advertise & NETDEV_F_10MB_FD) {
1747 ecmd.advertising |= ADVERTISED_10baseT_Full;
1749 if (advertise & NETDEV_F_100MB_HD) {
1750 ecmd.advertising |= ADVERTISED_100baseT_Half;
1752 if (advertise & NETDEV_F_100MB_FD) {
1753 ecmd.advertising |= ADVERTISED_100baseT_Full;
1755 if (advertise & NETDEV_F_1GB_HD) {
1756 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1758 if (advertise & NETDEV_F_1GB_FD) {
1759 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1761 if (advertise & NETDEV_F_10GB_FD) {
1762 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1764 if (advertise & NETDEV_F_COPPER) {
1765 ecmd.advertising |= ADVERTISED_TP;
1767 if (advertise & NETDEV_F_FIBER) {
1768 ecmd.advertising |= ADVERTISED_FIBRE;
1770 if (advertise & NETDEV_F_AUTONEG) {
1771 ecmd.advertising |= ADVERTISED_Autoneg;
1773 if (advertise & NETDEV_F_PAUSE) {
1774 ecmd.advertising |= ADVERTISED_Pause;
1776 if (advertise & NETDEV_F_PAUSE_ASYM) {
1777 ecmd.advertising |= ADVERTISED_Asym_Pause;
1779 COVERAGE_INC(netdev_set_ethtool);
1780 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1781 ETHTOOL_SSET, "ETHTOOL_SSET");
1784 ovs_mutex_unlock(&netdev->mutex);
1788 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1789 * successful, otherwise a positive errno value. */
1791 netdev_linux_set_policing(struct netdev *netdev_,
1792 uint32_t kbits_rate, uint32_t kbits_burst)
1794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1795 const char *netdev_name = netdev_get_name(netdev_);
1798 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1799 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1800 : kbits_burst); /* Stick with user-specified value. */
1802 ovs_mutex_lock(&netdev->mutex);
1803 if (netdev->cache_valid & VALID_POLICING) {
1804 error = netdev->netdev_policing_error;
1805 if (error || (netdev->kbits_rate == kbits_rate &&
1806 netdev->kbits_burst == kbits_burst)) {
1807 /* Assume that settings haven't changed since we last set them. */
1810 netdev->cache_valid &= ~VALID_POLICING;
1813 COVERAGE_INC(netdev_set_policing);
1814 /* Remove any existing ingress qdisc. */
1815 error = tc_add_del_ingress_qdisc(netdev_, false);
1817 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1818 netdev_name, ovs_strerror(error));
1823 error = tc_add_del_ingress_qdisc(netdev_, true);
1825 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1826 netdev_name, ovs_strerror(error));
1830 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1832 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1833 netdev_name, ovs_strerror(error));
1838 netdev->kbits_rate = kbits_rate;
1839 netdev->kbits_burst = kbits_burst;
1842 if (!error || error == ENODEV) {
1843 netdev->netdev_policing_error = error;
1844 netdev->cache_valid |= VALID_POLICING;
1846 ovs_mutex_unlock(&netdev->mutex);
1851 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1854 const struct tc_ops *const *opsp;
1856 for (opsp = tcs; *opsp != NULL; opsp++) {
1857 const struct tc_ops *ops = *opsp;
1858 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1859 sset_add(types, ops->ovs_name);
1865 static const struct tc_ops *
1866 tc_lookup_ovs_name(const char *name)
1868 const struct tc_ops *const *opsp;
1870 for (opsp = tcs; *opsp != NULL; opsp++) {
1871 const struct tc_ops *ops = *opsp;
1872 if (!strcmp(name, ops->ovs_name)) {
1879 static const struct tc_ops *
1880 tc_lookup_linux_name(const char *name)
1882 const struct tc_ops *const *opsp;
1884 for (opsp = tcs; *opsp != NULL; opsp++) {
1885 const struct tc_ops *ops = *opsp;
1886 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1893 static struct tc_queue *
1894 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
1897 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1898 struct tc_queue *queue;
1900 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
1901 if (queue->queue_id == queue_id) {
1908 static struct tc_queue *
1909 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1911 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1915 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1917 struct netdev_qos_capabilities *caps)
1919 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1923 caps->n_queues = ops->n_queues;
1928 netdev_linux_get_qos(const struct netdev *netdev_,
1929 const char **typep, struct smap *details)
1931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1934 ovs_mutex_lock(&netdev->mutex);
1935 error = tc_query_qdisc(netdev_);
1937 *typep = netdev->tc->ops->ovs_name;
1938 error = (netdev->tc->ops->qdisc_get
1939 ? netdev->tc->ops->qdisc_get(netdev_, details)
1942 ovs_mutex_unlock(&netdev->mutex);
1948 netdev_linux_set_qos(struct netdev *netdev_,
1949 const char *type, const struct smap *details)
1951 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1952 const struct tc_ops *new_ops;
1955 new_ops = tc_lookup_ovs_name(type);
1956 if (!new_ops || !new_ops->tc_install) {
1960 ovs_mutex_lock(&netdev->mutex);
1961 error = tc_query_qdisc(netdev_);
1966 if (new_ops == netdev->tc->ops) {
1967 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
1969 /* Delete existing qdisc. */
1970 error = tc_del_qdisc(netdev_);
1974 ovs_assert(netdev->tc == NULL);
1976 /* Install new qdisc. */
1977 error = new_ops->tc_install(netdev_, details);
1978 ovs_assert((error == 0) == (netdev->tc != NULL));
1982 ovs_mutex_unlock(&netdev->mutex);
1987 netdev_linux_get_queue(const struct netdev *netdev_,
1988 unsigned int queue_id, struct smap *details)
1990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1993 ovs_mutex_lock(&netdev->mutex);
1994 error = tc_query_qdisc(netdev_);
1996 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
1998 ? netdev->tc->ops->class_get(netdev_, queue, details)
2001 ovs_mutex_unlock(&netdev->mutex);
2007 netdev_linux_set_queue(struct netdev *netdev_,
2008 unsigned int queue_id, const struct smap *details)
2010 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2013 ovs_mutex_lock(&netdev->mutex);
2014 error = tc_query_qdisc(netdev_);
2016 error = (queue_id < netdev->tc->ops->n_queues
2017 && netdev->tc->ops->class_set
2018 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2021 ovs_mutex_unlock(&netdev->mutex);
2027 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2029 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2032 ovs_mutex_lock(&netdev->mutex);
2033 error = tc_query_qdisc(netdev_);
2035 if (netdev->tc->ops->class_delete) {
2036 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2038 ? netdev->tc->ops->class_delete(netdev_, queue)
2044 ovs_mutex_unlock(&netdev->mutex);
2050 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2051 unsigned int queue_id,
2052 struct netdev_queue_stats *stats)
2054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2057 ovs_mutex_lock(&netdev->mutex);
2058 error = tc_query_qdisc(netdev_);
2060 if (netdev->tc->ops->class_get_stats) {
2061 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2063 stats->created = queue->created;
2064 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2073 ovs_mutex_unlock(&netdev->mutex);
2079 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2081 struct ofpbuf request;
2082 struct tcmsg *tcmsg;
2084 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2088 tcmsg->tcm_parent = 0;
2089 nl_dump_start(dump, NETLINK_ROUTE, &request);
2090 ofpbuf_uninit(&request);
2095 netdev_linux_dump_queues(const struct netdev *netdev_,
2096 netdev_dump_queues_cb *cb, void *aux)
2098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2101 ovs_mutex_lock(&netdev->mutex);
2102 error = tc_query_qdisc(netdev_);
2104 if (netdev->tc->ops->class_get) {
2105 struct tc_queue *queue, *next_queue;
2106 struct smap details;
2108 smap_init(&details);
2109 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2110 &netdev->tc->queues) {
2113 smap_clear(&details);
2115 retval = netdev->tc->ops->class_get(netdev_, queue, &details);
2117 (*cb)(queue->queue_id, &details, aux);
2122 smap_destroy(&details);
2127 ovs_mutex_unlock(&netdev->mutex);
2133 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2134 netdev_dump_queue_stats_cb *cb, void *aux)
2136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2139 ovs_mutex_lock(&netdev->mutex);
2140 error = tc_query_qdisc(netdev_);
2142 struct nl_dump dump;
2144 if (!netdev->tc->ops->class_dump_stats) {
2146 } else if (!start_queue_dump(netdev_, &dump)) {
2152 while (nl_dump_next(&dump, &msg)) {
2153 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2160 retval = nl_dump_done(&dump);
2166 ovs_mutex_unlock(&netdev->mutex);
2172 netdev_linux_get_in4(const struct netdev *netdev_,
2173 struct in_addr *address, struct in_addr *netmask)
2175 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2178 ovs_mutex_lock(&netdev->mutex);
2179 if (!(netdev->cache_valid & VALID_IN4)) {
2180 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2181 SIOCGIFADDR, "SIOCGIFADDR");
2183 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2184 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2186 netdev->cache_valid |= VALID_IN4;
2194 if (netdev->address.s_addr != INADDR_ANY) {
2195 *address = netdev->address;
2196 *netmask = netdev->netmask;
2198 error = EADDRNOTAVAIL;
2201 ovs_mutex_unlock(&netdev->mutex);
2207 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2208 struct in_addr netmask)
2210 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2213 ovs_mutex_lock(&netdev->mutex);
2214 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2216 netdev->cache_valid |= VALID_IN4;
2217 netdev->address = address;
2218 netdev->netmask = netmask;
2219 if (address.s_addr != INADDR_ANY) {
2220 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2221 "SIOCSIFNETMASK", netmask);
2224 ovs_mutex_unlock(&netdev->mutex);
2230 parse_if_inet6_line(const char *line,
2231 struct in6_addr *in6, char ifname[16 + 1])
2233 uint8_t *s6 = in6->s6_addr;
2234 #define X8 "%2"SCNx8
2236 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2237 "%*x %*x %*x %*x %16s\n",
2238 &s6[0], &s6[1], &s6[2], &s6[3],
2239 &s6[4], &s6[5], &s6[6], &s6[7],
2240 &s6[8], &s6[9], &s6[10], &s6[11],
2241 &s6[12], &s6[13], &s6[14], &s6[15],
2245 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2246 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2248 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2250 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2252 ovs_mutex_lock(&netdev->mutex);
2253 if (!(netdev->cache_valid & VALID_IN6)) {
2257 netdev->in6 = in6addr_any;
2259 file = fopen("/proc/net/if_inet6", "r");
2261 const char *name = netdev_get_name(netdev_);
2262 while (fgets(line, sizeof line, file)) {
2263 struct in6_addr in6_tmp;
2264 char ifname[16 + 1];
2265 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2266 && !strcmp(name, ifname))
2268 netdev->in6 = in6_tmp;
2274 netdev->cache_valid |= VALID_IN6;
2277 ovs_mutex_unlock(&netdev->mutex);
2283 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2285 struct sockaddr_in sin;
2286 memset(&sin, 0, sizeof sin);
2287 sin.sin_family = AF_INET;
2288 sin.sin_addr = addr;
2291 memset(sa, 0, sizeof *sa);
2292 memcpy(sa, &sin, sizeof sin);
2296 do_set_addr(struct netdev *netdev,
2297 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2301 make_in4_sockaddr(&ifr.ifr_addr, addr);
2302 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2306 /* Adds 'router' as a default IP gateway. */
2308 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2310 struct in_addr any = { INADDR_ANY };
2314 memset(&rt, 0, sizeof rt);
2315 make_in4_sockaddr(&rt.rt_dst, any);
2316 make_in4_sockaddr(&rt.rt_gateway, router);
2317 make_in4_sockaddr(&rt.rt_genmask, any);
2318 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2319 error = af_inet_ioctl(SIOCADDRT, &rt);
2321 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2327 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2330 static const char fn[] = "/proc/net/route";
2335 *netdev_name = NULL;
2336 stream = fopen(fn, "r");
2337 if (stream == NULL) {
2338 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2343 while (fgets(line, sizeof line, stream)) {
2346 ovs_be32 dest, gateway, mask;
2347 int refcnt, metric, mtu;
2348 unsigned int flags, use, window, irtt;
2351 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2353 iface, &dest, &gateway, &flags, &refcnt,
2354 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2356 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2360 if (!(flags & RTF_UP)) {
2361 /* Skip routes that aren't up. */
2365 /* The output of 'dest', 'mask', and 'gateway' were given in
2366 * network byte order, so we don't need need any endian
2367 * conversions here. */
2368 if ((dest & mask) == (host->s_addr & mask)) {
2370 /* The host is directly reachable. */
2371 next_hop->s_addr = 0;
2373 /* To reach the host, we must go through a gateway. */
2374 next_hop->s_addr = gateway;
2376 *netdev_name = xstrdup(iface);
2388 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2390 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2393 ovs_mutex_lock(&netdev->mutex);
2394 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2395 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2397 COVERAGE_INC(netdev_get_ethtool);
2398 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2399 error = netdev_linux_do_ethtool(netdev->up.name,
2402 "ETHTOOL_GDRVINFO");
2404 netdev->cache_valid |= VALID_DRVINFO;
2409 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2410 smap_add(smap, "driver_version", netdev->drvinfo.version);
2411 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2413 ovs_mutex_unlock(&netdev->mutex);
2419 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2422 smap_add(smap, "driver_name", "openvswitch");
2426 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2427 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2428 * returns 0. Otherwise, it returns a positive errno value; in particular,
2429 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2431 netdev_linux_arp_lookup(const struct netdev *netdev,
2432 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2435 struct sockaddr_in sin;
2438 memset(&r, 0, sizeof r);
2439 memset(&sin, 0, sizeof sin);
2440 sin.sin_family = AF_INET;
2441 sin.sin_addr.s_addr = ip;
2443 memcpy(&r.arp_pa, &sin, sizeof sin);
2444 r.arp_ha.sa_family = ARPHRD_ETHER;
2446 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2447 COVERAGE_INC(netdev_arp_lookup);
2448 retval = af_inet_ioctl(SIOCGARP, &r);
2450 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2451 } else if (retval != ENXIO) {
2452 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2453 netdev_get_name(netdev), IP_ARGS(ip),
2454 ovs_strerror(retval));
2460 nd_to_iff_flags(enum netdev_flags nd)
2463 if (nd & NETDEV_UP) {
2466 if (nd & NETDEV_PROMISC) {
2473 iff_to_nd_flags(int iff)
2475 enum netdev_flags nd = 0;
2479 if (iff & IFF_PROMISC) {
2480 nd |= NETDEV_PROMISC;
2486 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2487 enum netdev_flags on, enum netdev_flags *old_flagsp)
2488 OVS_REQUIRES(netdev->mutex)
2490 int old_flags, new_flags;
2493 old_flags = netdev->ifi_flags;
2494 *old_flagsp = iff_to_nd_flags(old_flags);
2495 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2496 if (new_flags != old_flags) {
2497 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2498 get_flags(&netdev->up, &netdev->ifi_flags);
2505 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2506 enum netdev_flags on, enum netdev_flags *old_flagsp)
2508 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2511 ovs_mutex_lock(&netdev->mutex);
2512 error = update_flags(netdev, off, on, old_flagsp);
2513 ovs_mutex_unlock(&netdev->mutex);
2519 netdev_linux_change_seq(const struct netdev *netdev_)
2521 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2522 unsigned int change_seq;
2524 ovs_mutex_lock(&netdev->mutex);
2525 change_seq = netdev->change_seq;
2526 ovs_mutex_unlock(&netdev->mutex);
2531 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2532 GET_FEATURES, GET_STATUS) \
2538 netdev_linux_wait, \
2540 netdev_linux_alloc, \
2542 netdev_linux_destruct, \
2543 netdev_linux_dealloc, \
2544 NULL, /* get_config */ \
2545 NULL, /* set_config */ \
2546 NULL, /* get_tunnel_config */ \
2548 netdev_linux_send, \
2549 netdev_linux_send_wait, \
2551 netdev_linux_set_etheraddr, \
2552 netdev_linux_get_etheraddr, \
2553 netdev_linux_get_mtu, \
2554 netdev_linux_set_mtu, \
2555 netdev_linux_get_ifindex, \
2556 netdev_linux_get_carrier, \
2557 netdev_linux_get_carrier_resets, \
2558 netdev_linux_set_miimon_interval, \
2563 netdev_linux_set_advertisements, \
2565 netdev_linux_set_policing, \
2566 netdev_linux_get_qos_types, \
2567 netdev_linux_get_qos_capabilities, \
2568 netdev_linux_get_qos, \
2569 netdev_linux_set_qos, \
2570 netdev_linux_get_queue, \
2571 netdev_linux_set_queue, \
2572 netdev_linux_delete_queue, \
2573 netdev_linux_get_queue_stats, \
2574 netdev_linux_dump_queues, \
2575 netdev_linux_dump_queue_stats, \
2577 netdev_linux_get_in4, \
2578 netdev_linux_set_in4, \
2579 netdev_linux_get_in6, \
2580 netdev_linux_add_router, \
2581 netdev_linux_get_next_hop, \
2583 netdev_linux_arp_lookup, \
2585 netdev_linux_update_flags, \
2587 netdev_linux_change_seq, \
2589 netdev_linux_rx_alloc, \
2590 netdev_linux_rx_construct, \
2591 netdev_linux_rx_destruct, \
2592 netdev_linux_rx_dealloc, \
2593 netdev_linux_rx_recv, \
2594 netdev_linux_rx_wait, \
2595 netdev_linux_rx_drain, \
2598 const struct netdev_class netdev_linux_class =
2601 netdev_linux_construct,
2602 netdev_linux_get_stats,
2603 NULL, /* set_stats */
2604 netdev_linux_get_features,
2605 netdev_linux_get_status);
2607 const struct netdev_class netdev_tap_class =
2610 netdev_linux_construct_tap,
2611 netdev_tap_get_stats,
2612 NULL, /* set_stats */
2613 netdev_linux_get_features,
2614 netdev_linux_get_status);
2616 const struct netdev_class netdev_internal_class =
2619 netdev_linux_construct,
2620 netdev_internal_get_stats,
2621 netdev_internal_set_stats,
2622 NULL, /* get_features */
2623 netdev_internal_get_status);
2625 /* HTB traffic control class. */
2627 #define HTB_N_QUEUES 0xf000
2631 unsigned int max_rate; /* In bytes/s. */
2635 struct tc_queue tc_queue;
2636 unsigned int min_rate; /* In bytes/s. */
2637 unsigned int max_rate; /* In bytes/s. */
2638 unsigned int burst; /* In bytes. */
2639 unsigned int priority; /* Lower values are higher priorities. */
2643 htb_get__(const struct netdev *netdev_)
2645 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2646 return CONTAINER_OF(netdev->tc, struct htb, tc);
2650 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2652 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2655 htb = xmalloc(sizeof *htb);
2656 tc_init(&htb->tc, &tc_ops_htb);
2657 htb->max_rate = max_rate;
2659 netdev->tc = &htb->tc;
2662 /* Create an HTB qdisc.
2664 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2666 htb_setup_qdisc__(struct netdev *netdev)
2669 struct tc_htb_glob opt;
2670 struct ofpbuf request;
2671 struct tcmsg *tcmsg;
2673 tc_del_qdisc(netdev);
2675 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2676 NLM_F_EXCL | NLM_F_CREATE, &request);
2680 tcmsg->tcm_handle = tc_make_handle(1, 0);
2681 tcmsg->tcm_parent = TC_H_ROOT;
2683 nl_msg_put_string(&request, TCA_KIND, "htb");
2685 memset(&opt, 0, sizeof opt);
2686 opt.rate2quantum = 10;
2690 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2691 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2692 nl_msg_end_nested(&request, opt_offset);
2694 return tc_transact(&request, NULL);
2697 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2698 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2700 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2701 unsigned int parent, struct htb_class *class)
2704 struct tc_htb_opt opt;
2705 struct ofpbuf request;
2706 struct tcmsg *tcmsg;
2710 error = netdev_get_mtu(netdev, &mtu);
2712 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2713 netdev_get_name(netdev));
2717 memset(&opt, 0, sizeof opt);
2718 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2719 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2720 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2721 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2722 opt.prio = class->priority;
2724 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2728 tcmsg->tcm_handle = handle;
2729 tcmsg->tcm_parent = parent;
2731 nl_msg_put_string(&request, TCA_KIND, "htb");
2732 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2733 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2734 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2735 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2736 nl_msg_end_nested(&request, opt_offset);
2738 error = tc_transact(&request, NULL);
2740 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2741 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2742 netdev_get_name(netdev),
2743 tc_get_major(handle), tc_get_minor(handle),
2744 tc_get_major(parent), tc_get_minor(parent),
2745 class->min_rate, class->max_rate,
2746 class->burst, class->priority, ovs_strerror(error));
2751 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2752 * description of them into 'details'. The description complies with the
2753 * specification given in the vswitch database documentation for linux-htb
2756 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2758 static const struct nl_policy tca_htb_policy[] = {
2759 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2760 .min_len = sizeof(struct tc_htb_opt) },
2763 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2764 const struct tc_htb_opt *htb;
2766 if (!nl_parse_nested(nl_options, tca_htb_policy,
2767 attrs, ARRAY_SIZE(tca_htb_policy))) {
2768 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2772 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2773 class->min_rate = htb->rate.rate;
2774 class->max_rate = htb->ceil.rate;
2775 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2776 class->priority = htb->prio;
2781 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2782 struct htb_class *options,
2783 struct netdev_queue_stats *stats)
2785 struct nlattr *nl_options;
2786 unsigned int handle;
2789 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2790 if (!error && queue_id) {
2791 unsigned int major = tc_get_major(handle);
2792 unsigned int minor = tc_get_minor(handle);
2793 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2794 *queue_id = minor - 1;
2799 if (!error && options) {
2800 error = htb_parse_tca_options__(nl_options, options);
2806 htb_parse_qdisc_details__(struct netdev *netdev,
2807 const struct smap *details, struct htb_class *hc)
2809 const char *max_rate_s;
2811 max_rate_s = smap_get(details, "max-rate");
2812 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2813 if (!hc->max_rate) {
2814 enum netdev_features current;
2816 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2817 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2819 hc->min_rate = hc->max_rate;
2825 htb_parse_class_details__(struct netdev *netdev,
2826 const struct smap *details, struct htb_class *hc)
2828 const struct htb *htb = htb_get__(netdev);
2829 const char *min_rate_s = smap_get(details, "min-rate");
2830 const char *max_rate_s = smap_get(details, "max-rate");
2831 const char *burst_s = smap_get(details, "burst");
2832 const char *priority_s = smap_get(details, "priority");
2835 error = netdev_get_mtu(netdev, &mtu);
2837 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2838 netdev_get_name(netdev));
2842 /* HTB requires at least an mtu sized min-rate to send any traffic even
2843 * on uncongested links. */
2844 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2845 hc->min_rate = MAX(hc->min_rate, mtu);
2846 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2849 hc->max_rate = (max_rate_s
2850 ? strtoull(max_rate_s, NULL, 10) / 8
2852 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2853 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2857 * According to hints in the documentation that I've read, it is important
2858 * that 'burst' be at least as big as the largest frame that might be
2859 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2860 * but having it a bit too small is a problem. Since netdev_get_mtu()
2861 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2862 * the MTU. We actually add 64, instead of 14, as a guard against
2863 * additional headers get tacked on somewhere that we're not aware of. */
2864 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2865 hc->burst = MAX(hc->burst, mtu + 64);
2868 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2874 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2875 unsigned int parent, struct htb_class *options,
2876 struct netdev_queue_stats *stats)
2878 struct ofpbuf *reply;
2881 error = tc_query_class(netdev, handle, parent, &reply);
2883 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2884 ofpbuf_delete(reply);
2890 htb_tc_install(struct netdev *netdev, const struct smap *details)
2894 error = htb_setup_qdisc__(netdev);
2896 struct htb_class hc;
2898 htb_parse_qdisc_details__(netdev, details, &hc);
2899 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2900 tc_make_handle(1, 0), &hc);
2902 htb_install__(netdev, hc.max_rate);
2908 static struct htb_class *
2909 htb_class_cast__(const struct tc_queue *queue)
2911 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2915 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2916 const struct htb_class *hc)
2918 struct htb *htb = htb_get__(netdev);
2919 size_t hash = hash_int(queue_id, 0);
2920 struct tc_queue *queue;
2921 struct htb_class *hcp;
2923 queue = tc_find_queue__(netdev, queue_id, hash);
2925 hcp = htb_class_cast__(queue);
2927 hcp = xmalloc(sizeof *hcp);
2928 queue = &hcp->tc_queue;
2929 queue->queue_id = queue_id;
2930 queue->created = time_msec();
2931 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2934 hcp->min_rate = hc->min_rate;
2935 hcp->max_rate = hc->max_rate;
2936 hcp->burst = hc->burst;
2937 hcp->priority = hc->priority;
2941 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2944 struct nl_dump dump;
2945 struct htb_class hc;
2947 /* Get qdisc options. */
2949 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2950 htb_install__(netdev, hc.max_rate);
2953 if (!start_queue_dump(netdev, &dump)) {
2956 while (nl_dump_next(&dump, &msg)) {
2957 unsigned int queue_id;
2959 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2960 htb_update_queue__(netdev, queue_id, &hc);
2963 nl_dump_done(&dump);
2969 htb_tc_destroy(struct tc *tc)
2971 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2972 struct htb_class *hc, *next;
2974 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2975 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2983 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2985 const struct htb *htb = htb_get__(netdev);
2986 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2991 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2993 struct htb_class hc;
2996 htb_parse_qdisc_details__(netdev, details, &hc);
2997 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2998 tc_make_handle(1, 0), &hc);
3000 htb_get__(netdev)->max_rate = hc.max_rate;
3006 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3007 const struct tc_queue *queue, struct smap *details)
3009 const struct htb_class *hc = htb_class_cast__(queue);
3011 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3012 if (hc->min_rate != hc->max_rate) {
3013 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3015 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3017 smap_add_format(details, "priority", "%u", hc->priority);
3023 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3024 const struct smap *details)
3026 struct htb_class hc;
3029 error = htb_parse_class_details__(netdev, details, &hc);
3034 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3035 tc_make_handle(1, 0xfffe), &hc);
3040 htb_update_queue__(netdev, queue_id, &hc);
3045 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3047 struct htb_class *hc = htb_class_cast__(queue);
3048 struct htb *htb = htb_get__(netdev);
3051 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3053 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3060 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3061 struct netdev_queue_stats *stats)
3063 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3064 tc_make_handle(1, 0xfffe), NULL, stats);
3068 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3069 const struct ofpbuf *nlmsg,
3070 netdev_dump_queue_stats_cb *cb, void *aux)
3072 struct netdev_queue_stats stats;
3073 unsigned int handle, major, minor;
3076 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3081 major = tc_get_major(handle);
3082 minor = tc_get_minor(handle);
3083 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3084 (*cb)(minor - 1, &stats, aux);
3089 static const struct tc_ops tc_ops_htb = {
3090 "htb", /* linux_name */
3091 "linux-htb", /* ovs_name */
3092 HTB_N_QUEUES, /* n_queues */
3101 htb_class_get_stats,
3102 htb_class_dump_stats
3105 /* "linux-hfsc" traffic control class. */
3107 #define HFSC_N_QUEUES 0xf000
3115 struct tc_queue tc_queue;
3120 static struct hfsc *
3121 hfsc_get__(const struct netdev *netdev_)
3123 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3124 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3127 static struct hfsc_class *
3128 hfsc_class_cast__(const struct tc_queue *queue)
3130 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3134 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3139 hfsc = xmalloc(sizeof *hfsc);
3140 tc_init(&hfsc->tc, &tc_ops_hfsc);
3141 hfsc->max_rate = max_rate;
3142 netdev->tc = &hfsc->tc;
3146 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3147 const struct hfsc_class *hc)
3151 struct hfsc_class *hcp;
3152 struct tc_queue *queue;
3154 hfsc = hfsc_get__(netdev);
3155 hash = hash_int(queue_id, 0);
3157 queue = tc_find_queue__(netdev, queue_id, hash);
3159 hcp = hfsc_class_cast__(queue);
3161 hcp = xmalloc(sizeof *hcp);
3162 queue = &hcp->tc_queue;
3163 queue->queue_id = queue_id;
3164 queue->created = time_msec();
3165 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3168 hcp->min_rate = hc->min_rate;
3169 hcp->max_rate = hc->max_rate;
3173 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3175 const struct tc_service_curve *rsc, *fsc, *usc;
3176 static const struct nl_policy tca_hfsc_policy[] = {
3178 .type = NL_A_UNSPEC,
3180 .min_len = sizeof(struct tc_service_curve),
3183 .type = NL_A_UNSPEC,
3185 .min_len = sizeof(struct tc_service_curve),
3188 .type = NL_A_UNSPEC,
3190 .min_len = sizeof(struct tc_service_curve),
3193 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3195 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3196 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3197 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3201 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3202 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3203 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3205 if (rsc->m1 != 0 || rsc->d != 0 ||
3206 fsc->m1 != 0 || fsc->d != 0 ||
3207 usc->m1 != 0 || usc->d != 0) {
3208 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3209 "Non-linear service curves are not supported.");
3213 if (rsc->m2 != fsc->m2) {
3214 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3215 "Real-time service curves are not supported ");
3219 if (rsc->m2 > usc->m2) {
3220 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3221 "Min-rate service curve is greater than "
3222 "the max-rate service curve.");
3226 class->min_rate = fsc->m2;
3227 class->max_rate = usc->m2;
3232 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3233 struct hfsc_class *options,
3234 struct netdev_queue_stats *stats)
3237 unsigned int handle;
3238 struct nlattr *nl_options;
3240 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3246 unsigned int major, minor;
3248 major = tc_get_major(handle);
3249 minor = tc_get_minor(handle);
3250 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3251 *queue_id = minor - 1;
3258 error = hfsc_parse_tca_options__(nl_options, options);
3265 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3266 unsigned int parent, struct hfsc_class *options,
3267 struct netdev_queue_stats *stats)
3270 struct ofpbuf *reply;
3272 error = tc_query_class(netdev, handle, parent, &reply);
3277 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3278 ofpbuf_delete(reply);
3283 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3284 struct hfsc_class *class)
3287 const char *max_rate_s;
3289 max_rate_s = smap_get(details, "max-rate");
3290 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3293 enum netdev_features current;
3295 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3296 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3299 class->min_rate = max_rate;
3300 class->max_rate = max_rate;
3304 hfsc_parse_class_details__(struct netdev *netdev,
3305 const struct smap *details,
3306 struct hfsc_class * class)
3308 const struct hfsc *hfsc;
3309 uint32_t min_rate, max_rate;
3310 const char *min_rate_s, *max_rate_s;
3312 hfsc = hfsc_get__(netdev);
3313 min_rate_s = smap_get(details, "min-rate");
3314 max_rate_s = smap_get(details, "max-rate");
3316 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3317 min_rate = MAX(min_rate, 1);
3318 min_rate = MIN(min_rate, hfsc->max_rate);
3320 max_rate = (max_rate_s
3321 ? strtoull(max_rate_s, NULL, 10) / 8
3323 max_rate = MAX(max_rate, min_rate);
3324 max_rate = MIN(max_rate, hfsc->max_rate);
3326 class->min_rate = min_rate;
3327 class->max_rate = max_rate;
3332 /* Create an HFSC qdisc.
3334 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3336 hfsc_setup_qdisc__(struct netdev * netdev)
3338 struct tcmsg *tcmsg;
3339 struct ofpbuf request;
3340 struct tc_hfsc_qopt opt;
3342 tc_del_qdisc(netdev);
3344 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3345 NLM_F_EXCL | NLM_F_CREATE, &request);
3351 tcmsg->tcm_handle = tc_make_handle(1, 0);
3352 tcmsg->tcm_parent = TC_H_ROOT;
3354 memset(&opt, 0, sizeof opt);
3357 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3358 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3360 return tc_transact(&request, NULL);
3363 /* Create an HFSC class.
3365 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3366 * sc rate <min_rate> ul rate <max_rate>" */
3368 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3369 unsigned int parent, struct hfsc_class *class)
3373 struct tcmsg *tcmsg;
3374 struct ofpbuf request;
3375 struct tc_service_curve min, max;
3377 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3383 tcmsg->tcm_handle = handle;
3384 tcmsg->tcm_parent = parent;
3388 min.m2 = class->min_rate;
3392 max.m2 = class->max_rate;
3394 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3395 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3396 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3397 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3398 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3399 nl_msg_end_nested(&request, opt_offset);
3401 error = tc_transact(&request, NULL);
3403 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3404 "min-rate %ubps, max-rate %ubps (%s)",
3405 netdev_get_name(netdev),
3406 tc_get_major(handle), tc_get_minor(handle),
3407 tc_get_major(parent), tc_get_minor(parent),
3408 class->min_rate, class->max_rate, ovs_strerror(error));
3415 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3418 struct hfsc_class class;
3420 error = hfsc_setup_qdisc__(netdev);
3426 hfsc_parse_qdisc_details__(netdev, details, &class);
3427 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3428 tc_make_handle(1, 0), &class);
3434 hfsc_install__(netdev, class.max_rate);
3439 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3442 struct nl_dump dump;
3443 struct hfsc_class hc;
3446 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3447 hfsc_install__(netdev, hc.max_rate);
3449 if (!start_queue_dump(netdev, &dump)) {
3453 while (nl_dump_next(&dump, &msg)) {
3454 unsigned int queue_id;
3456 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3457 hfsc_update_queue__(netdev, queue_id, &hc);
3461 nl_dump_done(&dump);
3466 hfsc_tc_destroy(struct tc *tc)
3469 struct hfsc_class *hc, *next;
3471 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3473 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3474 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3483 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3485 const struct hfsc *hfsc;
3486 hfsc = hfsc_get__(netdev);
3487 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3492 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3495 struct hfsc_class class;
3497 hfsc_parse_qdisc_details__(netdev, details, &class);
3498 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3499 tc_make_handle(1, 0), &class);
3502 hfsc_get__(netdev)->max_rate = class.max_rate;
3509 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3510 const struct tc_queue *queue, struct smap *details)
3512 const struct hfsc_class *hc;
3514 hc = hfsc_class_cast__(queue);
3515 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3516 if (hc->min_rate != hc->max_rate) {
3517 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3523 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3524 const struct smap *details)
3527 struct hfsc_class class;
3529 error = hfsc_parse_class_details__(netdev, details, &class);
3534 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3535 tc_make_handle(1, 0xfffe), &class);
3540 hfsc_update_queue__(netdev, queue_id, &class);
3545 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3549 struct hfsc_class *hc;
3551 hc = hfsc_class_cast__(queue);
3552 hfsc = hfsc_get__(netdev);
3554 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3556 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3563 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3564 struct netdev_queue_stats *stats)
3566 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3567 tc_make_handle(1, 0xfffe), NULL, stats);
3571 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3572 const struct ofpbuf *nlmsg,
3573 netdev_dump_queue_stats_cb *cb, void *aux)
3575 struct netdev_queue_stats stats;
3576 unsigned int handle, major, minor;
3579 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3584 major = tc_get_major(handle);
3585 minor = tc_get_minor(handle);
3586 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3587 (*cb)(minor - 1, &stats, aux);
3592 static const struct tc_ops tc_ops_hfsc = {
3593 "hfsc", /* linux_name */
3594 "linux-hfsc", /* ovs_name */
3595 HFSC_N_QUEUES, /* n_queues */
3596 hfsc_tc_install, /* tc_install */
3597 hfsc_tc_load, /* tc_load */
3598 hfsc_tc_destroy, /* tc_destroy */
3599 hfsc_qdisc_get, /* qdisc_get */
3600 hfsc_qdisc_set, /* qdisc_set */
3601 hfsc_class_get, /* class_get */
3602 hfsc_class_set, /* class_set */
3603 hfsc_class_delete, /* class_delete */
3604 hfsc_class_get_stats, /* class_get_stats */
3605 hfsc_class_dump_stats /* class_dump_stats */
3608 /* "linux-default" traffic control class.
3610 * This class represents the default, unnamed Linux qdisc. It corresponds to
3611 * the "" (empty string) QoS type in the OVS database. */
3614 default_install__(struct netdev *netdev_)
3616 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3617 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3619 /* Nothing but a tc class implementation is allowed to write to a tc. This
3620 * class never does that, so we can legitimately use a const tc object. */
3621 netdev->tc = CONST_CAST(struct tc *, &tc);
3625 default_tc_install(struct netdev *netdev,
3626 const struct smap *details OVS_UNUSED)
3628 default_install__(netdev);
3633 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3635 default_install__(netdev);
3639 static const struct tc_ops tc_ops_default = {
3640 NULL, /* linux_name */
3645 NULL, /* tc_destroy */
3646 NULL, /* qdisc_get */
3647 NULL, /* qdisc_set */
3648 NULL, /* class_get */
3649 NULL, /* class_set */
3650 NULL, /* class_delete */
3651 NULL, /* class_get_stats */
3652 NULL /* class_dump_stats */
3655 /* "linux-other" traffic control class.
3660 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3663 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3665 /* Nothing but a tc class implementation is allowed to write to a tc. This
3666 * class never does that, so we can legitimately use a const tc object. */
3667 netdev->tc = CONST_CAST(struct tc *, &tc);
3671 static const struct tc_ops tc_ops_other = {
3672 NULL, /* linux_name */
3673 "linux-other", /* ovs_name */
3675 NULL, /* tc_install */
3677 NULL, /* tc_destroy */
3678 NULL, /* qdisc_get */
3679 NULL, /* qdisc_set */
3680 NULL, /* class_get */
3681 NULL, /* class_set */
3682 NULL, /* class_delete */
3683 NULL, /* class_get_stats */
3684 NULL /* class_dump_stats */
3687 /* Traffic control. */
3689 /* Number of kernel "tc" ticks per second. */
3690 static double ticks_per_s;
3692 /* Number of kernel "jiffies" per second. This is used for the purpose of
3693 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3694 * one jiffy's worth of data.
3696 * There are two possibilities here:
3698 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3699 * approximate range of 100 to 1024. That means that we really need to
3700 * make sure that the qdisc can buffer that much data.
3702 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3703 * has finely granular timers and there's no need to fudge additional room
3704 * for buffers. (There's no extra effort needed to implement that: the
3705 * large 'buffer_hz' is used as a divisor, so practically any number will
3706 * come out as 0 in the division. Small integer results in the case of
3707 * really high dividends won't have any real effect anyhow.)
3709 static unsigned int buffer_hz;
3711 /* Returns tc handle 'major':'minor'. */
3713 tc_make_handle(unsigned int major, unsigned int minor)
3715 return TC_H_MAKE(major << 16, minor);
3718 /* Returns the major number from 'handle'. */
3720 tc_get_major(unsigned int handle)
3722 return TC_H_MAJ(handle) >> 16;
3725 /* Returns the minor number from 'handle'. */
3727 tc_get_minor(unsigned int handle)
3729 return TC_H_MIN(handle);
3732 static struct tcmsg *
3733 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3734 struct ofpbuf *request)
3736 struct tcmsg *tcmsg;
3740 error = get_ifindex(netdev, &ifindex);
3745 ofpbuf_init(request, 512);
3746 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3747 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3748 tcmsg->tcm_family = AF_UNSPEC;
3749 tcmsg->tcm_ifindex = ifindex;
3750 /* Caller should fill in tcmsg->tcm_handle. */
3751 /* Caller should fill in tcmsg->tcm_parent. */
3757 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3759 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3760 ofpbuf_uninit(request);
3764 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3765 * policing configuration.
3767 * This function is equivalent to running the following when 'add' is true:
3768 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3770 * This function is equivalent to running the following when 'add' is false:
3771 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3773 * The configuration and stats may be seen with the following command:
3774 * /sbin/tc -s qdisc show dev <devname>
3776 * Returns 0 if successful, otherwise a positive errno value.
3779 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3781 struct ofpbuf request;
3782 struct tcmsg *tcmsg;
3784 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3785 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3787 tcmsg = tc_make_request(netdev, type, flags, &request);
3791 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3792 tcmsg->tcm_parent = TC_H_INGRESS;
3793 nl_msg_put_string(&request, TCA_KIND, "ingress");
3794 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3796 error = tc_transact(&request, NULL);
3798 /* If we're deleting the qdisc, don't worry about some of the
3799 * error conditions. */
3800 if (!add && (error == ENOENT || error == EINVAL)) {
3809 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3812 * This function is equivalent to running:
3813 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3814 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3817 * The configuration and stats may be seen with the following command:
3818 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3820 * Returns 0 if successful, otherwise a positive errno value.
3823 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3825 struct tc_police tc_police;
3826 struct ofpbuf request;
3827 struct tcmsg *tcmsg;
3828 size_t basic_offset;
3829 size_t police_offset;
3833 memset(&tc_police, 0, sizeof tc_police);
3834 tc_police.action = TC_POLICE_SHOT;
3835 tc_police.mtu = mtu;
3836 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
3837 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3838 kbits_burst * 1024);
3840 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3841 NLM_F_EXCL | NLM_F_CREATE, &request);
3845 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3846 tcmsg->tcm_info = tc_make_handle(49,
3847 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3849 nl_msg_put_string(&request, TCA_KIND, "basic");
3850 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3851 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3852 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3853 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3854 nl_msg_end_nested(&request, police_offset);
3855 nl_msg_end_nested(&request, basic_offset);
3857 error = tc_transact(&request, NULL);
3868 /* The values in psched are not individually very meaningful, but they are
3869 * important. The tables below show some values seen in the wild.
3873 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3874 * (Before that, there are hints that it was 1000000000.)
3876 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3880 * -----------------------------------
3881 * [1] 000c8000 000f4240 000f4240 00000064
3882 * [2] 000003e8 00000400 000f4240 3b9aca00
3883 * [3] 000003e8 00000400 000f4240 3b9aca00
3884 * [4] 000003e8 00000400 000f4240 00000064
3885 * [5] 000003e8 00000040 000f4240 3b9aca00
3886 * [6] 000003e8 00000040 000f4240 000000f9
3888 * a b c d ticks_per_s buffer_hz
3889 * ------- --------- ---------- ------------- ----------- -------------
3890 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3891 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3892 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3893 * [4] 1,000 1,024 1,000,000 100 976,562 100
3894 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3895 * [6] 1,000 64 1,000,000 249 15,625,000 249
3897 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3898 * [2] 2.6.26-1-686-bigmem from Debian lenny
3899 * [3] 2.6.26-2-sparc64 from Debian lenny
3900 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3901 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3902 * [6] 2.6.34 from kernel.org on KVM
3904 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3905 static const char fn[] = "/proc/net/psched";
3906 unsigned int a, b, c, d;
3909 if (!ovsthread_once_start(&once)) {
3916 stream = fopen(fn, "r");
3918 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
3922 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3923 VLOG_WARN("%s: read failed", fn);
3927 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3931 VLOG_WARN("%s: invalid scheduler parameters", fn);
3935 ticks_per_s = (double) a * c / b;
3939 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3942 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3945 ovsthread_once_done(&once);
3948 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3949 * rate of 'rate' bytes per second. */
3951 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3954 return (rate * ticks) / ticks_per_s;
3957 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3958 * rate of 'rate' bytes per second. */
3960 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3963 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3966 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3967 * a transmission rate of 'rate' bytes per second. */
3969 tc_buffer_per_jiffy(unsigned int rate)
3972 return rate / buffer_hz;
3975 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3976 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3977 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3978 * stores NULL into it if it is absent.
3980 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3983 * Returns 0 if successful, otherwise a positive errno value. */
3985 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3986 struct nlattr **options)
3988 static const struct nl_policy tca_policy[] = {
3989 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3990 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3992 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3994 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3995 tca_policy, ta, ARRAY_SIZE(ta))) {
3996 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4001 *kind = nl_attr_get_string(ta[TCA_KIND]);
4005 *options = ta[TCA_OPTIONS];
4020 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4021 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4022 * into '*options', and its queue statistics into '*stats'. Any of the output
4023 * arguments may be null.
4025 * Returns 0 if successful, otherwise a positive errno value. */
4027 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4028 struct nlattr **options, struct netdev_queue_stats *stats)
4030 static const struct nl_policy tca_policy[] = {
4031 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4032 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4034 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4036 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4037 tca_policy, ta, ARRAY_SIZE(ta))) {
4038 VLOG_WARN_RL(&rl, "failed to parse class message");
4043 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4044 *handlep = tc->tcm_handle;
4048 *options = ta[TCA_OPTIONS];
4052 const struct gnet_stats_queue *gsq;
4053 struct gnet_stats_basic gsb;
4055 static const struct nl_policy stats_policy[] = {
4056 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4057 .min_len = sizeof gsb },
4058 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4059 .min_len = sizeof *gsq },
4061 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4063 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4064 sa, ARRAY_SIZE(sa))) {
4065 VLOG_WARN_RL(&rl, "failed to parse class stats");
4069 /* Alignment issues screw up the length of struct gnet_stats_basic on
4070 * some arch/bitsize combinations. Newer versions of Linux have a
4071 * struct gnet_stats_basic_packed, but we can't depend on that. The
4072 * easiest thing to do is just to make a copy. */
4073 memset(&gsb, 0, sizeof gsb);
4074 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4075 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4076 stats->tx_bytes = gsb.bytes;
4077 stats->tx_packets = gsb.packets;
4079 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4080 stats->tx_errors = gsq->drops;
4090 memset(stats, 0, sizeof *stats);
4095 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4098 tc_query_class(const struct netdev *netdev,
4099 unsigned int handle, unsigned int parent,
4100 struct ofpbuf **replyp)
4102 struct ofpbuf request;
4103 struct tcmsg *tcmsg;
4106 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4110 tcmsg->tcm_handle = handle;
4111 tcmsg->tcm_parent = parent;
4113 error = tc_transact(&request, replyp);
4115 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4116 netdev_get_name(netdev),
4117 tc_get_major(handle), tc_get_minor(handle),
4118 tc_get_major(parent), tc_get_minor(parent),
4119 ovs_strerror(error));
4124 /* Equivalent to "tc class del dev <name> handle <handle>". */
4126 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4128 struct ofpbuf request;
4129 struct tcmsg *tcmsg;
4132 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4136 tcmsg->tcm_handle = handle;
4137 tcmsg->tcm_parent = 0;
4139 error = tc_transact(&request, NULL);
4141 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4142 netdev_get_name(netdev),
4143 tc_get_major(handle), tc_get_minor(handle),
4144 ovs_strerror(error));
4149 /* Equivalent to "tc qdisc del dev <name> root". */
4151 tc_del_qdisc(struct netdev *netdev_)
4153 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4154 struct ofpbuf request;
4155 struct tcmsg *tcmsg;
4158 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4162 tcmsg->tcm_handle = tc_make_handle(1, 0);
4163 tcmsg->tcm_parent = TC_H_ROOT;
4165 error = tc_transact(&request, NULL);
4166 if (error == EINVAL) {
4167 /* EINVAL probably means that the default qdisc was in use, in which
4168 * case we've accomplished our purpose. */
4171 if (!error && netdev->tc) {
4172 if (netdev->tc->ops->tc_destroy) {
4173 netdev->tc->ops->tc_destroy(netdev->tc);
4180 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4181 * kernel to determine what they are. Returns 0 if successful, otherwise a
4182 * positive errno value. */
4184 tc_query_qdisc(const struct netdev *netdev_)
4186 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4187 struct ofpbuf request, *qdisc;
4188 const struct tc_ops *ops;
4189 struct tcmsg *tcmsg;
4197 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4198 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4199 * 2.6.35 without that fix backported to it.
4201 * To avoid the OOPS, we must not make a request that would attempt to dump
4202 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4203 * few others. There are a few ways that I can see to do this, but most of
4204 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4205 * technique chosen here is to assume that any non-default qdisc that we
4206 * create will have a class with handle 1:0. The built-in qdiscs only have
4207 * a class with handle 0:0.
4209 * We could check for Linux 2.6.35+ and use a more straightforward method
4211 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4215 tcmsg->tcm_handle = tc_make_handle(1, 0);
4216 tcmsg->tcm_parent = 0;
4218 /* Figure out what tc class to instantiate. */
4219 error = tc_transact(&request, &qdisc);
4223 error = tc_parse_qdisc(qdisc, &kind, NULL);
4225 ops = &tc_ops_other;
4227 ops = tc_lookup_linux_name(kind);
4229 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4230 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4232 ops = &tc_ops_other;
4235 } else if (error == ENOENT) {
4236 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4237 * other entity that doesn't have a handle 1:0. We will assume
4238 * that it's the system default qdisc. */
4239 ops = &tc_ops_default;
4242 /* Who knows? Maybe the device got deleted. */
4243 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4244 netdev_get_name(netdev_), ovs_strerror(error));
4245 ops = &tc_ops_other;
4248 /* Instantiate it. */
4249 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4250 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4251 ofpbuf_delete(qdisc);
4253 return error ? error : load_error;
4256 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4257 approximate the time to transmit packets of various lengths. For an MTU of
4258 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4259 represents two possible packet lengths; for a MTU of 513 through 1024, four
4260 possible lengths; and so on.
4262 Returns, for the specified 'mtu', the number of bits that packet lengths
4263 need to be shifted right to fit within such a 256-entry table. */
4265 tc_calc_cell_log(unsigned int mtu)
4270 mtu = ETH_PAYLOAD_MAX;
4272 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4274 for (cell_log = 0; mtu >= 256; cell_log++) {
4281 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4284 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4286 memset(rate, 0, sizeof *rate);
4287 rate->cell_log = tc_calc_cell_log(mtu);
4288 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4289 /* rate->cell_align = 0; */ /* distro headers. */
4290 rate->mpu = ETH_TOTAL_MIN;
4294 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4295 * attribute of the specified "type".
4297 * See tc_calc_cell_log() above for a description of "rtab"s. */
4299 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4304 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4305 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4306 unsigned packet_size = (i + 1) << rate->cell_log;
4307 if (packet_size < rate->mpu) {
4308 packet_size = rate->mpu;
4310 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4314 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4315 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4316 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4319 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4321 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4322 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4325 /* Linux-only functions declared in netdev-linux.h */
4327 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4328 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4330 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4331 const char *flag_name, bool enable)
4333 const char *netdev_name = netdev_get_name(netdev);
4334 struct ethtool_value evalue;
4338 COVERAGE_INC(netdev_get_ethtool);
4339 memset(&evalue, 0, sizeof evalue);
4340 error = netdev_linux_do_ethtool(netdev_name,
4341 (struct ethtool_cmd *)&evalue,
4342 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4347 COVERAGE_INC(netdev_set_ethtool);
4348 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4349 error = netdev_linux_do_ethtool(netdev_name,
4350 (struct ethtool_cmd *)&evalue,
4351 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4356 COVERAGE_INC(netdev_get_ethtool);
4357 memset(&evalue, 0, sizeof evalue);
4358 error = netdev_linux_do_ethtool(netdev_name,
4359 (struct ethtool_cmd *)&evalue,
4360 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4365 if (new_flags != evalue.data) {
4366 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4367 "device %s failed", enable ? "enable" : "disable",
4368 flag_name, netdev_name);
4375 /* Utility functions. */
4377 /* Copies 'src' into 'dst', performing format conversion in the process. */
4379 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4380 const struct rtnl_link_stats *src)
4382 dst->rx_packets = src->rx_packets;
4383 dst->tx_packets = src->tx_packets;
4384 dst->rx_bytes = src->rx_bytes;
4385 dst->tx_bytes = src->tx_bytes;
4386 dst->rx_errors = src->rx_errors;
4387 dst->tx_errors = src->tx_errors;
4388 dst->rx_dropped = src->rx_dropped;
4389 dst->tx_dropped = src->tx_dropped;
4390 dst->multicast = src->multicast;
4391 dst->collisions = src->collisions;
4392 dst->rx_length_errors = src->rx_length_errors;
4393 dst->rx_over_errors = src->rx_over_errors;
4394 dst->rx_crc_errors = src->rx_crc_errors;
4395 dst->rx_frame_errors = src->rx_frame_errors;
4396 dst->rx_fifo_errors = src->rx_fifo_errors;
4397 dst->rx_missed_errors = src->rx_missed_errors;
4398 dst->tx_aborted_errors = src->tx_aborted_errors;
4399 dst->tx_carrier_errors = src->tx_carrier_errors;
4400 dst->tx_fifo_errors = src->tx_fifo_errors;
4401 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4402 dst->tx_window_errors = src->tx_window_errors;
4406 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4408 /* Policy for RTNLGRP_LINK messages.
4410 * There are *many* more fields in these messages, but currently we only
4411 * care about these fields. */
4412 static const struct nl_policy rtnlgrp_link_policy[] = {
4413 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4414 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4415 .min_len = sizeof(struct rtnl_link_stats) },
4418 struct ofpbuf request;
4419 struct ofpbuf *reply;
4420 struct ifinfomsg *ifi;
4421 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4424 ofpbuf_init(&request, 0);
4425 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4426 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4427 ifi->ifi_family = PF_UNSPEC;
4428 ifi->ifi_index = ifindex;
4429 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4430 ofpbuf_uninit(&request);
4435 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4436 rtnlgrp_link_policy,
4437 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4438 ofpbuf_delete(reply);
4442 if (!attrs[IFLA_STATS]) {
4443 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4444 ofpbuf_delete(reply);
4448 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4450 ofpbuf_delete(reply);
4456 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4458 static const char fn[] = "/proc/net/dev";
4463 stream = fopen(fn, "r");
4465 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
4470 while (fgets(line, sizeof line, stream)) {
4473 #define X64 "%"SCNu64
4476 X64 X64 X64 X64 X64 X64 X64 "%*u"
4477 X64 X64 X64 X64 X64 X64 X64 "%*u",
4483 &stats->rx_fifo_errors,
4484 &stats->rx_frame_errors,
4490 &stats->tx_fifo_errors,
4492 &stats->tx_carrier_errors) != 15) {
4493 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4494 } else if (!strcmp(devname, netdev_name)) {
4495 stats->rx_length_errors = UINT64_MAX;
4496 stats->rx_over_errors = UINT64_MAX;
4497 stats->rx_crc_errors = UINT64_MAX;
4498 stats->rx_missed_errors = UINT64_MAX;
4499 stats->tx_aborted_errors = UINT64_MAX;
4500 stats->tx_heartbeat_errors = UINT64_MAX;
4501 stats->tx_window_errors = UINT64_MAX;
4507 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4513 get_flags(const struct netdev *dev, unsigned int *flags)
4519 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4521 *flags = ifr.ifr_flags;
4527 set_flags(const char *name, unsigned int flags)
4531 ifr.ifr_flags = flags;
4532 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4536 do_get_ifindex(const char *netdev_name)
4541 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4542 COVERAGE_INC(netdev_get_ifindex);
4544 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4546 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4547 netdev_name, ovs_strerror(error));
4550 return ifr.ifr_ifindex;
4554 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4556 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4558 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4559 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4562 netdev->get_ifindex_error = -ifindex;
4563 netdev->ifindex = 0;
4565 netdev->get_ifindex_error = 0;
4566 netdev->ifindex = ifindex;
4568 netdev->cache_valid |= VALID_IFINDEX;
4571 *ifindexp = netdev->ifindex;
4572 return netdev->get_ifindex_error;
4576 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4582 memset(&ifr, 0, sizeof ifr);
4583 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4584 COVERAGE_INC(netdev_get_hwaddr);
4585 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4587 /* ENODEV probably means that a vif disappeared asynchronously and
4588 * hasn't been removed from the database yet, so reduce the log level
4589 * to INFO for that case. */
4590 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4591 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4592 netdev_name, ovs_strerror(error));
4595 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4596 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4597 VLOG_WARN("%s device has unknown hardware address family %d",
4598 netdev_name, hwaddr_family);
4600 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4605 set_etheraddr(const char *netdev_name,
4606 const uint8_t mac[ETH_ADDR_LEN])
4611 memset(&ifr, 0, sizeof ifr);
4612 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4613 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4614 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4615 COVERAGE_INC(netdev_set_hwaddr);
4616 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4618 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4619 netdev_name, ovs_strerror(error));
4625 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4626 int cmd, const char *cmd_name)
4631 memset(&ifr, 0, sizeof ifr);
4632 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4633 ifr.ifr_data = (caddr_t) ecmd;
4636 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4638 if (error != EOPNOTSUPP) {
4639 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4640 "failed: %s", cmd_name, name, ovs_strerror(error));
4642 /* The device doesn't support this operation. That's pretty
4643 * common, so there's no point in logging anything. */
4650 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4651 int cmd, const char *cmd_name)
4656 ifr.ifr_addr.sa_family = AF_INET;
4657 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4659 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4661 *ip = sin->sin_addr;
4666 /* Returns an AF_PACKET raw socket or a negative errno value. */
4668 af_packet_sock(void)
4670 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4673 if (ovsthread_once_start(&once)) {
4674 sock = socket(AF_PACKET, SOCK_RAW, 0);
4676 int error = set_nonblocking(sock);
4683 VLOG_ERR("failed to create packet socket: %s",
4684 ovs_strerror(errno));
4686 ovsthread_once_done(&once);