2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
153 /* Traffic control. */
155 /* An instance of a traffic control class. Always associated with a particular
158 * Each TC implementation subclasses this with whatever additional data it
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
169 /* One traffic control queue.
171 * Each TC implementation subclasses this with whatever additional data it
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
251 * This function may be null if 'tc' is not configurable.
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' is not configurable.
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
277 * This function may be null if 'tc' does not have queues ('n_queues' is
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
306 * On success, initializes '*stats'.
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
325 tc_init(struct tc *tc, const struct tc_ops *ops)
328 hmap_init(&tc->queues);
332 tc_destroy(struct tc *tc)
334 hmap_destroy(&tc->queues);
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
384 struct netdev_linux {
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
390 unsigned int cache_valid;
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
422 /* For devices of class netdev_tap_class only. */
426 struct netdev_rxq_linux {
427 struct netdev_rxq up;
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
439 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
441 static void netdev_linux_run(void);
443 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
444 int cmd, const char *cmd_name);
445 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
446 int cmd, const char *cmd_name);
447 static int get_flags(const struct netdev *, unsigned int *flags);
448 static int set_flags(const char *, unsigned int flags);
449 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
450 enum netdev_flags on, enum netdev_flags *old_flagsp)
451 OVS_REQUIRES(netdev->mutex);
452 static int do_get_ifindex(const char *netdev_name);
453 static int get_ifindex(const struct netdev *, int *ifindexp);
454 static int do_set_addr(struct netdev *netdev,
455 int ioctl_nr, const char *ioctl_name,
456 struct in_addr addr);
457 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
458 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
459 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
460 static int af_packet_sock(void);
461 static bool netdev_linux_miimon_enabled(void);
462 static void netdev_linux_miimon_run(void);
463 static void netdev_linux_miimon_wait(void);
464 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
467 is_netdev_linux_class(const struct netdev_class *netdev_class)
469 return netdev_class->run == netdev_linux_run;
473 is_tap_netdev(const struct netdev *netdev)
475 return netdev_get_class(netdev) == &netdev_tap_class;
478 static struct netdev_linux *
479 netdev_linux_cast(const struct netdev *netdev)
481 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
483 return CONTAINER_OF(netdev, struct netdev_linux, up);
486 static struct netdev_rxq_linux *
487 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
489 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
490 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
493 static void netdev_linux_update(struct netdev_linux *netdev,
494 const struct rtnetlink_link_change *)
495 OVS_REQUIRES(netdev->mutex);
496 static void netdev_linux_changed(struct netdev_linux *netdev,
497 unsigned int ifi_flags, unsigned int mask)
498 OVS_REQUIRES(netdev->mutex);
500 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
501 * if no such socket could be created. */
502 static struct nl_sock *
503 netdev_linux_notify_sock(void)
505 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
506 static struct nl_sock *sock;
508 if (ovsthread_once_start(&once)) {
511 error = nl_sock_create(NETLINK_ROUTE, &sock);
513 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
515 nl_sock_destroy(sock);
519 ovsthread_once_done(&once);
526 netdev_linux_miimon_enabled(void)
530 atomic_read(&miimon_cnt, &miimon);
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 netdev_change_seq_changed(&dev->up);
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
762 atomic_sub(&miimon_cnt, 1, &junk);
765 ovs_mutex_destroy(&netdev->mutex);
769 netdev_linux_dealloc(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
775 static struct netdev_rxq *
776 netdev_linux_rxq_alloc(void)
778 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
783 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
785 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
786 struct netdev *netdev_ = rx->up.netdev;
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 ovs_mutex_lock(&netdev->mutex);
791 rx->is_tap = is_tap_netdev(netdev_);
793 rx->fd = netdev->tap_fd;
795 struct sockaddr_ll sll;
797 /* Result of tcpdump -dd inbound */
798 static const struct sock_filter filt[] = {
799 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
800 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
801 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
802 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
804 static const struct sock_fprog fprog = {
805 ARRAY_SIZE(filt), (struct sock_filter *) filt
808 /* Create file descriptor. */
809 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
812 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
817 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
819 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 /* Set non-blocking mode. */
825 error = set_nonblocking(rx->fd);
830 /* Get ethernet device index. */
831 error = get_ifindex(&netdev->up, &ifindex);
836 /* Bind to specific ethernet device. */
837 memset(&sll, 0, sizeof sll);
838 sll.sll_family = AF_PACKET;
839 sll.sll_ifindex = ifindex;
840 sll.sll_protocol = htons(ETH_P_ALL);
841 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
843 VLOG_ERR("%s: failed to bind raw socket (%s)",
844 netdev_get_name(netdev_), ovs_strerror(error));
848 /* Filter for only inbound packets. */
849 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
853 VLOG_ERR("%s: failed to attach filter (%s)",
854 netdev_get_name(netdev_), ovs_strerror(error));
858 ovs_mutex_unlock(&netdev->mutex);
866 ovs_mutex_unlock(&netdev->mutex);
871 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
873 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
881 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
883 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
889 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
891 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
892 return htons(aux->tp_vlan_tpid);
894 return htons(ETH_TYPE_VLAN);
899 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
901 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
905 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
910 struct cmsghdr *cmsg;
913 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
917 /* Reserve headroom for a single VLAN tag */
918 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
919 size = ofpbuf_tailroom(buffer);
921 iov.iov_base = ofpbuf_data(buffer);
923 msgh.msg_name = NULL;
924 msgh.msg_namelen = 0;
927 msgh.msg_control = &cmsg_buffer;
928 msgh.msg_controllen = sizeof cmsg_buffer;
932 retval = recvmsg(fd, &msgh, MSG_TRUNC);
933 } while (retval < 0 && errno == EINTR);
937 } else if (retval > size) {
941 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
943 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
944 const struct tpacket_auxdata *aux;
946 if (cmsg->cmsg_level != SOL_PACKET
947 || cmsg->cmsg_type != PACKET_AUXDATA
948 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
952 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
953 if (auxdata_has_vlan_tci(aux)) {
954 if (retval < ETH_HEADER_LEN) {
958 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
959 htons(aux->tp_vlan_tci));
968 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
971 size_t size = ofpbuf_tailroom(buffer);
974 retval = read(fd, ofpbuf_data(buffer), size);
975 } while (retval < 0 && errno == EINTR);
979 } else if (retval > size) {
983 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
988 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
992 struct netdev *netdev = rx->up.netdev;
993 struct dpif_packet *packet;
994 struct ofpbuf *buffer;
998 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
999 mtu = ETH_PAYLOAD_MAX;
1002 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1003 DP_NETDEV_HEADROOM);
1004 buffer = &packet->ofpbuf;
1006 retval = (rx->is_tap
1007 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1008 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1011 if (retval != EAGAIN && retval != EMSGSIZE) {
1012 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1013 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1015 dpif_packet_delete(packet);
1017 dp_packet_pad(buffer);
1018 packets[0] = packet;
1026 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1028 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1029 poll_fd_wait(rx->fd, POLLIN);
1033 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1035 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1038 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1039 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1043 drain_fd(rx->fd, ifr.ifr_qlen);
1046 return drain_rcvbuf(rx->fd);
1050 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1051 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1052 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1053 * the packet is too big or too small to transmit on the device.
1055 * The caller retains ownership of 'buffer' in all cases.
1057 * The kernel maintains a packet transmission queue, so the caller is not
1058 * expected to do additional queuing of packets. */
1060 netdev_linux_send(struct netdev *netdev_, struct dpif_packet **pkts, int cnt,
1066 /* 'i' is incremented only if there's no error */
1067 for (i = 0; i < cnt;) {
1068 const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
1069 size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
1072 if (!is_tap_netdev(netdev_)) {
1073 /* Use our AF_PACKET socket to send to this device. */
1074 struct sockaddr_ll sll;
1080 sock = af_packet_sock();
1085 ifindex = netdev_get_ifindex(netdev_);
1090 /* We don't bother setting most fields in sockaddr_ll because the
1091 * kernel ignores them for SOCK_RAW. */
1092 memset(&sll, 0, sizeof sll);
1093 sll.sll_family = AF_PACKET;
1094 sll.sll_ifindex = ifindex;
1096 iov.iov_base = CONST_CAST(void *, data);
1099 msg.msg_name = &sll;
1100 msg.msg_namelen = sizeof sll;
1103 msg.msg_control = NULL;
1104 msg.msg_controllen = 0;
1107 retval = sendmsg(sock, &msg, 0);
1109 /* Use the tap fd to send to this device. This is essential for
1110 * tap devices, because packets sent to a tap device with an
1111 * AF_PACKET socket will loop back to be *received* again on the
1112 * tap device. This doesn't occur on other interface types
1113 * because we attach a socket filter to the rx socket. */
1114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1116 retval = write(netdev->tap_fd, data, size);
1120 /* The Linux AF_PACKET implementation never blocks waiting for room
1121 * for packets, instead returning ENOBUFS. Translate this into
1122 * EAGAIN for the caller. */
1123 error = errno == ENOBUFS ? EAGAIN : errno;
1124 if (error == EINTR) {
1125 /* continue without incrementing 'i', i.e. retry this packet */
1129 } else if (retval != size) {
1130 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1131 " of %"PRIuSIZE") on %s", retval, size,
1132 netdev_get_name(netdev_));
1137 /* Process the next packet in the batch */
1142 for (i = 0; i < cnt; i++) {
1143 dpif_packet_delete(pkts[i]);
1147 if (error && error != EAGAIN) {
1148 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1149 netdev_get_name(netdev_), ovs_strerror(error));
1156 /* Registers with the poll loop to wake up from the next call to poll_block()
1157 * when the packet transmission queue has sufficient room to transmit a packet
1158 * with netdev_send().
1160 * The kernel maintains a packet transmission queue, so the client is not
1161 * expected to do additional queuing of packets. Thus, this function is
1162 * unlikely to ever be used. It is included for completeness. */
1164 netdev_linux_send_wait(struct netdev *netdev)
1166 if (is_tap_netdev(netdev)) {
1167 /* TAP device always accepts packets.*/
1168 poll_immediate_wake();
1172 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1173 * otherwise a positive errno value. */
1175 netdev_linux_set_etheraddr(struct netdev *netdev_,
1176 const uint8_t mac[ETH_ADDR_LEN])
1178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1179 enum netdev_flags old_flags = 0;
1182 ovs_mutex_lock(&netdev->mutex);
1184 if (netdev->cache_valid & VALID_ETHERADDR) {
1185 error = netdev->ether_addr_error;
1186 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1189 netdev->cache_valid &= ~VALID_ETHERADDR;
1192 /* Tap devices must be brought down before setting the address. */
1193 if (is_tap_netdev(netdev_)) {
1194 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1196 error = set_etheraddr(netdev_get_name(netdev_), mac);
1197 if (!error || error == ENODEV) {
1198 netdev->ether_addr_error = error;
1199 netdev->cache_valid |= VALID_ETHERADDR;
1201 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1205 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1206 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1210 ovs_mutex_unlock(&netdev->mutex);
1214 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1216 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1217 uint8_t mac[ETH_ADDR_LEN])
1219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 ovs_mutex_lock(&netdev->mutex);
1223 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1224 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1226 netdev->cache_valid |= VALID_ETHERADDR;
1229 error = netdev->ether_addr_error;
1231 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1233 ovs_mutex_unlock(&netdev->mutex);
1239 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1243 if (!(netdev->cache_valid & VALID_MTU)) {
1246 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1247 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1248 netdev->mtu = ifr.ifr_mtu;
1249 netdev->cache_valid |= VALID_MTU;
1252 error = netdev->netdev_mtu_error;
1254 *mtup = netdev->mtu;
1260 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1261 * in bytes, not including the hardware header; thus, this is typically 1500
1262 * bytes for Ethernet devices. */
1264 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1266 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1269 ovs_mutex_lock(&netdev->mutex);
1270 error = netdev_linux_get_mtu__(netdev, mtup);
1271 ovs_mutex_unlock(&netdev->mutex);
1276 /* Sets the maximum size of transmitted (MTU) for given device using linux
1277 * networking ioctl interface.
1280 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1282 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1286 ovs_mutex_lock(&netdev->mutex);
1287 if (netdev->cache_valid & VALID_MTU) {
1288 error = netdev->netdev_mtu_error;
1289 if (error || netdev->mtu == mtu) {
1292 netdev->cache_valid &= ~VALID_MTU;
1295 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1296 SIOCSIFMTU, "SIOCSIFMTU");
1297 if (!error || error == ENODEV) {
1298 netdev->netdev_mtu_error = error;
1299 netdev->mtu = ifr.ifr_mtu;
1300 netdev->cache_valid |= VALID_MTU;
1303 ovs_mutex_unlock(&netdev->mutex);
1307 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1308 * On failure, returns a negative errno value. */
1310 netdev_linux_get_ifindex(const struct netdev *netdev_)
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1315 ovs_mutex_lock(&netdev->mutex);
1316 error = get_ifindex(netdev_, &ifindex);
1317 ovs_mutex_unlock(&netdev->mutex);
1319 return error ? -error : ifindex;
1323 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1327 ovs_mutex_lock(&netdev->mutex);
1328 if (netdev->miimon_interval > 0) {
1329 *carrier = netdev->miimon;
1331 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1333 ovs_mutex_unlock(&netdev->mutex);
1338 static long long int
1339 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1341 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1342 long long int carrier_resets;
1344 ovs_mutex_lock(&netdev->mutex);
1345 carrier_resets = netdev->carrier_resets;
1346 ovs_mutex_unlock(&netdev->mutex);
1348 return carrier_resets;
1352 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1353 struct mii_ioctl_data *data)
1358 memset(&ifr, 0, sizeof ifr);
1359 memcpy(&ifr.ifr_data, data, sizeof *data);
1360 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1361 memcpy(data, &ifr.ifr_data, sizeof *data);
1367 netdev_linux_get_miimon(const char *name, bool *miimon)
1369 struct mii_ioctl_data data;
1374 memset(&data, 0, sizeof data);
1375 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1377 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1378 data.reg_num = MII_BMSR;
1379 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1383 *miimon = !!(data.val_out & BMSR_LSTATUS);
1385 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1388 struct ethtool_cmd ecmd;
1390 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1393 COVERAGE_INC(netdev_get_ethtool);
1394 memset(&ecmd, 0, sizeof ecmd);
1395 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1398 struct ethtool_value eval;
1400 memcpy(&eval, &ecmd, sizeof eval);
1401 *miimon = !!eval.data;
1403 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1411 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1412 long long int interval)
1414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1416 ovs_mutex_lock(&netdev->mutex);
1417 interval = interval > 0 ? MAX(interval, 100) : 0;
1418 if (netdev->miimon_interval != interval) {
1421 if (interval && !netdev->miimon_interval) {
1422 atomic_add(&miimon_cnt, 1, &junk);
1423 } else if (!interval && netdev->miimon_interval) {
1424 atomic_sub(&miimon_cnt, 1, &junk);
1427 netdev->miimon_interval = interval;
1428 timer_set_expired(&netdev->miimon_timer);
1430 ovs_mutex_unlock(&netdev->mutex);
1436 netdev_linux_miimon_run(void)
1438 struct shash device_shash;
1439 struct shash_node *node;
1441 shash_init(&device_shash);
1442 netdev_get_devices(&netdev_linux_class, &device_shash);
1443 SHASH_FOR_EACH (node, &device_shash) {
1444 struct netdev *netdev = node->data;
1445 struct netdev_linux *dev = netdev_linux_cast(netdev);
1448 ovs_mutex_lock(&dev->mutex);
1449 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1450 netdev_linux_get_miimon(dev->up.name, &miimon);
1451 if (miimon != dev->miimon) {
1452 dev->miimon = miimon;
1453 netdev_linux_changed(dev, dev->ifi_flags, 0);
1456 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1458 ovs_mutex_unlock(&dev->mutex);
1459 netdev_close(netdev);
1462 shash_destroy(&device_shash);
1466 netdev_linux_miimon_wait(void)
1468 struct shash device_shash;
1469 struct shash_node *node;
1471 shash_init(&device_shash);
1472 netdev_get_devices(&netdev_linux_class, &device_shash);
1473 SHASH_FOR_EACH (node, &device_shash) {
1474 struct netdev *netdev = node->data;
1475 struct netdev_linux *dev = netdev_linux_cast(netdev);
1477 ovs_mutex_lock(&dev->mutex);
1478 if (dev->miimon_interval > 0) {
1479 timer_wait(&dev->miimon_timer);
1481 ovs_mutex_unlock(&dev->mutex);
1482 netdev_close(netdev);
1484 shash_destroy(&device_shash);
1488 swap_uint64(uint64_t *a, uint64_t *b)
1495 /* Copies 'src' into 'dst', performing format conversion in the process.
1497 * 'src' is allowed to be misaligned. */
1499 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1500 const struct ovs_vport_stats *src)
1502 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1503 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1504 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1505 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1506 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1507 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1508 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1509 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1511 dst->collisions = 0;
1512 dst->rx_length_errors = 0;
1513 dst->rx_over_errors = 0;
1514 dst->rx_crc_errors = 0;
1515 dst->rx_frame_errors = 0;
1516 dst->rx_fifo_errors = 0;
1517 dst->rx_missed_errors = 0;
1518 dst->tx_aborted_errors = 0;
1519 dst->tx_carrier_errors = 0;
1520 dst->tx_fifo_errors = 0;
1521 dst->tx_heartbeat_errors = 0;
1522 dst->tx_window_errors = 0;
1526 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1528 struct dpif_linux_vport reply;
1532 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1535 } else if (!reply.stats) {
1540 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1548 get_stats_via_vport(const struct netdev *netdev_,
1549 struct netdev_stats *stats)
1551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1553 if (!netdev->vport_stats_error ||
1554 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1557 error = get_stats_via_vport__(netdev_, stats);
1558 if (error && error != ENOENT) {
1559 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1561 netdev_get_name(netdev_), ovs_strerror(error));
1563 netdev->vport_stats_error = error;
1564 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1568 /* Retrieves current device stats for 'netdev-linux'. */
1570 netdev_linux_get_stats(const struct netdev *netdev_,
1571 struct netdev_stats *stats)
1573 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1574 struct netdev_stats dev_stats;
1577 ovs_mutex_lock(&netdev->mutex);
1578 get_stats_via_vport(netdev_, stats);
1579 error = get_stats_via_netlink(netdev_, &dev_stats);
1581 if (!netdev->vport_stats_error) {
1584 } else if (netdev->vport_stats_error) {
1585 /* stats not available from OVS then use netdev stats. */
1588 /* Use kernel netdev's packet and byte counts since vport's counters
1589 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1591 stats->rx_packets = dev_stats.rx_packets;
1592 stats->rx_bytes = dev_stats.rx_bytes;
1593 stats->tx_packets = dev_stats.tx_packets;
1594 stats->tx_bytes = dev_stats.tx_bytes;
1596 stats->rx_errors += dev_stats.rx_errors;
1597 stats->tx_errors += dev_stats.tx_errors;
1598 stats->rx_dropped += dev_stats.rx_dropped;
1599 stats->tx_dropped += dev_stats.tx_dropped;
1600 stats->multicast += dev_stats.multicast;
1601 stats->collisions += dev_stats.collisions;
1602 stats->rx_length_errors += dev_stats.rx_length_errors;
1603 stats->rx_over_errors += dev_stats.rx_over_errors;
1604 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1605 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1606 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1607 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1608 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1609 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1610 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1611 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1612 stats->tx_window_errors += dev_stats.tx_window_errors;
1614 ovs_mutex_unlock(&netdev->mutex);
1619 /* Retrieves current device stats for 'netdev-tap' netdev or
1620 * netdev-internal. */
1622 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1624 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1625 struct netdev_stats dev_stats;
1628 ovs_mutex_lock(&netdev->mutex);
1629 get_stats_via_vport(netdev_, stats);
1630 error = get_stats_via_netlink(netdev_, &dev_stats);
1632 if (!netdev->vport_stats_error) {
1635 } else if (netdev->vport_stats_error) {
1636 /* Transmit and receive stats will appear to be swapped relative to the
1637 * other ports since we are the one sending the data, not a remote
1638 * computer. For consistency, we swap them back here. This does not
1639 * apply if we are getting stats from the vport layer because it always
1640 * tracks stats from the perspective of the switch. */
1643 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1644 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1645 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1646 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1647 stats->rx_length_errors = 0;
1648 stats->rx_over_errors = 0;
1649 stats->rx_crc_errors = 0;
1650 stats->rx_frame_errors = 0;
1651 stats->rx_fifo_errors = 0;
1652 stats->rx_missed_errors = 0;
1653 stats->tx_aborted_errors = 0;
1654 stats->tx_carrier_errors = 0;
1655 stats->tx_fifo_errors = 0;
1656 stats->tx_heartbeat_errors = 0;
1657 stats->tx_window_errors = 0;
1659 /* Use kernel netdev's packet and byte counts since vport counters
1660 * do not reflect packet counts on the wire when GSO, TSO or GRO
1662 stats->rx_packets = dev_stats.tx_packets;
1663 stats->rx_bytes = dev_stats.tx_bytes;
1664 stats->tx_packets = dev_stats.rx_packets;
1665 stats->tx_bytes = dev_stats.rx_bytes;
1667 stats->rx_dropped += dev_stats.tx_dropped;
1668 stats->tx_dropped += dev_stats.rx_dropped;
1670 stats->rx_errors += dev_stats.tx_errors;
1671 stats->tx_errors += dev_stats.rx_errors;
1673 stats->multicast += dev_stats.multicast;
1674 stats->collisions += dev_stats.collisions;
1676 ovs_mutex_unlock(&netdev->mutex);
1682 netdev_internal_get_stats(const struct netdev *netdev_,
1683 struct netdev_stats *stats)
1685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1688 ovs_mutex_lock(&netdev->mutex);
1689 get_stats_via_vport(netdev_, stats);
1690 error = netdev->vport_stats_error;
1691 ovs_mutex_unlock(&netdev->mutex);
1697 netdev_internal_set_stats(struct netdev *netdev,
1698 const struct netdev_stats *stats)
1700 struct ovs_vport_stats vport_stats;
1701 struct dpif_linux_vport vport;
1704 vport_stats.rx_packets = stats->rx_packets;
1705 vport_stats.tx_packets = stats->tx_packets;
1706 vport_stats.rx_bytes = stats->rx_bytes;
1707 vport_stats.tx_bytes = stats->tx_bytes;
1708 vport_stats.rx_errors = stats->rx_errors;
1709 vport_stats.tx_errors = stats->tx_errors;
1710 vport_stats.rx_dropped = stats->rx_dropped;
1711 vport_stats.tx_dropped = stats->tx_dropped;
1713 dpif_linux_vport_init(&vport);
1714 vport.cmd = OVS_VPORT_CMD_SET;
1715 vport.name = netdev_get_name(netdev);
1716 vport.stats = &vport_stats;
1718 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1720 /* If the vport layer doesn't know about the device, that doesn't mean it
1721 * doesn't exist (after all were able to open it when netdev_open() was
1722 * called), it just means that it isn't attached and we'll be getting
1723 * stats a different way. */
1724 if (err == ENODEV) {
1732 netdev_linux_read_features(struct netdev_linux *netdev)
1734 struct ethtool_cmd ecmd;
1738 if (netdev->cache_valid & VALID_FEATURES) {
1742 COVERAGE_INC(netdev_get_ethtool);
1743 memset(&ecmd, 0, sizeof ecmd);
1744 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1745 ETHTOOL_GSET, "ETHTOOL_GSET");
1750 /* Supported features. */
1751 netdev->supported = 0;
1752 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1753 netdev->supported |= NETDEV_F_10MB_HD;
1755 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1756 netdev->supported |= NETDEV_F_10MB_FD;
1758 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1759 netdev->supported |= NETDEV_F_100MB_HD;
1761 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1762 netdev->supported |= NETDEV_F_100MB_FD;
1764 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1765 netdev->supported |= NETDEV_F_1GB_HD;
1767 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1768 netdev->supported |= NETDEV_F_1GB_FD;
1770 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1771 netdev->supported |= NETDEV_F_10GB_FD;
1773 if (ecmd.supported & SUPPORTED_TP) {
1774 netdev->supported |= NETDEV_F_COPPER;
1776 if (ecmd.supported & SUPPORTED_FIBRE) {
1777 netdev->supported |= NETDEV_F_FIBER;
1779 if (ecmd.supported & SUPPORTED_Autoneg) {
1780 netdev->supported |= NETDEV_F_AUTONEG;
1782 if (ecmd.supported & SUPPORTED_Pause) {
1783 netdev->supported |= NETDEV_F_PAUSE;
1785 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1786 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1789 /* Advertised features. */
1790 netdev->advertised = 0;
1791 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1792 netdev->advertised |= NETDEV_F_10MB_HD;
1794 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1795 netdev->advertised |= NETDEV_F_10MB_FD;
1797 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1798 netdev->advertised |= NETDEV_F_100MB_HD;
1800 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1801 netdev->advertised |= NETDEV_F_100MB_FD;
1803 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1804 netdev->advertised |= NETDEV_F_1GB_HD;
1806 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1807 netdev->advertised |= NETDEV_F_1GB_FD;
1809 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1810 netdev->advertised |= NETDEV_F_10GB_FD;
1812 if (ecmd.advertising & ADVERTISED_TP) {
1813 netdev->advertised |= NETDEV_F_COPPER;
1815 if (ecmd.advertising & ADVERTISED_FIBRE) {
1816 netdev->advertised |= NETDEV_F_FIBER;
1818 if (ecmd.advertising & ADVERTISED_Autoneg) {
1819 netdev->advertised |= NETDEV_F_AUTONEG;
1821 if (ecmd.advertising & ADVERTISED_Pause) {
1822 netdev->advertised |= NETDEV_F_PAUSE;
1824 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1825 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1828 /* Current settings. */
1830 if (speed == SPEED_10) {
1831 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1832 } else if (speed == SPEED_100) {
1833 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1834 } else if (speed == SPEED_1000) {
1835 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1836 } else if (speed == SPEED_10000) {
1837 netdev->current = NETDEV_F_10GB_FD;
1838 } else if (speed == 40000) {
1839 netdev->current = NETDEV_F_40GB_FD;
1840 } else if (speed == 100000) {
1841 netdev->current = NETDEV_F_100GB_FD;
1842 } else if (speed == 1000000) {
1843 netdev->current = NETDEV_F_1TB_FD;
1845 netdev->current = 0;
1848 if (ecmd.port == PORT_TP) {
1849 netdev->current |= NETDEV_F_COPPER;
1850 } else if (ecmd.port == PORT_FIBRE) {
1851 netdev->current |= NETDEV_F_FIBER;
1855 netdev->current |= NETDEV_F_AUTONEG;
1859 netdev->cache_valid |= VALID_FEATURES;
1860 netdev->get_features_error = error;
1863 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1864 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1865 * Returns 0 if successful, otherwise a positive errno value. */
1867 netdev_linux_get_features(const struct netdev *netdev_,
1868 enum netdev_features *current,
1869 enum netdev_features *advertised,
1870 enum netdev_features *supported,
1871 enum netdev_features *peer)
1873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1876 ovs_mutex_lock(&netdev->mutex);
1877 netdev_linux_read_features(netdev);
1878 if (!netdev->get_features_error) {
1879 *current = netdev->current;
1880 *advertised = netdev->advertised;
1881 *supported = netdev->supported;
1882 *peer = 0; /* XXX */
1884 error = netdev->get_features_error;
1885 ovs_mutex_unlock(&netdev->mutex);
1890 /* Set the features advertised by 'netdev' to 'advertise'. */
1892 netdev_linux_set_advertisements(struct netdev *netdev_,
1893 enum netdev_features advertise)
1895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1896 struct ethtool_cmd ecmd;
1899 ovs_mutex_lock(&netdev->mutex);
1901 COVERAGE_INC(netdev_get_ethtool);
1902 memset(&ecmd, 0, sizeof ecmd);
1903 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1904 ETHTOOL_GSET, "ETHTOOL_GSET");
1909 ecmd.advertising = 0;
1910 if (advertise & NETDEV_F_10MB_HD) {
1911 ecmd.advertising |= ADVERTISED_10baseT_Half;
1913 if (advertise & NETDEV_F_10MB_FD) {
1914 ecmd.advertising |= ADVERTISED_10baseT_Full;
1916 if (advertise & NETDEV_F_100MB_HD) {
1917 ecmd.advertising |= ADVERTISED_100baseT_Half;
1919 if (advertise & NETDEV_F_100MB_FD) {
1920 ecmd.advertising |= ADVERTISED_100baseT_Full;
1922 if (advertise & NETDEV_F_1GB_HD) {
1923 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1925 if (advertise & NETDEV_F_1GB_FD) {
1926 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1928 if (advertise & NETDEV_F_10GB_FD) {
1929 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1931 if (advertise & NETDEV_F_COPPER) {
1932 ecmd.advertising |= ADVERTISED_TP;
1934 if (advertise & NETDEV_F_FIBER) {
1935 ecmd.advertising |= ADVERTISED_FIBRE;
1937 if (advertise & NETDEV_F_AUTONEG) {
1938 ecmd.advertising |= ADVERTISED_Autoneg;
1940 if (advertise & NETDEV_F_PAUSE) {
1941 ecmd.advertising |= ADVERTISED_Pause;
1943 if (advertise & NETDEV_F_PAUSE_ASYM) {
1944 ecmd.advertising |= ADVERTISED_Asym_Pause;
1946 COVERAGE_INC(netdev_set_ethtool);
1947 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1948 ETHTOOL_SSET, "ETHTOOL_SSET");
1951 ovs_mutex_unlock(&netdev->mutex);
1955 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1956 * successful, otherwise a positive errno value. */
1958 netdev_linux_set_policing(struct netdev *netdev_,
1959 uint32_t kbits_rate, uint32_t kbits_burst)
1961 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1962 const char *netdev_name = netdev_get_name(netdev_);
1965 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1966 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1967 : kbits_burst); /* Stick with user-specified value. */
1969 ovs_mutex_lock(&netdev->mutex);
1970 if (netdev->cache_valid & VALID_POLICING) {
1971 error = netdev->netdev_policing_error;
1972 if (error || (netdev->kbits_rate == kbits_rate &&
1973 netdev->kbits_burst == kbits_burst)) {
1974 /* Assume that settings haven't changed since we last set them. */
1977 netdev->cache_valid &= ~VALID_POLICING;
1980 COVERAGE_INC(netdev_set_policing);
1981 /* Remove any existing ingress qdisc. */
1982 error = tc_add_del_ingress_qdisc(netdev_, false);
1984 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1985 netdev_name, ovs_strerror(error));
1990 error = tc_add_del_ingress_qdisc(netdev_, true);
1992 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1993 netdev_name, ovs_strerror(error));
1997 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1999 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2000 netdev_name, ovs_strerror(error));
2005 netdev->kbits_rate = kbits_rate;
2006 netdev->kbits_burst = kbits_burst;
2009 if (!error || error == ENODEV) {
2010 netdev->netdev_policing_error = error;
2011 netdev->cache_valid |= VALID_POLICING;
2013 ovs_mutex_unlock(&netdev->mutex);
2018 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2021 const struct tc_ops *const *opsp;
2023 for (opsp = tcs; *opsp != NULL; opsp++) {
2024 const struct tc_ops *ops = *opsp;
2025 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2026 sset_add(types, ops->ovs_name);
2032 static const struct tc_ops *
2033 tc_lookup_ovs_name(const char *name)
2035 const struct tc_ops *const *opsp;
2037 for (opsp = tcs; *opsp != NULL; opsp++) {
2038 const struct tc_ops *ops = *opsp;
2039 if (!strcmp(name, ops->ovs_name)) {
2046 static const struct tc_ops *
2047 tc_lookup_linux_name(const char *name)
2049 const struct tc_ops *const *opsp;
2051 for (opsp = tcs; *opsp != NULL; opsp++) {
2052 const struct tc_ops *ops = *opsp;
2053 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2060 static struct tc_queue *
2061 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2064 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2065 struct tc_queue *queue;
2067 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2068 if (queue->queue_id == queue_id) {
2075 static struct tc_queue *
2076 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2078 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2082 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2084 struct netdev_qos_capabilities *caps)
2086 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2090 caps->n_queues = ops->n_queues;
2095 netdev_linux_get_qos(const struct netdev *netdev_,
2096 const char **typep, struct smap *details)
2098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2101 ovs_mutex_lock(&netdev->mutex);
2102 error = tc_query_qdisc(netdev_);
2104 *typep = netdev->tc->ops->ovs_name;
2105 error = (netdev->tc->ops->qdisc_get
2106 ? netdev->tc->ops->qdisc_get(netdev_, details)
2109 ovs_mutex_unlock(&netdev->mutex);
2115 netdev_linux_set_qos(struct netdev *netdev_,
2116 const char *type, const struct smap *details)
2118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2119 const struct tc_ops *new_ops;
2122 new_ops = tc_lookup_ovs_name(type);
2123 if (!new_ops || !new_ops->tc_install) {
2127 ovs_mutex_lock(&netdev->mutex);
2128 error = tc_query_qdisc(netdev_);
2133 if (new_ops == netdev->tc->ops) {
2134 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2136 /* Delete existing qdisc. */
2137 error = tc_del_qdisc(netdev_);
2141 ovs_assert(netdev->tc == NULL);
2143 /* Install new qdisc. */
2144 error = new_ops->tc_install(netdev_, details);
2145 ovs_assert((error == 0) == (netdev->tc != NULL));
2149 ovs_mutex_unlock(&netdev->mutex);
2154 netdev_linux_get_queue(const struct netdev *netdev_,
2155 unsigned int queue_id, struct smap *details)
2157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2160 ovs_mutex_lock(&netdev->mutex);
2161 error = tc_query_qdisc(netdev_);
2163 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2165 ? netdev->tc->ops->class_get(netdev_, queue, details)
2168 ovs_mutex_unlock(&netdev->mutex);
2174 netdev_linux_set_queue(struct netdev *netdev_,
2175 unsigned int queue_id, const struct smap *details)
2177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2180 ovs_mutex_lock(&netdev->mutex);
2181 error = tc_query_qdisc(netdev_);
2183 error = (queue_id < netdev->tc->ops->n_queues
2184 && netdev->tc->ops->class_set
2185 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2188 ovs_mutex_unlock(&netdev->mutex);
2194 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2196 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2199 ovs_mutex_lock(&netdev->mutex);
2200 error = tc_query_qdisc(netdev_);
2202 if (netdev->tc->ops->class_delete) {
2203 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2205 ? netdev->tc->ops->class_delete(netdev_, queue)
2211 ovs_mutex_unlock(&netdev->mutex);
2217 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2218 unsigned int queue_id,
2219 struct netdev_queue_stats *stats)
2221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2224 ovs_mutex_lock(&netdev->mutex);
2225 error = tc_query_qdisc(netdev_);
2227 if (netdev->tc->ops->class_get_stats) {
2228 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2230 stats->created = queue->created;
2231 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2240 ovs_mutex_unlock(&netdev->mutex);
2245 struct queue_dump_state {
2246 struct nl_dump dump;
2251 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2253 struct ofpbuf request;
2254 struct tcmsg *tcmsg;
2256 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2260 tcmsg->tcm_parent = 0;
2261 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2262 ofpbuf_uninit(&request);
2264 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2269 finish_queue_dump(struct queue_dump_state *state)
2271 ofpbuf_uninit(&state->buf);
2272 return nl_dump_done(&state->dump);
2275 struct netdev_linux_queue_state {
2276 unsigned int *queues;
2282 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2284 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2287 ovs_mutex_lock(&netdev->mutex);
2288 error = tc_query_qdisc(netdev_);
2290 if (netdev->tc->ops->class_get) {
2291 struct netdev_linux_queue_state *state;
2292 struct tc_queue *queue;
2295 *statep = state = xmalloc(sizeof *state);
2296 state->n_queues = hmap_count(&netdev->tc->queues);
2297 state->cur_queue = 0;
2298 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2301 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2302 state->queues[i++] = queue->queue_id;
2308 ovs_mutex_unlock(&netdev->mutex);
2314 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2315 unsigned int *queue_idp, struct smap *details)
2317 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2318 struct netdev_linux_queue_state *state = state_;
2321 ovs_mutex_lock(&netdev->mutex);
2322 while (state->cur_queue < state->n_queues) {
2323 unsigned int queue_id = state->queues[state->cur_queue++];
2324 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2327 *queue_idp = queue_id;
2328 error = netdev->tc->ops->class_get(netdev_, queue, details);
2332 ovs_mutex_unlock(&netdev->mutex);
2338 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2341 struct netdev_linux_queue_state *state = state_;
2343 free(state->queues);
2349 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2350 netdev_dump_queue_stats_cb *cb, void *aux)
2352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2355 ovs_mutex_lock(&netdev->mutex);
2356 error = tc_query_qdisc(netdev_);
2358 struct queue_dump_state state;
2360 if (!netdev->tc->ops->class_dump_stats) {
2362 } else if (!start_queue_dump(netdev_, &state)) {
2368 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2369 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2376 retval = finish_queue_dump(&state);
2382 ovs_mutex_unlock(&netdev->mutex);
2388 netdev_linux_get_in4(const struct netdev *netdev_,
2389 struct in_addr *address, struct in_addr *netmask)
2391 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2394 ovs_mutex_lock(&netdev->mutex);
2395 if (!(netdev->cache_valid & VALID_IN4)) {
2396 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2397 SIOCGIFADDR, "SIOCGIFADDR");
2399 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2400 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2402 netdev->cache_valid |= VALID_IN4;
2410 if (netdev->address.s_addr != INADDR_ANY) {
2411 *address = netdev->address;
2412 *netmask = netdev->netmask;
2414 error = EADDRNOTAVAIL;
2417 ovs_mutex_unlock(&netdev->mutex);
2423 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2424 struct in_addr netmask)
2426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2429 ovs_mutex_lock(&netdev->mutex);
2430 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2432 netdev->cache_valid |= VALID_IN4;
2433 netdev->address = address;
2434 netdev->netmask = netmask;
2435 if (address.s_addr != INADDR_ANY) {
2436 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2437 "SIOCSIFNETMASK", netmask);
2440 ovs_mutex_unlock(&netdev->mutex);
2446 parse_if_inet6_line(const char *line,
2447 struct in6_addr *in6, char ifname[16 + 1])
2449 uint8_t *s6 = in6->s6_addr;
2450 #define X8 "%2"SCNx8
2451 return ovs_scan(line,
2452 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2453 "%*x %*x %*x %*x %16s\n",
2454 &s6[0], &s6[1], &s6[2], &s6[3],
2455 &s6[4], &s6[5], &s6[6], &s6[7],
2456 &s6[8], &s6[9], &s6[10], &s6[11],
2457 &s6[12], &s6[13], &s6[14], &s6[15],
2461 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2462 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2464 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2466 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2468 ovs_mutex_lock(&netdev->mutex);
2469 if (!(netdev->cache_valid & VALID_IN6)) {
2473 netdev->in6 = in6addr_any;
2475 file = fopen("/proc/net/if_inet6", "r");
2477 const char *name = netdev_get_name(netdev_);
2478 while (fgets(line, sizeof line, file)) {
2479 struct in6_addr in6_tmp;
2480 char ifname[16 + 1];
2481 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2482 && !strcmp(name, ifname))
2484 netdev->in6 = in6_tmp;
2490 netdev->cache_valid |= VALID_IN6;
2493 ovs_mutex_unlock(&netdev->mutex);
2499 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2501 struct sockaddr_in sin;
2502 memset(&sin, 0, sizeof sin);
2503 sin.sin_family = AF_INET;
2504 sin.sin_addr = addr;
2507 memset(sa, 0, sizeof *sa);
2508 memcpy(sa, &sin, sizeof sin);
2512 do_set_addr(struct netdev *netdev,
2513 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2517 make_in4_sockaddr(&ifr.ifr_addr, addr);
2518 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2522 /* Adds 'router' as a default IP gateway. */
2524 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2526 struct in_addr any = { INADDR_ANY };
2530 memset(&rt, 0, sizeof rt);
2531 make_in4_sockaddr(&rt.rt_dst, any);
2532 make_in4_sockaddr(&rt.rt_gateway, router);
2533 make_in4_sockaddr(&rt.rt_genmask, any);
2534 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2535 error = af_inet_ioctl(SIOCADDRT, &rt);
2537 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2543 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2546 static const char fn[] = "/proc/net/route";
2551 *netdev_name = NULL;
2552 stream = fopen(fn, "r");
2553 if (stream == NULL) {
2554 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2559 while (fgets(line, sizeof line, stream)) {
2562 ovs_be32 dest, gateway, mask;
2563 int refcnt, metric, mtu;
2564 unsigned int flags, use, window, irtt;
2567 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2569 iface, &dest, &gateway, &flags, &refcnt,
2570 &use, &metric, &mask, &mtu, &window, &irtt)) {
2571 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2575 if (!(flags & RTF_UP)) {
2576 /* Skip routes that aren't up. */
2580 /* The output of 'dest', 'mask', and 'gateway' were given in
2581 * network byte order, so we don't need need any endian
2582 * conversions here. */
2583 if ((dest & mask) == (host->s_addr & mask)) {
2585 /* The host is directly reachable. */
2586 next_hop->s_addr = 0;
2588 /* To reach the host, we must go through a gateway. */
2589 next_hop->s_addr = gateway;
2591 *netdev_name = xstrdup(iface);
2603 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2608 ovs_mutex_lock(&netdev->mutex);
2609 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2610 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2612 COVERAGE_INC(netdev_get_ethtool);
2613 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2614 error = netdev_linux_do_ethtool(netdev->up.name,
2617 "ETHTOOL_GDRVINFO");
2619 netdev->cache_valid |= VALID_DRVINFO;
2624 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2625 smap_add(smap, "driver_version", netdev->drvinfo.version);
2626 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2628 ovs_mutex_unlock(&netdev->mutex);
2634 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2637 smap_add(smap, "driver_name", "openvswitch");
2641 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2642 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2643 * returns 0. Otherwise, it returns a positive errno value; in particular,
2644 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2646 netdev_linux_arp_lookup(const struct netdev *netdev,
2647 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2650 struct sockaddr_in sin;
2653 memset(&r, 0, sizeof r);
2654 memset(&sin, 0, sizeof sin);
2655 sin.sin_family = AF_INET;
2656 sin.sin_addr.s_addr = ip;
2658 memcpy(&r.arp_pa, &sin, sizeof sin);
2659 r.arp_ha.sa_family = ARPHRD_ETHER;
2661 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2662 COVERAGE_INC(netdev_arp_lookup);
2663 retval = af_inet_ioctl(SIOCGARP, &r);
2665 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2666 } else if (retval != ENXIO) {
2667 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2668 netdev_get_name(netdev), IP_ARGS(ip),
2669 ovs_strerror(retval));
2675 nd_to_iff_flags(enum netdev_flags nd)
2678 if (nd & NETDEV_UP) {
2681 if (nd & NETDEV_PROMISC) {
2684 if (nd & NETDEV_LOOPBACK) {
2685 iff |= IFF_LOOPBACK;
2691 iff_to_nd_flags(int iff)
2693 enum netdev_flags nd = 0;
2697 if (iff & IFF_PROMISC) {
2698 nd |= NETDEV_PROMISC;
2700 if (iff & IFF_LOOPBACK) {
2701 nd |= NETDEV_LOOPBACK;
2707 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2708 enum netdev_flags on, enum netdev_flags *old_flagsp)
2709 OVS_REQUIRES(netdev->mutex)
2711 int old_flags, new_flags;
2714 old_flags = netdev->ifi_flags;
2715 *old_flagsp = iff_to_nd_flags(old_flags);
2716 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2717 if (new_flags != old_flags) {
2718 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2719 get_flags(&netdev->up, &netdev->ifi_flags);
2726 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2727 enum netdev_flags on, enum netdev_flags *old_flagsp)
2729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2732 ovs_mutex_lock(&netdev->mutex);
2733 error = update_flags(netdev, off, on, old_flagsp);
2734 ovs_mutex_unlock(&netdev->mutex);
2739 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2740 GET_FEATURES, GET_STATUS) \
2746 netdev_linux_wait, \
2748 netdev_linux_alloc, \
2750 netdev_linux_destruct, \
2751 netdev_linux_dealloc, \
2752 NULL, /* get_config */ \
2753 NULL, /* set_config */ \
2754 NULL, /* get_tunnel_config */ \
2756 netdev_linux_send, \
2757 netdev_linux_send_wait, \
2759 netdev_linux_set_etheraddr, \
2760 netdev_linux_get_etheraddr, \
2761 netdev_linux_get_mtu, \
2762 netdev_linux_set_mtu, \
2763 netdev_linux_get_ifindex, \
2764 netdev_linux_get_carrier, \
2765 netdev_linux_get_carrier_resets, \
2766 netdev_linux_set_miimon_interval, \
2771 netdev_linux_set_advertisements, \
2773 netdev_linux_set_policing, \
2774 netdev_linux_get_qos_types, \
2775 netdev_linux_get_qos_capabilities, \
2776 netdev_linux_get_qos, \
2777 netdev_linux_set_qos, \
2778 netdev_linux_get_queue, \
2779 netdev_linux_set_queue, \
2780 netdev_linux_delete_queue, \
2781 netdev_linux_get_queue_stats, \
2782 netdev_linux_queue_dump_start, \
2783 netdev_linux_queue_dump_next, \
2784 netdev_linux_queue_dump_done, \
2785 netdev_linux_dump_queue_stats, \
2787 netdev_linux_get_in4, \
2788 netdev_linux_set_in4, \
2789 netdev_linux_get_in6, \
2790 netdev_linux_add_router, \
2791 netdev_linux_get_next_hop, \
2793 netdev_linux_arp_lookup, \
2795 netdev_linux_update_flags, \
2797 netdev_linux_rxq_alloc, \
2798 netdev_linux_rxq_construct, \
2799 netdev_linux_rxq_destruct, \
2800 netdev_linux_rxq_dealloc, \
2801 netdev_linux_rxq_recv, \
2802 netdev_linux_rxq_wait, \
2803 netdev_linux_rxq_drain, \
2806 const struct netdev_class netdev_linux_class =
2809 netdev_linux_construct,
2810 netdev_linux_get_stats,
2811 NULL, /* set_stats */
2812 netdev_linux_get_features,
2813 netdev_linux_get_status);
2815 const struct netdev_class netdev_tap_class =
2818 netdev_linux_construct_tap,
2819 netdev_tap_get_stats,
2820 NULL, /* set_stats */
2821 netdev_linux_get_features,
2822 netdev_linux_get_status);
2824 const struct netdev_class netdev_internal_class =
2827 netdev_linux_construct,
2828 netdev_internal_get_stats,
2829 netdev_internal_set_stats,
2830 NULL, /* get_features */
2831 netdev_internal_get_status);
2833 /* HTB traffic control class. */
2835 #define HTB_N_QUEUES 0xf000
2839 unsigned int max_rate; /* In bytes/s. */
2843 struct tc_queue tc_queue;
2844 unsigned int min_rate; /* In bytes/s. */
2845 unsigned int max_rate; /* In bytes/s. */
2846 unsigned int burst; /* In bytes. */
2847 unsigned int priority; /* Lower values are higher priorities. */
2851 htb_get__(const struct netdev *netdev_)
2853 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2854 return CONTAINER_OF(netdev->tc, struct htb, tc);
2858 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2863 htb = xmalloc(sizeof *htb);
2864 tc_init(&htb->tc, &tc_ops_htb);
2865 htb->max_rate = max_rate;
2867 netdev->tc = &htb->tc;
2870 /* Create an HTB qdisc.
2872 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2874 htb_setup_qdisc__(struct netdev *netdev)
2877 struct tc_htb_glob opt;
2878 struct ofpbuf request;
2879 struct tcmsg *tcmsg;
2881 tc_del_qdisc(netdev);
2883 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2884 NLM_F_EXCL | NLM_F_CREATE, &request);
2888 tcmsg->tcm_handle = tc_make_handle(1, 0);
2889 tcmsg->tcm_parent = TC_H_ROOT;
2891 nl_msg_put_string(&request, TCA_KIND, "htb");
2893 memset(&opt, 0, sizeof opt);
2894 opt.rate2quantum = 10;
2898 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2899 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2900 nl_msg_end_nested(&request, opt_offset);
2902 return tc_transact(&request, NULL);
2905 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2906 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2908 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2909 unsigned int parent, struct htb_class *class)
2912 struct tc_htb_opt opt;
2913 struct ofpbuf request;
2914 struct tcmsg *tcmsg;
2918 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2920 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2921 netdev_get_name(netdev));
2925 memset(&opt, 0, sizeof opt);
2926 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2927 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2928 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2929 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2930 opt.prio = class->priority;
2932 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2936 tcmsg->tcm_handle = handle;
2937 tcmsg->tcm_parent = parent;
2939 nl_msg_put_string(&request, TCA_KIND, "htb");
2940 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2941 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2942 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2943 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2944 nl_msg_end_nested(&request, opt_offset);
2946 error = tc_transact(&request, NULL);
2948 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2949 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2950 netdev_get_name(netdev),
2951 tc_get_major(handle), tc_get_minor(handle),
2952 tc_get_major(parent), tc_get_minor(parent),
2953 class->min_rate, class->max_rate,
2954 class->burst, class->priority, ovs_strerror(error));
2959 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2960 * description of them into 'details'. The description complies with the
2961 * specification given in the vswitch database documentation for linux-htb
2964 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2966 static const struct nl_policy tca_htb_policy[] = {
2967 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2968 .min_len = sizeof(struct tc_htb_opt) },
2971 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2972 const struct tc_htb_opt *htb;
2974 if (!nl_parse_nested(nl_options, tca_htb_policy,
2975 attrs, ARRAY_SIZE(tca_htb_policy))) {
2976 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2980 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2981 class->min_rate = htb->rate.rate;
2982 class->max_rate = htb->ceil.rate;
2983 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2984 class->priority = htb->prio;
2989 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2990 struct htb_class *options,
2991 struct netdev_queue_stats *stats)
2993 struct nlattr *nl_options;
2994 unsigned int handle;
2997 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2998 if (!error && queue_id) {
2999 unsigned int major = tc_get_major(handle);
3000 unsigned int minor = tc_get_minor(handle);
3001 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3002 *queue_id = minor - 1;
3007 if (!error && options) {
3008 error = htb_parse_tca_options__(nl_options, options);
3014 htb_parse_qdisc_details__(struct netdev *netdev_,
3015 const struct smap *details, struct htb_class *hc)
3017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3018 const char *max_rate_s;
3020 max_rate_s = smap_get(details, "max-rate");
3021 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3022 if (!hc->max_rate) {
3023 enum netdev_features current;
3025 netdev_linux_read_features(netdev);
3026 current = !netdev->get_features_error ? netdev->current : 0;
3027 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3029 hc->min_rate = hc->max_rate;
3035 htb_parse_class_details__(struct netdev *netdev,
3036 const struct smap *details, struct htb_class *hc)
3038 const struct htb *htb = htb_get__(netdev);
3039 const char *min_rate_s = smap_get(details, "min-rate");
3040 const char *max_rate_s = smap_get(details, "max-rate");
3041 const char *burst_s = smap_get(details, "burst");
3042 const char *priority_s = smap_get(details, "priority");
3045 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3047 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3048 netdev_get_name(netdev));
3052 /* HTB requires at least an mtu sized min-rate to send any traffic even
3053 * on uncongested links. */
3054 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3055 hc->min_rate = MAX(hc->min_rate, mtu);
3056 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3059 hc->max_rate = (max_rate_s
3060 ? strtoull(max_rate_s, NULL, 10) / 8
3062 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3063 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3067 * According to hints in the documentation that I've read, it is important
3068 * that 'burst' be at least as big as the largest frame that might be
3069 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3070 * but having it a bit too small is a problem. Since netdev_get_mtu()
3071 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3072 * the MTU. We actually add 64, instead of 14, as a guard against
3073 * additional headers get tacked on somewhere that we're not aware of. */
3074 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3075 hc->burst = MAX(hc->burst, mtu + 64);
3078 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3084 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3085 unsigned int parent, struct htb_class *options,
3086 struct netdev_queue_stats *stats)
3088 struct ofpbuf *reply;
3091 error = tc_query_class(netdev, handle, parent, &reply);
3093 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3094 ofpbuf_delete(reply);
3100 htb_tc_install(struct netdev *netdev, const struct smap *details)
3104 error = htb_setup_qdisc__(netdev);
3106 struct htb_class hc;
3108 htb_parse_qdisc_details__(netdev, details, &hc);
3109 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3110 tc_make_handle(1, 0), &hc);
3112 htb_install__(netdev, hc.max_rate);
3118 static struct htb_class *
3119 htb_class_cast__(const struct tc_queue *queue)
3121 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3125 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3126 const struct htb_class *hc)
3128 struct htb *htb = htb_get__(netdev);
3129 size_t hash = hash_int(queue_id, 0);
3130 struct tc_queue *queue;
3131 struct htb_class *hcp;
3133 queue = tc_find_queue__(netdev, queue_id, hash);
3135 hcp = htb_class_cast__(queue);
3137 hcp = xmalloc(sizeof *hcp);
3138 queue = &hcp->tc_queue;
3139 queue->queue_id = queue_id;
3140 queue->created = time_msec();
3141 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3144 hcp->min_rate = hc->min_rate;
3145 hcp->max_rate = hc->max_rate;
3146 hcp->burst = hc->burst;
3147 hcp->priority = hc->priority;
3151 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3154 struct queue_dump_state state;
3155 struct htb_class hc;
3157 /* Get qdisc options. */
3159 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3160 htb_install__(netdev, hc.max_rate);
3163 if (!start_queue_dump(netdev, &state)) {
3166 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3167 unsigned int queue_id;
3169 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3170 htb_update_queue__(netdev, queue_id, &hc);
3173 finish_queue_dump(&state);
3179 htb_tc_destroy(struct tc *tc)
3181 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3182 struct htb_class *hc, *next;
3184 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3185 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3193 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3195 const struct htb *htb = htb_get__(netdev);
3196 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3201 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3203 struct htb_class hc;
3206 htb_parse_qdisc_details__(netdev, details, &hc);
3207 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3208 tc_make_handle(1, 0), &hc);
3210 htb_get__(netdev)->max_rate = hc.max_rate;
3216 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3217 const struct tc_queue *queue, struct smap *details)
3219 const struct htb_class *hc = htb_class_cast__(queue);
3221 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3222 if (hc->min_rate != hc->max_rate) {
3223 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3225 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3227 smap_add_format(details, "priority", "%u", hc->priority);
3233 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3234 const struct smap *details)
3236 struct htb_class hc;
3239 error = htb_parse_class_details__(netdev, details, &hc);
3244 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3245 tc_make_handle(1, 0xfffe), &hc);
3250 htb_update_queue__(netdev, queue_id, &hc);
3255 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3257 struct htb_class *hc = htb_class_cast__(queue);
3258 struct htb *htb = htb_get__(netdev);
3261 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3263 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3270 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3271 struct netdev_queue_stats *stats)
3273 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3274 tc_make_handle(1, 0xfffe), NULL, stats);
3278 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3279 const struct ofpbuf *nlmsg,
3280 netdev_dump_queue_stats_cb *cb, void *aux)
3282 struct netdev_queue_stats stats;
3283 unsigned int handle, major, minor;
3286 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3291 major = tc_get_major(handle);
3292 minor = tc_get_minor(handle);
3293 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3294 (*cb)(minor - 1, &stats, aux);
3299 static const struct tc_ops tc_ops_htb = {
3300 "htb", /* linux_name */
3301 "linux-htb", /* ovs_name */
3302 HTB_N_QUEUES, /* n_queues */
3311 htb_class_get_stats,
3312 htb_class_dump_stats
3315 /* "linux-hfsc" traffic control class. */
3317 #define HFSC_N_QUEUES 0xf000
3325 struct tc_queue tc_queue;
3330 static struct hfsc *
3331 hfsc_get__(const struct netdev *netdev_)
3333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3334 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3337 static struct hfsc_class *
3338 hfsc_class_cast__(const struct tc_queue *queue)
3340 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3344 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3346 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3349 hfsc = xmalloc(sizeof *hfsc);
3350 tc_init(&hfsc->tc, &tc_ops_hfsc);
3351 hfsc->max_rate = max_rate;
3352 netdev->tc = &hfsc->tc;
3356 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3357 const struct hfsc_class *hc)
3361 struct hfsc_class *hcp;
3362 struct tc_queue *queue;
3364 hfsc = hfsc_get__(netdev);
3365 hash = hash_int(queue_id, 0);
3367 queue = tc_find_queue__(netdev, queue_id, hash);
3369 hcp = hfsc_class_cast__(queue);
3371 hcp = xmalloc(sizeof *hcp);
3372 queue = &hcp->tc_queue;
3373 queue->queue_id = queue_id;
3374 queue->created = time_msec();
3375 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3378 hcp->min_rate = hc->min_rate;
3379 hcp->max_rate = hc->max_rate;
3383 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3385 const struct tc_service_curve *rsc, *fsc, *usc;
3386 static const struct nl_policy tca_hfsc_policy[] = {
3388 .type = NL_A_UNSPEC,
3390 .min_len = sizeof(struct tc_service_curve),
3393 .type = NL_A_UNSPEC,
3395 .min_len = sizeof(struct tc_service_curve),
3398 .type = NL_A_UNSPEC,
3400 .min_len = sizeof(struct tc_service_curve),
3403 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3405 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3406 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3407 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3411 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3412 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3413 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3415 if (rsc->m1 != 0 || rsc->d != 0 ||
3416 fsc->m1 != 0 || fsc->d != 0 ||
3417 usc->m1 != 0 || usc->d != 0) {
3418 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3419 "Non-linear service curves are not supported.");
3423 if (rsc->m2 != fsc->m2) {
3424 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3425 "Real-time service curves are not supported ");
3429 if (rsc->m2 > usc->m2) {
3430 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3431 "Min-rate service curve is greater than "
3432 "the max-rate service curve.");
3436 class->min_rate = fsc->m2;
3437 class->max_rate = usc->m2;
3442 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3443 struct hfsc_class *options,
3444 struct netdev_queue_stats *stats)
3447 unsigned int handle;
3448 struct nlattr *nl_options;
3450 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3456 unsigned int major, minor;
3458 major = tc_get_major(handle);
3459 minor = tc_get_minor(handle);
3460 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3461 *queue_id = minor - 1;
3468 error = hfsc_parse_tca_options__(nl_options, options);
3475 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3476 unsigned int parent, struct hfsc_class *options,
3477 struct netdev_queue_stats *stats)
3480 struct ofpbuf *reply;
3482 error = tc_query_class(netdev, handle, parent, &reply);
3487 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3488 ofpbuf_delete(reply);
3493 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3494 struct hfsc_class *class)
3496 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3498 const char *max_rate_s;
3500 max_rate_s = smap_get(details, "max-rate");
3501 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3504 enum netdev_features current;
3506 netdev_linux_read_features(netdev);
3507 current = !netdev->get_features_error ? netdev->current : 0;
3508 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3511 class->min_rate = max_rate;
3512 class->max_rate = max_rate;
3516 hfsc_parse_class_details__(struct netdev *netdev,
3517 const struct smap *details,
3518 struct hfsc_class * class)
3520 const struct hfsc *hfsc;
3521 uint32_t min_rate, max_rate;
3522 const char *min_rate_s, *max_rate_s;
3524 hfsc = hfsc_get__(netdev);
3525 min_rate_s = smap_get(details, "min-rate");
3526 max_rate_s = smap_get(details, "max-rate");
3528 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3529 min_rate = MAX(min_rate, 1);
3530 min_rate = MIN(min_rate, hfsc->max_rate);
3532 max_rate = (max_rate_s
3533 ? strtoull(max_rate_s, NULL, 10) / 8
3535 max_rate = MAX(max_rate, min_rate);
3536 max_rate = MIN(max_rate, hfsc->max_rate);
3538 class->min_rate = min_rate;
3539 class->max_rate = max_rate;
3544 /* Create an HFSC qdisc.
3546 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3548 hfsc_setup_qdisc__(struct netdev * netdev)
3550 struct tcmsg *tcmsg;
3551 struct ofpbuf request;
3552 struct tc_hfsc_qopt opt;
3554 tc_del_qdisc(netdev);
3556 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3557 NLM_F_EXCL | NLM_F_CREATE, &request);
3563 tcmsg->tcm_handle = tc_make_handle(1, 0);
3564 tcmsg->tcm_parent = TC_H_ROOT;
3566 memset(&opt, 0, sizeof opt);
3569 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3570 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3572 return tc_transact(&request, NULL);
3575 /* Create an HFSC class.
3577 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3578 * sc rate <min_rate> ul rate <max_rate>" */
3580 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3581 unsigned int parent, struct hfsc_class *class)
3585 struct tcmsg *tcmsg;
3586 struct ofpbuf request;
3587 struct tc_service_curve min, max;
3589 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3595 tcmsg->tcm_handle = handle;
3596 tcmsg->tcm_parent = parent;
3600 min.m2 = class->min_rate;
3604 max.m2 = class->max_rate;
3606 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3607 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3608 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3609 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3610 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3611 nl_msg_end_nested(&request, opt_offset);
3613 error = tc_transact(&request, NULL);
3615 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3616 "min-rate %ubps, max-rate %ubps (%s)",
3617 netdev_get_name(netdev),
3618 tc_get_major(handle), tc_get_minor(handle),
3619 tc_get_major(parent), tc_get_minor(parent),
3620 class->min_rate, class->max_rate, ovs_strerror(error));
3627 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3630 struct hfsc_class class;
3632 error = hfsc_setup_qdisc__(netdev);
3638 hfsc_parse_qdisc_details__(netdev, details, &class);
3639 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3640 tc_make_handle(1, 0), &class);
3646 hfsc_install__(netdev, class.max_rate);
3651 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3654 struct queue_dump_state state;
3655 struct hfsc_class hc;
3658 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3659 hfsc_install__(netdev, hc.max_rate);
3661 if (!start_queue_dump(netdev, &state)) {
3665 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3666 unsigned int queue_id;
3668 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3669 hfsc_update_queue__(netdev, queue_id, &hc);
3673 finish_queue_dump(&state);
3678 hfsc_tc_destroy(struct tc *tc)
3681 struct hfsc_class *hc, *next;
3683 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3685 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3686 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3695 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3697 const struct hfsc *hfsc;
3698 hfsc = hfsc_get__(netdev);
3699 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3704 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3707 struct hfsc_class class;
3709 hfsc_parse_qdisc_details__(netdev, details, &class);
3710 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3711 tc_make_handle(1, 0), &class);
3714 hfsc_get__(netdev)->max_rate = class.max_rate;
3721 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3722 const struct tc_queue *queue, struct smap *details)
3724 const struct hfsc_class *hc;
3726 hc = hfsc_class_cast__(queue);
3727 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3728 if (hc->min_rate != hc->max_rate) {
3729 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3735 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3736 const struct smap *details)
3739 struct hfsc_class class;
3741 error = hfsc_parse_class_details__(netdev, details, &class);
3746 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3747 tc_make_handle(1, 0xfffe), &class);
3752 hfsc_update_queue__(netdev, queue_id, &class);
3757 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3761 struct hfsc_class *hc;
3763 hc = hfsc_class_cast__(queue);
3764 hfsc = hfsc_get__(netdev);
3766 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3768 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3775 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3776 struct netdev_queue_stats *stats)
3778 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3779 tc_make_handle(1, 0xfffe), NULL, stats);
3783 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3784 const struct ofpbuf *nlmsg,
3785 netdev_dump_queue_stats_cb *cb, void *aux)
3787 struct netdev_queue_stats stats;
3788 unsigned int handle, major, minor;
3791 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3796 major = tc_get_major(handle);
3797 minor = tc_get_minor(handle);
3798 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3799 (*cb)(minor - 1, &stats, aux);
3804 static const struct tc_ops tc_ops_hfsc = {
3805 "hfsc", /* linux_name */
3806 "linux-hfsc", /* ovs_name */
3807 HFSC_N_QUEUES, /* n_queues */
3808 hfsc_tc_install, /* tc_install */
3809 hfsc_tc_load, /* tc_load */
3810 hfsc_tc_destroy, /* tc_destroy */
3811 hfsc_qdisc_get, /* qdisc_get */
3812 hfsc_qdisc_set, /* qdisc_set */
3813 hfsc_class_get, /* class_get */
3814 hfsc_class_set, /* class_set */
3815 hfsc_class_delete, /* class_delete */
3816 hfsc_class_get_stats, /* class_get_stats */
3817 hfsc_class_dump_stats /* class_dump_stats */
3820 /* "linux-default" traffic control class.
3822 * This class represents the default, unnamed Linux qdisc. It corresponds to
3823 * the "" (empty string) QoS type in the OVS database. */
3826 default_install__(struct netdev *netdev_)
3828 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3829 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3831 /* Nothing but a tc class implementation is allowed to write to a tc. This
3832 * class never does that, so we can legitimately use a const tc object. */
3833 netdev->tc = CONST_CAST(struct tc *, &tc);
3837 default_tc_install(struct netdev *netdev,
3838 const struct smap *details OVS_UNUSED)
3840 default_install__(netdev);
3845 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3847 default_install__(netdev);
3851 static const struct tc_ops tc_ops_default = {
3852 NULL, /* linux_name */
3857 NULL, /* tc_destroy */
3858 NULL, /* qdisc_get */
3859 NULL, /* qdisc_set */
3860 NULL, /* class_get */
3861 NULL, /* class_set */
3862 NULL, /* class_delete */
3863 NULL, /* class_get_stats */
3864 NULL /* class_dump_stats */
3867 /* "linux-other" traffic control class.
3872 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3875 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3877 /* Nothing but a tc class implementation is allowed to write to a tc. This
3878 * class never does that, so we can legitimately use a const tc object. */
3879 netdev->tc = CONST_CAST(struct tc *, &tc);
3883 static const struct tc_ops tc_ops_other = {
3884 NULL, /* linux_name */
3885 "linux-other", /* ovs_name */
3887 NULL, /* tc_install */
3889 NULL, /* tc_destroy */
3890 NULL, /* qdisc_get */
3891 NULL, /* qdisc_set */
3892 NULL, /* class_get */
3893 NULL, /* class_set */
3894 NULL, /* class_delete */
3895 NULL, /* class_get_stats */
3896 NULL /* class_dump_stats */
3899 /* Traffic control. */
3901 /* Number of kernel "tc" ticks per second. */
3902 static double ticks_per_s;
3904 /* Number of kernel "jiffies" per second. This is used for the purpose of
3905 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3906 * one jiffy's worth of data.
3908 * There are two possibilities here:
3910 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3911 * approximate range of 100 to 1024. That means that we really need to
3912 * make sure that the qdisc can buffer that much data.
3914 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3915 * has finely granular timers and there's no need to fudge additional room
3916 * for buffers. (There's no extra effort needed to implement that: the
3917 * large 'buffer_hz' is used as a divisor, so practically any number will
3918 * come out as 0 in the division. Small integer results in the case of
3919 * really high dividends won't have any real effect anyhow.)
3921 static unsigned int buffer_hz;
3923 /* Returns tc handle 'major':'minor'. */
3925 tc_make_handle(unsigned int major, unsigned int minor)
3927 return TC_H_MAKE(major << 16, minor);
3930 /* Returns the major number from 'handle'. */
3932 tc_get_major(unsigned int handle)
3934 return TC_H_MAJ(handle) >> 16;
3937 /* Returns the minor number from 'handle'. */
3939 tc_get_minor(unsigned int handle)
3941 return TC_H_MIN(handle);
3944 static struct tcmsg *
3945 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3946 struct ofpbuf *request)
3948 struct tcmsg *tcmsg;
3952 error = get_ifindex(netdev, &ifindex);
3957 ofpbuf_init(request, 512);
3958 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3959 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3960 tcmsg->tcm_family = AF_UNSPEC;
3961 tcmsg->tcm_ifindex = ifindex;
3962 /* Caller should fill in tcmsg->tcm_handle. */
3963 /* Caller should fill in tcmsg->tcm_parent. */
3969 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3971 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3972 ofpbuf_uninit(request);
3976 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3977 * policing configuration.
3979 * This function is equivalent to running the following when 'add' is true:
3980 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3982 * This function is equivalent to running the following when 'add' is false:
3983 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3985 * The configuration and stats may be seen with the following command:
3986 * /sbin/tc -s qdisc show dev <devname>
3988 * Returns 0 if successful, otherwise a positive errno value.
3991 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3993 struct ofpbuf request;
3994 struct tcmsg *tcmsg;
3996 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3997 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3999 tcmsg = tc_make_request(netdev, type, flags, &request);
4003 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4004 tcmsg->tcm_parent = TC_H_INGRESS;
4005 nl_msg_put_string(&request, TCA_KIND, "ingress");
4006 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4008 error = tc_transact(&request, NULL);
4010 /* If we're deleting the qdisc, don't worry about some of the
4011 * error conditions. */
4012 if (!add && (error == ENOENT || error == EINVAL)) {
4021 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4024 * This function is equivalent to running:
4025 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4026 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4029 * The configuration and stats may be seen with the following command:
4030 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4032 * Returns 0 if successful, otherwise a positive errno value.
4035 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4037 struct tc_police tc_police;
4038 struct ofpbuf request;
4039 struct tcmsg *tcmsg;
4040 size_t basic_offset;
4041 size_t police_offset;
4045 memset(&tc_police, 0, sizeof tc_police);
4046 tc_police.action = TC_POLICE_SHOT;
4047 tc_police.mtu = mtu;
4048 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
4049 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4050 kbits_burst * 1024);
4052 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4053 NLM_F_EXCL | NLM_F_CREATE, &request);
4057 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4058 tcmsg->tcm_info = tc_make_handle(49,
4059 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4061 nl_msg_put_string(&request, TCA_KIND, "basic");
4062 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4063 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4064 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4065 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4066 nl_msg_end_nested(&request, police_offset);
4067 nl_msg_end_nested(&request, basic_offset);
4069 error = tc_transact(&request, NULL);
4080 /* The values in psched are not individually very meaningful, but they are
4081 * important. The tables below show some values seen in the wild.
4085 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4086 * (Before that, there are hints that it was 1000000000.)
4088 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4092 * -----------------------------------
4093 * [1] 000c8000 000f4240 000f4240 00000064
4094 * [2] 000003e8 00000400 000f4240 3b9aca00
4095 * [3] 000003e8 00000400 000f4240 3b9aca00
4096 * [4] 000003e8 00000400 000f4240 00000064
4097 * [5] 000003e8 00000040 000f4240 3b9aca00
4098 * [6] 000003e8 00000040 000f4240 000000f9
4100 * a b c d ticks_per_s buffer_hz
4101 * ------- --------- ---------- ------------- ----------- -------------
4102 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4103 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4104 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4105 * [4] 1,000 1,024 1,000,000 100 976,562 100
4106 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4107 * [6] 1,000 64 1,000,000 249 15,625,000 249
4109 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4110 * [2] 2.6.26-1-686-bigmem from Debian lenny
4111 * [3] 2.6.26-2-sparc64 from Debian lenny
4112 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4113 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4114 * [6] 2.6.34 from kernel.org on KVM
4116 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4117 static const char fn[] = "/proc/net/psched";
4118 unsigned int a, b, c, d;
4121 if (!ovsthread_once_start(&once)) {
4128 stream = fopen(fn, "r");
4130 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4134 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4135 VLOG_WARN("%s: read failed", fn);
4139 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4143 VLOG_WARN("%s: invalid scheduler parameters", fn);
4147 ticks_per_s = (double) a * c / b;
4151 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4154 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4157 ovsthread_once_done(&once);
4160 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4161 * rate of 'rate' bytes per second. */
4163 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4166 return (rate * ticks) / ticks_per_s;
4169 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4170 * rate of 'rate' bytes per second. */
4172 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4175 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4178 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4179 * a transmission rate of 'rate' bytes per second. */
4181 tc_buffer_per_jiffy(unsigned int rate)
4184 return rate / buffer_hz;
4187 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4188 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4189 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4190 * stores NULL into it if it is absent.
4192 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4195 * Returns 0 if successful, otherwise a positive errno value. */
4197 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4198 struct nlattr **options)
4200 static const struct nl_policy tca_policy[] = {
4201 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4202 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4204 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4206 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4207 tca_policy, ta, ARRAY_SIZE(ta))) {
4208 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4213 *kind = nl_attr_get_string(ta[TCA_KIND]);
4217 *options = ta[TCA_OPTIONS];
4232 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4233 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4234 * into '*options', and its queue statistics into '*stats'. Any of the output
4235 * arguments may be null.
4237 * Returns 0 if successful, otherwise a positive errno value. */
4239 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4240 struct nlattr **options, struct netdev_queue_stats *stats)
4242 static const struct nl_policy tca_policy[] = {
4243 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4244 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4246 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4248 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4249 tca_policy, ta, ARRAY_SIZE(ta))) {
4250 VLOG_WARN_RL(&rl, "failed to parse class message");
4255 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4256 *handlep = tc->tcm_handle;
4260 *options = ta[TCA_OPTIONS];
4264 const struct gnet_stats_queue *gsq;
4265 struct gnet_stats_basic gsb;
4267 static const struct nl_policy stats_policy[] = {
4268 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4269 .min_len = sizeof gsb },
4270 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4271 .min_len = sizeof *gsq },
4273 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4275 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4276 sa, ARRAY_SIZE(sa))) {
4277 VLOG_WARN_RL(&rl, "failed to parse class stats");
4281 /* Alignment issues screw up the length of struct gnet_stats_basic on
4282 * some arch/bitsize combinations. Newer versions of Linux have a
4283 * struct gnet_stats_basic_packed, but we can't depend on that. The
4284 * easiest thing to do is just to make a copy. */
4285 memset(&gsb, 0, sizeof gsb);
4286 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4287 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4288 stats->tx_bytes = gsb.bytes;
4289 stats->tx_packets = gsb.packets;
4291 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4292 stats->tx_errors = gsq->drops;
4302 memset(stats, 0, sizeof *stats);
4307 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4310 tc_query_class(const struct netdev *netdev,
4311 unsigned int handle, unsigned int parent,
4312 struct ofpbuf **replyp)
4314 struct ofpbuf request;
4315 struct tcmsg *tcmsg;
4318 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4322 tcmsg->tcm_handle = handle;
4323 tcmsg->tcm_parent = parent;
4325 error = tc_transact(&request, replyp);
4327 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4328 netdev_get_name(netdev),
4329 tc_get_major(handle), tc_get_minor(handle),
4330 tc_get_major(parent), tc_get_minor(parent),
4331 ovs_strerror(error));
4336 /* Equivalent to "tc class del dev <name> handle <handle>". */
4338 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4340 struct ofpbuf request;
4341 struct tcmsg *tcmsg;
4344 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4348 tcmsg->tcm_handle = handle;
4349 tcmsg->tcm_parent = 0;
4351 error = tc_transact(&request, NULL);
4353 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4354 netdev_get_name(netdev),
4355 tc_get_major(handle), tc_get_minor(handle),
4356 ovs_strerror(error));
4361 /* Equivalent to "tc qdisc del dev <name> root". */
4363 tc_del_qdisc(struct netdev *netdev_)
4365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4366 struct ofpbuf request;
4367 struct tcmsg *tcmsg;
4370 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4374 tcmsg->tcm_handle = tc_make_handle(1, 0);
4375 tcmsg->tcm_parent = TC_H_ROOT;
4377 error = tc_transact(&request, NULL);
4378 if (error == EINVAL) {
4379 /* EINVAL probably means that the default qdisc was in use, in which
4380 * case we've accomplished our purpose. */
4383 if (!error && netdev->tc) {
4384 if (netdev->tc->ops->tc_destroy) {
4385 netdev->tc->ops->tc_destroy(netdev->tc);
4392 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4393 * kernel to determine what they are. Returns 0 if successful, otherwise a
4394 * positive errno value. */
4396 tc_query_qdisc(const struct netdev *netdev_)
4398 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4399 struct ofpbuf request, *qdisc;
4400 const struct tc_ops *ops;
4401 struct tcmsg *tcmsg;
4409 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4410 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4411 * 2.6.35 without that fix backported to it.
4413 * To avoid the OOPS, we must not make a request that would attempt to dump
4414 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4415 * few others. There are a few ways that I can see to do this, but most of
4416 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4417 * technique chosen here is to assume that any non-default qdisc that we
4418 * create will have a class with handle 1:0. The built-in qdiscs only have
4419 * a class with handle 0:0.
4421 * We could check for Linux 2.6.35+ and use a more straightforward method
4423 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4427 tcmsg->tcm_handle = tc_make_handle(1, 0);
4428 tcmsg->tcm_parent = 0;
4430 /* Figure out what tc class to instantiate. */
4431 error = tc_transact(&request, &qdisc);
4435 error = tc_parse_qdisc(qdisc, &kind, NULL);
4437 ops = &tc_ops_other;
4439 ops = tc_lookup_linux_name(kind);
4441 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4442 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4444 ops = &tc_ops_other;
4447 } else if (error == ENOENT) {
4448 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4449 * other entity that doesn't have a handle 1:0. We will assume
4450 * that it's the system default qdisc. */
4451 ops = &tc_ops_default;
4454 /* Who knows? Maybe the device got deleted. */
4455 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4456 netdev_get_name(netdev_), ovs_strerror(error));
4457 ops = &tc_ops_other;
4460 /* Instantiate it. */
4461 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4462 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4463 ofpbuf_delete(qdisc);
4465 return error ? error : load_error;
4468 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4469 approximate the time to transmit packets of various lengths. For an MTU of
4470 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4471 represents two possible packet lengths; for a MTU of 513 through 1024, four
4472 possible lengths; and so on.
4474 Returns, for the specified 'mtu', the number of bits that packet lengths
4475 need to be shifted right to fit within such a 256-entry table. */
4477 tc_calc_cell_log(unsigned int mtu)
4482 mtu = ETH_PAYLOAD_MAX;
4484 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4486 for (cell_log = 0; mtu >= 256; cell_log++) {
4493 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4496 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4498 memset(rate, 0, sizeof *rate);
4499 rate->cell_log = tc_calc_cell_log(mtu);
4500 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4501 /* rate->cell_align = 0; */ /* distro headers. */
4502 rate->mpu = ETH_TOTAL_MIN;
4506 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4507 * attribute of the specified "type".
4509 * See tc_calc_cell_log() above for a description of "rtab"s. */
4511 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4516 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4517 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4518 unsigned packet_size = (i + 1) << rate->cell_log;
4519 if (packet_size < rate->mpu) {
4520 packet_size = rate->mpu;
4522 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4526 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4527 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4528 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4531 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4533 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4534 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4537 /* Linux-only functions declared in netdev-linux.h */
4539 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4540 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4542 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4543 const char *flag_name, bool enable)
4545 const char *netdev_name = netdev_get_name(netdev);
4546 struct ethtool_value evalue;
4550 COVERAGE_INC(netdev_get_ethtool);
4551 memset(&evalue, 0, sizeof evalue);
4552 error = netdev_linux_do_ethtool(netdev_name,
4553 (struct ethtool_cmd *)&evalue,
4554 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4559 COVERAGE_INC(netdev_set_ethtool);
4560 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4561 error = netdev_linux_do_ethtool(netdev_name,
4562 (struct ethtool_cmd *)&evalue,
4563 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4568 COVERAGE_INC(netdev_get_ethtool);
4569 memset(&evalue, 0, sizeof evalue);
4570 error = netdev_linux_do_ethtool(netdev_name,
4571 (struct ethtool_cmd *)&evalue,
4572 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4577 if (new_flags != evalue.data) {
4578 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4579 "device %s failed", enable ? "enable" : "disable",
4580 flag_name, netdev_name);
4587 /* Utility functions. */
4589 /* Copies 'src' into 'dst', performing format conversion in the process. */
4591 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4592 const struct rtnl_link_stats *src)
4594 dst->rx_packets = src->rx_packets;
4595 dst->tx_packets = src->tx_packets;
4596 dst->rx_bytes = src->rx_bytes;
4597 dst->tx_bytes = src->tx_bytes;
4598 dst->rx_errors = src->rx_errors;
4599 dst->tx_errors = src->tx_errors;
4600 dst->rx_dropped = src->rx_dropped;
4601 dst->tx_dropped = src->tx_dropped;
4602 dst->multicast = src->multicast;
4603 dst->collisions = src->collisions;
4604 dst->rx_length_errors = src->rx_length_errors;
4605 dst->rx_over_errors = src->rx_over_errors;
4606 dst->rx_crc_errors = src->rx_crc_errors;
4607 dst->rx_frame_errors = src->rx_frame_errors;
4608 dst->rx_fifo_errors = src->rx_fifo_errors;
4609 dst->rx_missed_errors = src->rx_missed_errors;
4610 dst->tx_aborted_errors = src->tx_aborted_errors;
4611 dst->tx_carrier_errors = src->tx_carrier_errors;
4612 dst->tx_fifo_errors = src->tx_fifo_errors;
4613 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4614 dst->tx_window_errors = src->tx_window_errors;
4618 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4620 struct ofpbuf request;
4621 struct ofpbuf *reply;
4624 ofpbuf_init(&request, 0);
4625 nl_msg_put_nlmsghdr(&request,
4626 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4627 RTM_GETLINK, NLM_F_REQUEST);
4628 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4629 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4630 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4631 ofpbuf_uninit(&request);
4636 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4637 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4638 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4639 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4642 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4646 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4651 ofpbuf_delete(reply);
4656 get_flags(const struct netdev *dev, unsigned int *flags)
4662 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4664 *flags = ifr.ifr_flags;
4670 set_flags(const char *name, unsigned int flags)
4674 ifr.ifr_flags = flags;
4675 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4679 do_get_ifindex(const char *netdev_name)
4684 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4685 COVERAGE_INC(netdev_get_ifindex);
4687 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4689 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4690 netdev_name, ovs_strerror(error));
4693 return ifr.ifr_ifindex;
4697 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4699 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4701 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4702 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4705 netdev->get_ifindex_error = -ifindex;
4706 netdev->ifindex = 0;
4708 netdev->get_ifindex_error = 0;
4709 netdev->ifindex = ifindex;
4711 netdev->cache_valid |= VALID_IFINDEX;
4714 *ifindexp = netdev->ifindex;
4715 return netdev->get_ifindex_error;
4719 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4725 memset(&ifr, 0, sizeof ifr);
4726 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4727 COVERAGE_INC(netdev_get_hwaddr);
4728 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4730 /* ENODEV probably means that a vif disappeared asynchronously and
4731 * hasn't been removed from the database yet, so reduce the log level
4732 * to INFO for that case. */
4733 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4734 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4735 netdev_name, ovs_strerror(error));
4738 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4739 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4740 VLOG_WARN("%s device has unknown hardware address family %d",
4741 netdev_name, hwaddr_family);
4743 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4748 set_etheraddr(const char *netdev_name,
4749 const uint8_t mac[ETH_ADDR_LEN])
4754 memset(&ifr, 0, sizeof ifr);
4755 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4756 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4757 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4758 COVERAGE_INC(netdev_set_hwaddr);
4759 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4761 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4762 netdev_name, ovs_strerror(error));
4768 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4769 int cmd, const char *cmd_name)
4774 memset(&ifr, 0, sizeof ifr);
4775 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4776 ifr.ifr_data = (caddr_t) ecmd;
4779 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4781 if (error != EOPNOTSUPP) {
4782 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4783 "failed: %s", cmd_name, name, ovs_strerror(error));
4785 /* The device doesn't support this operation. That's pretty
4786 * common, so there's no point in logging anything. */
4793 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4794 int cmd, const char *cmd_name)
4799 ifr.ifr_addr.sa_family = AF_INET;
4800 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4802 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4804 *ip = sin->sin_addr;
4809 /* Returns an AF_PACKET raw socket or a negative errno value. */
4811 af_packet_sock(void)
4813 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4816 if (ovsthread_once_start(&once)) {
4817 sock = socket(AF_PACKET, SOCK_RAW, 0);
4819 int error = set_nonblocking(sock);
4826 VLOG_ERR("failed to create packet socket: %s",
4827 ovs_strerror(errno));
4829 ovsthread_once_done(&once);