2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
142 VALID_IFINDEX = 1 << 0,
143 VALID_ETHERADDR = 1 << 1,
147 VALID_POLICING = 1 << 5,
148 VALID_VPORT_STAT_ERROR = 1 << 6,
149 VALID_DRVINFO = 1 << 7,
150 VALID_FEATURES = 1 << 8,
153 /* Traffic control. */
155 /* An instance of a traffic control class. Always associated with a particular
158 * Each TC implementation subclasses this with whatever additional data it
161 const struct tc_ops *ops;
162 struct hmap queues; /* Contains "struct tc_queue"s.
163 * Read by generic TC layer.
164 * Written only by TC implementation. */
167 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
169 /* One traffic control queue.
171 * Each TC implementation subclasses this with whatever additional data it
174 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
175 unsigned int queue_id; /* OpenFlow queue ID. */
176 long long int created; /* Time queue was created, in msecs. */
179 /* A particular kind of traffic control. Each implementation generally maps to
180 * one particular Linux qdisc class.
182 * The functions below return 0 if successful or a positive errno value on
183 * failure, except where otherwise noted. All of them must be provided, except
184 * where otherwise noted. */
186 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
187 * This is null for tc_ops_default and tc_ops_other, for which there are no
188 * appropriate values. */
189 const char *linux_name;
191 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
192 const char *ovs_name;
194 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
195 * queues. The queues are numbered 0 through n_queues - 1. */
196 unsigned int n_queues;
198 /* Called to install this TC class on 'netdev'. The implementation should
199 * make the Netlink calls required to set up 'netdev' with the right qdisc
200 * and configure it according to 'details'. The implementation may assume
201 * that the current qdisc is the default; that is, there is no need for it
202 * to delete the current qdisc before installing itself.
204 * The contents of 'details' should be documented as valid for 'ovs_name'
205 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
206 * (which is built as ovs-vswitchd.conf.db(8)).
208 * This function must return 0 if and only if it sets 'netdev->tc' to an
209 * initialized 'struct tc'.
211 * (This function is null for tc_ops_other, which cannot be installed. For
212 * other TC classes it should always be nonnull.) */
213 int (*tc_install)(struct netdev *netdev, const struct smap *details);
215 /* Called when the netdev code determines (through a Netlink query) that
216 * this TC class's qdisc is installed on 'netdev', but we didn't install
217 * it ourselves and so don't know any of the details.
219 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
220 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
221 * implementation should parse the other attributes of 'nlmsg' as
222 * necessary to determine its configuration. If necessary it should also
223 * use Netlink queries to determine the configuration of queues on
226 * This function must return 0 if and only if it sets 'netdev->tc' to an
227 * initialized 'struct tc'. */
228 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
230 /* Destroys the data structures allocated by the implementation as part of
231 * 'tc'. (This includes destroying 'tc->queues' by calling
234 * The implementation should not need to perform any Netlink calls. If
235 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
236 * (But it may not be desirable.)
238 * This function may be null if 'tc' is trivial. */
239 void (*tc_destroy)(struct tc *tc);
241 /* Retrieves details of 'netdev->tc' configuration into 'details'.
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the configuration.
247 * The contents of 'details' should be documented as valid for 'ovs_name'
248 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
249 * (which is built as ovs-vswitchd.conf.db(8)).
251 * This function may be null if 'tc' is not configurable.
253 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
255 /* Reconfigures 'netdev->tc' according to 'details', performing any
256 * required Netlink calls to complete the reconfiguration.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' is not configurable.
264 int (*qdisc_set)(struct netdev *, const struct smap *details);
266 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
267 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
269 * The contents of 'details' should be documented as valid for 'ovs_name'
270 * in the "other_config" column in the "Queue" table in
271 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
273 * The implementation should not need to perform any Netlink calls, because
274 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
275 * cached the queue configuration.
277 * This function may be null if 'tc' does not have queues ('n_queues' is
279 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
280 struct smap *details);
282 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
283 * 'details', perfoming any required Netlink calls to complete the
284 * reconfiguration. The caller ensures that 'queue_id' is less than
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "Queue" table in
289 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' does not have queues or its queues are
292 * not configurable. */
293 int (*class_set)(struct netdev *, unsigned int queue_id,
294 const struct smap *details);
296 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
297 * tc_queue's within 'netdev->tc->queues'.
299 * This function may be null if 'tc' does not have queues or its queues
300 * cannot be deleted. */
301 int (*class_delete)(struct netdev *, struct tc_queue *queue);
303 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
304 * 'struct tc_queue's within 'netdev->tc->queues'.
306 * On success, initializes '*stats'.
308 * This function may be null if 'tc' does not have queues or if it cannot
309 * report queue statistics. */
310 int (*class_get_stats)(const struct netdev *netdev,
311 const struct tc_queue *queue,
312 struct netdev_queue_stats *stats);
314 /* Extracts queue stats from 'nlmsg', which is a response to a
315 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
317 * This function may be null if 'tc' does not have queues or if it cannot
318 * report queue statistics. */
319 int (*class_dump_stats)(const struct netdev *netdev,
320 const struct ofpbuf *nlmsg,
321 netdev_dump_queue_stats_cb *cb, void *aux);
325 tc_init(struct tc *tc, const struct tc_ops *ops)
328 hmap_init(&tc->queues);
332 tc_destroy(struct tc *tc)
334 hmap_destroy(&tc->queues);
337 static const struct tc_ops tc_ops_htb;
338 static const struct tc_ops tc_ops_hfsc;
339 static const struct tc_ops tc_ops_default;
340 static const struct tc_ops tc_ops_other;
342 static const struct tc_ops *const tcs[] = {
343 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
344 &tc_ops_hfsc, /* Hierarchical fair service curve. */
345 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
346 &tc_ops_other, /* Some other qdisc. */
350 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
351 static unsigned int tc_get_major(unsigned int handle);
352 static unsigned int tc_get_minor(unsigned int handle);
354 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
355 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
356 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
358 static struct tcmsg *tc_make_request(const struct netdev *, int type,
359 unsigned int flags, struct ofpbuf *);
360 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
361 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
362 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
365 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
366 struct nlattr **options);
367 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
368 struct nlattr **options,
369 struct netdev_queue_stats *);
370 static int tc_query_class(const struct netdev *,
371 unsigned int handle, unsigned int parent,
372 struct ofpbuf **replyp);
373 static int tc_delete_class(const struct netdev *, unsigned int handle);
375 static int tc_del_qdisc(struct netdev *netdev);
376 static int tc_query_qdisc(const struct netdev *netdev);
378 static int tc_calc_cell_log(unsigned int mtu);
379 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
380 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
381 const struct tc_ratespec *rate);
382 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
384 struct netdev_linux {
387 /* Protects all members below. */
388 struct ovs_mutex mutex;
390 unsigned int cache_valid;
392 bool miimon; /* Link status of last poll. */
393 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
394 struct timer miimon_timer;
396 /* The following are figured out "on demand" only. They are only valid
397 * when the corresponding VALID_* bit in 'cache_valid' is set. */
399 uint8_t etheraddr[ETH_ADDR_LEN];
400 struct in_addr address, netmask;
403 unsigned int ifi_flags;
404 long long int carrier_resets;
405 uint32_t kbits_rate; /* Policing data. */
406 uint32_t kbits_burst;
407 int vport_stats_error; /* Cached error code from vport_get_stats().
408 0 or an errno value. */
409 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
410 int ether_addr_error; /* Cached error code from set/get etheraddr. */
411 int netdev_policing_error; /* Cached error code from set policing. */
412 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
413 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
415 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
416 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
417 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
419 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
422 /* For devices of class netdev_tap_class only. */
426 struct netdev_rxq_linux {
427 struct netdev_rxq up;
432 /* This is set pretty low because we probably won't learn anything from the
433 * additional log messages. */
434 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
436 /* Polling miimon status for all ports causes performance degradation when
437 * handling a large number of ports. If there are no devices using miimon, then
438 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
439 static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
441 static void netdev_linux_run(void);
443 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
444 int cmd, const char *cmd_name);
445 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
446 int cmd, const char *cmd_name);
447 static int get_flags(const struct netdev *, unsigned int *flags);
448 static int set_flags(const char *, unsigned int flags);
449 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
450 enum netdev_flags on, enum netdev_flags *old_flagsp)
451 OVS_REQUIRES(netdev->mutex);
452 static int do_get_ifindex(const char *netdev_name);
453 static int get_ifindex(const struct netdev *, int *ifindexp);
454 static int do_set_addr(struct netdev *netdev,
455 int ioctl_nr, const char *ioctl_name,
456 struct in_addr addr);
457 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
458 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
459 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
460 static int af_packet_sock(void);
461 static bool netdev_linux_miimon_enabled(void);
462 static void netdev_linux_miimon_run(void);
463 static void netdev_linux_miimon_wait(void);
464 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
467 is_netdev_linux_class(const struct netdev_class *netdev_class)
469 return netdev_class->run == netdev_linux_run;
473 is_tap_netdev(const struct netdev *netdev)
475 return netdev_get_class(netdev) == &netdev_tap_class;
478 static struct netdev_linux *
479 netdev_linux_cast(const struct netdev *netdev)
481 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
483 return CONTAINER_OF(netdev, struct netdev_linux, up);
486 static struct netdev_rxq_linux *
487 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
489 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
490 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
493 static void netdev_linux_update(struct netdev_linux *netdev,
494 const struct rtnetlink_link_change *)
495 OVS_REQUIRES(netdev->mutex);
496 static void netdev_linux_changed(struct netdev_linux *netdev,
497 unsigned int ifi_flags, unsigned int mask)
498 OVS_REQUIRES(netdev->mutex);
500 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
501 * if no such socket could be created. */
502 static struct nl_sock *
503 netdev_linux_notify_sock(void)
505 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
506 static struct nl_sock *sock;
508 if (ovsthread_once_start(&once)) {
511 error = nl_sock_create(NETLINK_ROUTE, &sock);
513 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
515 nl_sock_destroy(sock);
519 ovsthread_once_done(&once);
526 netdev_linux_miimon_enabled(void)
530 atomic_read(&miimon_cnt, &miimon);
535 netdev_linux_run(void)
537 struct nl_sock *sock;
540 if (netdev_linux_miimon_enabled()) {
541 netdev_linux_miimon_run();
544 sock = netdev_linux_notify_sock();
550 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
551 uint64_t buf_stub[4096 / 8];
554 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
555 error = nl_sock_recv(sock, &buf, false);
557 struct rtnetlink_link_change change;
559 if (rtnetlink_link_parse(&buf, &change)) {
560 struct netdev *netdev_ = netdev_from_name(change.ifname);
561 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
562 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
564 ovs_mutex_lock(&netdev->mutex);
565 netdev_linux_update(netdev, &change);
566 ovs_mutex_unlock(&netdev->mutex);
568 netdev_close(netdev_);
570 } else if (error == ENOBUFS) {
571 struct shash device_shash;
572 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
579 struct netdev *netdev_ = node->data;
580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
583 ovs_mutex_lock(&netdev->mutex);
584 get_flags(netdev_, &flags);
585 netdev_linux_changed(netdev, flags, 0);
586 ovs_mutex_unlock(&netdev->mutex);
588 netdev_close(netdev_);
590 shash_destroy(&device_shash);
591 } else if (error != EAGAIN) {
592 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
593 ovs_strerror(error));
600 netdev_linux_wait(void)
602 struct nl_sock *sock;
604 if (netdev_linux_miimon_enabled()) {
605 netdev_linux_miimon_wait();
607 sock = netdev_linux_notify_sock();
609 nl_sock_wait(sock, POLLIN);
614 netdev_linux_changed(struct netdev_linux *dev,
615 unsigned int ifi_flags, unsigned int mask)
616 OVS_REQUIRES(dev->mutex)
618 netdev_change_seq_changed(&dev->up);
620 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
621 dev->carrier_resets++;
623 dev->ifi_flags = ifi_flags;
625 dev->cache_valid &= mask;
629 netdev_linux_update(struct netdev_linux *dev,
630 const struct rtnetlink_link_change *change)
631 OVS_REQUIRES(dev->mutex)
633 if (change->nlmsg_type == RTM_NEWLINK) {
635 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
637 /* Update netdev from rtnl-change msg. */
639 dev->mtu = change->mtu;
640 dev->cache_valid |= VALID_MTU;
641 dev->netdev_mtu_error = 0;
644 if (!eth_addr_is_zero(change->addr)) {
645 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
646 dev->cache_valid |= VALID_ETHERADDR;
647 dev->ether_addr_error = 0;
650 dev->ifindex = change->ifi_index;
651 dev->cache_valid |= VALID_IFINDEX;
652 dev->get_ifindex_error = 0;
655 netdev_linux_changed(dev, change->ifi_flags, 0);
659 static struct netdev *
660 netdev_linux_alloc(void)
662 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
667 netdev_linux_common_construct(struct netdev_linux *netdev)
669 ovs_mutex_init(&netdev->mutex);
672 /* Creates system and internal devices. */
674 netdev_linux_construct(struct netdev *netdev_)
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
679 netdev_linux_common_construct(netdev);
681 error = get_flags(&netdev->up, &netdev->ifi_flags);
682 if (error == ENODEV) {
683 if (netdev->up.netdev_class != &netdev_internal_class) {
684 /* The device does not exist, so don't allow it to be opened. */
687 /* "Internal" netdevs have to be created as netdev objects before
688 * they exist in the kernel, because creating them in the kernel
689 * happens by passing a netdev object to dpif_port_add().
690 * Therefore, ignore the error. */
697 /* For most types of netdevs we open the device for each call of
698 * netdev_open(). However, this is not the case with tap devices,
699 * since it is only possible to open the device once. In this
700 * situation we share a single file descriptor, and consequently
701 * buffers, across all readers. Therefore once data is read it will
702 * be unavailable to other reads for tap devices. */
704 netdev_linux_construct_tap(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707 static const char tap_dev[] = "/dev/net/tun";
708 const char *name = netdev_->name;
712 netdev_linux_common_construct(netdev);
714 /* Open tap device. */
715 netdev->tap_fd = open(tap_dev, O_RDWR);
716 if (netdev->tap_fd < 0) {
718 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
722 /* Create tap device. */
723 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
724 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
725 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
726 VLOG_WARN("%s: creating tap device failed: %s", name,
727 ovs_strerror(errno));
732 /* Make non-blocking. */
733 error = set_nonblocking(netdev->tap_fd);
741 close(netdev->tap_fd);
746 netdev_linux_destruct(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 if (netdev->tc && netdev->tc->ops->tc_destroy) {
751 netdev->tc->ops->tc_destroy(netdev->tc);
754 if (netdev_get_class(netdev_) == &netdev_tap_class
755 && netdev->tap_fd >= 0)
757 close(netdev->tap_fd);
760 if (netdev->miimon_interval > 0) {
762 atomic_sub(&miimon_cnt, 1, &junk);
765 ovs_mutex_destroy(&netdev->mutex);
769 netdev_linux_dealloc(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
775 static struct netdev_rxq *
776 netdev_linux_rxq_alloc(void)
778 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
783 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
785 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
786 struct netdev *netdev_ = rx->up.netdev;
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 ovs_mutex_lock(&netdev->mutex);
791 rx->is_tap = is_tap_netdev(netdev_);
793 rx->fd = netdev->tap_fd;
795 struct sockaddr_ll sll;
797 /* Result of tcpdump -dd inbound */
798 static const struct sock_filter filt[] = {
799 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
800 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
801 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
802 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
804 static const struct sock_fprog fprog = {
805 ARRAY_SIZE(filt), (struct sock_filter *) filt
808 /* Create file descriptor. */
809 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
812 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
817 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
819 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
820 netdev_get_name(netdev_), ovs_strerror(error));
824 /* Set non-blocking mode. */
825 error = set_nonblocking(rx->fd);
830 /* Get ethernet device index. */
831 error = get_ifindex(&netdev->up, &ifindex);
836 /* Bind to specific ethernet device. */
837 memset(&sll, 0, sizeof sll);
838 sll.sll_family = AF_PACKET;
839 sll.sll_ifindex = ifindex;
840 sll.sll_protocol = htons(ETH_P_ALL);
841 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
843 VLOG_ERR("%s: failed to bind raw socket (%s)",
844 netdev_get_name(netdev_), ovs_strerror(error));
848 /* Filter for only inbound packets. */
849 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
853 VLOG_ERR("%s: failed to attach filter (%s)",
854 netdev_get_name(netdev_), ovs_strerror(error));
858 ovs_mutex_unlock(&netdev->mutex);
866 ovs_mutex_unlock(&netdev->mutex);
871 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
873 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
881 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
883 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
889 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
891 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
892 return htons(aux->tp_vlan_tpid);
894 return htons(ETH_TYPE_VLAN);
899 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
901 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
905 netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
910 struct cmsghdr *cmsg;
913 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
917 /* Reserve headroom for a single VLAN tag */
918 ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
919 size = ofpbuf_tailroom(buffer);
921 iov.iov_base = ofpbuf_data(buffer);
923 msgh.msg_name = NULL;
924 msgh.msg_namelen = 0;
927 msgh.msg_control = &cmsg_buffer;
928 msgh.msg_controllen = sizeof cmsg_buffer;
932 retval = recvmsg(fd, &msgh, MSG_TRUNC);
933 } while (retval < 0 && errno == EINTR);
937 } else if (retval > size) {
941 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
943 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
944 const struct tpacket_auxdata *aux;
946 if (cmsg->cmsg_level != SOL_PACKET
947 || cmsg->cmsg_type != PACKET_AUXDATA
948 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
952 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
953 if (auxdata_has_vlan_tci(aux)) {
954 if (retval < ETH_HEADER_LEN) {
958 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
959 htons(aux->tp_vlan_tci));
968 netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
971 size_t size = ofpbuf_tailroom(buffer);
974 retval = read(fd, ofpbuf_data(buffer), size);
975 } while (retval < 0 && errno == EINTR);
979 } else if (retval > size) {
983 ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
988 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
991 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
992 struct netdev *netdev = rx->up.netdev;
993 struct dpif_packet *packet;
994 struct ofpbuf *buffer;
998 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
999 mtu = ETH_PAYLOAD_MAX;
1002 packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1003 DP_NETDEV_HEADROOM);
1004 buffer = &packet->ofpbuf;
1006 retval = (rx->is_tap
1007 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1008 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1011 if (retval != EAGAIN && retval != EMSGSIZE) {
1012 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1013 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1015 ofpbuf_delete(buffer);
1017 dp_packet_pad(buffer);
1018 packets[0] = packet;
1026 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1028 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1029 poll_fd_wait(rx->fd, POLLIN);
1033 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1035 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1038 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1039 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1043 drain_fd(rx->fd, ifr.ifr_qlen);
1046 return drain_rcvbuf(rx->fd);
1050 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1051 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1052 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1053 * the packet is too big or too small to transmit on the device.
1055 * The caller retains ownership of 'buffer' in all cases.
1057 * The kernel maintains a packet transmission queue, so the caller is not
1058 * expected to do additional queuing of packets. */
1060 netdev_linux_send(struct netdev *netdev_, struct dpif_packet *pkt,
1063 const void *data = ofpbuf_data(&pkt->ofpbuf);
1064 size_t size = ofpbuf_size(&pkt->ofpbuf);
1069 if (!is_tap_netdev(netdev_)) {
1070 /* Use our AF_PACKET socket to send to this device. */
1071 struct sockaddr_ll sll;
1077 sock = af_packet_sock();
1082 ifindex = netdev_get_ifindex(netdev_);
1087 /* We don't bother setting most fields in sockaddr_ll because the
1088 * kernel ignores them for SOCK_RAW. */
1089 memset(&sll, 0, sizeof sll);
1090 sll.sll_family = AF_PACKET;
1091 sll.sll_ifindex = ifindex;
1093 iov.iov_base = CONST_CAST(void *, data);
1096 msg.msg_name = &sll;
1097 msg.msg_namelen = sizeof sll;
1100 msg.msg_control = NULL;
1101 msg.msg_controllen = 0;
1104 retval = sendmsg(sock, &msg, 0);
1106 /* Use the tap fd to send to this device. This is essential for
1107 * tap devices, because packets sent to a tap device with an
1108 * AF_PACKET socket will loop back to be *received* again on the
1109 * tap device. This doesn't occur on other interface types
1110 * because we attach a socket filter to the rx socket. */
1111 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1113 retval = write(netdev->tap_fd, data, size);
1117 dpif_packet_delete(pkt);
1121 /* The Linux AF_PACKET implementation never blocks waiting for room
1122 * for packets, instead returning ENOBUFS. Translate this into
1123 * EAGAIN for the caller. */
1124 if (errno == ENOBUFS) {
1126 } else if (errno == EINTR) {
1128 } else if (errno != EAGAIN) {
1129 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1130 netdev_get_name(netdev_), ovs_strerror(errno));
1133 } else if (retval != size) {
1134 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes of "
1135 "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
1143 /* Registers with the poll loop to wake up from the next call to poll_block()
1144 * when the packet transmission queue has sufficient room to transmit a packet
1145 * with netdev_send().
1147 * The kernel maintains a packet transmission queue, so the client is not
1148 * expected to do additional queuing of packets. Thus, this function is
1149 * unlikely to ever be used. It is included for completeness. */
1151 netdev_linux_send_wait(struct netdev *netdev)
1153 if (is_tap_netdev(netdev)) {
1154 /* TAP device always accepts packets.*/
1155 poll_immediate_wake();
1159 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1160 * otherwise a positive errno value. */
1162 netdev_linux_set_etheraddr(struct netdev *netdev_,
1163 const uint8_t mac[ETH_ADDR_LEN])
1165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1166 enum netdev_flags old_flags = 0;
1169 ovs_mutex_lock(&netdev->mutex);
1171 if (netdev->cache_valid & VALID_ETHERADDR) {
1172 error = netdev->ether_addr_error;
1173 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1176 netdev->cache_valid &= ~VALID_ETHERADDR;
1179 /* Tap devices must be brought down before setting the address. */
1180 if (is_tap_netdev(netdev_)) {
1181 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1183 error = set_etheraddr(netdev_get_name(netdev_), mac);
1184 if (!error || error == ENODEV) {
1185 netdev->ether_addr_error = error;
1186 netdev->cache_valid |= VALID_ETHERADDR;
1188 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1192 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1193 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1197 ovs_mutex_unlock(&netdev->mutex);
1201 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1203 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1204 uint8_t mac[ETH_ADDR_LEN])
1206 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1209 ovs_mutex_lock(&netdev->mutex);
1210 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1211 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1213 netdev->cache_valid |= VALID_ETHERADDR;
1216 error = netdev->ether_addr_error;
1218 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1220 ovs_mutex_unlock(&netdev->mutex);
1226 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1230 if (!(netdev->cache_valid & VALID_MTU)) {
1233 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1234 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1235 netdev->mtu = ifr.ifr_mtu;
1236 netdev->cache_valid |= VALID_MTU;
1239 error = netdev->netdev_mtu_error;
1241 *mtup = netdev->mtu;
1247 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1248 * in bytes, not including the hardware header; thus, this is typically 1500
1249 * bytes for Ethernet devices. */
1251 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1253 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1256 ovs_mutex_lock(&netdev->mutex);
1257 error = netdev_linux_get_mtu__(netdev, mtup);
1258 ovs_mutex_unlock(&netdev->mutex);
1263 /* Sets the maximum size of transmitted (MTU) for given device using linux
1264 * networking ioctl interface.
1267 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1269 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1273 ovs_mutex_lock(&netdev->mutex);
1274 if (netdev->cache_valid & VALID_MTU) {
1275 error = netdev->netdev_mtu_error;
1276 if (error || netdev->mtu == mtu) {
1279 netdev->cache_valid &= ~VALID_MTU;
1282 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1283 SIOCSIFMTU, "SIOCSIFMTU");
1284 if (!error || error == ENODEV) {
1285 netdev->netdev_mtu_error = error;
1286 netdev->mtu = ifr.ifr_mtu;
1287 netdev->cache_valid |= VALID_MTU;
1290 ovs_mutex_unlock(&netdev->mutex);
1294 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1295 * On failure, returns a negative errno value. */
1297 netdev_linux_get_ifindex(const struct netdev *netdev_)
1299 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1302 ovs_mutex_lock(&netdev->mutex);
1303 error = get_ifindex(netdev_, &ifindex);
1304 ovs_mutex_unlock(&netdev->mutex);
1306 return error ? -error : ifindex;
1310 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1314 ovs_mutex_lock(&netdev->mutex);
1315 if (netdev->miimon_interval > 0) {
1316 *carrier = netdev->miimon;
1318 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1320 ovs_mutex_unlock(&netdev->mutex);
1325 static long long int
1326 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1329 long long int carrier_resets;
1331 ovs_mutex_lock(&netdev->mutex);
1332 carrier_resets = netdev->carrier_resets;
1333 ovs_mutex_unlock(&netdev->mutex);
1335 return carrier_resets;
1339 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1340 struct mii_ioctl_data *data)
1345 memset(&ifr, 0, sizeof ifr);
1346 memcpy(&ifr.ifr_data, data, sizeof *data);
1347 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1348 memcpy(data, &ifr.ifr_data, sizeof *data);
1354 netdev_linux_get_miimon(const char *name, bool *miimon)
1356 struct mii_ioctl_data data;
1361 memset(&data, 0, sizeof data);
1362 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1364 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1365 data.reg_num = MII_BMSR;
1366 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1370 *miimon = !!(data.val_out & BMSR_LSTATUS);
1372 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1375 struct ethtool_cmd ecmd;
1377 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1380 COVERAGE_INC(netdev_get_ethtool);
1381 memset(&ecmd, 0, sizeof ecmd);
1382 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1385 struct ethtool_value eval;
1387 memcpy(&eval, &ecmd, sizeof eval);
1388 *miimon = !!eval.data;
1390 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1398 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1399 long long int interval)
1401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1403 ovs_mutex_lock(&netdev->mutex);
1404 interval = interval > 0 ? MAX(interval, 100) : 0;
1405 if (netdev->miimon_interval != interval) {
1408 if (interval && !netdev->miimon_interval) {
1409 atomic_add(&miimon_cnt, 1, &junk);
1410 } else if (!interval && netdev->miimon_interval) {
1411 atomic_sub(&miimon_cnt, 1, &junk);
1414 netdev->miimon_interval = interval;
1415 timer_set_expired(&netdev->miimon_timer);
1417 ovs_mutex_unlock(&netdev->mutex);
1423 netdev_linux_miimon_run(void)
1425 struct shash device_shash;
1426 struct shash_node *node;
1428 shash_init(&device_shash);
1429 netdev_get_devices(&netdev_linux_class, &device_shash);
1430 SHASH_FOR_EACH (node, &device_shash) {
1431 struct netdev *netdev = node->data;
1432 struct netdev_linux *dev = netdev_linux_cast(netdev);
1435 ovs_mutex_lock(&dev->mutex);
1436 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1437 netdev_linux_get_miimon(dev->up.name, &miimon);
1438 if (miimon != dev->miimon) {
1439 dev->miimon = miimon;
1440 netdev_linux_changed(dev, dev->ifi_flags, 0);
1443 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1445 ovs_mutex_unlock(&dev->mutex);
1446 netdev_close(netdev);
1449 shash_destroy(&device_shash);
1453 netdev_linux_miimon_wait(void)
1455 struct shash device_shash;
1456 struct shash_node *node;
1458 shash_init(&device_shash);
1459 netdev_get_devices(&netdev_linux_class, &device_shash);
1460 SHASH_FOR_EACH (node, &device_shash) {
1461 struct netdev *netdev = node->data;
1462 struct netdev_linux *dev = netdev_linux_cast(netdev);
1464 ovs_mutex_lock(&dev->mutex);
1465 if (dev->miimon_interval > 0) {
1466 timer_wait(&dev->miimon_timer);
1468 ovs_mutex_unlock(&dev->mutex);
1469 netdev_close(netdev);
1471 shash_destroy(&device_shash);
1475 swap_uint64(uint64_t *a, uint64_t *b)
1482 /* Copies 'src' into 'dst', performing format conversion in the process.
1484 * 'src' is allowed to be misaligned. */
1486 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1487 const struct ovs_vport_stats *src)
1489 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1490 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1491 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1492 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1493 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1494 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1495 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1496 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1498 dst->collisions = 0;
1499 dst->rx_length_errors = 0;
1500 dst->rx_over_errors = 0;
1501 dst->rx_crc_errors = 0;
1502 dst->rx_frame_errors = 0;
1503 dst->rx_fifo_errors = 0;
1504 dst->rx_missed_errors = 0;
1505 dst->tx_aborted_errors = 0;
1506 dst->tx_carrier_errors = 0;
1507 dst->tx_fifo_errors = 0;
1508 dst->tx_heartbeat_errors = 0;
1509 dst->tx_window_errors = 0;
1513 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1515 struct dpif_linux_vport reply;
1519 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1522 } else if (!reply.stats) {
1527 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1535 get_stats_via_vport(const struct netdev *netdev_,
1536 struct netdev_stats *stats)
1538 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1540 if (!netdev->vport_stats_error ||
1541 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1544 error = get_stats_via_vport__(netdev_, stats);
1545 if (error && error != ENOENT) {
1546 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1548 netdev_get_name(netdev_), ovs_strerror(error));
1550 netdev->vport_stats_error = error;
1551 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1555 /* Retrieves current device stats for 'netdev-linux'. */
1557 netdev_linux_get_stats(const struct netdev *netdev_,
1558 struct netdev_stats *stats)
1560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1561 struct netdev_stats dev_stats;
1564 ovs_mutex_lock(&netdev->mutex);
1565 get_stats_via_vport(netdev_, stats);
1566 error = get_stats_via_netlink(netdev_, &dev_stats);
1568 if (!netdev->vport_stats_error) {
1571 } else if (netdev->vport_stats_error) {
1572 /* stats not available from OVS then use netdev stats. */
1575 /* Use kernel netdev's packet and byte counts since vport's counters
1576 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1578 stats->rx_packets = dev_stats.rx_packets;
1579 stats->rx_bytes = dev_stats.rx_bytes;
1580 stats->tx_packets = dev_stats.tx_packets;
1581 stats->tx_bytes = dev_stats.tx_bytes;
1583 stats->rx_errors += dev_stats.rx_errors;
1584 stats->tx_errors += dev_stats.tx_errors;
1585 stats->rx_dropped += dev_stats.rx_dropped;
1586 stats->tx_dropped += dev_stats.tx_dropped;
1587 stats->multicast += dev_stats.multicast;
1588 stats->collisions += dev_stats.collisions;
1589 stats->rx_length_errors += dev_stats.rx_length_errors;
1590 stats->rx_over_errors += dev_stats.rx_over_errors;
1591 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1592 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1593 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1594 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1595 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1596 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1597 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1598 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1599 stats->tx_window_errors += dev_stats.tx_window_errors;
1601 ovs_mutex_unlock(&netdev->mutex);
1606 /* Retrieves current device stats for 'netdev-tap' netdev or
1607 * netdev-internal. */
1609 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1612 struct netdev_stats dev_stats;
1615 ovs_mutex_lock(&netdev->mutex);
1616 get_stats_via_vport(netdev_, stats);
1617 error = get_stats_via_netlink(netdev_, &dev_stats);
1619 if (!netdev->vport_stats_error) {
1622 } else if (netdev->vport_stats_error) {
1623 /* Transmit and receive stats will appear to be swapped relative to the
1624 * other ports since we are the one sending the data, not a remote
1625 * computer. For consistency, we swap them back here. This does not
1626 * apply if we are getting stats from the vport layer because it always
1627 * tracks stats from the perspective of the switch. */
1630 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1631 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1632 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1633 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1634 stats->rx_length_errors = 0;
1635 stats->rx_over_errors = 0;
1636 stats->rx_crc_errors = 0;
1637 stats->rx_frame_errors = 0;
1638 stats->rx_fifo_errors = 0;
1639 stats->rx_missed_errors = 0;
1640 stats->tx_aborted_errors = 0;
1641 stats->tx_carrier_errors = 0;
1642 stats->tx_fifo_errors = 0;
1643 stats->tx_heartbeat_errors = 0;
1644 stats->tx_window_errors = 0;
1646 /* Use kernel netdev's packet and byte counts since vport counters
1647 * do not reflect packet counts on the wire when GSO, TSO or GRO
1649 stats->rx_packets = dev_stats.tx_packets;
1650 stats->rx_bytes = dev_stats.tx_bytes;
1651 stats->tx_packets = dev_stats.rx_packets;
1652 stats->tx_bytes = dev_stats.rx_bytes;
1654 stats->rx_dropped += dev_stats.tx_dropped;
1655 stats->tx_dropped += dev_stats.rx_dropped;
1657 stats->rx_errors += dev_stats.tx_errors;
1658 stats->tx_errors += dev_stats.rx_errors;
1660 stats->multicast += dev_stats.multicast;
1661 stats->collisions += dev_stats.collisions;
1663 ovs_mutex_unlock(&netdev->mutex);
1669 netdev_internal_get_stats(const struct netdev *netdev_,
1670 struct netdev_stats *stats)
1672 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1675 ovs_mutex_lock(&netdev->mutex);
1676 get_stats_via_vport(netdev_, stats);
1677 error = netdev->vport_stats_error;
1678 ovs_mutex_unlock(&netdev->mutex);
1684 netdev_internal_set_stats(struct netdev *netdev,
1685 const struct netdev_stats *stats)
1687 struct ovs_vport_stats vport_stats;
1688 struct dpif_linux_vport vport;
1691 vport_stats.rx_packets = stats->rx_packets;
1692 vport_stats.tx_packets = stats->tx_packets;
1693 vport_stats.rx_bytes = stats->rx_bytes;
1694 vport_stats.tx_bytes = stats->tx_bytes;
1695 vport_stats.rx_errors = stats->rx_errors;
1696 vport_stats.tx_errors = stats->tx_errors;
1697 vport_stats.rx_dropped = stats->rx_dropped;
1698 vport_stats.tx_dropped = stats->tx_dropped;
1700 dpif_linux_vport_init(&vport);
1701 vport.cmd = OVS_VPORT_CMD_SET;
1702 vport.name = netdev_get_name(netdev);
1703 vport.stats = &vport_stats;
1705 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1707 /* If the vport layer doesn't know about the device, that doesn't mean it
1708 * doesn't exist (after all were able to open it when netdev_open() was
1709 * called), it just means that it isn't attached and we'll be getting
1710 * stats a different way. */
1711 if (err == ENODEV) {
1719 netdev_linux_read_features(struct netdev_linux *netdev)
1721 struct ethtool_cmd ecmd;
1725 if (netdev->cache_valid & VALID_FEATURES) {
1729 COVERAGE_INC(netdev_get_ethtool);
1730 memset(&ecmd, 0, sizeof ecmd);
1731 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1732 ETHTOOL_GSET, "ETHTOOL_GSET");
1737 /* Supported features. */
1738 netdev->supported = 0;
1739 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1740 netdev->supported |= NETDEV_F_10MB_HD;
1742 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1743 netdev->supported |= NETDEV_F_10MB_FD;
1745 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1746 netdev->supported |= NETDEV_F_100MB_HD;
1748 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1749 netdev->supported |= NETDEV_F_100MB_FD;
1751 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1752 netdev->supported |= NETDEV_F_1GB_HD;
1754 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1755 netdev->supported |= NETDEV_F_1GB_FD;
1757 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1758 netdev->supported |= NETDEV_F_10GB_FD;
1760 if (ecmd.supported & SUPPORTED_TP) {
1761 netdev->supported |= NETDEV_F_COPPER;
1763 if (ecmd.supported & SUPPORTED_FIBRE) {
1764 netdev->supported |= NETDEV_F_FIBER;
1766 if (ecmd.supported & SUPPORTED_Autoneg) {
1767 netdev->supported |= NETDEV_F_AUTONEG;
1769 if (ecmd.supported & SUPPORTED_Pause) {
1770 netdev->supported |= NETDEV_F_PAUSE;
1772 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1773 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1776 /* Advertised features. */
1777 netdev->advertised = 0;
1778 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1779 netdev->advertised |= NETDEV_F_10MB_HD;
1781 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1782 netdev->advertised |= NETDEV_F_10MB_FD;
1784 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1785 netdev->advertised |= NETDEV_F_100MB_HD;
1787 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1788 netdev->advertised |= NETDEV_F_100MB_FD;
1790 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1791 netdev->advertised |= NETDEV_F_1GB_HD;
1793 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1794 netdev->advertised |= NETDEV_F_1GB_FD;
1796 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1797 netdev->advertised |= NETDEV_F_10GB_FD;
1799 if (ecmd.advertising & ADVERTISED_TP) {
1800 netdev->advertised |= NETDEV_F_COPPER;
1802 if (ecmd.advertising & ADVERTISED_FIBRE) {
1803 netdev->advertised |= NETDEV_F_FIBER;
1805 if (ecmd.advertising & ADVERTISED_Autoneg) {
1806 netdev->advertised |= NETDEV_F_AUTONEG;
1808 if (ecmd.advertising & ADVERTISED_Pause) {
1809 netdev->advertised |= NETDEV_F_PAUSE;
1811 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1812 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1815 /* Current settings. */
1817 if (speed == SPEED_10) {
1818 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1819 } else if (speed == SPEED_100) {
1820 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1821 } else if (speed == SPEED_1000) {
1822 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1823 } else if (speed == SPEED_10000) {
1824 netdev->current = NETDEV_F_10GB_FD;
1825 } else if (speed == 40000) {
1826 netdev->current = NETDEV_F_40GB_FD;
1827 } else if (speed == 100000) {
1828 netdev->current = NETDEV_F_100GB_FD;
1829 } else if (speed == 1000000) {
1830 netdev->current = NETDEV_F_1TB_FD;
1832 netdev->current = 0;
1835 if (ecmd.port == PORT_TP) {
1836 netdev->current |= NETDEV_F_COPPER;
1837 } else if (ecmd.port == PORT_FIBRE) {
1838 netdev->current |= NETDEV_F_FIBER;
1842 netdev->current |= NETDEV_F_AUTONEG;
1846 netdev->cache_valid |= VALID_FEATURES;
1847 netdev->get_features_error = error;
1850 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1851 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1852 * Returns 0 if successful, otherwise a positive errno value. */
1854 netdev_linux_get_features(const struct netdev *netdev_,
1855 enum netdev_features *current,
1856 enum netdev_features *advertised,
1857 enum netdev_features *supported,
1858 enum netdev_features *peer)
1860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1863 ovs_mutex_lock(&netdev->mutex);
1864 netdev_linux_read_features(netdev);
1865 if (!netdev->get_features_error) {
1866 *current = netdev->current;
1867 *advertised = netdev->advertised;
1868 *supported = netdev->supported;
1869 *peer = 0; /* XXX */
1871 error = netdev->get_features_error;
1872 ovs_mutex_unlock(&netdev->mutex);
1877 /* Set the features advertised by 'netdev' to 'advertise'. */
1879 netdev_linux_set_advertisements(struct netdev *netdev_,
1880 enum netdev_features advertise)
1882 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1883 struct ethtool_cmd ecmd;
1886 ovs_mutex_lock(&netdev->mutex);
1888 COVERAGE_INC(netdev_get_ethtool);
1889 memset(&ecmd, 0, sizeof ecmd);
1890 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1891 ETHTOOL_GSET, "ETHTOOL_GSET");
1896 ecmd.advertising = 0;
1897 if (advertise & NETDEV_F_10MB_HD) {
1898 ecmd.advertising |= ADVERTISED_10baseT_Half;
1900 if (advertise & NETDEV_F_10MB_FD) {
1901 ecmd.advertising |= ADVERTISED_10baseT_Full;
1903 if (advertise & NETDEV_F_100MB_HD) {
1904 ecmd.advertising |= ADVERTISED_100baseT_Half;
1906 if (advertise & NETDEV_F_100MB_FD) {
1907 ecmd.advertising |= ADVERTISED_100baseT_Full;
1909 if (advertise & NETDEV_F_1GB_HD) {
1910 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1912 if (advertise & NETDEV_F_1GB_FD) {
1913 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1915 if (advertise & NETDEV_F_10GB_FD) {
1916 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1918 if (advertise & NETDEV_F_COPPER) {
1919 ecmd.advertising |= ADVERTISED_TP;
1921 if (advertise & NETDEV_F_FIBER) {
1922 ecmd.advertising |= ADVERTISED_FIBRE;
1924 if (advertise & NETDEV_F_AUTONEG) {
1925 ecmd.advertising |= ADVERTISED_Autoneg;
1927 if (advertise & NETDEV_F_PAUSE) {
1928 ecmd.advertising |= ADVERTISED_Pause;
1930 if (advertise & NETDEV_F_PAUSE_ASYM) {
1931 ecmd.advertising |= ADVERTISED_Asym_Pause;
1933 COVERAGE_INC(netdev_set_ethtool);
1934 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1935 ETHTOOL_SSET, "ETHTOOL_SSET");
1938 ovs_mutex_unlock(&netdev->mutex);
1942 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1943 * successful, otherwise a positive errno value. */
1945 netdev_linux_set_policing(struct netdev *netdev_,
1946 uint32_t kbits_rate, uint32_t kbits_burst)
1948 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1949 const char *netdev_name = netdev_get_name(netdev_);
1952 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1953 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1954 : kbits_burst); /* Stick with user-specified value. */
1956 ovs_mutex_lock(&netdev->mutex);
1957 if (netdev->cache_valid & VALID_POLICING) {
1958 error = netdev->netdev_policing_error;
1959 if (error || (netdev->kbits_rate == kbits_rate &&
1960 netdev->kbits_burst == kbits_burst)) {
1961 /* Assume that settings haven't changed since we last set them. */
1964 netdev->cache_valid &= ~VALID_POLICING;
1967 COVERAGE_INC(netdev_set_policing);
1968 /* Remove any existing ingress qdisc. */
1969 error = tc_add_del_ingress_qdisc(netdev_, false);
1971 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1972 netdev_name, ovs_strerror(error));
1977 error = tc_add_del_ingress_qdisc(netdev_, true);
1979 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1980 netdev_name, ovs_strerror(error));
1984 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1986 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1987 netdev_name, ovs_strerror(error));
1992 netdev->kbits_rate = kbits_rate;
1993 netdev->kbits_burst = kbits_burst;
1996 if (!error || error == ENODEV) {
1997 netdev->netdev_policing_error = error;
1998 netdev->cache_valid |= VALID_POLICING;
2000 ovs_mutex_unlock(&netdev->mutex);
2005 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2008 const struct tc_ops *const *opsp;
2010 for (opsp = tcs; *opsp != NULL; opsp++) {
2011 const struct tc_ops *ops = *opsp;
2012 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2013 sset_add(types, ops->ovs_name);
2019 static const struct tc_ops *
2020 tc_lookup_ovs_name(const char *name)
2022 const struct tc_ops *const *opsp;
2024 for (opsp = tcs; *opsp != NULL; opsp++) {
2025 const struct tc_ops *ops = *opsp;
2026 if (!strcmp(name, ops->ovs_name)) {
2033 static const struct tc_ops *
2034 tc_lookup_linux_name(const char *name)
2036 const struct tc_ops *const *opsp;
2038 for (opsp = tcs; *opsp != NULL; opsp++) {
2039 const struct tc_ops *ops = *opsp;
2040 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2047 static struct tc_queue *
2048 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2051 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052 struct tc_queue *queue;
2054 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2055 if (queue->queue_id == queue_id) {
2062 static struct tc_queue *
2063 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2065 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2069 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2071 struct netdev_qos_capabilities *caps)
2073 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2077 caps->n_queues = ops->n_queues;
2082 netdev_linux_get_qos(const struct netdev *netdev_,
2083 const char **typep, struct smap *details)
2085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2088 ovs_mutex_lock(&netdev->mutex);
2089 error = tc_query_qdisc(netdev_);
2091 *typep = netdev->tc->ops->ovs_name;
2092 error = (netdev->tc->ops->qdisc_get
2093 ? netdev->tc->ops->qdisc_get(netdev_, details)
2096 ovs_mutex_unlock(&netdev->mutex);
2102 netdev_linux_set_qos(struct netdev *netdev_,
2103 const char *type, const struct smap *details)
2105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2106 const struct tc_ops *new_ops;
2109 new_ops = tc_lookup_ovs_name(type);
2110 if (!new_ops || !new_ops->tc_install) {
2114 ovs_mutex_lock(&netdev->mutex);
2115 error = tc_query_qdisc(netdev_);
2120 if (new_ops == netdev->tc->ops) {
2121 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2123 /* Delete existing qdisc. */
2124 error = tc_del_qdisc(netdev_);
2128 ovs_assert(netdev->tc == NULL);
2130 /* Install new qdisc. */
2131 error = new_ops->tc_install(netdev_, details);
2132 ovs_assert((error == 0) == (netdev->tc != NULL));
2136 ovs_mutex_unlock(&netdev->mutex);
2141 netdev_linux_get_queue(const struct netdev *netdev_,
2142 unsigned int queue_id, struct smap *details)
2144 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2147 ovs_mutex_lock(&netdev->mutex);
2148 error = tc_query_qdisc(netdev_);
2150 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2152 ? netdev->tc->ops->class_get(netdev_, queue, details)
2155 ovs_mutex_unlock(&netdev->mutex);
2161 netdev_linux_set_queue(struct netdev *netdev_,
2162 unsigned int queue_id, const struct smap *details)
2164 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2167 ovs_mutex_lock(&netdev->mutex);
2168 error = tc_query_qdisc(netdev_);
2170 error = (queue_id < netdev->tc->ops->n_queues
2171 && netdev->tc->ops->class_set
2172 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2175 ovs_mutex_unlock(&netdev->mutex);
2181 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2183 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2186 ovs_mutex_lock(&netdev->mutex);
2187 error = tc_query_qdisc(netdev_);
2189 if (netdev->tc->ops->class_delete) {
2190 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2192 ? netdev->tc->ops->class_delete(netdev_, queue)
2198 ovs_mutex_unlock(&netdev->mutex);
2204 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2205 unsigned int queue_id,
2206 struct netdev_queue_stats *stats)
2208 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2211 ovs_mutex_lock(&netdev->mutex);
2212 error = tc_query_qdisc(netdev_);
2214 if (netdev->tc->ops->class_get_stats) {
2215 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2217 stats->created = queue->created;
2218 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2227 ovs_mutex_unlock(&netdev->mutex);
2232 struct queue_dump_state {
2233 struct nl_dump dump;
2238 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2240 struct ofpbuf request;
2241 struct tcmsg *tcmsg;
2243 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2247 tcmsg->tcm_parent = 0;
2248 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2249 ofpbuf_uninit(&request);
2251 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2256 finish_queue_dump(struct queue_dump_state *state)
2258 ofpbuf_uninit(&state->buf);
2259 return nl_dump_done(&state->dump);
2262 struct netdev_linux_queue_state {
2263 unsigned int *queues;
2269 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2271 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2274 ovs_mutex_lock(&netdev->mutex);
2275 error = tc_query_qdisc(netdev_);
2277 if (netdev->tc->ops->class_get) {
2278 struct netdev_linux_queue_state *state;
2279 struct tc_queue *queue;
2282 *statep = state = xmalloc(sizeof *state);
2283 state->n_queues = hmap_count(&netdev->tc->queues);
2284 state->cur_queue = 0;
2285 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2288 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2289 state->queues[i++] = queue->queue_id;
2295 ovs_mutex_unlock(&netdev->mutex);
2301 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2302 unsigned int *queue_idp, struct smap *details)
2304 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2305 struct netdev_linux_queue_state *state = state_;
2308 ovs_mutex_lock(&netdev->mutex);
2309 while (state->cur_queue < state->n_queues) {
2310 unsigned int queue_id = state->queues[state->cur_queue++];
2311 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2314 *queue_idp = queue_id;
2315 error = netdev->tc->ops->class_get(netdev_, queue, details);
2319 ovs_mutex_unlock(&netdev->mutex);
2325 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2328 struct netdev_linux_queue_state *state = state_;
2330 free(state->queues);
2336 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2337 netdev_dump_queue_stats_cb *cb, void *aux)
2339 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2342 ovs_mutex_lock(&netdev->mutex);
2343 error = tc_query_qdisc(netdev_);
2345 struct queue_dump_state state;
2347 if (!netdev->tc->ops->class_dump_stats) {
2349 } else if (!start_queue_dump(netdev_, &state)) {
2355 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2356 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2363 retval = finish_queue_dump(&state);
2369 ovs_mutex_unlock(&netdev->mutex);
2375 netdev_linux_get_in4(const struct netdev *netdev_,
2376 struct in_addr *address, struct in_addr *netmask)
2378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2381 ovs_mutex_lock(&netdev->mutex);
2382 if (!(netdev->cache_valid & VALID_IN4)) {
2383 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2384 SIOCGIFADDR, "SIOCGIFADDR");
2386 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2387 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2389 netdev->cache_valid |= VALID_IN4;
2397 if (netdev->address.s_addr != INADDR_ANY) {
2398 *address = netdev->address;
2399 *netmask = netdev->netmask;
2401 error = EADDRNOTAVAIL;
2404 ovs_mutex_unlock(&netdev->mutex);
2410 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2411 struct in_addr netmask)
2413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2416 ovs_mutex_lock(&netdev->mutex);
2417 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2419 netdev->cache_valid |= VALID_IN4;
2420 netdev->address = address;
2421 netdev->netmask = netmask;
2422 if (address.s_addr != INADDR_ANY) {
2423 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2424 "SIOCSIFNETMASK", netmask);
2427 ovs_mutex_unlock(&netdev->mutex);
2433 parse_if_inet6_line(const char *line,
2434 struct in6_addr *in6, char ifname[16 + 1])
2436 uint8_t *s6 = in6->s6_addr;
2437 #define X8 "%2"SCNx8
2438 return ovs_scan(line,
2439 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2440 "%*x %*x %*x %*x %16s\n",
2441 &s6[0], &s6[1], &s6[2], &s6[3],
2442 &s6[4], &s6[5], &s6[6], &s6[7],
2443 &s6[8], &s6[9], &s6[10], &s6[11],
2444 &s6[12], &s6[13], &s6[14], &s6[15],
2448 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2449 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2451 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2453 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2455 ovs_mutex_lock(&netdev->mutex);
2456 if (!(netdev->cache_valid & VALID_IN6)) {
2460 netdev->in6 = in6addr_any;
2462 file = fopen("/proc/net/if_inet6", "r");
2464 const char *name = netdev_get_name(netdev_);
2465 while (fgets(line, sizeof line, file)) {
2466 struct in6_addr in6_tmp;
2467 char ifname[16 + 1];
2468 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2469 && !strcmp(name, ifname))
2471 netdev->in6 = in6_tmp;
2477 netdev->cache_valid |= VALID_IN6;
2480 ovs_mutex_unlock(&netdev->mutex);
2486 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2488 struct sockaddr_in sin;
2489 memset(&sin, 0, sizeof sin);
2490 sin.sin_family = AF_INET;
2491 sin.sin_addr = addr;
2494 memset(sa, 0, sizeof *sa);
2495 memcpy(sa, &sin, sizeof sin);
2499 do_set_addr(struct netdev *netdev,
2500 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2504 make_in4_sockaddr(&ifr.ifr_addr, addr);
2505 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2509 /* Adds 'router' as a default IP gateway. */
2511 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2513 struct in_addr any = { INADDR_ANY };
2517 memset(&rt, 0, sizeof rt);
2518 make_in4_sockaddr(&rt.rt_dst, any);
2519 make_in4_sockaddr(&rt.rt_gateway, router);
2520 make_in4_sockaddr(&rt.rt_genmask, any);
2521 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2522 error = af_inet_ioctl(SIOCADDRT, &rt);
2524 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2530 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2533 static const char fn[] = "/proc/net/route";
2538 *netdev_name = NULL;
2539 stream = fopen(fn, "r");
2540 if (stream == NULL) {
2541 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2546 while (fgets(line, sizeof line, stream)) {
2549 ovs_be32 dest, gateway, mask;
2550 int refcnt, metric, mtu;
2551 unsigned int flags, use, window, irtt;
2554 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2556 iface, &dest, &gateway, &flags, &refcnt,
2557 &use, &metric, &mask, &mtu, &window, &irtt)) {
2558 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2562 if (!(flags & RTF_UP)) {
2563 /* Skip routes that aren't up. */
2567 /* The output of 'dest', 'mask', and 'gateway' were given in
2568 * network byte order, so we don't need need any endian
2569 * conversions here. */
2570 if ((dest & mask) == (host->s_addr & mask)) {
2572 /* The host is directly reachable. */
2573 next_hop->s_addr = 0;
2575 /* To reach the host, we must go through a gateway. */
2576 next_hop->s_addr = gateway;
2578 *netdev_name = xstrdup(iface);
2590 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2592 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2595 ovs_mutex_lock(&netdev->mutex);
2596 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2597 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2599 COVERAGE_INC(netdev_get_ethtool);
2600 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2601 error = netdev_linux_do_ethtool(netdev->up.name,
2604 "ETHTOOL_GDRVINFO");
2606 netdev->cache_valid |= VALID_DRVINFO;
2611 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2612 smap_add(smap, "driver_version", netdev->drvinfo.version);
2613 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2615 ovs_mutex_unlock(&netdev->mutex);
2621 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2624 smap_add(smap, "driver_name", "openvswitch");
2628 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2629 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2630 * returns 0. Otherwise, it returns a positive errno value; in particular,
2631 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2633 netdev_linux_arp_lookup(const struct netdev *netdev,
2634 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2637 struct sockaddr_in sin;
2640 memset(&r, 0, sizeof r);
2641 memset(&sin, 0, sizeof sin);
2642 sin.sin_family = AF_INET;
2643 sin.sin_addr.s_addr = ip;
2645 memcpy(&r.arp_pa, &sin, sizeof sin);
2646 r.arp_ha.sa_family = ARPHRD_ETHER;
2648 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2649 COVERAGE_INC(netdev_arp_lookup);
2650 retval = af_inet_ioctl(SIOCGARP, &r);
2652 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2653 } else if (retval != ENXIO) {
2654 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2655 netdev_get_name(netdev), IP_ARGS(ip),
2656 ovs_strerror(retval));
2662 nd_to_iff_flags(enum netdev_flags nd)
2665 if (nd & NETDEV_UP) {
2668 if (nd & NETDEV_PROMISC) {
2671 if (nd & NETDEV_LOOPBACK) {
2672 iff |= IFF_LOOPBACK;
2678 iff_to_nd_flags(int iff)
2680 enum netdev_flags nd = 0;
2684 if (iff & IFF_PROMISC) {
2685 nd |= NETDEV_PROMISC;
2687 if (iff & IFF_LOOPBACK) {
2688 nd |= NETDEV_LOOPBACK;
2694 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2695 enum netdev_flags on, enum netdev_flags *old_flagsp)
2696 OVS_REQUIRES(netdev->mutex)
2698 int old_flags, new_flags;
2701 old_flags = netdev->ifi_flags;
2702 *old_flagsp = iff_to_nd_flags(old_flags);
2703 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2704 if (new_flags != old_flags) {
2705 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2706 get_flags(&netdev->up, &netdev->ifi_flags);
2713 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2714 enum netdev_flags on, enum netdev_flags *old_flagsp)
2716 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2719 ovs_mutex_lock(&netdev->mutex);
2720 error = update_flags(netdev, off, on, old_flagsp);
2721 ovs_mutex_unlock(&netdev->mutex);
2726 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
2727 GET_FEATURES, GET_STATUS) \
2733 netdev_linux_wait, \
2735 netdev_linux_alloc, \
2737 netdev_linux_destruct, \
2738 netdev_linux_dealloc, \
2739 NULL, /* get_config */ \
2740 NULL, /* set_config */ \
2741 NULL, /* get_tunnel_config */ \
2743 netdev_linux_send, \
2744 netdev_linux_send_wait, \
2746 netdev_linux_set_etheraddr, \
2747 netdev_linux_get_etheraddr, \
2748 netdev_linux_get_mtu, \
2749 netdev_linux_set_mtu, \
2750 netdev_linux_get_ifindex, \
2751 netdev_linux_get_carrier, \
2752 netdev_linux_get_carrier_resets, \
2753 netdev_linux_set_miimon_interval, \
2758 netdev_linux_set_advertisements, \
2760 netdev_linux_set_policing, \
2761 netdev_linux_get_qos_types, \
2762 netdev_linux_get_qos_capabilities, \
2763 netdev_linux_get_qos, \
2764 netdev_linux_set_qos, \
2765 netdev_linux_get_queue, \
2766 netdev_linux_set_queue, \
2767 netdev_linux_delete_queue, \
2768 netdev_linux_get_queue_stats, \
2769 netdev_linux_queue_dump_start, \
2770 netdev_linux_queue_dump_next, \
2771 netdev_linux_queue_dump_done, \
2772 netdev_linux_dump_queue_stats, \
2774 netdev_linux_get_in4, \
2775 netdev_linux_set_in4, \
2776 netdev_linux_get_in6, \
2777 netdev_linux_add_router, \
2778 netdev_linux_get_next_hop, \
2780 netdev_linux_arp_lookup, \
2782 netdev_linux_update_flags, \
2784 netdev_linux_rxq_alloc, \
2785 netdev_linux_rxq_construct, \
2786 netdev_linux_rxq_destruct, \
2787 netdev_linux_rxq_dealloc, \
2788 netdev_linux_rxq_recv, \
2789 netdev_linux_rxq_wait, \
2790 netdev_linux_rxq_drain, \
2793 const struct netdev_class netdev_linux_class =
2796 netdev_linux_construct,
2797 netdev_linux_get_stats,
2798 NULL, /* set_stats */
2799 netdev_linux_get_features,
2800 netdev_linux_get_status);
2802 const struct netdev_class netdev_tap_class =
2805 netdev_linux_construct_tap,
2806 netdev_tap_get_stats,
2807 NULL, /* set_stats */
2808 netdev_linux_get_features,
2809 netdev_linux_get_status);
2811 const struct netdev_class netdev_internal_class =
2814 netdev_linux_construct,
2815 netdev_internal_get_stats,
2816 netdev_internal_set_stats,
2817 NULL, /* get_features */
2818 netdev_internal_get_status);
2820 /* HTB traffic control class. */
2822 #define HTB_N_QUEUES 0xf000
2826 unsigned int max_rate; /* In bytes/s. */
2830 struct tc_queue tc_queue;
2831 unsigned int min_rate; /* In bytes/s. */
2832 unsigned int max_rate; /* In bytes/s. */
2833 unsigned int burst; /* In bytes. */
2834 unsigned int priority; /* Lower values are higher priorities. */
2838 htb_get__(const struct netdev *netdev_)
2840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2841 return CONTAINER_OF(netdev->tc, struct htb, tc);
2845 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2850 htb = xmalloc(sizeof *htb);
2851 tc_init(&htb->tc, &tc_ops_htb);
2852 htb->max_rate = max_rate;
2854 netdev->tc = &htb->tc;
2857 /* Create an HTB qdisc.
2859 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2861 htb_setup_qdisc__(struct netdev *netdev)
2864 struct tc_htb_glob opt;
2865 struct ofpbuf request;
2866 struct tcmsg *tcmsg;
2868 tc_del_qdisc(netdev);
2870 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2871 NLM_F_EXCL | NLM_F_CREATE, &request);
2875 tcmsg->tcm_handle = tc_make_handle(1, 0);
2876 tcmsg->tcm_parent = TC_H_ROOT;
2878 nl_msg_put_string(&request, TCA_KIND, "htb");
2880 memset(&opt, 0, sizeof opt);
2881 opt.rate2quantum = 10;
2885 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2886 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2887 nl_msg_end_nested(&request, opt_offset);
2889 return tc_transact(&request, NULL);
2892 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2893 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2895 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2896 unsigned int parent, struct htb_class *class)
2899 struct tc_htb_opt opt;
2900 struct ofpbuf request;
2901 struct tcmsg *tcmsg;
2905 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2907 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2908 netdev_get_name(netdev));
2912 memset(&opt, 0, sizeof opt);
2913 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2914 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2915 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2916 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2917 opt.prio = class->priority;
2919 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2923 tcmsg->tcm_handle = handle;
2924 tcmsg->tcm_parent = parent;
2926 nl_msg_put_string(&request, TCA_KIND, "htb");
2927 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2928 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2929 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2930 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2931 nl_msg_end_nested(&request, opt_offset);
2933 error = tc_transact(&request, NULL);
2935 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2936 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2937 netdev_get_name(netdev),
2938 tc_get_major(handle), tc_get_minor(handle),
2939 tc_get_major(parent), tc_get_minor(parent),
2940 class->min_rate, class->max_rate,
2941 class->burst, class->priority, ovs_strerror(error));
2946 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2947 * description of them into 'details'. The description complies with the
2948 * specification given in the vswitch database documentation for linux-htb
2951 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2953 static const struct nl_policy tca_htb_policy[] = {
2954 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2955 .min_len = sizeof(struct tc_htb_opt) },
2958 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2959 const struct tc_htb_opt *htb;
2961 if (!nl_parse_nested(nl_options, tca_htb_policy,
2962 attrs, ARRAY_SIZE(tca_htb_policy))) {
2963 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2967 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2968 class->min_rate = htb->rate.rate;
2969 class->max_rate = htb->ceil.rate;
2970 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2971 class->priority = htb->prio;
2976 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2977 struct htb_class *options,
2978 struct netdev_queue_stats *stats)
2980 struct nlattr *nl_options;
2981 unsigned int handle;
2984 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2985 if (!error && queue_id) {
2986 unsigned int major = tc_get_major(handle);
2987 unsigned int minor = tc_get_minor(handle);
2988 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2989 *queue_id = minor - 1;
2994 if (!error && options) {
2995 error = htb_parse_tca_options__(nl_options, options);
3001 htb_parse_qdisc_details__(struct netdev *netdev_,
3002 const struct smap *details, struct htb_class *hc)
3004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3005 const char *max_rate_s;
3007 max_rate_s = smap_get(details, "max-rate");
3008 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3009 if (!hc->max_rate) {
3010 enum netdev_features current;
3012 netdev_linux_read_features(netdev);
3013 current = !netdev->get_features_error ? netdev->current : 0;
3014 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3016 hc->min_rate = hc->max_rate;
3022 htb_parse_class_details__(struct netdev *netdev,
3023 const struct smap *details, struct htb_class *hc)
3025 const struct htb *htb = htb_get__(netdev);
3026 const char *min_rate_s = smap_get(details, "min-rate");
3027 const char *max_rate_s = smap_get(details, "max-rate");
3028 const char *burst_s = smap_get(details, "burst");
3029 const char *priority_s = smap_get(details, "priority");
3032 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3034 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3035 netdev_get_name(netdev));
3039 /* HTB requires at least an mtu sized min-rate to send any traffic even
3040 * on uncongested links. */
3041 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3042 hc->min_rate = MAX(hc->min_rate, mtu);
3043 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3046 hc->max_rate = (max_rate_s
3047 ? strtoull(max_rate_s, NULL, 10) / 8
3049 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3050 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3054 * According to hints in the documentation that I've read, it is important
3055 * that 'burst' be at least as big as the largest frame that might be
3056 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3057 * but having it a bit too small is a problem. Since netdev_get_mtu()
3058 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3059 * the MTU. We actually add 64, instead of 14, as a guard against
3060 * additional headers get tacked on somewhere that we're not aware of. */
3061 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3062 hc->burst = MAX(hc->burst, mtu + 64);
3065 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3071 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3072 unsigned int parent, struct htb_class *options,
3073 struct netdev_queue_stats *stats)
3075 struct ofpbuf *reply;
3078 error = tc_query_class(netdev, handle, parent, &reply);
3080 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3081 ofpbuf_delete(reply);
3087 htb_tc_install(struct netdev *netdev, const struct smap *details)
3091 error = htb_setup_qdisc__(netdev);
3093 struct htb_class hc;
3095 htb_parse_qdisc_details__(netdev, details, &hc);
3096 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3097 tc_make_handle(1, 0), &hc);
3099 htb_install__(netdev, hc.max_rate);
3105 static struct htb_class *
3106 htb_class_cast__(const struct tc_queue *queue)
3108 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3112 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3113 const struct htb_class *hc)
3115 struct htb *htb = htb_get__(netdev);
3116 size_t hash = hash_int(queue_id, 0);
3117 struct tc_queue *queue;
3118 struct htb_class *hcp;
3120 queue = tc_find_queue__(netdev, queue_id, hash);
3122 hcp = htb_class_cast__(queue);
3124 hcp = xmalloc(sizeof *hcp);
3125 queue = &hcp->tc_queue;
3126 queue->queue_id = queue_id;
3127 queue->created = time_msec();
3128 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3131 hcp->min_rate = hc->min_rate;
3132 hcp->max_rate = hc->max_rate;
3133 hcp->burst = hc->burst;
3134 hcp->priority = hc->priority;
3138 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3141 struct queue_dump_state state;
3142 struct htb_class hc;
3144 /* Get qdisc options. */
3146 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3147 htb_install__(netdev, hc.max_rate);
3150 if (!start_queue_dump(netdev, &state)) {
3153 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3154 unsigned int queue_id;
3156 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3157 htb_update_queue__(netdev, queue_id, &hc);
3160 finish_queue_dump(&state);
3166 htb_tc_destroy(struct tc *tc)
3168 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3169 struct htb_class *hc, *next;
3171 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3172 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3180 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3182 const struct htb *htb = htb_get__(netdev);
3183 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3188 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3190 struct htb_class hc;
3193 htb_parse_qdisc_details__(netdev, details, &hc);
3194 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3195 tc_make_handle(1, 0), &hc);
3197 htb_get__(netdev)->max_rate = hc.max_rate;
3203 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3204 const struct tc_queue *queue, struct smap *details)
3206 const struct htb_class *hc = htb_class_cast__(queue);
3208 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3209 if (hc->min_rate != hc->max_rate) {
3210 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3212 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3214 smap_add_format(details, "priority", "%u", hc->priority);
3220 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3221 const struct smap *details)
3223 struct htb_class hc;
3226 error = htb_parse_class_details__(netdev, details, &hc);
3231 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3232 tc_make_handle(1, 0xfffe), &hc);
3237 htb_update_queue__(netdev, queue_id, &hc);
3242 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3244 struct htb_class *hc = htb_class_cast__(queue);
3245 struct htb *htb = htb_get__(netdev);
3248 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3250 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3257 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3258 struct netdev_queue_stats *stats)
3260 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3261 tc_make_handle(1, 0xfffe), NULL, stats);
3265 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3266 const struct ofpbuf *nlmsg,
3267 netdev_dump_queue_stats_cb *cb, void *aux)
3269 struct netdev_queue_stats stats;
3270 unsigned int handle, major, minor;
3273 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3278 major = tc_get_major(handle);
3279 minor = tc_get_minor(handle);
3280 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3281 (*cb)(minor - 1, &stats, aux);
3286 static const struct tc_ops tc_ops_htb = {
3287 "htb", /* linux_name */
3288 "linux-htb", /* ovs_name */
3289 HTB_N_QUEUES, /* n_queues */
3298 htb_class_get_stats,
3299 htb_class_dump_stats
3302 /* "linux-hfsc" traffic control class. */
3304 #define HFSC_N_QUEUES 0xf000
3312 struct tc_queue tc_queue;
3317 static struct hfsc *
3318 hfsc_get__(const struct netdev *netdev_)
3320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3321 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3324 static struct hfsc_class *
3325 hfsc_class_cast__(const struct tc_queue *queue)
3327 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3331 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3336 hfsc = xmalloc(sizeof *hfsc);
3337 tc_init(&hfsc->tc, &tc_ops_hfsc);
3338 hfsc->max_rate = max_rate;
3339 netdev->tc = &hfsc->tc;
3343 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3344 const struct hfsc_class *hc)
3348 struct hfsc_class *hcp;
3349 struct tc_queue *queue;
3351 hfsc = hfsc_get__(netdev);
3352 hash = hash_int(queue_id, 0);
3354 queue = tc_find_queue__(netdev, queue_id, hash);
3356 hcp = hfsc_class_cast__(queue);
3358 hcp = xmalloc(sizeof *hcp);
3359 queue = &hcp->tc_queue;
3360 queue->queue_id = queue_id;
3361 queue->created = time_msec();
3362 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3365 hcp->min_rate = hc->min_rate;
3366 hcp->max_rate = hc->max_rate;
3370 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3372 const struct tc_service_curve *rsc, *fsc, *usc;
3373 static const struct nl_policy tca_hfsc_policy[] = {
3375 .type = NL_A_UNSPEC,
3377 .min_len = sizeof(struct tc_service_curve),
3380 .type = NL_A_UNSPEC,
3382 .min_len = sizeof(struct tc_service_curve),
3385 .type = NL_A_UNSPEC,
3387 .min_len = sizeof(struct tc_service_curve),
3390 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3392 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3393 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3394 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3398 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3399 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3400 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3402 if (rsc->m1 != 0 || rsc->d != 0 ||
3403 fsc->m1 != 0 || fsc->d != 0 ||
3404 usc->m1 != 0 || usc->d != 0) {
3405 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3406 "Non-linear service curves are not supported.");
3410 if (rsc->m2 != fsc->m2) {
3411 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3412 "Real-time service curves are not supported ");
3416 if (rsc->m2 > usc->m2) {
3417 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3418 "Min-rate service curve is greater than "
3419 "the max-rate service curve.");
3423 class->min_rate = fsc->m2;
3424 class->max_rate = usc->m2;
3429 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3430 struct hfsc_class *options,
3431 struct netdev_queue_stats *stats)
3434 unsigned int handle;
3435 struct nlattr *nl_options;
3437 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3443 unsigned int major, minor;
3445 major = tc_get_major(handle);
3446 minor = tc_get_minor(handle);
3447 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3448 *queue_id = minor - 1;
3455 error = hfsc_parse_tca_options__(nl_options, options);
3462 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3463 unsigned int parent, struct hfsc_class *options,
3464 struct netdev_queue_stats *stats)
3467 struct ofpbuf *reply;
3469 error = tc_query_class(netdev, handle, parent, &reply);
3474 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3475 ofpbuf_delete(reply);
3480 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3481 struct hfsc_class *class)
3483 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3485 const char *max_rate_s;
3487 max_rate_s = smap_get(details, "max-rate");
3488 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3491 enum netdev_features current;
3493 netdev_linux_read_features(netdev);
3494 current = !netdev->get_features_error ? netdev->current : 0;
3495 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3498 class->min_rate = max_rate;
3499 class->max_rate = max_rate;
3503 hfsc_parse_class_details__(struct netdev *netdev,
3504 const struct smap *details,
3505 struct hfsc_class * class)
3507 const struct hfsc *hfsc;
3508 uint32_t min_rate, max_rate;
3509 const char *min_rate_s, *max_rate_s;
3511 hfsc = hfsc_get__(netdev);
3512 min_rate_s = smap_get(details, "min-rate");
3513 max_rate_s = smap_get(details, "max-rate");
3515 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3516 min_rate = MAX(min_rate, 1);
3517 min_rate = MIN(min_rate, hfsc->max_rate);
3519 max_rate = (max_rate_s
3520 ? strtoull(max_rate_s, NULL, 10) / 8
3522 max_rate = MAX(max_rate, min_rate);
3523 max_rate = MIN(max_rate, hfsc->max_rate);
3525 class->min_rate = min_rate;
3526 class->max_rate = max_rate;
3531 /* Create an HFSC qdisc.
3533 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3535 hfsc_setup_qdisc__(struct netdev * netdev)
3537 struct tcmsg *tcmsg;
3538 struct ofpbuf request;
3539 struct tc_hfsc_qopt opt;
3541 tc_del_qdisc(netdev);
3543 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3544 NLM_F_EXCL | NLM_F_CREATE, &request);
3550 tcmsg->tcm_handle = tc_make_handle(1, 0);
3551 tcmsg->tcm_parent = TC_H_ROOT;
3553 memset(&opt, 0, sizeof opt);
3556 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3557 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3559 return tc_transact(&request, NULL);
3562 /* Create an HFSC class.
3564 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3565 * sc rate <min_rate> ul rate <max_rate>" */
3567 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3568 unsigned int parent, struct hfsc_class *class)
3572 struct tcmsg *tcmsg;
3573 struct ofpbuf request;
3574 struct tc_service_curve min, max;
3576 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3582 tcmsg->tcm_handle = handle;
3583 tcmsg->tcm_parent = parent;
3587 min.m2 = class->min_rate;
3591 max.m2 = class->max_rate;
3593 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3594 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3595 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3596 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3597 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3598 nl_msg_end_nested(&request, opt_offset);
3600 error = tc_transact(&request, NULL);
3602 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3603 "min-rate %ubps, max-rate %ubps (%s)",
3604 netdev_get_name(netdev),
3605 tc_get_major(handle), tc_get_minor(handle),
3606 tc_get_major(parent), tc_get_minor(parent),
3607 class->min_rate, class->max_rate, ovs_strerror(error));
3614 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3617 struct hfsc_class class;
3619 error = hfsc_setup_qdisc__(netdev);
3625 hfsc_parse_qdisc_details__(netdev, details, &class);
3626 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3627 tc_make_handle(1, 0), &class);
3633 hfsc_install__(netdev, class.max_rate);
3638 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3641 struct queue_dump_state state;
3642 struct hfsc_class hc;
3645 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3646 hfsc_install__(netdev, hc.max_rate);
3648 if (!start_queue_dump(netdev, &state)) {
3652 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3653 unsigned int queue_id;
3655 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3656 hfsc_update_queue__(netdev, queue_id, &hc);
3660 finish_queue_dump(&state);
3665 hfsc_tc_destroy(struct tc *tc)
3668 struct hfsc_class *hc, *next;
3670 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3672 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3673 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3682 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3684 const struct hfsc *hfsc;
3685 hfsc = hfsc_get__(netdev);
3686 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3691 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3694 struct hfsc_class class;
3696 hfsc_parse_qdisc_details__(netdev, details, &class);
3697 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3698 tc_make_handle(1, 0), &class);
3701 hfsc_get__(netdev)->max_rate = class.max_rate;
3708 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3709 const struct tc_queue *queue, struct smap *details)
3711 const struct hfsc_class *hc;
3713 hc = hfsc_class_cast__(queue);
3714 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3715 if (hc->min_rate != hc->max_rate) {
3716 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3722 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3723 const struct smap *details)
3726 struct hfsc_class class;
3728 error = hfsc_parse_class_details__(netdev, details, &class);
3733 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3734 tc_make_handle(1, 0xfffe), &class);
3739 hfsc_update_queue__(netdev, queue_id, &class);
3744 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3748 struct hfsc_class *hc;
3750 hc = hfsc_class_cast__(queue);
3751 hfsc = hfsc_get__(netdev);
3753 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3755 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3762 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3763 struct netdev_queue_stats *stats)
3765 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3766 tc_make_handle(1, 0xfffe), NULL, stats);
3770 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3771 const struct ofpbuf *nlmsg,
3772 netdev_dump_queue_stats_cb *cb, void *aux)
3774 struct netdev_queue_stats stats;
3775 unsigned int handle, major, minor;
3778 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3783 major = tc_get_major(handle);
3784 minor = tc_get_minor(handle);
3785 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3786 (*cb)(minor - 1, &stats, aux);
3791 static const struct tc_ops tc_ops_hfsc = {
3792 "hfsc", /* linux_name */
3793 "linux-hfsc", /* ovs_name */
3794 HFSC_N_QUEUES, /* n_queues */
3795 hfsc_tc_install, /* tc_install */
3796 hfsc_tc_load, /* tc_load */
3797 hfsc_tc_destroy, /* tc_destroy */
3798 hfsc_qdisc_get, /* qdisc_get */
3799 hfsc_qdisc_set, /* qdisc_set */
3800 hfsc_class_get, /* class_get */
3801 hfsc_class_set, /* class_set */
3802 hfsc_class_delete, /* class_delete */
3803 hfsc_class_get_stats, /* class_get_stats */
3804 hfsc_class_dump_stats /* class_dump_stats */
3807 /* "linux-default" traffic control class.
3809 * This class represents the default, unnamed Linux qdisc. It corresponds to
3810 * the "" (empty string) QoS type in the OVS database. */
3813 default_install__(struct netdev *netdev_)
3815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3816 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3818 /* Nothing but a tc class implementation is allowed to write to a tc. This
3819 * class never does that, so we can legitimately use a const tc object. */
3820 netdev->tc = CONST_CAST(struct tc *, &tc);
3824 default_tc_install(struct netdev *netdev,
3825 const struct smap *details OVS_UNUSED)
3827 default_install__(netdev);
3832 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3834 default_install__(netdev);
3838 static const struct tc_ops tc_ops_default = {
3839 NULL, /* linux_name */
3844 NULL, /* tc_destroy */
3845 NULL, /* qdisc_get */
3846 NULL, /* qdisc_set */
3847 NULL, /* class_get */
3848 NULL, /* class_set */
3849 NULL, /* class_delete */
3850 NULL, /* class_get_stats */
3851 NULL /* class_dump_stats */
3854 /* "linux-other" traffic control class.
3859 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3862 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3864 /* Nothing but a tc class implementation is allowed to write to a tc. This
3865 * class never does that, so we can legitimately use a const tc object. */
3866 netdev->tc = CONST_CAST(struct tc *, &tc);
3870 static const struct tc_ops tc_ops_other = {
3871 NULL, /* linux_name */
3872 "linux-other", /* ovs_name */
3874 NULL, /* tc_install */
3876 NULL, /* tc_destroy */
3877 NULL, /* qdisc_get */
3878 NULL, /* qdisc_set */
3879 NULL, /* class_get */
3880 NULL, /* class_set */
3881 NULL, /* class_delete */
3882 NULL, /* class_get_stats */
3883 NULL /* class_dump_stats */
3886 /* Traffic control. */
3888 /* Number of kernel "tc" ticks per second. */
3889 static double ticks_per_s;
3891 /* Number of kernel "jiffies" per second. This is used for the purpose of
3892 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3893 * one jiffy's worth of data.
3895 * There are two possibilities here:
3897 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3898 * approximate range of 100 to 1024. That means that we really need to
3899 * make sure that the qdisc can buffer that much data.
3901 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3902 * has finely granular timers and there's no need to fudge additional room
3903 * for buffers. (There's no extra effort needed to implement that: the
3904 * large 'buffer_hz' is used as a divisor, so practically any number will
3905 * come out as 0 in the division. Small integer results in the case of
3906 * really high dividends won't have any real effect anyhow.)
3908 static unsigned int buffer_hz;
3910 /* Returns tc handle 'major':'minor'. */
3912 tc_make_handle(unsigned int major, unsigned int minor)
3914 return TC_H_MAKE(major << 16, minor);
3917 /* Returns the major number from 'handle'. */
3919 tc_get_major(unsigned int handle)
3921 return TC_H_MAJ(handle) >> 16;
3924 /* Returns the minor number from 'handle'. */
3926 tc_get_minor(unsigned int handle)
3928 return TC_H_MIN(handle);
3931 static struct tcmsg *
3932 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3933 struct ofpbuf *request)
3935 struct tcmsg *tcmsg;
3939 error = get_ifindex(netdev, &ifindex);
3944 ofpbuf_init(request, 512);
3945 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3946 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3947 tcmsg->tcm_family = AF_UNSPEC;
3948 tcmsg->tcm_ifindex = ifindex;
3949 /* Caller should fill in tcmsg->tcm_handle. */
3950 /* Caller should fill in tcmsg->tcm_parent. */
3956 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3958 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3959 ofpbuf_uninit(request);
3963 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3964 * policing configuration.
3966 * This function is equivalent to running the following when 'add' is true:
3967 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3969 * This function is equivalent to running the following when 'add' is false:
3970 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3972 * The configuration and stats may be seen with the following command:
3973 * /sbin/tc -s qdisc show dev <devname>
3975 * Returns 0 if successful, otherwise a positive errno value.
3978 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3980 struct ofpbuf request;
3981 struct tcmsg *tcmsg;
3983 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3984 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3986 tcmsg = tc_make_request(netdev, type, flags, &request);
3990 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3991 tcmsg->tcm_parent = TC_H_INGRESS;
3992 nl_msg_put_string(&request, TCA_KIND, "ingress");
3993 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3995 error = tc_transact(&request, NULL);
3997 /* If we're deleting the qdisc, don't worry about some of the
3998 * error conditions. */
3999 if (!add && (error == ENOENT || error == EINVAL)) {
4008 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4011 * This function is equivalent to running:
4012 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4013 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4016 * The configuration and stats may be seen with the following command:
4017 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4019 * Returns 0 if successful, otherwise a positive errno value.
4022 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4024 struct tc_police tc_police;
4025 struct ofpbuf request;
4026 struct tcmsg *tcmsg;
4027 size_t basic_offset;
4028 size_t police_offset;
4032 memset(&tc_police, 0, sizeof tc_police);
4033 tc_police.action = TC_POLICE_SHOT;
4034 tc_police.mtu = mtu;
4035 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
4036 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4037 kbits_burst * 1024);
4039 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4040 NLM_F_EXCL | NLM_F_CREATE, &request);
4044 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4045 tcmsg->tcm_info = tc_make_handle(49,
4046 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4048 nl_msg_put_string(&request, TCA_KIND, "basic");
4049 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4050 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4051 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4052 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4053 nl_msg_end_nested(&request, police_offset);
4054 nl_msg_end_nested(&request, basic_offset);
4056 error = tc_transact(&request, NULL);
4067 /* The values in psched are not individually very meaningful, but they are
4068 * important. The tables below show some values seen in the wild.
4072 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4073 * (Before that, there are hints that it was 1000000000.)
4075 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4079 * -----------------------------------
4080 * [1] 000c8000 000f4240 000f4240 00000064
4081 * [2] 000003e8 00000400 000f4240 3b9aca00
4082 * [3] 000003e8 00000400 000f4240 3b9aca00
4083 * [4] 000003e8 00000400 000f4240 00000064
4084 * [5] 000003e8 00000040 000f4240 3b9aca00
4085 * [6] 000003e8 00000040 000f4240 000000f9
4087 * a b c d ticks_per_s buffer_hz
4088 * ------- --------- ---------- ------------- ----------- -------------
4089 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4090 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4091 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4092 * [4] 1,000 1,024 1,000,000 100 976,562 100
4093 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4094 * [6] 1,000 64 1,000,000 249 15,625,000 249
4096 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4097 * [2] 2.6.26-1-686-bigmem from Debian lenny
4098 * [3] 2.6.26-2-sparc64 from Debian lenny
4099 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4100 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4101 * [6] 2.6.34 from kernel.org on KVM
4103 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4104 static const char fn[] = "/proc/net/psched";
4105 unsigned int a, b, c, d;
4108 if (!ovsthread_once_start(&once)) {
4115 stream = fopen(fn, "r");
4117 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4121 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4122 VLOG_WARN("%s: read failed", fn);
4126 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4130 VLOG_WARN("%s: invalid scheduler parameters", fn);
4134 ticks_per_s = (double) a * c / b;
4138 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4141 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4144 ovsthread_once_done(&once);
4147 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4148 * rate of 'rate' bytes per second. */
4150 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4153 return (rate * ticks) / ticks_per_s;
4156 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4157 * rate of 'rate' bytes per second. */
4159 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4162 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4165 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4166 * a transmission rate of 'rate' bytes per second. */
4168 tc_buffer_per_jiffy(unsigned int rate)
4171 return rate / buffer_hz;
4174 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4175 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4176 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4177 * stores NULL into it if it is absent.
4179 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4182 * Returns 0 if successful, otherwise a positive errno value. */
4184 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4185 struct nlattr **options)
4187 static const struct nl_policy tca_policy[] = {
4188 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4189 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4191 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4193 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4194 tca_policy, ta, ARRAY_SIZE(ta))) {
4195 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4200 *kind = nl_attr_get_string(ta[TCA_KIND]);
4204 *options = ta[TCA_OPTIONS];
4219 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4220 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4221 * into '*options', and its queue statistics into '*stats'. Any of the output
4222 * arguments may be null.
4224 * Returns 0 if successful, otherwise a positive errno value. */
4226 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4227 struct nlattr **options, struct netdev_queue_stats *stats)
4229 static const struct nl_policy tca_policy[] = {
4230 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4231 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4233 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4235 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4236 tca_policy, ta, ARRAY_SIZE(ta))) {
4237 VLOG_WARN_RL(&rl, "failed to parse class message");
4242 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4243 *handlep = tc->tcm_handle;
4247 *options = ta[TCA_OPTIONS];
4251 const struct gnet_stats_queue *gsq;
4252 struct gnet_stats_basic gsb;
4254 static const struct nl_policy stats_policy[] = {
4255 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4256 .min_len = sizeof gsb },
4257 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4258 .min_len = sizeof *gsq },
4260 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4262 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4263 sa, ARRAY_SIZE(sa))) {
4264 VLOG_WARN_RL(&rl, "failed to parse class stats");
4268 /* Alignment issues screw up the length of struct gnet_stats_basic on
4269 * some arch/bitsize combinations. Newer versions of Linux have a
4270 * struct gnet_stats_basic_packed, but we can't depend on that. The
4271 * easiest thing to do is just to make a copy. */
4272 memset(&gsb, 0, sizeof gsb);
4273 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4274 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4275 stats->tx_bytes = gsb.bytes;
4276 stats->tx_packets = gsb.packets;
4278 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4279 stats->tx_errors = gsq->drops;
4289 memset(stats, 0, sizeof *stats);
4294 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4297 tc_query_class(const struct netdev *netdev,
4298 unsigned int handle, unsigned int parent,
4299 struct ofpbuf **replyp)
4301 struct ofpbuf request;
4302 struct tcmsg *tcmsg;
4305 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4309 tcmsg->tcm_handle = handle;
4310 tcmsg->tcm_parent = parent;
4312 error = tc_transact(&request, replyp);
4314 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4315 netdev_get_name(netdev),
4316 tc_get_major(handle), tc_get_minor(handle),
4317 tc_get_major(parent), tc_get_minor(parent),
4318 ovs_strerror(error));
4323 /* Equivalent to "tc class del dev <name> handle <handle>". */
4325 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4327 struct ofpbuf request;
4328 struct tcmsg *tcmsg;
4331 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4335 tcmsg->tcm_handle = handle;
4336 tcmsg->tcm_parent = 0;
4338 error = tc_transact(&request, NULL);
4340 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4341 netdev_get_name(netdev),
4342 tc_get_major(handle), tc_get_minor(handle),
4343 ovs_strerror(error));
4348 /* Equivalent to "tc qdisc del dev <name> root". */
4350 tc_del_qdisc(struct netdev *netdev_)
4352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4353 struct ofpbuf request;
4354 struct tcmsg *tcmsg;
4357 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4361 tcmsg->tcm_handle = tc_make_handle(1, 0);
4362 tcmsg->tcm_parent = TC_H_ROOT;
4364 error = tc_transact(&request, NULL);
4365 if (error == EINVAL) {
4366 /* EINVAL probably means that the default qdisc was in use, in which
4367 * case we've accomplished our purpose. */
4370 if (!error && netdev->tc) {
4371 if (netdev->tc->ops->tc_destroy) {
4372 netdev->tc->ops->tc_destroy(netdev->tc);
4379 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4380 * kernel to determine what they are. Returns 0 if successful, otherwise a
4381 * positive errno value. */
4383 tc_query_qdisc(const struct netdev *netdev_)
4385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4386 struct ofpbuf request, *qdisc;
4387 const struct tc_ops *ops;
4388 struct tcmsg *tcmsg;
4396 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4397 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4398 * 2.6.35 without that fix backported to it.
4400 * To avoid the OOPS, we must not make a request that would attempt to dump
4401 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4402 * few others. There are a few ways that I can see to do this, but most of
4403 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4404 * technique chosen here is to assume that any non-default qdisc that we
4405 * create will have a class with handle 1:0. The built-in qdiscs only have
4406 * a class with handle 0:0.
4408 * We could check for Linux 2.6.35+ and use a more straightforward method
4410 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4414 tcmsg->tcm_handle = tc_make_handle(1, 0);
4415 tcmsg->tcm_parent = 0;
4417 /* Figure out what tc class to instantiate. */
4418 error = tc_transact(&request, &qdisc);
4422 error = tc_parse_qdisc(qdisc, &kind, NULL);
4424 ops = &tc_ops_other;
4426 ops = tc_lookup_linux_name(kind);
4428 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4429 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4431 ops = &tc_ops_other;
4434 } else if (error == ENOENT) {
4435 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4436 * other entity that doesn't have a handle 1:0. We will assume
4437 * that it's the system default qdisc. */
4438 ops = &tc_ops_default;
4441 /* Who knows? Maybe the device got deleted. */
4442 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4443 netdev_get_name(netdev_), ovs_strerror(error));
4444 ops = &tc_ops_other;
4447 /* Instantiate it. */
4448 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4449 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4450 ofpbuf_delete(qdisc);
4452 return error ? error : load_error;
4455 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4456 approximate the time to transmit packets of various lengths. For an MTU of
4457 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4458 represents two possible packet lengths; for a MTU of 513 through 1024, four
4459 possible lengths; and so on.
4461 Returns, for the specified 'mtu', the number of bits that packet lengths
4462 need to be shifted right to fit within such a 256-entry table. */
4464 tc_calc_cell_log(unsigned int mtu)
4469 mtu = ETH_PAYLOAD_MAX;
4471 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4473 for (cell_log = 0; mtu >= 256; cell_log++) {
4480 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4483 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4485 memset(rate, 0, sizeof *rate);
4486 rate->cell_log = tc_calc_cell_log(mtu);
4487 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4488 /* rate->cell_align = 0; */ /* distro headers. */
4489 rate->mpu = ETH_TOTAL_MIN;
4493 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4494 * attribute of the specified "type".
4496 * See tc_calc_cell_log() above for a description of "rtab"s. */
4498 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4503 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4504 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4505 unsigned packet_size = (i + 1) << rate->cell_log;
4506 if (packet_size < rate->mpu) {
4507 packet_size = rate->mpu;
4509 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4513 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4514 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4515 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4518 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4520 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4521 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4524 /* Linux-only functions declared in netdev-linux.h */
4526 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4527 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4529 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4530 const char *flag_name, bool enable)
4532 const char *netdev_name = netdev_get_name(netdev);
4533 struct ethtool_value evalue;
4537 COVERAGE_INC(netdev_get_ethtool);
4538 memset(&evalue, 0, sizeof evalue);
4539 error = netdev_linux_do_ethtool(netdev_name,
4540 (struct ethtool_cmd *)&evalue,
4541 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4546 COVERAGE_INC(netdev_set_ethtool);
4547 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4548 error = netdev_linux_do_ethtool(netdev_name,
4549 (struct ethtool_cmd *)&evalue,
4550 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4555 COVERAGE_INC(netdev_get_ethtool);
4556 memset(&evalue, 0, sizeof evalue);
4557 error = netdev_linux_do_ethtool(netdev_name,
4558 (struct ethtool_cmd *)&evalue,
4559 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4564 if (new_flags != evalue.data) {
4565 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4566 "device %s failed", enable ? "enable" : "disable",
4567 flag_name, netdev_name);
4574 /* Utility functions. */
4576 /* Copies 'src' into 'dst', performing format conversion in the process. */
4578 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4579 const struct rtnl_link_stats *src)
4581 dst->rx_packets = src->rx_packets;
4582 dst->tx_packets = src->tx_packets;
4583 dst->rx_bytes = src->rx_bytes;
4584 dst->tx_bytes = src->tx_bytes;
4585 dst->rx_errors = src->rx_errors;
4586 dst->tx_errors = src->tx_errors;
4587 dst->rx_dropped = src->rx_dropped;
4588 dst->tx_dropped = src->tx_dropped;
4589 dst->multicast = src->multicast;
4590 dst->collisions = src->collisions;
4591 dst->rx_length_errors = src->rx_length_errors;
4592 dst->rx_over_errors = src->rx_over_errors;
4593 dst->rx_crc_errors = src->rx_crc_errors;
4594 dst->rx_frame_errors = src->rx_frame_errors;
4595 dst->rx_fifo_errors = src->rx_fifo_errors;
4596 dst->rx_missed_errors = src->rx_missed_errors;
4597 dst->tx_aborted_errors = src->tx_aborted_errors;
4598 dst->tx_carrier_errors = src->tx_carrier_errors;
4599 dst->tx_fifo_errors = src->tx_fifo_errors;
4600 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4601 dst->tx_window_errors = src->tx_window_errors;
4605 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4607 struct ofpbuf request;
4608 struct ofpbuf *reply;
4611 ofpbuf_init(&request, 0);
4612 nl_msg_put_nlmsghdr(&request,
4613 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4614 RTM_GETLINK, NLM_F_REQUEST);
4615 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4616 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4617 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4618 ofpbuf_uninit(&request);
4623 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4624 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4625 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4626 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4629 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4633 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4638 ofpbuf_delete(reply);
4643 get_flags(const struct netdev *dev, unsigned int *flags)
4649 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4651 *flags = ifr.ifr_flags;
4657 set_flags(const char *name, unsigned int flags)
4661 ifr.ifr_flags = flags;
4662 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4666 do_get_ifindex(const char *netdev_name)
4671 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4672 COVERAGE_INC(netdev_get_ifindex);
4674 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4676 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4677 netdev_name, ovs_strerror(error));
4680 return ifr.ifr_ifindex;
4684 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4688 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4689 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4692 netdev->get_ifindex_error = -ifindex;
4693 netdev->ifindex = 0;
4695 netdev->get_ifindex_error = 0;
4696 netdev->ifindex = ifindex;
4698 netdev->cache_valid |= VALID_IFINDEX;
4701 *ifindexp = netdev->ifindex;
4702 return netdev->get_ifindex_error;
4706 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4712 memset(&ifr, 0, sizeof ifr);
4713 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4714 COVERAGE_INC(netdev_get_hwaddr);
4715 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4717 /* ENODEV probably means that a vif disappeared asynchronously and
4718 * hasn't been removed from the database yet, so reduce the log level
4719 * to INFO for that case. */
4720 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4721 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4722 netdev_name, ovs_strerror(error));
4725 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4726 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4727 VLOG_WARN("%s device has unknown hardware address family %d",
4728 netdev_name, hwaddr_family);
4730 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4735 set_etheraddr(const char *netdev_name,
4736 const uint8_t mac[ETH_ADDR_LEN])
4741 memset(&ifr, 0, sizeof ifr);
4742 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4743 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4744 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4745 COVERAGE_INC(netdev_set_hwaddr);
4746 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4748 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4749 netdev_name, ovs_strerror(error));
4755 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4756 int cmd, const char *cmd_name)
4761 memset(&ifr, 0, sizeof ifr);
4762 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4763 ifr.ifr_data = (caddr_t) ecmd;
4766 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4768 if (error != EOPNOTSUPP) {
4769 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4770 "failed: %s", cmd_name, name, ovs_strerror(error));
4772 /* The device doesn't support this operation. That's pretty
4773 * common, so there's no point in logging anything. */
4780 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4781 int cmd, const char *cmd_name)
4786 ifr.ifr_addr.sa_family = AF_INET;
4787 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4789 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4791 *ip = sin->sin_addr;
4796 /* Returns an AF_PACKET raw socket or a negative errno value. */
4798 af_packet_sock(void)
4800 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4803 if (ovsthread_once_start(&once)) {
4804 sock = socket(AF_PACKET, SOCK_RAW, 0);
4806 int error = set_nonblocking(sock);
4813 VLOG_ERR("failed to create packet socket: %s",
4814 ovs_strerror(errno));
4816 ovsthread_once_done(&once);