2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dp-packet.h"
52 #include "dpif-netlink.h"
53 #include "dpif-netdev.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
74 #include "openvswitch/vlog.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux);
78 COVERAGE_DEFINE(netdev_set_policing);
79 COVERAGE_DEFINE(netdev_arp_lookup);
80 COVERAGE_DEFINE(netdev_get_ifindex);
81 COVERAGE_DEFINE(netdev_get_hwaddr);
82 COVERAGE_DEFINE(netdev_set_hwaddr);
83 COVERAGE_DEFINE(netdev_get_ethtool);
84 COVERAGE_DEFINE(netdev_set_ethtool);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata {
137 uint16_t tp_vlan_tci;
138 uint16_t tp_vlan_tpid;
141 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
144 * 2.6.32-431.29.2.el6.x86_64 (see report at
145 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
146 * if_link.h is not self-contained on those kernels. It is easiest to
147 * unconditionally define a replacement. */
149 #define IFLA_STATS64 23
151 #define rtnl_link_stats64 rpl_rtnl_link_stats64
152 struct rtnl_link_stats64 {
164 uint64_t rx_length_errors;
165 uint64_t rx_over_errors;
166 uint64_t rx_crc_errors;
167 uint64_t rx_frame_errors;
168 uint64_t rx_fifo_errors;
169 uint64_t rx_missed_errors;
171 uint64_t tx_aborted_errors;
172 uint64_t tx_carrier_errors;
173 uint64_t tx_fifo_errors;
174 uint64_t tx_heartbeat_errors;
175 uint64_t tx_window_errors;
177 uint64_t rx_compressed;
178 uint64_t tx_compressed;
182 VALID_IFINDEX = 1 << 0,
183 VALID_ETHERADDR = 1 << 1,
187 VALID_POLICING = 1 << 5,
188 VALID_VPORT_STAT_ERROR = 1 << 6,
189 VALID_DRVINFO = 1 << 7,
190 VALID_FEATURES = 1 << 8,
193 /* Traffic control. */
195 /* An instance of a traffic control class. Always associated with a particular
198 * Each TC implementation subclasses this with whatever additional data it
201 const struct tc_ops *ops;
202 struct hmap queues; /* Contains "struct tc_queue"s.
203 * Read by generic TC layer.
204 * Written only by TC implementation. */
207 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209 /* One traffic control queue.
211 * Each TC implementation subclasses this with whatever additional data it
214 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
215 unsigned int queue_id; /* OpenFlow queue ID. */
216 long long int created; /* Time queue was created, in msecs. */
219 /* A particular kind of traffic control. Each implementation generally maps to
220 * one particular Linux qdisc class.
222 * The functions below return 0 if successful or a positive errno value on
223 * failure, except where otherwise noted. All of them must be provided, except
224 * where otherwise noted. */
226 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
227 * This is null for tc_ops_default and tc_ops_other, for which there are no
228 * appropriate values. */
229 const char *linux_name;
231 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
232 const char *ovs_name;
234 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
235 * queues. The queues are numbered 0 through n_queues - 1. */
236 unsigned int n_queues;
238 /* Called to install this TC class on 'netdev'. The implementation should
239 * make the Netlink calls required to set up 'netdev' with the right qdisc
240 * and configure it according to 'details'. The implementation may assume
241 * that the current qdisc is the default; that is, there is no need for it
242 * to delete the current qdisc before installing itself.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
246 * (which is built as ovs-vswitchd.conf.db(8)).
248 * This function must return 0 if and only if it sets 'netdev->tc' to an
249 * initialized 'struct tc'.
251 * (This function is null for tc_ops_other, which cannot be installed. For
252 * other TC classes it should always be nonnull.) */
253 int (*tc_install)(struct netdev *netdev, const struct smap *details);
255 /* Called when the netdev code determines (through a Netlink query) that
256 * this TC class's qdisc is installed on 'netdev', but we didn't install
257 * it ourselves and so don't know any of the details.
259 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
260 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
261 * implementation should parse the other attributes of 'nlmsg' as
262 * necessary to determine its configuration. If necessary it should also
263 * use Netlink queries to determine the configuration of queues on
266 * This function must return 0 if and only if it sets 'netdev->tc' to an
267 * initialized 'struct tc'. */
268 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
270 /* Destroys the data structures allocated by the implementation as part of
271 * 'tc'. (This includes destroying 'tc->queues' by calling
274 * The implementation should not need to perform any Netlink calls. If
275 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
276 * (But it may not be desirable.)
278 * This function may be null if 'tc' is trivial. */
279 void (*tc_destroy)(struct tc *tc);
281 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 * The implementation should not need to perform any Netlink calls, because
284 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
285 * cached the configuration.
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' is not configurable.
293 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
295 /* Reconfigures 'netdev->tc' according to 'details', performing any
296 * required Netlink calls to complete the reconfiguration.
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
302 * This function may be null if 'tc' is not configurable.
304 int (*qdisc_set)(struct netdev *, const struct smap *details);
306 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
307 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
309 * The contents of 'details' should be documented as valid for 'ovs_name'
310 * in the "other_config" column in the "Queue" table in
311 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 * The implementation should not need to perform any Netlink calls, because
314 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
315 * cached the queue configuration.
317 * This function may be null if 'tc' does not have queues ('n_queues' is
319 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
320 struct smap *details);
322 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
323 * 'details', perfoming any required Netlink calls to complete the
324 * reconfiguration. The caller ensures that 'queue_id' is less than
327 * The contents of 'details' should be documented as valid for 'ovs_name'
328 * in the "other_config" column in the "Queue" table in
329 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 * This function may be null if 'tc' does not have queues or its queues are
332 * not configurable. */
333 int (*class_set)(struct netdev *, unsigned int queue_id,
334 const struct smap *details);
336 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
337 * tc_queue's within 'netdev->tc->queues'.
339 * This function may be null if 'tc' does not have queues or its queues
340 * cannot be deleted. */
341 int (*class_delete)(struct netdev *, struct tc_queue *queue);
343 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
344 * 'struct tc_queue's within 'netdev->tc->queues'.
346 * On success, initializes '*stats'.
348 * This function may be null if 'tc' does not have queues or if it cannot
349 * report queue statistics. */
350 int (*class_get_stats)(const struct netdev *netdev,
351 const struct tc_queue *queue,
352 struct netdev_queue_stats *stats);
354 /* Extracts queue stats from 'nlmsg', which is a response to a
355 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 * This function may be null if 'tc' does not have queues or if it cannot
358 * report queue statistics. */
359 int (*class_dump_stats)(const struct netdev *netdev,
360 const struct ofpbuf *nlmsg,
361 netdev_dump_queue_stats_cb *cb, void *aux);
365 tc_init(struct tc *tc, const struct tc_ops *ops)
368 hmap_init(&tc->queues);
372 tc_destroy(struct tc *tc)
374 hmap_destroy(&tc->queues);
377 static const struct tc_ops tc_ops_htb;
378 static const struct tc_ops tc_ops_hfsc;
379 static const struct tc_ops tc_ops_default;
380 static const struct tc_ops tc_ops_other;
382 static const struct tc_ops *const tcs[] = {
383 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
384 &tc_ops_hfsc, /* Hierarchical fair service curve. */
385 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
386 &tc_ops_other, /* Some other qdisc. */
390 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
391 static unsigned int tc_get_major(unsigned int handle);
392 static unsigned int tc_get_minor(unsigned int handle);
394 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
395 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
396 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
398 static struct tcmsg *tc_make_request(const struct netdev *, int type,
399 unsigned int flags, struct ofpbuf *);
400 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
401 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
402 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
405 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
406 struct nlattr **options);
407 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
408 struct nlattr **options,
409 struct netdev_queue_stats *);
410 static int tc_query_class(const struct netdev *,
411 unsigned int handle, unsigned int parent,
412 struct ofpbuf **replyp);
413 static int tc_delete_class(const struct netdev *, unsigned int handle);
415 static int tc_del_qdisc(struct netdev *netdev);
416 static int tc_query_qdisc(const struct netdev *netdev);
418 static int tc_calc_cell_log(unsigned int mtu);
419 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
420 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
421 const struct tc_ratespec *rate);
422 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
424 struct netdev_linux {
427 /* Protects all members below. */
428 struct ovs_mutex mutex;
430 unsigned int cache_valid;
432 bool miimon; /* Link status of last poll. */
433 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
434 struct timer miimon_timer;
436 /* The following are figured out "on demand" only. They are only valid
437 * when the corresponding VALID_* bit in 'cache_valid' is set. */
439 uint8_t etheraddr[ETH_ADDR_LEN];
440 struct in_addr address, netmask;
443 unsigned int ifi_flags;
444 long long int carrier_resets;
445 uint32_t kbits_rate; /* Policing data. */
446 uint32_t kbits_burst;
447 int vport_stats_error; /* Cached error code from vport_get_stats().
448 0 or an errno value. */
449 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
450 int ether_addr_error; /* Cached error code from set/get etheraddr. */
451 int netdev_policing_error; /* Cached error code from set policing. */
452 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
453 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
455 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
456 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
457 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
459 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
462 /* For devices of class netdev_tap_class only. */
466 struct netdev_rxq_linux {
467 struct netdev_rxq up;
472 /* This is set pretty low because we probably won't learn anything from the
473 * additional log messages. */
474 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
476 /* Polling miimon status for all ports causes performance degradation when
477 * handling a large number of ports. If there are no devices using miimon, then
478 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
480 * Readers do not depend on this variable synchronizing with the related
481 * changes in the device miimon status, so we can use atomic_count. */
482 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
484 static void netdev_linux_run(void);
486 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
487 int cmd, const char *cmd_name);
488 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
489 int cmd, const char *cmd_name);
490 static int get_flags(const struct netdev *, unsigned int *flags);
491 static int set_flags(const char *, unsigned int flags);
492 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
493 enum netdev_flags on, enum netdev_flags *old_flagsp)
494 OVS_REQUIRES(netdev->mutex);
495 static int do_get_ifindex(const char *netdev_name);
496 static int get_ifindex(const struct netdev *, int *ifindexp);
497 static int do_set_addr(struct netdev *netdev,
498 int ioctl_nr, const char *ioctl_name,
499 struct in_addr addr);
500 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
501 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
502 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
503 static int af_packet_sock(void);
504 static bool netdev_linux_miimon_enabled(void);
505 static void netdev_linux_miimon_run(void);
506 static void netdev_linux_miimon_wait(void);
507 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
510 is_netdev_linux_class(const struct netdev_class *netdev_class)
512 return netdev_class->run == netdev_linux_run;
516 is_tap_netdev(const struct netdev *netdev)
518 return netdev_get_class(netdev) == &netdev_tap_class;
521 static struct netdev_linux *
522 netdev_linux_cast(const struct netdev *netdev)
524 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
526 return CONTAINER_OF(netdev, struct netdev_linux, up);
529 static struct netdev_rxq_linux *
530 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
532 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
533 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
536 static void netdev_linux_update(struct netdev_linux *netdev,
537 const struct rtnetlink_link_change *)
538 OVS_REQUIRES(netdev->mutex);
539 static void netdev_linux_changed(struct netdev_linux *netdev,
540 unsigned int ifi_flags, unsigned int mask)
541 OVS_REQUIRES(netdev->mutex);
543 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
544 * if no such socket could be created. */
545 static struct nl_sock *
546 netdev_linux_notify_sock(void)
548 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
549 static struct nl_sock *sock;
551 if (ovsthread_once_start(&once)) {
554 error = nl_sock_create(NETLINK_ROUTE, &sock);
556 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
558 nl_sock_destroy(sock);
562 ovsthread_once_done(&once);
569 netdev_linux_miimon_enabled(void)
571 return atomic_count_get(&miimon_cnt) > 0;
575 netdev_linux_run(void)
577 struct nl_sock *sock;
580 if (netdev_linux_miimon_enabled()) {
581 netdev_linux_miimon_run();
584 sock = netdev_linux_notify_sock();
590 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
591 uint64_t buf_stub[4096 / 8];
594 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
595 error = nl_sock_recv(sock, &buf, false);
597 struct rtnetlink_link_change change;
599 if (rtnetlink_link_parse(&buf, &change)) {
600 struct netdev *netdev_ = netdev_from_name(change.ifname);
601 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
604 ovs_mutex_lock(&netdev->mutex);
605 netdev_linux_update(netdev, &change);
606 ovs_mutex_unlock(&netdev->mutex);
608 netdev_close(netdev_);
610 } else if (error == ENOBUFS) {
611 struct shash device_shash;
612 struct shash_node *node;
616 shash_init(&device_shash);
617 netdev_get_devices(&netdev_linux_class, &device_shash);
618 SHASH_FOR_EACH (node, &device_shash) {
619 struct netdev *netdev_ = node->data;
620 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
623 ovs_mutex_lock(&netdev->mutex);
624 get_flags(netdev_, &flags);
625 netdev_linux_changed(netdev, flags, 0);
626 ovs_mutex_unlock(&netdev->mutex);
628 netdev_close(netdev_);
630 shash_destroy(&device_shash);
631 } else if (error != EAGAIN) {
632 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
633 ovs_strerror(error));
640 netdev_linux_wait(void)
642 struct nl_sock *sock;
644 if (netdev_linux_miimon_enabled()) {
645 netdev_linux_miimon_wait();
647 sock = netdev_linux_notify_sock();
649 nl_sock_wait(sock, POLLIN);
654 netdev_linux_changed(struct netdev_linux *dev,
655 unsigned int ifi_flags, unsigned int mask)
656 OVS_REQUIRES(dev->mutex)
658 netdev_change_seq_changed(&dev->up);
660 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
661 dev->carrier_resets++;
663 dev->ifi_flags = ifi_flags;
665 dev->cache_valid &= mask;
669 netdev_linux_update(struct netdev_linux *dev,
670 const struct rtnetlink_link_change *change)
671 OVS_REQUIRES(dev->mutex)
673 if (change->nlmsg_type == RTM_NEWLINK) {
675 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
677 /* Update netdev from rtnl-change msg. */
679 dev->mtu = change->mtu;
680 dev->cache_valid |= VALID_MTU;
681 dev->netdev_mtu_error = 0;
684 if (!eth_addr_is_zero(change->addr)) {
685 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
686 dev->cache_valid |= VALID_ETHERADDR;
687 dev->ether_addr_error = 0;
690 dev->ifindex = change->ifi_index;
691 dev->cache_valid |= VALID_IFINDEX;
692 dev->get_ifindex_error = 0;
695 netdev_linux_changed(dev, change->ifi_flags, 0);
699 static struct netdev *
700 netdev_linux_alloc(void)
702 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
707 netdev_linux_common_construct(struct netdev_linux *netdev)
709 ovs_mutex_init(&netdev->mutex);
712 /* Creates system and internal devices. */
714 netdev_linux_construct(struct netdev *netdev_)
716 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
719 netdev_linux_common_construct(netdev);
721 error = get_flags(&netdev->up, &netdev->ifi_flags);
722 if (error == ENODEV) {
723 if (netdev->up.netdev_class != &netdev_internal_class) {
724 /* The device does not exist, so don't allow it to be opened. */
727 /* "Internal" netdevs have to be created as netdev objects before
728 * they exist in the kernel, because creating them in the kernel
729 * happens by passing a netdev object to dpif_port_add().
730 * Therefore, ignore the error. */
737 /* For most types of netdevs we open the device for each call of
738 * netdev_open(). However, this is not the case with tap devices,
739 * since it is only possible to open the device once. In this
740 * situation we share a single file descriptor, and consequently
741 * buffers, across all readers. Therefore once data is read it will
742 * be unavailable to other reads for tap devices. */
744 netdev_linux_construct_tap(struct netdev *netdev_)
746 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
747 static const char tap_dev[] = "/dev/net/tun";
748 const char *name = netdev_->name;
752 netdev_linux_common_construct(netdev);
754 /* Open tap device. */
755 netdev->tap_fd = open(tap_dev, O_RDWR);
756 if (netdev->tap_fd < 0) {
758 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
762 /* Create tap device. */
763 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
764 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
765 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
766 VLOG_WARN("%s: creating tap device failed: %s", name,
767 ovs_strerror(errno));
772 /* Make non-blocking. */
773 error = set_nonblocking(netdev->tap_fd);
781 close(netdev->tap_fd);
786 netdev_linux_destruct(struct netdev *netdev_)
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 if (netdev->tc && netdev->tc->ops->tc_destroy) {
791 netdev->tc->ops->tc_destroy(netdev->tc);
794 if (netdev_get_class(netdev_) == &netdev_tap_class
795 && netdev->tap_fd >= 0)
797 close(netdev->tap_fd);
800 if (netdev->miimon_interval > 0) {
801 atomic_count_dec(&miimon_cnt);
804 ovs_mutex_destroy(&netdev->mutex);
808 netdev_linux_dealloc(struct netdev *netdev_)
810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 static struct netdev_rxq *
815 netdev_linux_rxq_alloc(void)
817 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
822 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
824 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
825 struct netdev *netdev_ = rx->up.netdev;
826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829 ovs_mutex_lock(&netdev->mutex);
830 rx->is_tap = is_tap_netdev(netdev_);
832 rx->fd = netdev->tap_fd;
834 struct sockaddr_ll sll;
836 /* Result of tcpdump -dd inbound */
837 static const struct sock_filter filt[] = {
838 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
839 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
840 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
841 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
843 static const struct sock_fprog fprog = {
844 ARRAY_SIZE(filt), (struct sock_filter *) filt
847 /* Create file descriptor. */
848 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
851 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
856 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
858 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
859 netdev_get_name(netdev_), ovs_strerror(error));
863 /* Set non-blocking mode. */
864 error = set_nonblocking(rx->fd);
869 /* Get ethernet device index. */
870 error = get_ifindex(&netdev->up, &ifindex);
875 /* Bind to specific ethernet device. */
876 memset(&sll, 0, sizeof sll);
877 sll.sll_family = AF_PACKET;
878 sll.sll_ifindex = ifindex;
879 sll.sll_protocol = htons(ETH_P_ALL);
880 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
882 VLOG_ERR("%s: failed to bind raw socket (%s)",
883 netdev_get_name(netdev_), ovs_strerror(error));
887 /* Filter for only inbound packets. */
888 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
892 VLOG_ERR("%s: failed to attach filter (%s)",
893 netdev_get_name(netdev_), ovs_strerror(error));
897 ovs_mutex_unlock(&netdev->mutex);
905 ovs_mutex_unlock(&netdev->mutex);
910 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
912 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
920 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
922 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
928 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
930 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
931 return htons(aux->tp_vlan_tpid);
933 return htons(ETH_TYPE_VLAN);
938 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
940 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
944 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
949 struct cmsghdr *cmsg;
952 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
956 /* Reserve headroom for a single VLAN tag */
957 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
958 size = dp_packet_tailroom(buffer);
960 iov.iov_base = dp_packet_data(buffer);
962 msgh.msg_name = NULL;
963 msgh.msg_namelen = 0;
966 msgh.msg_control = &cmsg_buffer;
967 msgh.msg_controllen = sizeof cmsg_buffer;
971 retval = recvmsg(fd, &msgh, MSG_TRUNC);
972 } while (retval < 0 && errno == EINTR);
976 } else if (retval > size) {
980 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
982 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
983 const struct tpacket_auxdata *aux;
985 if (cmsg->cmsg_level != SOL_PACKET
986 || cmsg->cmsg_type != PACKET_AUXDATA
987 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
991 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
992 if (auxdata_has_vlan_tci(aux)) {
993 if (retval < ETH_HEADER_LEN) {
997 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
998 htons(aux->tp_vlan_tci));
1007 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1010 size_t size = dp_packet_tailroom(buffer);
1013 retval = read(fd, dp_packet_data(buffer), size);
1014 } while (retval < 0 && errno == EINTR);
1018 } else if (retval > size) {
1022 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1027 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1030 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1031 struct netdev *netdev = rx->up.netdev;
1032 struct dp_packet *buffer;
1036 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1037 mtu = ETH_PAYLOAD_MAX;
1040 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1041 DP_NETDEV_HEADROOM);
1042 retval = (rx->is_tap
1043 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1044 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1047 if (retval != EAGAIN && retval != EMSGSIZE) {
1048 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1049 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1051 dp_packet_delete(buffer);
1053 dp_packet_pad(buffer);
1054 dp_packet_set_dp_hash(buffer, 0);
1055 packets[0] = buffer;
1063 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1065 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1066 poll_fd_wait(rx->fd, POLLIN);
1070 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1072 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1075 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1076 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1080 drain_fd(rx->fd, ifr.ifr_qlen);
1083 return drain_rcvbuf(rx->fd);
1087 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1088 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1089 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1090 * the packet is too big or too small to transmit on the device.
1092 * The caller retains ownership of 'buffer' in all cases.
1094 * The kernel maintains a packet transmission queue, so the caller is not
1095 * expected to do additional queuing of packets. */
1097 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1098 struct dp_packet **pkts, int cnt, bool may_steal)
1103 /* 'i' is incremented only if there's no error */
1104 for (i = 0; i < cnt;) {
1105 const void *data = dp_packet_data(pkts[i]);
1106 size_t size = dp_packet_size(pkts[i]);
1109 if (!is_tap_netdev(netdev_)) {
1110 /* Use our AF_PACKET socket to send to this device. */
1111 struct sockaddr_ll sll;
1117 sock = af_packet_sock();
1122 ifindex = netdev_get_ifindex(netdev_);
1127 /* We don't bother setting most fields in sockaddr_ll because the
1128 * kernel ignores them for SOCK_RAW. */
1129 memset(&sll, 0, sizeof sll);
1130 sll.sll_family = AF_PACKET;
1131 sll.sll_ifindex = ifindex;
1133 iov.iov_base = CONST_CAST(void *, data);
1136 msg.msg_name = &sll;
1137 msg.msg_namelen = sizeof sll;
1140 msg.msg_control = NULL;
1141 msg.msg_controllen = 0;
1144 retval = sendmsg(sock, &msg, 0);
1146 /* Use the tap fd to send to this device. This is essential for
1147 * tap devices, because packets sent to a tap device with an
1148 * AF_PACKET socket will loop back to be *received* again on the
1149 * tap device. This doesn't occur on other interface types
1150 * because we attach a socket filter to the rx socket. */
1151 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1153 retval = write(netdev->tap_fd, data, size);
1157 /* The Linux AF_PACKET implementation never blocks waiting for room
1158 * for packets, instead returning ENOBUFS. Translate this into
1159 * EAGAIN for the caller. */
1160 error = errno == ENOBUFS ? EAGAIN : errno;
1161 if (error == EINTR) {
1162 /* continue without incrementing 'i', i.e. retry this packet */
1166 } else if (retval != size) {
1167 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1168 " of %"PRIuSIZE") on %s", retval, size,
1169 netdev_get_name(netdev_));
1174 /* Process the next packet in the batch */
1179 for (i = 0; i < cnt; i++) {
1180 dp_packet_delete(pkts[i]);
1184 if (error && error != EAGAIN) {
1185 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1186 netdev_get_name(netdev_), ovs_strerror(error));
1193 /* Registers with the poll loop to wake up from the next call to poll_block()
1194 * when the packet transmission queue has sufficient room to transmit a packet
1195 * with netdev_send().
1197 * The kernel maintains a packet transmission queue, so the client is not
1198 * expected to do additional queuing of packets. Thus, this function is
1199 * unlikely to ever be used. It is included for completeness. */
1201 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1203 if (is_tap_netdev(netdev)) {
1204 /* TAP device always accepts packets.*/
1205 poll_immediate_wake();
1209 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1210 * otherwise a positive errno value. */
1212 netdev_linux_set_etheraddr(struct netdev *netdev_,
1213 const uint8_t mac[ETH_ADDR_LEN])
1215 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1216 enum netdev_flags old_flags = 0;
1219 ovs_mutex_lock(&netdev->mutex);
1221 if (netdev->cache_valid & VALID_ETHERADDR) {
1222 error = netdev->ether_addr_error;
1223 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1226 netdev->cache_valid &= ~VALID_ETHERADDR;
1229 /* Tap devices must be brought down before setting the address. */
1230 if (is_tap_netdev(netdev_)) {
1231 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1233 error = set_etheraddr(netdev_get_name(netdev_), mac);
1234 if (!error || error == ENODEV) {
1235 netdev->ether_addr_error = error;
1236 netdev->cache_valid |= VALID_ETHERADDR;
1238 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1242 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1243 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1247 ovs_mutex_unlock(&netdev->mutex);
1251 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1253 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1254 uint8_t mac[ETH_ADDR_LEN])
1256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1259 ovs_mutex_lock(&netdev->mutex);
1260 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1261 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1263 netdev->cache_valid |= VALID_ETHERADDR;
1266 error = netdev->ether_addr_error;
1268 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1270 ovs_mutex_unlock(&netdev->mutex);
1276 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1280 if (!(netdev->cache_valid & VALID_MTU)) {
1283 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1284 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1285 netdev->mtu = ifr.ifr_mtu;
1286 netdev->cache_valid |= VALID_MTU;
1289 error = netdev->netdev_mtu_error;
1291 *mtup = netdev->mtu;
1297 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1298 * in bytes, not including the hardware header; thus, this is typically 1500
1299 * bytes for Ethernet devices. */
1301 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1303 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1306 ovs_mutex_lock(&netdev->mutex);
1307 error = netdev_linux_get_mtu__(netdev, mtup);
1308 ovs_mutex_unlock(&netdev->mutex);
1313 /* Sets the maximum size of transmitted (MTU) for given device using linux
1314 * networking ioctl interface.
1317 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1323 ovs_mutex_lock(&netdev->mutex);
1324 if (netdev->cache_valid & VALID_MTU) {
1325 error = netdev->netdev_mtu_error;
1326 if (error || netdev->mtu == mtu) {
1329 netdev->cache_valid &= ~VALID_MTU;
1332 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1333 SIOCSIFMTU, "SIOCSIFMTU");
1334 if (!error || error == ENODEV) {
1335 netdev->netdev_mtu_error = error;
1336 netdev->mtu = ifr.ifr_mtu;
1337 netdev->cache_valid |= VALID_MTU;
1340 ovs_mutex_unlock(&netdev->mutex);
1344 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1345 * On failure, returns a negative errno value. */
1347 netdev_linux_get_ifindex(const struct netdev *netdev_)
1349 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1352 ovs_mutex_lock(&netdev->mutex);
1353 error = get_ifindex(netdev_, &ifindex);
1354 ovs_mutex_unlock(&netdev->mutex);
1356 return error ? -error : ifindex;
1360 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1362 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1364 ovs_mutex_lock(&netdev->mutex);
1365 if (netdev->miimon_interval > 0) {
1366 *carrier = netdev->miimon;
1368 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1370 ovs_mutex_unlock(&netdev->mutex);
1375 static long long int
1376 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 long long int carrier_resets;
1381 ovs_mutex_lock(&netdev->mutex);
1382 carrier_resets = netdev->carrier_resets;
1383 ovs_mutex_unlock(&netdev->mutex);
1385 return carrier_resets;
1389 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1390 struct mii_ioctl_data *data)
1395 memset(&ifr, 0, sizeof ifr);
1396 memcpy(&ifr.ifr_data, data, sizeof *data);
1397 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1398 memcpy(data, &ifr.ifr_data, sizeof *data);
1404 netdev_linux_get_miimon(const char *name, bool *miimon)
1406 struct mii_ioctl_data data;
1411 memset(&data, 0, sizeof data);
1412 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1414 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1415 data.reg_num = MII_BMSR;
1416 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1420 *miimon = !!(data.val_out & BMSR_LSTATUS);
1422 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1425 struct ethtool_cmd ecmd;
1427 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1430 COVERAGE_INC(netdev_get_ethtool);
1431 memset(&ecmd, 0, sizeof ecmd);
1432 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1435 struct ethtool_value eval;
1437 memcpy(&eval, &ecmd, sizeof eval);
1438 *miimon = !!eval.data;
1440 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1448 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1449 long long int interval)
1451 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1453 ovs_mutex_lock(&netdev->mutex);
1454 interval = interval > 0 ? MAX(interval, 100) : 0;
1455 if (netdev->miimon_interval != interval) {
1456 if (interval && !netdev->miimon_interval) {
1457 atomic_count_inc(&miimon_cnt);
1458 } else if (!interval && netdev->miimon_interval) {
1459 atomic_count_dec(&miimon_cnt);
1462 netdev->miimon_interval = interval;
1463 timer_set_expired(&netdev->miimon_timer);
1465 ovs_mutex_unlock(&netdev->mutex);
1471 netdev_linux_miimon_run(void)
1473 struct shash device_shash;
1474 struct shash_node *node;
1476 shash_init(&device_shash);
1477 netdev_get_devices(&netdev_linux_class, &device_shash);
1478 SHASH_FOR_EACH (node, &device_shash) {
1479 struct netdev *netdev = node->data;
1480 struct netdev_linux *dev = netdev_linux_cast(netdev);
1483 ovs_mutex_lock(&dev->mutex);
1484 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1485 netdev_linux_get_miimon(dev->up.name, &miimon);
1486 if (miimon != dev->miimon) {
1487 dev->miimon = miimon;
1488 netdev_linux_changed(dev, dev->ifi_flags, 0);
1491 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1493 ovs_mutex_unlock(&dev->mutex);
1494 netdev_close(netdev);
1497 shash_destroy(&device_shash);
1501 netdev_linux_miimon_wait(void)
1503 struct shash device_shash;
1504 struct shash_node *node;
1506 shash_init(&device_shash);
1507 netdev_get_devices(&netdev_linux_class, &device_shash);
1508 SHASH_FOR_EACH (node, &device_shash) {
1509 struct netdev *netdev = node->data;
1510 struct netdev_linux *dev = netdev_linux_cast(netdev);
1512 ovs_mutex_lock(&dev->mutex);
1513 if (dev->miimon_interval > 0) {
1514 timer_wait(&dev->miimon_timer);
1516 ovs_mutex_unlock(&dev->mutex);
1517 netdev_close(netdev);
1519 shash_destroy(&device_shash);
1523 swap_uint64(uint64_t *a, uint64_t *b)
1530 /* Copies 'src' into 'dst', performing format conversion in the process.
1532 * 'src' is allowed to be misaligned. */
1534 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1535 const struct ovs_vport_stats *src)
1537 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1538 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1539 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1540 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1541 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1542 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1543 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1544 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1546 dst->collisions = 0;
1547 dst->rx_length_errors = 0;
1548 dst->rx_over_errors = 0;
1549 dst->rx_crc_errors = 0;
1550 dst->rx_frame_errors = 0;
1551 dst->rx_fifo_errors = 0;
1552 dst->rx_missed_errors = 0;
1553 dst->tx_aborted_errors = 0;
1554 dst->tx_carrier_errors = 0;
1555 dst->tx_fifo_errors = 0;
1556 dst->tx_heartbeat_errors = 0;
1557 dst->tx_window_errors = 0;
1561 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1563 struct dpif_netlink_vport reply;
1567 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1570 } else if (!reply.stats) {
1575 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1583 get_stats_via_vport(const struct netdev *netdev_,
1584 struct netdev_stats *stats)
1586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1588 if (!netdev->vport_stats_error ||
1589 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1592 error = get_stats_via_vport__(netdev_, stats);
1593 if (error && error != ENOENT) {
1594 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1596 netdev_get_name(netdev_), ovs_strerror(error));
1598 netdev->vport_stats_error = error;
1599 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1603 /* Retrieves current device stats for 'netdev-linux'. */
1605 netdev_linux_get_stats(const struct netdev *netdev_,
1606 struct netdev_stats *stats)
1608 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1609 struct netdev_stats dev_stats;
1612 ovs_mutex_lock(&netdev->mutex);
1613 get_stats_via_vport(netdev_, stats);
1614 error = get_stats_via_netlink(netdev_, &dev_stats);
1616 if (!netdev->vport_stats_error) {
1619 } else if (netdev->vport_stats_error) {
1620 /* stats not available from OVS then use netdev stats. */
1623 /* Use kernel netdev's packet and byte counts since vport's counters
1624 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1626 stats->rx_packets = dev_stats.rx_packets;
1627 stats->rx_bytes = dev_stats.rx_bytes;
1628 stats->tx_packets = dev_stats.tx_packets;
1629 stats->tx_bytes = dev_stats.tx_bytes;
1631 stats->rx_errors += dev_stats.rx_errors;
1632 stats->tx_errors += dev_stats.tx_errors;
1633 stats->rx_dropped += dev_stats.rx_dropped;
1634 stats->tx_dropped += dev_stats.tx_dropped;
1635 stats->multicast += dev_stats.multicast;
1636 stats->collisions += dev_stats.collisions;
1637 stats->rx_length_errors += dev_stats.rx_length_errors;
1638 stats->rx_over_errors += dev_stats.rx_over_errors;
1639 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1640 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1641 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1642 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1643 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1644 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1645 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1646 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1647 stats->tx_window_errors += dev_stats.tx_window_errors;
1649 ovs_mutex_unlock(&netdev->mutex);
1654 /* Retrieves current device stats for 'netdev-tap' netdev or
1655 * netdev-internal. */
1657 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1660 struct netdev_stats dev_stats;
1663 ovs_mutex_lock(&netdev->mutex);
1664 get_stats_via_vport(netdev_, stats);
1665 error = get_stats_via_netlink(netdev_, &dev_stats);
1667 if (!netdev->vport_stats_error) {
1670 } else if (netdev->vport_stats_error) {
1671 /* Transmit and receive stats will appear to be swapped relative to the
1672 * other ports since we are the one sending the data, not a remote
1673 * computer. For consistency, we swap them back here. This does not
1674 * apply if we are getting stats from the vport layer because it always
1675 * tracks stats from the perspective of the switch. */
1678 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1679 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1680 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1681 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1682 stats->rx_length_errors = 0;
1683 stats->rx_over_errors = 0;
1684 stats->rx_crc_errors = 0;
1685 stats->rx_frame_errors = 0;
1686 stats->rx_fifo_errors = 0;
1687 stats->rx_missed_errors = 0;
1688 stats->tx_aborted_errors = 0;
1689 stats->tx_carrier_errors = 0;
1690 stats->tx_fifo_errors = 0;
1691 stats->tx_heartbeat_errors = 0;
1692 stats->tx_window_errors = 0;
1694 /* Use kernel netdev's packet and byte counts since vport counters
1695 * do not reflect packet counts on the wire when GSO, TSO or GRO
1697 stats->rx_packets = dev_stats.tx_packets;
1698 stats->rx_bytes = dev_stats.tx_bytes;
1699 stats->tx_packets = dev_stats.rx_packets;
1700 stats->tx_bytes = dev_stats.rx_bytes;
1702 stats->rx_dropped += dev_stats.tx_dropped;
1703 stats->tx_dropped += dev_stats.rx_dropped;
1705 stats->rx_errors += dev_stats.tx_errors;
1706 stats->tx_errors += dev_stats.rx_errors;
1708 stats->multicast += dev_stats.multicast;
1709 stats->collisions += dev_stats.collisions;
1711 ovs_mutex_unlock(&netdev->mutex);
1717 netdev_internal_get_stats(const struct netdev *netdev_,
1718 struct netdev_stats *stats)
1720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1723 ovs_mutex_lock(&netdev->mutex);
1724 get_stats_via_vport(netdev_, stats);
1725 error = netdev->vport_stats_error;
1726 ovs_mutex_unlock(&netdev->mutex);
1732 netdev_linux_read_features(struct netdev_linux *netdev)
1734 struct ethtool_cmd ecmd;
1738 if (netdev->cache_valid & VALID_FEATURES) {
1742 COVERAGE_INC(netdev_get_ethtool);
1743 memset(&ecmd, 0, sizeof ecmd);
1744 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1745 ETHTOOL_GSET, "ETHTOOL_GSET");
1750 /* Supported features. */
1751 netdev->supported = 0;
1752 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1753 netdev->supported |= NETDEV_F_10MB_HD;
1755 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1756 netdev->supported |= NETDEV_F_10MB_FD;
1758 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1759 netdev->supported |= NETDEV_F_100MB_HD;
1761 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1762 netdev->supported |= NETDEV_F_100MB_FD;
1764 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1765 netdev->supported |= NETDEV_F_1GB_HD;
1767 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1768 netdev->supported |= NETDEV_F_1GB_FD;
1770 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1771 netdev->supported |= NETDEV_F_10GB_FD;
1773 if (ecmd.supported & SUPPORTED_TP) {
1774 netdev->supported |= NETDEV_F_COPPER;
1776 if (ecmd.supported & SUPPORTED_FIBRE) {
1777 netdev->supported |= NETDEV_F_FIBER;
1779 if (ecmd.supported & SUPPORTED_Autoneg) {
1780 netdev->supported |= NETDEV_F_AUTONEG;
1782 if (ecmd.supported & SUPPORTED_Pause) {
1783 netdev->supported |= NETDEV_F_PAUSE;
1785 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1786 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1789 /* Advertised features. */
1790 netdev->advertised = 0;
1791 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1792 netdev->advertised |= NETDEV_F_10MB_HD;
1794 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1795 netdev->advertised |= NETDEV_F_10MB_FD;
1797 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1798 netdev->advertised |= NETDEV_F_100MB_HD;
1800 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1801 netdev->advertised |= NETDEV_F_100MB_FD;
1803 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1804 netdev->advertised |= NETDEV_F_1GB_HD;
1806 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1807 netdev->advertised |= NETDEV_F_1GB_FD;
1809 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1810 netdev->advertised |= NETDEV_F_10GB_FD;
1812 if (ecmd.advertising & ADVERTISED_TP) {
1813 netdev->advertised |= NETDEV_F_COPPER;
1815 if (ecmd.advertising & ADVERTISED_FIBRE) {
1816 netdev->advertised |= NETDEV_F_FIBER;
1818 if (ecmd.advertising & ADVERTISED_Autoneg) {
1819 netdev->advertised |= NETDEV_F_AUTONEG;
1821 if (ecmd.advertising & ADVERTISED_Pause) {
1822 netdev->advertised |= NETDEV_F_PAUSE;
1824 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1825 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1828 /* Current settings. */
1830 if (speed == SPEED_10) {
1831 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1832 } else if (speed == SPEED_100) {
1833 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1834 } else if (speed == SPEED_1000) {
1835 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1836 } else if (speed == SPEED_10000) {
1837 netdev->current = NETDEV_F_10GB_FD;
1838 } else if (speed == 40000) {
1839 netdev->current = NETDEV_F_40GB_FD;
1840 } else if (speed == 100000) {
1841 netdev->current = NETDEV_F_100GB_FD;
1842 } else if (speed == 1000000) {
1843 netdev->current = NETDEV_F_1TB_FD;
1845 netdev->current = 0;
1848 if (ecmd.port == PORT_TP) {
1849 netdev->current |= NETDEV_F_COPPER;
1850 } else if (ecmd.port == PORT_FIBRE) {
1851 netdev->current |= NETDEV_F_FIBER;
1855 netdev->current |= NETDEV_F_AUTONEG;
1859 netdev->cache_valid |= VALID_FEATURES;
1860 netdev->get_features_error = error;
1863 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1864 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1865 * Returns 0 if successful, otherwise a positive errno value. */
1867 netdev_linux_get_features(const struct netdev *netdev_,
1868 enum netdev_features *current,
1869 enum netdev_features *advertised,
1870 enum netdev_features *supported,
1871 enum netdev_features *peer)
1873 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1876 ovs_mutex_lock(&netdev->mutex);
1877 netdev_linux_read_features(netdev);
1878 if (!netdev->get_features_error) {
1879 *current = netdev->current;
1880 *advertised = netdev->advertised;
1881 *supported = netdev->supported;
1882 *peer = 0; /* XXX */
1884 error = netdev->get_features_error;
1885 ovs_mutex_unlock(&netdev->mutex);
1890 /* Set the features advertised by 'netdev' to 'advertise'. */
1892 netdev_linux_set_advertisements(struct netdev *netdev_,
1893 enum netdev_features advertise)
1895 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1896 struct ethtool_cmd ecmd;
1899 ovs_mutex_lock(&netdev->mutex);
1901 COVERAGE_INC(netdev_get_ethtool);
1902 memset(&ecmd, 0, sizeof ecmd);
1903 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1904 ETHTOOL_GSET, "ETHTOOL_GSET");
1909 ecmd.advertising = 0;
1910 if (advertise & NETDEV_F_10MB_HD) {
1911 ecmd.advertising |= ADVERTISED_10baseT_Half;
1913 if (advertise & NETDEV_F_10MB_FD) {
1914 ecmd.advertising |= ADVERTISED_10baseT_Full;
1916 if (advertise & NETDEV_F_100MB_HD) {
1917 ecmd.advertising |= ADVERTISED_100baseT_Half;
1919 if (advertise & NETDEV_F_100MB_FD) {
1920 ecmd.advertising |= ADVERTISED_100baseT_Full;
1922 if (advertise & NETDEV_F_1GB_HD) {
1923 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1925 if (advertise & NETDEV_F_1GB_FD) {
1926 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1928 if (advertise & NETDEV_F_10GB_FD) {
1929 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1931 if (advertise & NETDEV_F_COPPER) {
1932 ecmd.advertising |= ADVERTISED_TP;
1934 if (advertise & NETDEV_F_FIBER) {
1935 ecmd.advertising |= ADVERTISED_FIBRE;
1937 if (advertise & NETDEV_F_AUTONEG) {
1938 ecmd.advertising |= ADVERTISED_Autoneg;
1940 if (advertise & NETDEV_F_PAUSE) {
1941 ecmd.advertising |= ADVERTISED_Pause;
1943 if (advertise & NETDEV_F_PAUSE_ASYM) {
1944 ecmd.advertising |= ADVERTISED_Asym_Pause;
1946 COVERAGE_INC(netdev_set_ethtool);
1947 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1948 ETHTOOL_SSET, "ETHTOOL_SSET");
1951 ovs_mutex_unlock(&netdev->mutex);
1955 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1956 * successful, otherwise a positive errno value. */
1958 netdev_linux_set_policing(struct netdev *netdev_,
1959 uint32_t kbits_rate, uint32_t kbits_burst)
1961 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1962 const char *netdev_name = netdev_get_name(netdev_);
1965 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1966 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1967 : kbits_burst); /* Stick with user-specified value. */
1969 ovs_mutex_lock(&netdev->mutex);
1970 if (netdev->cache_valid & VALID_POLICING) {
1971 error = netdev->netdev_policing_error;
1972 if (error || (netdev->kbits_rate == kbits_rate &&
1973 netdev->kbits_burst == kbits_burst)) {
1974 /* Assume that settings haven't changed since we last set them. */
1977 netdev->cache_valid &= ~VALID_POLICING;
1980 COVERAGE_INC(netdev_set_policing);
1981 /* Remove any existing ingress qdisc. */
1982 error = tc_add_del_ingress_qdisc(netdev_, false);
1984 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1985 netdev_name, ovs_strerror(error));
1990 error = tc_add_del_ingress_qdisc(netdev_, true);
1992 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1993 netdev_name, ovs_strerror(error));
1997 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
1999 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2000 netdev_name, ovs_strerror(error));
2005 netdev->kbits_rate = kbits_rate;
2006 netdev->kbits_burst = kbits_burst;
2009 if (!error || error == ENODEV) {
2010 netdev->netdev_policing_error = error;
2011 netdev->cache_valid |= VALID_POLICING;
2013 ovs_mutex_unlock(&netdev->mutex);
2018 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2021 const struct tc_ops *const *opsp;
2023 for (opsp = tcs; *opsp != NULL; opsp++) {
2024 const struct tc_ops *ops = *opsp;
2025 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2026 sset_add(types, ops->ovs_name);
2032 static const struct tc_ops *
2033 tc_lookup_ovs_name(const char *name)
2035 const struct tc_ops *const *opsp;
2037 for (opsp = tcs; *opsp != NULL; opsp++) {
2038 const struct tc_ops *ops = *opsp;
2039 if (!strcmp(name, ops->ovs_name)) {
2046 static const struct tc_ops *
2047 tc_lookup_linux_name(const char *name)
2049 const struct tc_ops *const *opsp;
2051 for (opsp = tcs; *opsp != NULL; opsp++) {
2052 const struct tc_ops *ops = *opsp;
2053 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2060 static struct tc_queue *
2061 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2064 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2065 struct tc_queue *queue;
2067 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2068 if (queue->queue_id == queue_id) {
2075 static struct tc_queue *
2076 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2078 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2082 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2084 struct netdev_qos_capabilities *caps)
2086 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2090 caps->n_queues = ops->n_queues;
2095 netdev_linux_get_qos(const struct netdev *netdev_,
2096 const char **typep, struct smap *details)
2098 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2101 ovs_mutex_lock(&netdev->mutex);
2102 error = tc_query_qdisc(netdev_);
2104 *typep = netdev->tc->ops->ovs_name;
2105 error = (netdev->tc->ops->qdisc_get
2106 ? netdev->tc->ops->qdisc_get(netdev_, details)
2109 ovs_mutex_unlock(&netdev->mutex);
2115 netdev_linux_set_qos(struct netdev *netdev_,
2116 const char *type, const struct smap *details)
2118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2119 const struct tc_ops *new_ops;
2122 new_ops = tc_lookup_ovs_name(type);
2123 if (!new_ops || !new_ops->tc_install) {
2127 ovs_mutex_lock(&netdev->mutex);
2128 error = tc_query_qdisc(netdev_);
2133 if (new_ops == netdev->tc->ops) {
2134 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2136 /* Delete existing qdisc. */
2137 error = tc_del_qdisc(netdev_);
2141 ovs_assert(netdev->tc == NULL);
2143 /* Install new qdisc. */
2144 error = new_ops->tc_install(netdev_, details);
2145 ovs_assert((error == 0) == (netdev->tc != NULL));
2149 ovs_mutex_unlock(&netdev->mutex);
2154 netdev_linux_get_queue(const struct netdev *netdev_,
2155 unsigned int queue_id, struct smap *details)
2157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2160 ovs_mutex_lock(&netdev->mutex);
2161 error = tc_query_qdisc(netdev_);
2163 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2165 ? netdev->tc->ops->class_get(netdev_, queue, details)
2168 ovs_mutex_unlock(&netdev->mutex);
2174 netdev_linux_set_queue(struct netdev *netdev_,
2175 unsigned int queue_id, const struct smap *details)
2177 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2180 ovs_mutex_lock(&netdev->mutex);
2181 error = tc_query_qdisc(netdev_);
2183 error = (queue_id < netdev->tc->ops->n_queues
2184 && netdev->tc->ops->class_set
2185 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2188 ovs_mutex_unlock(&netdev->mutex);
2194 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2196 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2199 ovs_mutex_lock(&netdev->mutex);
2200 error = tc_query_qdisc(netdev_);
2202 if (netdev->tc->ops->class_delete) {
2203 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2205 ? netdev->tc->ops->class_delete(netdev_, queue)
2211 ovs_mutex_unlock(&netdev->mutex);
2217 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2218 unsigned int queue_id,
2219 struct netdev_queue_stats *stats)
2221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2224 ovs_mutex_lock(&netdev->mutex);
2225 error = tc_query_qdisc(netdev_);
2227 if (netdev->tc->ops->class_get_stats) {
2228 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2230 stats->created = queue->created;
2231 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2240 ovs_mutex_unlock(&netdev->mutex);
2245 struct queue_dump_state {
2246 struct nl_dump dump;
2251 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2253 struct ofpbuf request;
2254 struct tcmsg *tcmsg;
2256 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2260 tcmsg->tcm_parent = 0;
2261 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2262 ofpbuf_uninit(&request);
2264 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2269 finish_queue_dump(struct queue_dump_state *state)
2271 ofpbuf_uninit(&state->buf);
2272 return nl_dump_done(&state->dump);
2275 struct netdev_linux_queue_state {
2276 unsigned int *queues;
2282 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2284 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2287 ovs_mutex_lock(&netdev->mutex);
2288 error = tc_query_qdisc(netdev_);
2290 if (netdev->tc->ops->class_get) {
2291 struct netdev_linux_queue_state *state;
2292 struct tc_queue *queue;
2295 *statep = state = xmalloc(sizeof *state);
2296 state->n_queues = hmap_count(&netdev->tc->queues);
2297 state->cur_queue = 0;
2298 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2301 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2302 state->queues[i++] = queue->queue_id;
2308 ovs_mutex_unlock(&netdev->mutex);
2314 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2315 unsigned int *queue_idp, struct smap *details)
2317 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2318 struct netdev_linux_queue_state *state = state_;
2321 ovs_mutex_lock(&netdev->mutex);
2322 while (state->cur_queue < state->n_queues) {
2323 unsigned int queue_id = state->queues[state->cur_queue++];
2324 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2327 *queue_idp = queue_id;
2328 error = netdev->tc->ops->class_get(netdev_, queue, details);
2332 ovs_mutex_unlock(&netdev->mutex);
2338 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2341 struct netdev_linux_queue_state *state = state_;
2343 free(state->queues);
2349 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2350 netdev_dump_queue_stats_cb *cb, void *aux)
2352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2355 ovs_mutex_lock(&netdev->mutex);
2356 error = tc_query_qdisc(netdev_);
2358 struct queue_dump_state state;
2360 if (!netdev->tc->ops->class_dump_stats) {
2362 } else if (!start_queue_dump(netdev_, &state)) {
2368 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2369 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2376 retval = finish_queue_dump(&state);
2382 ovs_mutex_unlock(&netdev->mutex);
2388 netdev_linux_get_in4(const struct netdev *netdev_,
2389 struct in_addr *address, struct in_addr *netmask)
2391 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2394 ovs_mutex_lock(&netdev->mutex);
2395 if (!(netdev->cache_valid & VALID_IN4)) {
2396 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2397 SIOCGIFADDR, "SIOCGIFADDR");
2399 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2400 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2402 netdev->cache_valid |= VALID_IN4;
2410 if (netdev->address.s_addr != INADDR_ANY) {
2411 *address = netdev->address;
2412 *netmask = netdev->netmask;
2414 error = EADDRNOTAVAIL;
2417 ovs_mutex_unlock(&netdev->mutex);
2423 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2424 struct in_addr netmask)
2426 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2429 ovs_mutex_lock(&netdev->mutex);
2430 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2432 netdev->cache_valid |= VALID_IN4;
2433 netdev->address = address;
2434 netdev->netmask = netmask;
2435 if (address.s_addr != INADDR_ANY) {
2436 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2437 "SIOCSIFNETMASK", netmask);
2440 ovs_mutex_unlock(&netdev->mutex);
2446 parse_if_inet6_line(const char *line,
2447 struct in6_addr *in6, char ifname[16 + 1])
2449 uint8_t *s6 = in6->s6_addr;
2450 #define X8 "%2"SCNx8
2451 return ovs_scan(line,
2452 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2453 "%*x %*x %*x %*x %16s\n",
2454 &s6[0], &s6[1], &s6[2], &s6[3],
2455 &s6[4], &s6[5], &s6[6], &s6[7],
2456 &s6[8], &s6[9], &s6[10], &s6[11],
2457 &s6[12], &s6[13], &s6[14], &s6[15],
2461 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2462 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2464 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2466 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2468 ovs_mutex_lock(&netdev->mutex);
2469 if (!(netdev->cache_valid & VALID_IN6)) {
2473 netdev->in6 = in6addr_any;
2475 file = fopen("/proc/net/if_inet6", "r");
2477 const char *name = netdev_get_name(netdev_);
2478 while (fgets(line, sizeof line, file)) {
2479 struct in6_addr in6_tmp;
2480 char ifname[16 + 1];
2481 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2482 && !strcmp(name, ifname))
2484 netdev->in6 = in6_tmp;
2490 netdev->cache_valid |= VALID_IN6;
2493 ovs_mutex_unlock(&netdev->mutex);
2499 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2501 struct sockaddr_in sin;
2502 memset(&sin, 0, sizeof sin);
2503 sin.sin_family = AF_INET;
2504 sin.sin_addr = addr;
2507 memset(sa, 0, sizeof *sa);
2508 memcpy(sa, &sin, sizeof sin);
2512 do_set_addr(struct netdev *netdev,
2513 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2517 make_in4_sockaddr(&ifr.ifr_addr, addr);
2518 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2522 /* Adds 'router' as a default IP gateway. */
2524 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2526 struct in_addr any = { INADDR_ANY };
2530 memset(&rt, 0, sizeof rt);
2531 make_in4_sockaddr(&rt.rt_dst, any);
2532 make_in4_sockaddr(&rt.rt_gateway, router);
2533 make_in4_sockaddr(&rt.rt_genmask, any);
2534 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2535 error = af_inet_ioctl(SIOCADDRT, &rt);
2537 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2543 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2546 static const char fn[] = "/proc/net/route";
2551 *netdev_name = NULL;
2552 stream = fopen(fn, "r");
2553 if (stream == NULL) {
2554 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2559 while (fgets(line, sizeof line, stream)) {
2562 ovs_be32 dest, gateway, mask;
2563 int refcnt, metric, mtu;
2564 unsigned int flags, use, window, irtt;
2567 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2569 iface, &dest, &gateway, &flags, &refcnt,
2570 &use, &metric, &mask, &mtu, &window, &irtt)) {
2571 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2575 if (!(flags & RTF_UP)) {
2576 /* Skip routes that aren't up. */
2580 /* The output of 'dest', 'mask', and 'gateway' were given in
2581 * network byte order, so we don't need need any endian
2582 * conversions here. */
2583 if ((dest & mask) == (host->s_addr & mask)) {
2585 /* The host is directly reachable. */
2586 next_hop->s_addr = 0;
2588 /* To reach the host, we must go through a gateway. */
2589 next_hop->s_addr = gateway;
2591 *netdev_name = xstrdup(iface);
2603 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2608 ovs_mutex_lock(&netdev->mutex);
2609 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2610 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2612 COVERAGE_INC(netdev_get_ethtool);
2613 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2614 error = netdev_linux_do_ethtool(netdev->up.name,
2617 "ETHTOOL_GDRVINFO");
2619 netdev->cache_valid |= VALID_DRVINFO;
2624 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2625 smap_add(smap, "driver_version", netdev->drvinfo.version);
2626 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2628 ovs_mutex_unlock(&netdev->mutex);
2634 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2637 smap_add(smap, "driver_name", "openvswitch");
2641 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2642 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2643 * returns 0. Otherwise, it returns a positive errno value; in particular,
2644 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2646 netdev_linux_arp_lookup(const struct netdev *netdev,
2647 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2650 struct sockaddr_in sin;
2653 memset(&r, 0, sizeof r);
2654 memset(&sin, 0, sizeof sin);
2655 sin.sin_family = AF_INET;
2656 sin.sin_addr.s_addr = ip;
2658 memcpy(&r.arp_pa, &sin, sizeof sin);
2659 r.arp_ha.sa_family = ARPHRD_ETHER;
2661 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2662 COVERAGE_INC(netdev_arp_lookup);
2663 retval = af_inet_ioctl(SIOCGARP, &r);
2665 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2666 } else if (retval != ENXIO) {
2667 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2668 netdev_get_name(netdev), IP_ARGS(ip),
2669 ovs_strerror(retval));
2675 nd_to_iff_flags(enum netdev_flags nd)
2678 if (nd & NETDEV_UP) {
2681 if (nd & NETDEV_PROMISC) {
2684 if (nd & NETDEV_LOOPBACK) {
2685 iff |= IFF_LOOPBACK;
2691 iff_to_nd_flags(int iff)
2693 enum netdev_flags nd = 0;
2697 if (iff & IFF_PROMISC) {
2698 nd |= NETDEV_PROMISC;
2700 if (iff & IFF_LOOPBACK) {
2701 nd |= NETDEV_LOOPBACK;
2707 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2708 enum netdev_flags on, enum netdev_flags *old_flagsp)
2709 OVS_REQUIRES(netdev->mutex)
2711 int old_flags, new_flags;
2714 old_flags = netdev->ifi_flags;
2715 *old_flagsp = iff_to_nd_flags(old_flags);
2716 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2717 if (new_flags != old_flags) {
2718 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2719 get_flags(&netdev->up, &netdev->ifi_flags);
2726 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2727 enum netdev_flags on, enum netdev_flags *old_flagsp)
2729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2732 ovs_mutex_lock(&netdev->mutex);
2733 error = update_flags(netdev, off, on, old_flagsp);
2734 ovs_mutex_unlock(&netdev->mutex);
2739 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2740 GET_FEATURES, GET_STATUS) \
2746 netdev_linux_wait, \
2748 netdev_linux_alloc, \
2750 netdev_linux_destruct, \
2751 netdev_linux_dealloc, \
2752 NULL, /* get_config */ \
2753 NULL, /* set_config */ \
2754 NULL, /* get_tunnel_config */ \
2755 NULL, /* build header */ \
2756 NULL, /* push header */ \
2757 NULL, /* pop header */ \
2758 NULL, /* get_numa_id */ \
2759 NULL, /* set_multiq */ \
2761 netdev_linux_send, \
2762 netdev_linux_send_wait, \
2764 netdev_linux_set_etheraddr, \
2765 netdev_linux_get_etheraddr, \
2766 netdev_linux_get_mtu, \
2767 netdev_linux_set_mtu, \
2768 netdev_linux_get_ifindex, \
2769 netdev_linux_get_carrier, \
2770 netdev_linux_get_carrier_resets, \
2771 netdev_linux_set_miimon_interval, \
2775 netdev_linux_set_advertisements, \
2777 netdev_linux_set_policing, \
2778 netdev_linux_get_qos_types, \
2779 netdev_linux_get_qos_capabilities, \
2780 netdev_linux_get_qos, \
2781 netdev_linux_set_qos, \
2782 netdev_linux_get_queue, \
2783 netdev_linux_set_queue, \
2784 netdev_linux_delete_queue, \
2785 netdev_linux_get_queue_stats, \
2786 netdev_linux_queue_dump_start, \
2787 netdev_linux_queue_dump_next, \
2788 netdev_linux_queue_dump_done, \
2789 netdev_linux_dump_queue_stats, \
2791 netdev_linux_get_in4, \
2792 netdev_linux_set_in4, \
2793 netdev_linux_get_in6, \
2794 netdev_linux_add_router, \
2795 netdev_linux_get_next_hop, \
2797 netdev_linux_arp_lookup, \
2799 netdev_linux_update_flags, \
2801 netdev_linux_rxq_alloc, \
2802 netdev_linux_rxq_construct, \
2803 netdev_linux_rxq_destruct, \
2804 netdev_linux_rxq_dealloc, \
2805 netdev_linux_rxq_recv, \
2806 netdev_linux_rxq_wait, \
2807 netdev_linux_rxq_drain, \
2810 const struct netdev_class netdev_linux_class =
2813 netdev_linux_construct,
2814 netdev_linux_get_stats,
2815 netdev_linux_get_features,
2816 netdev_linux_get_status);
2818 const struct netdev_class netdev_tap_class =
2821 netdev_linux_construct_tap,
2822 netdev_tap_get_stats,
2823 netdev_linux_get_features,
2824 netdev_linux_get_status);
2826 const struct netdev_class netdev_internal_class =
2829 netdev_linux_construct,
2830 netdev_internal_get_stats,
2831 NULL, /* get_features */
2832 netdev_internal_get_status);
2834 /* HTB traffic control class. */
2836 #define HTB_N_QUEUES 0xf000
2840 unsigned int max_rate; /* In bytes/s. */
2844 struct tc_queue tc_queue;
2845 unsigned int min_rate; /* In bytes/s. */
2846 unsigned int max_rate; /* In bytes/s. */
2847 unsigned int burst; /* In bytes. */
2848 unsigned int priority; /* Lower values are higher priorities. */
2852 htb_get__(const struct netdev *netdev_)
2854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2855 return CONTAINER_OF(netdev->tc, struct htb, tc);
2859 htb_install__(struct netdev *netdev_, uint64_t max_rate)
2861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2864 htb = xmalloc(sizeof *htb);
2865 tc_init(&htb->tc, &tc_ops_htb);
2866 htb->max_rate = max_rate;
2868 netdev->tc = &htb->tc;
2871 /* Create an HTB qdisc.
2873 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2875 htb_setup_qdisc__(struct netdev *netdev)
2878 struct tc_htb_glob opt;
2879 struct ofpbuf request;
2880 struct tcmsg *tcmsg;
2882 tc_del_qdisc(netdev);
2884 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2885 NLM_F_EXCL | NLM_F_CREATE, &request);
2889 tcmsg->tcm_handle = tc_make_handle(1, 0);
2890 tcmsg->tcm_parent = TC_H_ROOT;
2892 nl_msg_put_string(&request, TCA_KIND, "htb");
2894 memset(&opt, 0, sizeof opt);
2895 opt.rate2quantum = 10;
2899 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2900 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2901 nl_msg_end_nested(&request, opt_offset);
2903 return tc_transact(&request, NULL);
2906 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2907 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2909 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2910 unsigned int parent, struct htb_class *class)
2913 struct tc_htb_opt opt;
2914 struct ofpbuf request;
2915 struct tcmsg *tcmsg;
2919 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
2921 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2922 netdev_get_name(netdev));
2926 memset(&opt, 0, sizeof opt);
2927 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2928 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2929 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2930 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2931 opt.prio = class->priority;
2933 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2937 tcmsg->tcm_handle = handle;
2938 tcmsg->tcm_parent = parent;
2940 nl_msg_put_string(&request, TCA_KIND, "htb");
2941 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2942 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2943 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2944 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2945 nl_msg_end_nested(&request, opt_offset);
2947 error = tc_transact(&request, NULL);
2949 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2950 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2951 netdev_get_name(netdev),
2952 tc_get_major(handle), tc_get_minor(handle),
2953 tc_get_major(parent), tc_get_minor(parent),
2954 class->min_rate, class->max_rate,
2955 class->burst, class->priority, ovs_strerror(error));
2960 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2961 * description of them into 'details'. The description complies with the
2962 * specification given in the vswitch database documentation for linux-htb
2965 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2967 static const struct nl_policy tca_htb_policy[] = {
2968 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2969 .min_len = sizeof(struct tc_htb_opt) },
2972 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2973 const struct tc_htb_opt *htb;
2975 if (!nl_parse_nested(nl_options, tca_htb_policy,
2976 attrs, ARRAY_SIZE(tca_htb_policy))) {
2977 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2981 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2982 class->min_rate = htb->rate.rate;
2983 class->max_rate = htb->ceil.rate;
2984 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2985 class->priority = htb->prio;
2990 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2991 struct htb_class *options,
2992 struct netdev_queue_stats *stats)
2994 struct nlattr *nl_options;
2995 unsigned int handle;
2998 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2999 if (!error && queue_id) {
3000 unsigned int major = tc_get_major(handle);
3001 unsigned int minor = tc_get_minor(handle);
3002 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3003 *queue_id = minor - 1;
3008 if (!error && options) {
3009 error = htb_parse_tca_options__(nl_options, options);
3015 htb_parse_qdisc_details__(struct netdev *netdev_,
3016 const struct smap *details, struct htb_class *hc)
3018 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3019 const char *max_rate_s;
3021 max_rate_s = smap_get(details, "max-rate");
3022 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3023 if (!hc->max_rate) {
3024 enum netdev_features current;
3026 netdev_linux_read_features(netdev);
3027 current = !netdev->get_features_error ? netdev->current : 0;
3028 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3030 hc->min_rate = hc->max_rate;
3036 htb_parse_class_details__(struct netdev *netdev,
3037 const struct smap *details, struct htb_class *hc)
3039 const struct htb *htb = htb_get__(netdev);
3040 const char *min_rate_s = smap_get(details, "min-rate");
3041 const char *max_rate_s = smap_get(details, "max-rate");
3042 const char *burst_s = smap_get(details, "burst");
3043 const char *priority_s = smap_get(details, "priority");
3046 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3048 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3049 netdev_get_name(netdev));
3053 /* HTB requires at least an mtu sized min-rate to send any traffic even
3054 * on uncongested links. */
3055 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3056 hc->min_rate = MAX(hc->min_rate, mtu);
3057 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3060 hc->max_rate = (max_rate_s
3061 ? strtoull(max_rate_s, NULL, 10) / 8
3063 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3064 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3068 * According to hints in the documentation that I've read, it is important
3069 * that 'burst' be at least as big as the largest frame that might be
3070 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3071 * but having it a bit too small is a problem. Since netdev_get_mtu()
3072 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3073 * the MTU. We actually add 64, instead of 14, as a guard against
3074 * additional headers get tacked on somewhere that we're not aware of. */
3075 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3076 hc->burst = MAX(hc->burst, mtu + 64);
3079 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3085 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3086 unsigned int parent, struct htb_class *options,
3087 struct netdev_queue_stats *stats)
3089 struct ofpbuf *reply;
3092 error = tc_query_class(netdev, handle, parent, &reply);
3094 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3095 ofpbuf_delete(reply);
3101 htb_tc_install(struct netdev *netdev, const struct smap *details)
3105 error = htb_setup_qdisc__(netdev);
3107 struct htb_class hc;
3109 htb_parse_qdisc_details__(netdev, details, &hc);
3110 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3111 tc_make_handle(1, 0), &hc);
3113 htb_install__(netdev, hc.max_rate);
3119 static struct htb_class *
3120 htb_class_cast__(const struct tc_queue *queue)
3122 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3126 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3127 const struct htb_class *hc)
3129 struct htb *htb = htb_get__(netdev);
3130 size_t hash = hash_int(queue_id, 0);
3131 struct tc_queue *queue;
3132 struct htb_class *hcp;
3134 queue = tc_find_queue__(netdev, queue_id, hash);
3136 hcp = htb_class_cast__(queue);
3138 hcp = xmalloc(sizeof *hcp);
3139 queue = &hcp->tc_queue;
3140 queue->queue_id = queue_id;
3141 queue->created = time_msec();
3142 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3145 hcp->min_rate = hc->min_rate;
3146 hcp->max_rate = hc->max_rate;
3147 hcp->burst = hc->burst;
3148 hcp->priority = hc->priority;
3152 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3155 struct queue_dump_state state;
3156 struct htb_class hc;
3158 /* Get qdisc options. */
3160 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3161 htb_install__(netdev, hc.max_rate);
3164 if (!start_queue_dump(netdev, &state)) {
3167 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3168 unsigned int queue_id;
3170 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3171 htb_update_queue__(netdev, queue_id, &hc);
3174 finish_queue_dump(&state);
3180 htb_tc_destroy(struct tc *tc)
3182 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3183 struct htb_class *hc, *next;
3185 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3186 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3194 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3196 const struct htb *htb = htb_get__(netdev);
3197 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3202 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3204 struct htb_class hc;
3207 htb_parse_qdisc_details__(netdev, details, &hc);
3208 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3209 tc_make_handle(1, 0), &hc);
3211 htb_get__(netdev)->max_rate = hc.max_rate;
3217 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3218 const struct tc_queue *queue, struct smap *details)
3220 const struct htb_class *hc = htb_class_cast__(queue);
3222 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3223 if (hc->min_rate != hc->max_rate) {
3224 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3226 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3228 smap_add_format(details, "priority", "%u", hc->priority);
3234 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3235 const struct smap *details)
3237 struct htb_class hc;
3240 error = htb_parse_class_details__(netdev, details, &hc);
3245 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3246 tc_make_handle(1, 0xfffe), &hc);
3251 htb_update_queue__(netdev, queue_id, &hc);
3256 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3258 struct htb_class *hc = htb_class_cast__(queue);
3259 struct htb *htb = htb_get__(netdev);
3262 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3264 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3271 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3272 struct netdev_queue_stats *stats)
3274 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3275 tc_make_handle(1, 0xfffe), NULL, stats);
3279 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3280 const struct ofpbuf *nlmsg,
3281 netdev_dump_queue_stats_cb *cb, void *aux)
3283 struct netdev_queue_stats stats;
3284 unsigned int handle, major, minor;
3287 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3292 major = tc_get_major(handle);
3293 minor = tc_get_minor(handle);
3294 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3295 (*cb)(minor - 1, &stats, aux);
3300 static const struct tc_ops tc_ops_htb = {
3301 "htb", /* linux_name */
3302 "linux-htb", /* ovs_name */
3303 HTB_N_QUEUES, /* n_queues */
3312 htb_class_get_stats,
3313 htb_class_dump_stats
3316 /* "linux-hfsc" traffic control class. */
3318 #define HFSC_N_QUEUES 0xf000
3326 struct tc_queue tc_queue;
3331 static struct hfsc *
3332 hfsc_get__(const struct netdev *netdev_)
3334 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3335 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3338 static struct hfsc_class *
3339 hfsc_class_cast__(const struct tc_queue *queue)
3341 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3345 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3347 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3350 hfsc = xmalloc(sizeof *hfsc);
3351 tc_init(&hfsc->tc, &tc_ops_hfsc);
3352 hfsc->max_rate = max_rate;
3353 netdev->tc = &hfsc->tc;
3357 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3358 const struct hfsc_class *hc)
3362 struct hfsc_class *hcp;
3363 struct tc_queue *queue;
3365 hfsc = hfsc_get__(netdev);
3366 hash = hash_int(queue_id, 0);
3368 queue = tc_find_queue__(netdev, queue_id, hash);
3370 hcp = hfsc_class_cast__(queue);
3372 hcp = xmalloc(sizeof *hcp);
3373 queue = &hcp->tc_queue;
3374 queue->queue_id = queue_id;
3375 queue->created = time_msec();
3376 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3379 hcp->min_rate = hc->min_rate;
3380 hcp->max_rate = hc->max_rate;
3384 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3386 const struct tc_service_curve *rsc, *fsc, *usc;
3387 static const struct nl_policy tca_hfsc_policy[] = {
3389 .type = NL_A_UNSPEC,
3391 .min_len = sizeof(struct tc_service_curve),
3394 .type = NL_A_UNSPEC,
3396 .min_len = sizeof(struct tc_service_curve),
3399 .type = NL_A_UNSPEC,
3401 .min_len = sizeof(struct tc_service_curve),
3404 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3406 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3407 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3408 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3412 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3413 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3414 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3416 if (rsc->m1 != 0 || rsc->d != 0 ||
3417 fsc->m1 != 0 || fsc->d != 0 ||
3418 usc->m1 != 0 || usc->d != 0) {
3419 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3420 "Non-linear service curves are not supported.");
3424 if (rsc->m2 != fsc->m2) {
3425 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3426 "Real-time service curves are not supported ");
3430 if (rsc->m2 > usc->m2) {
3431 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3432 "Min-rate service curve is greater than "
3433 "the max-rate service curve.");
3437 class->min_rate = fsc->m2;
3438 class->max_rate = usc->m2;
3443 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3444 struct hfsc_class *options,
3445 struct netdev_queue_stats *stats)
3448 unsigned int handle;
3449 struct nlattr *nl_options;
3451 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3457 unsigned int major, minor;
3459 major = tc_get_major(handle);
3460 minor = tc_get_minor(handle);
3461 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3462 *queue_id = minor - 1;
3469 error = hfsc_parse_tca_options__(nl_options, options);
3476 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3477 unsigned int parent, struct hfsc_class *options,
3478 struct netdev_queue_stats *stats)
3481 struct ofpbuf *reply;
3483 error = tc_query_class(netdev, handle, parent, &reply);
3488 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3489 ofpbuf_delete(reply);
3494 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
3495 struct hfsc_class *class)
3497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3499 const char *max_rate_s;
3501 max_rate_s = smap_get(details, "max-rate");
3502 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3505 enum netdev_features current;
3507 netdev_linux_read_features(netdev);
3508 current = !netdev->get_features_error ? netdev->current : 0;
3509 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3512 class->min_rate = max_rate;
3513 class->max_rate = max_rate;
3517 hfsc_parse_class_details__(struct netdev *netdev,
3518 const struct smap *details,
3519 struct hfsc_class * class)
3521 const struct hfsc *hfsc;
3522 uint32_t min_rate, max_rate;
3523 const char *min_rate_s, *max_rate_s;
3525 hfsc = hfsc_get__(netdev);
3526 min_rate_s = smap_get(details, "min-rate");
3527 max_rate_s = smap_get(details, "max-rate");
3529 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3530 min_rate = MAX(min_rate, 1);
3531 min_rate = MIN(min_rate, hfsc->max_rate);
3533 max_rate = (max_rate_s
3534 ? strtoull(max_rate_s, NULL, 10) / 8
3536 max_rate = MAX(max_rate, min_rate);
3537 max_rate = MIN(max_rate, hfsc->max_rate);
3539 class->min_rate = min_rate;
3540 class->max_rate = max_rate;
3545 /* Create an HFSC qdisc.
3547 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3549 hfsc_setup_qdisc__(struct netdev * netdev)
3551 struct tcmsg *tcmsg;
3552 struct ofpbuf request;
3553 struct tc_hfsc_qopt opt;
3555 tc_del_qdisc(netdev);
3557 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3558 NLM_F_EXCL | NLM_F_CREATE, &request);
3564 tcmsg->tcm_handle = tc_make_handle(1, 0);
3565 tcmsg->tcm_parent = TC_H_ROOT;
3567 memset(&opt, 0, sizeof opt);
3570 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3571 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3573 return tc_transact(&request, NULL);
3576 /* Create an HFSC class.
3578 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3579 * sc rate <min_rate> ul rate <max_rate>" */
3581 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3582 unsigned int parent, struct hfsc_class *class)
3586 struct tcmsg *tcmsg;
3587 struct ofpbuf request;
3588 struct tc_service_curve min, max;
3590 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3596 tcmsg->tcm_handle = handle;
3597 tcmsg->tcm_parent = parent;
3601 min.m2 = class->min_rate;
3605 max.m2 = class->max_rate;
3607 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3608 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3609 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3610 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3611 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3612 nl_msg_end_nested(&request, opt_offset);
3614 error = tc_transact(&request, NULL);
3616 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3617 "min-rate %ubps, max-rate %ubps (%s)",
3618 netdev_get_name(netdev),
3619 tc_get_major(handle), tc_get_minor(handle),
3620 tc_get_major(parent), tc_get_minor(parent),
3621 class->min_rate, class->max_rate, ovs_strerror(error));
3628 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3631 struct hfsc_class class;
3633 error = hfsc_setup_qdisc__(netdev);
3639 hfsc_parse_qdisc_details__(netdev, details, &class);
3640 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3641 tc_make_handle(1, 0), &class);
3647 hfsc_install__(netdev, class.max_rate);
3652 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3655 struct queue_dump_state state;
3656 struct hfsc_class hc;
3659 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3660 hfsc_install__(netdev, hc.max_rate);
3662 if (!start_queue_dump(netdev, &state)) {
3666 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3667 unsigned int queue_id;
3669 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3670 hfsc_update_queue__(netdev, queue_id, &hc);
3674 finish_queue_dump(&state);
3679 hfsc_tc_destroy(struct tc *tc)
3682 struct hfsc_class *hc, *next;
3684 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3686 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3687 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3696 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3698 const struct hfsc *hfsc;
3699 hfsc = hfsc_get__(netdev);
3700 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3705 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3708 struct hfsc_class class;
3710 hfsc_parse_qdisc_details__(netdev, details, &class);
3711 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3712 tc_make_handle(1, 0), &class);
3715 hfsc_get__(netdev)->max_rate = class.max_rate;
3722 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3723 const struct tc_queue *queue, struct smap *details)
3725 const struct hfsc_class *hc;
3727 hc = hfsc_class_cast__(queue);
3728 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3729 if (hc->min_rate != hc->max_rate) {
3730 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3736 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3737 const struct smap *details)
3740 struct hfsc_class class;
3742 error = hfsc_parse_class_details__(netdev, details, &class);
3747 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3748 tc_make_handle(1, 0xfffe), &class);
3753 hfsc_update_queue__(netdev, queue_id, &class);
3758 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3762 struct hfsc_class *hc;
3764 hc = hfsc_class_cast__(queue);
3765 hfsc = hfsc_get__(netdev);
3767 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3769 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3776 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3777 struct netdev_queue_stats *stats)
3779 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3780 tc_make_handle(1, 0xfffe), NULL, stats);
3784 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3785 const struct ofpbuf *nlmsg,
3786 netdev_dump_queue_stats_cb *cb, void *aux)
3788 struct netdev_queue_stats stats;
3789 unsigned int handle, major, minor;
3792 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3797 major = tc_get_major(handle);
3798 minor = tc_get_minor(handle);
3799 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3800 (*cb)(minor - 1, &stats, aux);
3805 static const struct tc_ops tc_ops_hfsc = {
3806 "hfsc", /* linux_name */
3807 "linux-hfsc", /* ovs_name */
3808 HFSC_N_QUEUES, /* n_queues */
3809 hfsc_tc_install, /* tc_install */
3810 hfsc_tc_load, /* tc_load */
3811 hfsc_tc_destroy, /* tc_destroy */
3812 hfsc_qdisc_get, /* qdisc_get */
3813 hfsc_qdisc_set, /* qdisc_set */
3814 hfsc_class_get, /* class_get */
3815 hfsc_class_set, /* class_set */
3816 hfsc_class_delete, /* class_delete */
3817 hfsc_class_get_stats, /* class_get_stats */
3818 hfsc_class_dump_stats /* class_dump_stats */
3821 /* "linux-default" traffic control class.
3823 * This class represents the default, unnamed Linux qdisc. It corresponds to
3824 * the "" (empty string) QoS type in the OVS database. */
3827 default_install__(struct netdev *netdev_)
3829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3830 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
3832 /* Nothing but a tc class implementation is allowed to write to a tc. This
3833 * class never does that, so we can legitimately use a const tc object. */
3834 netdev->tc = CONST_CAST(struct tc *, &tc);
3838 default_tc_install(struct netdev *netdev,
3839 const struct smap *details OVS_UNUSED)
3841 default_install__(netdev);
3846 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3848 default_install__(netdev);
3852 static const struct tc_ops tc_ops_default = {
3853 NULL, /* linux_name */
3858 NULL, /* tc_destroy */
3859 NULL, /* qdisc_get */
3860 NULL, /* qdisc_set */
3861 NULL, /* class_get */
3862 NULL, /* class_set */
3863 NULL, /* class_delete */
3864 NULL, /* class_get_stats */
3865 NULL /* class_dump_stats */
3868 /* "linux-other" traffic control class.
3873 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
3875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3876 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
3878 /* Nothing but a tc class implementation is allowed to write to a tc. This
3879 * class never does that, so we can legitimately use a const tc object. */
3880 netdev->tc = CONST_CAST(struct tc *, &tc);
3884 static const struct tc_ops tc_ops_other = {
3885 NULL, /* linux_name */
3886 "linux-other", /* ovs_name */
3888 NULL, /* tc_install */
3890 NULL, /* tc_destroy */
3891 NULL, /* qdisc_get */
3892 NULL, /* qdisc_set */
3893 NULL, /* class_get */
3894 NULL, /* class_set */
3895 NULL, /* class_delete */
3896 NULL, /* class_get_stats */
3897 NULL /* class_dump_stats */
3900 /* Traffic control. */
3902 /* Number of kernel "tc" ticks per second. */
3903 static double ticks_per_s;
3905 /* Number of kernel "jiffies" per second. This is used for the purpose of
3906 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3907 * one jiffy's worth of data.
3909 * There are two possibilities here:
3911 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3912 * approximate range of 100 to 1024. That means that we really need to
3913 * make sure that the qdisc can buffer that much data.
3915 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3916 * has finely granular timers and there's no need to fudge additional room
3917 * for buffers. (There's no extra effort needed to implement that: the
3918 * large 'buffer_hz' is used as a divisor, so practically any number will
3919 * come out as 0 in the division. Small integer results in the case of
3920 * really high dividends won't have any real effect anyhow.)
3922 static unsigned int buffer_hz;
3924 /* Returns tc handle 'major':'minor'. */
3926 tc_make_handle(unsigned int major, unsigned int minor)
3928 return TC_H_MAKE(major << 16, minor);
3931 /* Returns the major number from 'handle'. */
3933 tc_get_major(unsigned int handle)
3935 return TC_H_MAJ(handle) >> 16;
3938 /* Returns the minor number from 'handle'. */
3940 tc_get_minor(unsigned int handle)
3942 return TC_H_MIN(handle);
3945 static struct tcmsg *
3946 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3947 struct ofpbuf *request)
3949 struct tcmsg *tcmsg;
3953 error = get_ifindex(netdev, &ifindex);
3958 ofpbuf_init(request, 512);
3959 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3960 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3961 tcmsg->tcm_family = AF_UNSPEC;
3962 tcmsg->tcm_ifindex = ifindex;
3963 /* Caller should fill in tcmsg->tcm_handle. */
3964 /* Caller should fill in tcmsg->tcm_parent. */
3970 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3972 int error = nl_transact(NETLINK_ROUTE, request, replyp);
3973 ofpbuf_uninit(request);
3977 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3978 * policing configuration.
3980 * This function is equivalent to running the following when 'add' is true:
3981 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3983 * This function is equivalent to running the following when 'add' is false:
3984 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3986 * The configuration and stats may be seen with the following command:
3987 * /sbin/tc -s qdisc show dev <devname>
3989 * Returns 0 if successful, otherwise a positive errno value.
3992 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3994 struct ofpbuf request;
3995 struct tcmsg *tcmsg;
3997 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3998 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4000 tcmsg = tc_make_request(netdev, type, flags, &request);
4004 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4005 tcmsg->tcm_parent = TC_H_INGRESS;
4006 nl_msg_put_string(&request, TCA_KIND, "ingress");
4007 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4009 error = tc_transact(&request, NULL);
4011 /* If we're deleting the qdisc, don't worry about some of the
4012 * error conditions. */
4013 if (!add && (error == ENOENT || error == EINVAL)) {
4022 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4025 * This function is equivalent to running:
4026 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4027 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4030 * The configuration and stats may be seen with the following command:
4031 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4033 * Returns 0 if successful, otherwise a positive errno value.
4036 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
4038 struct tc_police tc_police;
4039 struct ofpbuf request;
4040 struct tcmsg *tcmsg;
4041 size_t basic_offset;
4042 size_t police_offset;
4046 memset(&tc_police, 0, sizeof tc_police);
4047 tc_police.action = TC_POLICE_SHOT;
4048 tc_police.mtu = mtu;
4049 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4050 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
4051 kbits_burst * 1024);
4053 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4054 NLM_F_EXCL | NLM_F_CREATE, &request);
4058 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4059 tcmsg->tcm_info = tc_make_handle(49,
4060 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4062 nl_msg_put_string(&request, TCA_KIND, "basic");
4063 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4064 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4065 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4066 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4067 nl_msg_end_nested(&request, police_offset);
4068 nl_msg_end_nested(&request, basic_offset);
4070 error = tc_transact(&request, NULL);
4081 /* The values in psched are not individually very meaningful, but they are
4082 * important. The tables below show some values seen in the wild.
4086 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4087 * (Before that, there are hints that it was 1000000000.)
4089 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4093 * -----------------------------------
4094 * [1] 000c8000 000f4240 000f4240 00000064
4095 * [2] 000003e8 00000400 000f4240 3b9aca00
4096 * [3] 000003e8 00000400 000f4240 3b9aca00
4097 * [4] 000003e8 00000400 000f4240 00000064
4098 * [5] 000003e8 00000040 000f4240 3b9aca00
4099 * [6] 000003e8 00000040 000f4240 000000f9
4101 * a b c d ticks_per_s buffer_hz
4102 * ------- --------- ---------- ------------- ----------- -------------
4103 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4104 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4105 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4106 * [4] 1,000 1,024 1,000,000 100 976,562 100
4107 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4108 * [6] 1,000 64 1,000,000 249 15,625,000 249
4110 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4111 * [2] 2.6.26-1-686-bigmem from Debian lenny
4112 * [3] 2.6.26-2-sparc64 from Debian lenny
4113 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4114 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4115 * [6] 2.6.34 from kernel.org on KVM
4117 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4118 static const char fn[] = "/proc/net/psched";
4119 unsigned int a, b, c, d;
4122 if (!ovsthread_once_start(&once)) {
4129 stream = fopen(fn, "r");
4131 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4135 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4136 VLOG_WARN("%s: read failed", fn);
4140 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4144 VLOG_WARN("%s: invalid scheduler parameters", fn);
4148 ticks_per_s = (double) a * c / b;
4152 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4155 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4158 ovsthread_once_done(&once);
4161 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4162 * rate of 'rate' bytes per second. */
4164 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4167 return (rate * ticks) / ticks_per_s;
4170 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4171 * rate of 'rate' bytes per second. */
4173 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4176 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4179 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4180 * a transmission rate of 'rate' bytes per second. */
4182 tc_buffer_per_jiffy(unsigned int rate)
4185 return rate / buffer_hz;
4188 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4189 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4190 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4191 * stores NULL into it if it is absent.
4193 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4196 * Returns 0 if successful, otherwise a positive errno value. */
4198 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4199 struct nlattr **options)
4201 static const struct nl_policy tca_policy[] = {
4202 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4203 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4205 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4207 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4208 tca_policy, ta, ARRAY_SIZE(ta))) {
4209 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4214 *kind = nl_attr_get_string(ta[TCA_KIND]);
4218 *options = ta[TCA_OPTIONS];
4233 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4234 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4235 * into '*options', and its queue statistics into '*stats'. Any of the output
4236 * arguments may be null.
4238 * Returns 0 if successful, otherwise a positive errno value. */
4240 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4241 struct nlattr **options, struct netdev_queue_stats *stats)
4243 static const struct nl_policy tca_policy[] = {
4244 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4245 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4247 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4249 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4250 tca_policy, ta, ARRAY_SIZE(ta))) {
4251 VLOG_WARN_RL(&rl, "failed to parse class message");
4256 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4257 *handlep = tc->tcm_handle;
4261 *options = ta[TCA_OPTIONS];
4265 const struct gnet_stats_queue *gsq;
4266 struct gnet_stats_basic gsb;
4268 static const struct nl_policy stats_policy[] = {
4269 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4270 .min_len = sizeof gsb },
4271 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4272 .min_len = sizeof *gsq },
4274 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4276 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4277 sa, ARRAY_SIZE(sa))) {
4278 VLOG_WARN_RL(&rl, "failed to parse class stats");
4282 /* Alignment issues screw up the length of struct gnet_stats_basic on
4283 * some arch/bitsize combinations. Newer versions of Linux have a
4284 * struct gnet_stats_basic_packed, but we can't depend on that. The
4285 * easiest thing to do is just to make a copy. */
4286 memset(&gsb, 0, sizeof gsb);
4287 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4288 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4289 stats->tx_bytes = gsb.bytes;
4290 stats->tx_packets = gsb.packets;
4292 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4293 stats->tx_errors = gsq->drops;
4303 memset(stats, 0, sizeof *stats);
4308 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4311 tc_query_class(const struct netdev *netdev,
4312 unsigned int handle, unsigned int parent,
4313 struct ofpbuf **replyp)
4315 struct ofpbuf request;
4316 struct tcmsg *tcmsg;
4319 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4323 tcmsg->tcm_handle = handle;
4324 tcmsg->tcm_parent = parent;
4326 error = tc_transact(&request, replyp);
4328 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4329 netdev_get_name(netdev),
4330 tc_get_major(handle), tc_get_minor(handle),
4331 tc_get_major(parent), tc_get_minor(parent),
4332 ovs_strerror(error));
4337 /* Equivalent to "tc class del dev <name> handle <handle>". */
4339 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4341 struct ofpbuf request;
4342 struct tcmsg *tcmsg;
4345 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4349 tcmsg->tcm_handle = handle;
4350 tcmsg->tcm_parent = 0;
4352 error = tc_transact(&request, NULL);
4354 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4355 netdev_get_name(netdev),
4356 tc_get_major(handle), tc_get_minor(handle),
4357 ovs_strerror(error));
4362 /* Equivalent to "tc qdisc del dev <name> root". */
4364 tc_del_qdisc(struct netdev *netdev_)
4366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4367 struct ofpbuf request;
4368 struct tcmsg *tcmsg;
4371 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
4375 tcmsg->tcm_handle = tc_make_handle(1, 0);
4376 tcmsg->tcm_parent = TC_H_ROOT;
4378 error = tc_transact(&request, NULL);
4379 if (error == EINVAL) {
4380 /* EINVAL probably means that the default qdisc was in use, in which
4381 * case we've accomplished our purpose. */
4384 if (!error && netdev->tc) {
4385 if (netdev->tc->ops->tc_destroy) {
4386 netdev->tc->ops->tc_destroy(netdev->tc);
4393 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4394 * kernel to determine what they are. Returns 0 if successful, otherwise a
4395 * positive errno value. */
4397 tc_query_qdisc(const struct netdev *netdev_)
4399 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4400 struct ofpbuf request, *qdisc;
4401 const struct tc_ops *ops;
4402 struct tcmsg *tcmsg;
4410 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4411 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4412 * 2.6.35 without that fix backported to it.
4414 * To avoid the OOPS, we must not make a request that would attempt to dump
4415 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4416 * few others. There are a few ways that I can see to do this, but most of
4417 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4418 * technique chosen here is to assume that any non-default qdisc that we
4419 * create will have a class with handle 1:0. The built-in qdiscs only have
4420 * a class with handle 0:0.
4422 * We could check for Linux 2.6.35+ and use a more straightforward method
4424 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
4428 tcmsg->tcm_handle = tc_make_handle(1, 0);
4429 tcmsg->tcm_parent = 0;
4431 /* Figure out what tc class to instantiate. */
4432 error = tc_transact(&request, &qdisc);
4436 error = tc_parse_qdisc(qdisc, &kind, NULL);
4438 ops = &tc_ops_other;
4440 ops = tc_lookup_linux_name(kind);
4442 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4443 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4445 ops = &tc_ops_other;
4448 } else if (error == ENOENT) {
4449 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4450 * other entity that doesn't have a handle 1:0. We will assume
4451 * that it's the system default qdisc. */
4452 ops = &tc_ops_default;
4455 /* Who knows? Maybe the device got deleted. */
4456 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4457 netdev_get_name(netdev_), ovs_strerror(error));
4458 ops = &tc_ops_other;
4461 /* Instantiate it. */
4462 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
4463 ovs_assert((load_error == 0) == (netdev->tc != NULL));
4464 ofpbuf_delete(qdisc);
4466 return error ? error : load_error;
4469 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4470 approximate the time to transmit packets of various lengths. For an MTU of
4471 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4472 represents two possible packet lengths; for a MTU of 513 through 1024, four
4473 possible lengths; and so on.
4475 Returns, for the specified 'mtu', the number of bits that packet lengths
4476 need to be shifted right to fit within such a 256-entry table. */
4478 tc_calc_cell_log(unsigned int mtu)
4483 mtu = ETH_PAYLOAD_MAX;
4485 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4487 for (cell_log = 0; mtu >= 256; cell_log++) {
4494 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4497 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4499 memset(rate, 0, sizeof *rate);
4500 rate->cell_log = tc_calc_cell_log(mtu);
4501 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4502 /* rate->cell_align = 0; */ /* distro headers. */
4503 rate->mpu = ETH_TOTAL_MIN;
4507 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4508 * attribute of the specified "type".
4510 * See tc_calc_cell_log() above for a description of "rtab"s. */
4512 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4517 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4518 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4519 unsigned packet_size = (i + 1) << rate->cell_log;
4520 if (packet_size < rate->mpu) {
4521 packet_size = rate->mpu;
4523 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4527 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4528 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4529 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4532 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4534 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4535 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4538 /* Linux-only functions declared in netdev-linux.h */
4540 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4541 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4543 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4544 const char *flag_name, bool enable)
4546 const char *netdev_name = netdev_get_name(netdev);
4547 struct ethtool_value evalue;
4551 COVERAGE_INC(netdev_get_ethtool);
4552 memset(&evalue, 0, sizeof evalue);
4553 error = netdev_linux_do_ethtool(netdev_name,
4554 (struct ethtool_cmd *)&evalue,
4555 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4560 COVERAGE_INC(netdev_set_ethtool);
4561 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4562 error = netdev_linux_do_ethtool(netdev_name,
4563 (struct ethtool_cmd *)&evalue,
4564 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4569 COVERAGE_INC(netdev_get_ethtool);
4570 memset(&evalue, 0, sizeof evalue);
4571 error = netdev_linux_do_ethtool(netdev_name,
4572 (struct ethtool_cmd *)&evalue,
4573 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4578 if (new_flags != evalue.data) {
4579 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4580 "device %s failed", enable ? "enable" : "disable",
4581 flag_name, netdev_name);
4588 /* Utility functions. */
4590 /* Copies 'src' into 'dst', performing format conversion in the process. */
4592 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4593 const struct rtnl_link_stats *src)
4595 dst->rx_packets = src->rx_packets;
4596 dst->tx_packets = src->tx_packets;
4597 dst->rx_bytes = src->rx_bytes;
4598 dst->tx_bytes = src->tx_bytes;
4599 dst->rx_errors = src->rx_errors;
4600 dst->tx_errors = src->tx_errors;
4601 dst->rx_dropped = src->rx_dropped;
4602 dst->tx_dropped = src->tx_dropped;
4603 dst->multicast = src->multicast;
4604 dst->collisions = src->collisions;
4605 dst->rx_length_errors = src->rx_length_errors;
4606 dst->rx_over_errors = src->rx_over_errors;
4607 dst->rx_crc_errors = src->rx_crc_errors;
4608 dst->rx_frame_errors = src->rx_frame_errors;
4609 dst->rx_fifo_errors = src->rx_fifo_errors;
4610 dst->rx_missed_errors = src->rx_missed_errors;
4611 dst->tx_aborted_errors = src->tx_aborted_errors;
4612 dst->tx_carrier_errors = src->tx_carrier_errors;
4613 dst->tx_fifo_errors = src->tx_fifo_errors;
4614 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4615 dst->tx_window_errors = src->tx_window_errors;
4618 /* Copies 'src' into 'dst', performing format conversion in the process. */
4620 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4621 const struct rtnl_link_stats64 *src)
4623 dst->rx_packets = src->rx_packets;
4624 dst->tx_packets = src->tx_packets;
4625 dst->rx_bytes = src->rx_bytes;
4626 dst->tx_bytes = src->tx_bytes;
4627 dst->rx_errors = src->rx_errors;
4628 dst->tx_errors = src->tx_errors;
4629 dst->rx_dropped = src->rx_dropped;
4630 dst->tx_dropped = src->tx_dropped;
4631 dst->multicast = src->multicast;
4632 dst->collisions = src->collisions;
4633 dst->rx_length_errors = src->rx_length_errors;
4634 dst->rx_over_errors = src->rx_over_errors;
4635 dst->rx_crc_errors = src->rx_crc_errors;
4636 dst->rx_frame_errors = src->rx_frame_errors;
4637 dst->rx_fifo_errors = src->rx_fifo_errors;
4638 dst->rx_missed_errors = src->rx_missed_errors;
4639 dst->tx_aborted_errors = src->tx_aborted_errors;
4640 dst->tx_carrier_errors = src->tx_carrier_errors;
4641 dst->tx_fifo_errors = src->tx_fifo_errors;
4642 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4643 dst->tx_window_errors = src->tx_window_errors;
4647 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
4649 struct ofpbuf request;
4650 struct ofpbuf *reply;
4653 ofpbuf_init(&request, 0);
4654 nl_msg_put_nlmsghdr(&request,
4655 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
4656 RTM_GETLINK, NLM_F_REQUEST);
4657 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
4658 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
4659 error = nl_transact(NETLINK_ROUTE, &request, &reply);
4660 ofpbuf_uninit(&request);
4665 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
4666 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
4667 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
4668 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
4671 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
4672 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
4673 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
4676 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4681 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
4686 ofpbuf_delete(reply);
4691 get_flags(const struct netdev *dev, unsigned int *flags)
4697 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
4699 *flags = ifr.ifr_flags;
4705 set_flags(const char *name, unsigned int flags)
4709 ifr.ifr_flags = flags;
4710 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
4714 do_get_ifindex(const char *netdev_name)
4719 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4720 COVERAGE_INC(netdev_get_ifindex);
4722 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
4724 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4725 netdev_name, ovs_strerror(error));
4728 return ifr.ifr_ifindex;
4732 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4736 if (!(netdev->cache_valid & VALID_IFINDEX)) {
4737 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4740 netdev->get_ifindex_error = -ifindex;
4741 netdev->ifindex = 0;
4743 netdev->get_ifindex_error = 0;
4744 netdev->ifindex = ifindex;
4746 netdev->cache_valid |= VALID_IFINDEX;
4749 *ifindexp = netdev->ifindex;
4750 return netdev->get_ifindex_error;
4754 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4760 memset(&ifr, 0, sizeof ifr);
4761 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4762 COVERAGE_INC(netdev_get_hwaddr);
4763 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
4765 /* ENODEV probably means that a vif disappeared asynchronously and
4766 * hasn't been removed from the database yet, so reduce the log level
4767 * to INFO for that case. */
4768 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
4769 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4770 netdev_name, ovs_strerror(error));
4773 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4774 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4775 VLOG_WARN("%s device has unknown hardware address family %d",
4776 netdev_name, hwaddr_family);
4778 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4783 set_etheraddr(const char *netdev_name,
4784 const uint8_t mac[ETH_ADDR_LEN])
4789 memset(&ifr, 0, sizeof ifr);
4790 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4791 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4792 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4793 COVERAGE_INC(netdev_set_hwaddr);
4794 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
4796 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4797 netdev_name, ovs_strerror(error));
4803 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4804 int cmd, const char *cmd_name)
4809 memset(&ifr, 0, sizeof ifr);
4810 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4811 ifr.ifr_data = (caddr_t) ecmd;
4814 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
4816 if (error != EOPNOTSUPP) {
4817 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4818 "failed: %s", cmd_name, name, ovs_strerror(error));
4820 /* The device doesn't support this operation. That's pretty
4821 * common, so there's no point in logging anything. */
4828 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4829 int cmd, const char *cmd_name)
4834 ifr.ifr_addr.sa_family = AF_INET;
4835 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4837 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
4839 *ip = sin->sin_addr;
4844 /* Returns an AF_PACKET raw socket or a negative errno value. */
4846 af_packet_sock(void)
4848 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4851 if (ovsthread_once_start(&once)) {
4852 sock = socket(AF_PACKET, SOCK_RAW, 0);
4854 int error = set_nonblocking(sock);
4861 VLOG_ERR("failed to create packet socket: %s",
4862 ovs_strerror(errno));
4864 ovsthread_once_done(&once);