2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink-link.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 uint8_t etheraddr[ETH_ADDR_LEN];
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
462 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
463 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
466 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
469 /* For devices of class netdev_tap_class only. */
473 struct netdev_rxq_linux {
474 struct netdev_rxq up;
479 /* This is set pretty low because we probably won't learn anything from the
480 * additional log messages. */
481 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
483 /* Polling miimon status for all ports causes performance degradation when
484 * handling a large number of ports. If there are no devices using miimon, then
485 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
487 * Readers do not depend on this variable synchronizing with the related
488 * changes in the device miimon status, so we can use atomic_count. */
489 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
491 static void netdev_linux_run(void);
493 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
494 int cmd, const char *cmd_name);
495 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
496 int cmd, const char *cmd_name);
497 static int get_flags(const struct netdev *, unsigned int *flags);
498 static int set_flags(const char *, unsigned int flags);
499 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
500 enum netdev_flags on, enum netdev_flags *old_flagsp)
501 OVS_REQUIRES(netdev->mutex);
502 static int do_get_ifindex(const char *netdev_name);
503 static int get_ifindex(const struct netdev *, int *ifindexp);
504 static int do_set_addr(struct netdev *netdev,
505 int ioctl_nr, const char *ioctl_name,
506 struct in_addr addr);
507 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
508 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
509 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
510 static int af_packet_sock(void);
511 static bool netdev_linux_miimon_enabled(void);
512 static void netdev_linux_miimon_run(void);
513 static void netdev_linux_miimon_wait(void);
514 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
517 is_netdev_linux_class(const struct netdev_class *netdev_class)
519 return netdev_class->run == netdev_linux_run;
523 is_tap_netdev(const struct netdev *netdev)
525 return netdev_get_class(netdev) == &netdev_tap_class;
528 static struct netdev_linux *
529 netdev_linux_cast(const struct netdev *netdev)
531 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
533 return CONTAINER_OF(netdev, struct netdev_linux, up);
536 static struct netdev_rxq_linux *
537 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
539 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
540 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
543 static void netdev_linux_update(struct netdev_linux *netdev,
544 const struct rtnetlink_link_change *)
545 OVS_REQUIRES(netdev->mutex);
546 static void netdev_linux_changed(struct netdev_linux *netdev,
547 unsigned int ifi_flags, unsigned int mask)
548 OVS_REQUIRES(netdev->mutex);
550 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
551 * if no such socket could be created. */
552 static struct nl_sock *
553 netdev_linux_notify_sock(void)
555 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
556 static struct nl_sock *sock;
558 if (ovsthread_once_start(&once)) {
561 error = nl_sock_create(NETLINK_ROUTE, &sock);
563 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
565 nl_sock_destroy(sock);
569 ovsthread_once_done(&once);
576 netdev_linux_miimon_enabled(void)
578 return atomic_count_get(&miimon_cnt) > 0;
582 netdev_linux_run(void)
584 struct nl_sock *sock;
587 if (netdev_linux_miimon_enabled()) {
588 netdev_linux_miimon_run();
591 sock = netdev_linux_notify_sock();
597 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
598 uint64_t buf_stub[4096 / 8];
601 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
602 error = nl_sock_recv(sock, &buf, false);
604 struct rtnetlink_link_change change;
606 if (rtnetlink_link_parse(&buf, &change)) {
607 struct netdev *netdev_ = netdev_from_name(change.ifname);
608 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
611 ovs_mutex_lock(&netdev->mutex);
612 netdev_linux_update(netdev, &change);
613 ovs_mutex_unlock(&netdev->mutex);
615 netdev_close(netdev_);
617 } else if (error == ENOBUFS) {
618 struct shash device_shash;
619 struct shash_node *node;
623 shash_init(&device_shash);
624 netdev_get_devices(&netdev_linux_class, &device_shash);
625 SHASH_FOR_EACH (node, &device_shash) {
626 struct netdev *netdev_ = node->data;
627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
630 ovs_mutex_lock(&netdev->mutex);
631 get_flags(netdev_, &flags);
632 netdev_linux_changed(netdev, flags, 0);
633 ovs_mutex_unlock(&netdev->mutex);
635 netdev_close(netdev_);
637 shash_destroy(&device_shash);
638 } else if (error != EAGAIN) {
639 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
640 ovs_strerror(error));
647 netdev_linux_wait(void)
649 struct nl_sock *sock;
651 if (netdev_linux_miimon_enabled()) {
652 netdev_linux_miimon_wait();
654 sock = netdev_linux_notify_sock();
656 nl_sock_wait(sock, POLLIN);
661 netdev_linux_changed(struct netdev_linux *dev,
662 unsigned int ifi_flags, unsigned int mask)
663 OVS_REQUIRES(dev->mutex)
665 netdev_change_seq_changed(&dev->up);
667 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
668 dev->carrier_resets++;
670 dev->ifi_flags = ifi_flags;
672 dev->cache_valid &= mask;
676 netdev_linux_update(struct netdev_linux *dev,
677 const struct rtnetlink_link_change *change)
678 OVS_REQUIRES(dev->mutex)
680 if (change->nlmsg_type == RTM_NEWLINK) {
682 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
684 /* Update netdev from rtnl-change msg. */
686 dev->mtu = change->mtu;
687 dev->cache_valid |= VALID_MTU;
688 dev->netdev_mtu_error = 0;
691 if (!eth_addr_is_zero(change->addr)) {
692 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
693 dev->cache_valid |= VALID_ETHERADDR;
694 dev->ether_addr_error = 0;
697 dev->ifindex = change->ifi_index;
698 dev->cache_valid |= VALID_IFINDEX;
699 dev->get_ifindex_error = 0;
702 netdev_linux_changed(dev, change->ifi_flags, 0);
706 static struct netdev *
707 netdev_linux_alloc(void)
709 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
714 netdev_linux_common_construct(struct netdev_linux *netdev)
716 ovs_mutex_init(&netdev->mutex);
719 /* Creates system and internal devices. */
721 netdev_linux_construct(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 netdev_linux_common_construct(netdev);
728 error = get_flags(&netdev->up, &netdev->ifi_flags);
729 if (error == ENODEV) {
730 if (netdev->up.netdev_class != &netdev_internal_class) {
731 /* The device does not exist, so don't allow it to be opened. */
734 /* "Internal" netdevs have to be created as netdev objects before
735 * they exist in the kernel, because creating them in the kernel
736 * happens by passing a netdev object to dpif_port_add().
737 * Therefore, ignore the error. */
744 /* For most types of netdevs we open the device for each call of
745 * netdev_open(). However, this is not the case with tap devices,
746 * since it is only possible to open the device once. In this
747 * situation we share a single file descriptor, and consequently
748 * buffers, across all readers. Therefore once data is read it will
749 * be unavailable to other reads for tap devices. */
751 netdev_linux_construct_tap(struct netdev *netdev_)
753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
754 static const char tap_dev[] = "/dev/net/tun";
755 const char *name = netdev_->name;
759 netdev_linux_common_construct(netdev);
761 /* Open tap device. */
762 netdev->tap_fd = open(tap_dev, O_RDWR);
763 if (netdev->tap_fd < 0) {
765 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
769 /* Create tap device. */
770 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
771 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
772 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
773 VLOG_WARN("%s: creating tap device failed: %s", name,
774 ovs_strerror(errno));
779 /* Make non-blocking. */
780 error = set_nonblocking(netdev->tap_fd);
788 close(netdev->tap_fd);
793 netdev_linux_destruct(struct netdev *netdev_)
795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
797 if (netdev->tc && netdev->tc->ops->tc_destroy) {
798 netdev->tc->ops->tc_destroy(netdev->tc);
801 if (netdev_get_class(netdev_) == &netdev_tap_class
802 && netdev->tap_fd >= 0)
804 close(netdev->tap_fd);
807 if (netdev->miimon_interval > 0) {
808 atomic_count_dec(&miimon_cnt);
811 ovs_mutex_destroy(&netdev->mutex);
815 netdev_linux_dealloc(struct netdev *netdev_)
817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
821 static struct netdev_rxq *
822 netdev_linux_rxq_alloc(void)
824 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
829 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
831 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
832 struct netdev *netdev_ = rx->up.netdev;
833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
836 ovs_mutex_lock(&netdev->mutex);
837 rx->is_tap = is_tap_netdev(netdev_);
839 rx->fd = netdev->tap_fd;
841 struct sockaddr_ll sll;
843 /* Result of tcpdump -dd inbound */
844 static const struct sock_filter filt[] = {
845 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
846 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
847 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
848 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
850 static const struct sock_fprog fprog = {
851 ARRAY_SIZE(filt), (struct sock_filter *) filt
854 /* Create file descriptor. */
855 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
858 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
863 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
865 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
866 netdev_get_name(netdev_), ovs_strerror(error));
870 /* Set non-blocking mode. */
871 error = set_nonblocking(rx->fd);
876 /* Get ethernet device index. */
877 error = get_ifindex(&netdev->up, &ifindex);
882 /* Bind to specific ethernet device. */
883 memset(&sll, 0, sizeof sll);
884 sll.sll_family = AF_PACKET;
885 sll.sll_ifindex = ifindex;
886 sll.sll_protocol = htons(ETH_P_ALL);
887 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
889 VLOG_ERR("%s: failed to bind raw socket (%s)",
890 netdev_get_name(netdev_), ovs_strerror(error));
894 /* Filter for only inbound packets. */
895 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
899 VLOG_ERR("%s: failed to attach filter (%s)",
900 netdev_get_name(netdev_), ovs_strerror(error));
904 ovs_mutex_unlock(&netdev->mutex);
912 ovs_mutex_unlock(&netdev->mutex);
917 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
919 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
927 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
929 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
935 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
937 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
938 return htons(aux->tp_vlan_tpid);
940 return htons(ETH_TYPE_VLAN);
945 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
947 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
951 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
956 struct cmsghdr *cmsg;
959 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
963 /* Reserve headroom for a single VLAN tag */
964 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
965 size = dp_packet_tailroom(buffer);
967 iov.iov_base = dp_packet_data(buffer);
969 msgh.msg_name = NULL;
970 msgh.msg_namelen = 0;
973 msgh.msg_control = &cmsg_buffer;
974 msgh.msg_controllen = sizeof cmsg_buffer;
978 retval = recvmsg(fd, &msgh, MSG_TRUNC);
979 } while (retval < 0 && errno == EINTR);
983 } else if (retval > size) {
987 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
989 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
990 const struct tpacket_auxdata *aux;
992 if (cmsg->cmsg_level != SOL_PACKET
993 || cmsg->cmsg_type != PACKET_AUXDATA
994 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
998 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
999 if (auxdata_has_vlan_tci(aux)) {
1000 if (retval < ETH_HEADER_LEN) {
1004 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1005 htons(aux->tp_vlan_tci));
1014 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1017 size_t size = dp_packet_tailroom(buffer);
1020 retval = read(fd, dp_packet_data(buffer), size);
1021 } while (retval < 0 && errno == EINTR);
1025 } else if (retval > size) {
1029 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1034 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1037 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1038 struct netdev *netdev = rx->up.netdev;
1039 struct dp_packet *buffer;
1043 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1044 mtu = ETH_PAYLOAD_MAX;
1047 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1048 DP_NETDEV_HEADROOM);
1049 retval = (rx->is_tap
1050 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1051 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1054 if (retval != EAGAIN && retval != EMSGSIZE) {
1055 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1056 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1058 dp_packet_delete(buffer);
1060 dp_packet_pad(buffer);
1061 dp_packet_set_dp_hash(buffer, 0);
1062 packets[0] = buffer;
1070 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1072 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1073 poll_fd_wait(rx->fd, POLLIN);
1077 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1079 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1082 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1083 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1087 drain_fd(rx->fd, ifr.ifr_qlen);
1090 return drain_rcvbuf(rx->fd);
1094 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1095 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1096 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1097 * the packet is too big or too small to transmit on the device.
1099 * The caller retains ownership of 'buffer' in all cases.
1101 * The kernel maintains a packet transmission queue, so the caller is not
1102 * expected to do additional queuing of packets. */
1104 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1105 struct dp_packet **pkts, int cnt, bool may_steal)
1110 /* 'i' is incremented only if there's no error */
1111 for (i = 0; i < cnt;) {
1112 const void *data = dp_packet_data(pkts[i]);
1113 size_t size = dp_packet_size(pkts[i]);
1116 if (!is_tap_netdev(netdev_)) {
1117 /* Use our AF_PACKET socket to send to this device. */
1118 struct sockaddr_ll sll;
1124 sock = af_packet_sock();
1129 ifindex = netdev_get_ifindex(netdev_);
1134 /* We don't bother setting most fields in sockaddr_ll because the
1135 * kernel ignores them for SOCK_RAW. */
1136 memset(&sll, 0, sizeof sll);
1137 sll.sll_family = AF_PACKET;
1138 sll.sll_ifindex = ifindex;
1140 iov.iov_base = CONST_CAST(void *, data);
1143 msg.msg_name = &sll;
1144 msg.msg_namelen = sizeof sll;
1147 msg.msg_control = NULL;
1148 msg.msg_controllen = 0;
1151 retval = sendmsg(sock, &msg, 0);
1153 /* Use the tap fd to send to this device. This is essential for
1154 * tap devices, because packets sent to a tap device with an
1155 * AF_PACKET socket will loop back to be *received* again on the
1156 * tap device. This doesn't occur on other interface types
1157 * because we attach a socket filter to the rx socket. */
1158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1160 retval = write(netdev->tap_fd, data, size);
1164 /* The Linux AF_PACKET implementation never blocks waiting for room
1165 * for packets, instead returning ENOBUFS. Translate this into
1166 * EAGAIN for the caller. */
1167 error = errno == ENOBUFS ? EAGAIN : errno;
1168 if (error == EINTR) {
1169 /* continue without incrementing 'i', i.e. retry this packet */
1173 } else if (retval != size) {
1174 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1175 " of %"PRIuSIZE") on %s", retval, size,
1176 netdev_get_name(netdev_));
1181 /* Process the next packet in the batch */
1186 for (i = 0; i < cnt; i++) {
1187 dp_packet_delete(pkts[i]);
1191 if (error && error != EAGAIN) {
1192 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1193 netdev_get_name(netdev_), ovs_strerror(error));
1200 /* Registers with the poll loop to wake up from the next call to poll_block()
1201 * when the packet transmission queue has sufficient room to transmit a packet
1202 * with netdev_send().
1204 * The kernel maintains a packet transmission queue, so the client is not
1205 * expected to do additional queuing of packets. Thus, this function is
1206 * unlikely to ever be used. It is included for completeness. */
1208 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1210 if (is_tap_netdev(netdev)) {
1211 /* TAP device always accepts packets.*/
1212 poll_immediate_wake();
1216 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1217 * otherwise a positive errno value. */
1219 netdev_linux_set_etheraddr(struct netdev *netdev_,
1220 const uint8_t mac[ETH_ADDR_LEN])
1222 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1223 enum netdev_flags old_flags = 0;
1226 ovs_mutex_lock(&netdev->mutex);
1228 if (netdev->cache_valid & VALID_ETHERADDR) {
1229 error = netdev->ether_addr_error;
1230 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1233 netdev->cache_valid &= ~VALID_ETHERADDR;
1236 /* Tap devices must be brought down before setting the address. */
1237 if (is_tap_netdev(netdev_)) {
1238 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1240 error = set_etheraddr(netdev_get_name(netdev_), mac);
1241 if (!error || error == ENODEV) {
1242 netdev->ether_addr_error = error;
1243 netdev->cache_valid |= VALID_ETHERADDR;
1245 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1249 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1250 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1254 ovs_mutex_unlock(&netdev->mutex);
1258 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1260 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1261 uint8_t mac[ETH_ADDR_LEN])
1263 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1266 ovs_mutex_lock(&netdev->mutex);
1267 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1268 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1270 netdev->cache_valid |= VALID_ETHERADDR;
1273 error = netdev->ether_addr_error;
1275 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1277 ovs_mutex_unlock(&netdev->mutex);
1283 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1287 if (!(netdev->cache_valid & VALID_MTU)) {
1290 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1291 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1292 netdev->mtu = ifr.ifr_mtu;
1293 netdev->cache_valid |= VALID_MTU;
1296 error = netdev->netdev_mtu_error;
1298 *mtup = netdev->mtu;
1304 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1305 * in bytes, not including the hardware header; thus, this is typically 1500
1306 * bytes for Ethernet devices. */
1308 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1310 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1313 ovs_mutex_lock(&netdev->mutex);
1314 error = netdev_linux_get_mtu__(netdev, mtup);
1315 ovs_mutex_unlock(&netdev->mutex);
1320 /* Sets the maximum size of transmitted (MTU) for given device using linux
1321 * networking ioctl interface.
1324 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1330 ovs_mutex_lock(&netdev->mutex);
1331 if (netdev->cache_valid & VALID_MTU) {
1332 error = netdev->netdev_mtu_error;
1333 if (error || netdev->mtu == mtu) {
1336 netdev->cache_valid &= ~VALID_MTU;
1339 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1340 SIOCSIFMTU, "SIOCSIFMTU");
1341 if (!error || error == ENODEV) {
1342 netdev->netdev_mtu_error = error;
1343 netdev->mtu = ifr.ifr_mtu;
1344 netdev->cache_valid |= VALID_MTU;
1347 ovs_mutex_unlock(&netdev->mutex);
1351 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1352 * On failure, returns a negative errno value. */
1354 netdev_linux_get_ifindex(const struct netdev *netdev_)
1356 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1359 ovs_mutex_lock(&netdev->mutex);
1360 error = get_ifindex(netdev_, &ifindex);
1361 ovs_mutex_unlock(&netdev->mutex);
1363 return error ? -error : ifindex;
1367 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1371 ovs_mutex_lock(&netdev->mutex);
1372 if (netdev->miimon_interval > 0) {
1373 *carrier = netdev->miimon;
1375 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1377 ovs_mutex_unlock(&netdev->mutex);
1382 static long long int
1383 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1386 long long int carrier_resets;
1388 ovs_mutex_lock(&netdev->mutex);
1389 carrier_resets = netdev->carrier_resets;
1390 ovs_mutex_unlock(&netdev->mutex);
1392 return carrier_resets;
1396 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1397 struct mii_ioctl_data *data)
1402 memset(&ifr, 0, sizeof ifr);
1403 memcpy(&ifr.ifr_data, data, sizeof *data);
1404 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1405 memcpy(data, &ifr.ifr_data, sizeof *data);
1411 netdev_linux_get_miimon(const char *name, bool *miimon)
1413 struct mii_ioctl_data data;
1418 memset(&data, 0, sizeof data);
1419 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1421 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1422 data.reg_num = MII_BMSR;
1423 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1427 *miimon = !!(data.val_out & BMSR_LSTATUS);
1429 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1432 struct ethtool_cmd ecmd;
1434 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1437 COVERAGE_INC(netdev_get_ethtool);
1438 memset(&ecmd, 0, sizeof ecmd);
1439 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1442 struct ethtool_value eval;
1444 memcpy(&eval, &ecmd, sizeof eval);
1445 *miimon = !!eval.data;
1447 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1455 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1456 long long int interval)
1458 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1460 ovs_mutex_lock(&netdev->mutex);
1461 interval = interval > 0 ? MAX(interval, 100) : 0;
1462 if (netdev->miimon_interval != interval) {
1463 if (interval && !netdev->miimon_interval) {
1464 atomic_count_inc(&miimon_cnt);
1465 } else if (!interval && netdev->miimon_interval) {
1466 atomic_count_dec(&miimon_cnt);
1469 netdev->miimon_interval = interval;
1470 timer_set_expired(&netdev->miimon_timer);
1472 ovs_mutex_unlock(&netdev->mutex);
1478 netdev_linux_miimon_run(void)
1480 struct shash device_shash;
1481 struct shash_node *node;
1483 shash_init(&device_shash);
1484 netdev_get_devices(&netdev_linux_class, &device_shash);
1485 SHASH_FOR_EACH (node, &device_shash) {
1486 struct netdev *netdev = node->data;
1487 struct netdev_linux *dev = netdev_linux_cast(netdev);
1490 ovs_mutex_lock(&dev->mutex);
1491 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1492 netdev_linux_get_miimon(dev->up.name, &miimon);
1493 if (miimon != dev->miimon) {
1494 dev->miimon = miimon;
1495 netdev_linux_changed(dev, dev->ifi_flags, 0);
1498 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1500 ovs_mutex_unlock(&dev->mutex);
1501 netdev_close(netdev);
1504 shash_destroy(&device_shash);
1508 netdev_linux_miimon_wait(void)
1510 struct shash device_shash;
1511 struct shash_node *node;
1513 shash_init(&device_shash);
1514 netdev_get_devices(&netdev_linux_class, &device_shash);
1515 SHASH_FOR_EACH (node, &device_shash) {
1516 struct netdev *netdev = node->data;
1517 struct netdev_linux *dev = netdev_linux_cast(netdev);
1519 ovs_mutex_lock(&dev->mutex);
1520 if (dev->miimon_interval > 0) {
1521 timer_wait(&dev->miimon_timer);
1523 ovs_mutex_unlock(&dev->mutex);
1524 netdev_close(netdev);
1526 shash_destroy(&device_shash);
1530 swap_uint64(uint64_t *a, uint64_t *b)
1537 /* Copies 'src' into 'dst', performing format conversion in the process.
1539 * 'src' is allowed to be misaligned. */
1541 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1542 const struct ovs_vport_stats *src)
1544 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1545 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1546 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1547 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1548 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1549 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1550 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1551 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1553 dst->collisions = 0;
1554 dst->rx_length_errors = 0;
1555 dst->rx_over_errors = 0;
1556 dst->rx_crc_errors = 0;
1557 dst->rx_frame_errors = 0;
1558 dst->rx_fifo_errors = 0;
1559 dst->rx_missed_errors = 0;
1560 dst->tx_aborted_errors = 0;
1561 dst->tx_carrier_errors = 0;
1562 dst->tx_fifo_errors = 0;
1563 dst->tx_heartbeat_errors = 0;
1564 dst->tx_window_errors = 0;
1568 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1570 struct dpif_netlink_vport reply;
1574 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1577 } else if (!reply.stats) {
1582 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1590 get_stats_via_vport(const struct netdev *netdev_,
1591 struct netdev_stats *stats)
1593 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1595 if (!netdev->vport_stats_error ||
1596 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1599 error = get_stats_via_vport__(netdev_, stats);
1600 if (error && error != ENOENT) {
1601 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1603 netdev_get_name(netdev_), ovs_strerror(error));
1605 netdev->vport_stats_error = error;
1606 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1610 /* Retrieves current device stats for 'netdev-linux'. */
1612 netdev_linux_get_stats(const struct netdev *netdev_,
1613 struct netdev_stats *stats)
1615 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1616 struct netdev_stats dev_stats;
1619 ovs_mutex_lock(&netdev->mutex);
1620 get_stats_via_vport(netdev_, stats);
1621 error = get_stats_via_netlink(netdev_, &dev_stats);
1623 if (!netdev->vport_stats_error) {
1626 } else if (netdev->vport_stats_error) {
1627 /* stats not available from OVS then use netdev stats. */
1630 /* Use kernel netdev's packet and byte counts since vport's counters
1631 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1633 stats->rx_packets = dev_stats.rx_packets;
1634 stats->rx_bytes = dev_stats.rx_bytes;
1635 stats->tx_packets = dev_stats.tx_packets;
1636 stats->tx_bytes = dev_stats.tx_bytes;
1638 stats->rx_errors += dev_stats.rx_errors;
1639 stats->tx_errors += dev_stats.tx_errors;
1640 stats->rx_dropped += dev_stats.rx_dropped;
1641 stats->tx_dropped += dev_stats.tx_dropped;
1642 stats->multicast += dev_stats.multicast;
1643 stats->collisions += dev_stats.collisions;
1644 stats->rx_length_errors += dev_stats.rx_length_errors;
1645 stats->rx_over_errors += dev_stats.rx_over_errors;
1646 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1647 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1648 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1649 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1650 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1651 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1652 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1653 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1654 stats->tx_window_errors += dev_stats.tx_window_errors;
1656 ovs_mutex_unlock(&netdev->mutex);
1661 /* Retrieves current device stats for 'netdev-tap' netdev or
1662 * netdev-internal. */
1664 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1667 struct netdev_stats dev_stats;
1670 ovs_mutex_lock(&netdev->mutex);
1671 get_stats_via_vport(netdev_, stats);
1672 error = get_stats_via_netlink(netdev_, &dev_stats);
1674 if (!netdev->vport_stats_error) {
1677 } else if (netdev->vport_stats_error) {
1678 /* Transmit and receive stats will appear to be swapped relative to the
1679 * other ports since we are the one sending the data, not a remote
1680 * computer. For consistency, we swap them back here. This does not
1681 * apply if we are getting stats from the vport layer because it always
1682 * tracks stats from the perspective of the switch. */
1685 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1686 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1687 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1688 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1689 stats->rx_length_errors = 0;
1690 stats->rx_over_errors = 0;
1691 stats->rx_crc_errors = 0;
1692 stats->rx_frame_errors = 0;
1693 stats->rx_fifo_errors = 0;
1694 stats->rx_missed_errors = 0;
1695 stats->tx_aborted_errors = 0;
1696 stats->tx_carrier_errors = 0;
1697 stats->tx_fifo_errors = 0;
1698 stats->tx_heartbeat_errors = 0;
1699 stats->tx_window_errors = 0;
1701 /* Use kernel netdev's packet and byte counts since vport counters
1702 * do not reflect packet counts on the wire when GSO, TSO or GRO
1704 stats->rx_packets = dev_stats.tx_packets;
1705 stats->rx_bytes = dev_stats.tx_bytes;
1706 stats->tx_packets = dev_stats.rx_packets;
1707 stats->tx_bytes = dev_stats.rx_bytes;
1709 stats->rx_dropped += dev_stats.tx_dropped;
1710 stats->tx_dropped += dev_stats.rx_dropped;
1712 stats->rx_errors += dev_stats.tx_errors;
1713 stats->tx_errors += dev_stats.rx_errors;
1715 stats->multicast += dev_stats.multicast;
1716 stats->collisions += dev_stats.collisions;
1718 ovs_mutex_unlock(&netdev->mutex);
1724 netdev_internal_get_stats(const struct netdev *netdev_,
1725 struct netdev_stats *stats)
1727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1730 ovs_mutex_lock(&netdev->mutex);
1731 get_stats_via_vport(netdev_, stats);
1732 error = netdev->vport_stats_error;
1733 ovs_mutex_unlock(&netdev->mutex);
1739 netdev_linux_read_features(struct netdev_linux *netdev)
1741 struct ethtool_cmd ecmd;
1745 if (netdev->cache_valid & VALID_FEATURES) {
1749 COVERAGE_INC(netdev_get_ethtool);
1750 memset(&ecmd, 0, sizeof ecmd);
1751 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1752 ETHTOOL_GSET, "ETHTOOL_GSET");
1757 /* Supported features. */
1758 netdev->supported = 0;
1759 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1760 netdev->supported |= NETDEV_F_10MB_HD;
1762 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1763 netdev->supported |= NETDEV_F_10MB_FD;
1765 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1766 netdev->supported |= NETDEV_F_100MB_HD;
1768 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1769 netdev->supported |= NETDEV_F_100MB_FD;
1771 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1772 netdev->supported |= NETDEV_F_1GB_HD;
1774 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1775 netdev->supported |= NETDEV_F_1GB_FD;
1777 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1778 netdev->supported |= NETDEV_F_10GB_FD;
1780 if (ecmd.supported & SUPPORTED_TP) {
1781 netdev->supported |= NETDEV_F_COPPER;
1783 if (ecmd.supported & SUPPORTED_FIBRE) {
1784 netdev->supported |= NETDEV_F_FIBER;
1786 if (ecmd.supported & SUPPORTED_Autoneg) {
1787 netdev->supported |= NETDEV_F_AUTONEG;
1789 if (ecmd.supported & SUPPORTED_Pause) {
1790 netdev->supported |= NETDEV_F_PAUSE;
1792 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1793 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1796 /* Advertised features. */
1797 netdev->advertised = 0;
1798 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1799 netdev->advertised |= NETDEV_F_10MB_HD;
1801 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1802 netdev->advertised |= NETDEV_F_10MB_FD;
1804 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1805 netdev->advertised |= NETDEV_F_100MB_HD;
1807 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1808 netdev->advertised |= NETDEV_F_100MB_FD;
1810 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1811 netdev->advertised |= NETDEV_F_1GB_HD;
1813 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1814 netdev->advertised |= NETDEV_F_1GB_FD;
1816 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1817 netdev->advertised |= NETDEV_F_10GB_FD;
1819 if (ecmd.advertising & ADVERTISED_TP) {
1820 netdev->advertised |= NETDEV_F_COPPER;
1822 if (ecmd.advertising & ADVERTISED_FIBRE) {
1823 netdev->advertised |= NETDEV_F_FIBER;
1825 if (ecmd.advertising & ADVERTISED_Autoneg) {
1826 netdev->advertised |= NETDEV_F_AUTONEG;
1828 if (ecmd.advertising & ADVERTISED_Pause) {
1829 netdev->advertised |= NETDEV_F_PAUSE;
1831 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1832 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1835 /* Current settings. */
1837 if (speed == SPEED_10) {
1838 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1839 } else if (speed == SPEED_100) {
1840 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1841 } else if (speed == SPEED_1000) {
1842 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1843 } else if (speed == SPEED_10000) {
1844 netdev->current = NETDEV_F_10GB_FD;
1845 } else if (speed == 40000) {
1846 netdev->current = NETDEV_F_40GB_FD;
1847 } else if (speed == 100000) {
1848 netdev->current = NETDEV_F_100GB_FD;
1849 } else if (speed == 1000000) {
1850 netdev->current = NETDEV_F_1TB_FD;
1852 netdev->current = 0;
1855 if (ecmd.port == PORT_TP) {
1856 netdev->current |= NETDEV_F_COPPER;
1857 } else if (ecmd.port == PORT_FIBRE) {
1858 netdev->current |= NETDEV_F_FIBER;
1862 netdev->current |= NETDEV_F_AUTONEG;
1866 netdev->cache_valid |= VALID_FEATURES;
1867 netdev->get_features_error = error;
1870 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1871 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1872 * Returns 0 if successful, otherwise a positive errno value. */
1874 netdev_linux_get_features(const struct netdev *netdev_,
1875 enum netdev_features *current,
1876 enum netdev_features *advertised,
1877 enum netdev_features *supported,
1878 enum netdev_features *peer)
1880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1883 ovs_mutex_lock(&netdev->mutex);
1884 netdev_linux_read_features(netdev);
1885 if (!netdev->get_features_error) {
1886 *current = netdev->current;
1887 *advertised = netdev->advertised;
1888 *supported = netdev->supported;
1889 *peer = 0; /* XXX */
1891 error = netdev->get_features_error;
1892 ovs_mutex_unlock(&netdev->mutex);
1897 /* Set the features advertised by 'netdev' to 'advertise'. */
1899 netdev_linux_set_advertisements(struct netdev *netdev_,
1900 enum netdev_features advertise)
1902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1903 struct ethtool_cmd ecmd;
1906 ovs_mutex_lock(&netdev->mutex);
1908 COVERAGE_INC(netdev_get_ethtool);
1909 memset(&ecmd, 0, sizeof ecmd);
1910 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1911 ETHTOOL_GSET, "ETHTOOL_GSET");
1916 ecmd.advertising = 0;
1917 if (advertise & NETDEV_F_10MB_HD) {
1918 ecmd.advertising |= ADVERTISED_10baseT_Half;
1920 if (advertise & NETDEV_F_10MB_FD) {
1921 ecmd.advertising |= ADVERTISED_10baseT_Full;
1923 if (advertise & NETDEV_F_100MB_HD) {
1924 ecmd.advertising |= ADVERTISED_100baseT_Half;
1926 if (advertise & NETDEV_F_100MB_FD) {
1927 ecmd.advertising |= ADVERTISED_100baseT_Full;
1929 if (advertise & NETDEV_F_1GB_HD) {
1930 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1932 if (advertise & NETDEV_F_1GB_FD) {
1933 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1935 if (advertise & NETDEV_F_10GB_FD) {
1936 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1938 if (advertise & NETDEV_F_COPPER) {
1939 ecmd.advertising |= ADVERTISED_TP;
1941 if (advertise & NETDEV_F_FIBER) {
1942 ecmd.advertising |= ADVERTISED_FIBRE;
1944 if (advertise & NETDEV_F_AUTONEG) {
1945 ecmd.advertising |= ADVERTISED_Autoneg;
1947 if (advertise & NETDEV_F_PAUSE) {
1948 ecmd.advertising |= ADVERTISED_Pause;
1950 if (advertise & NETDEV_F_PAUSE_ASYM) {
1951 ecmd.advertising |= ADVERTISED_Asym_Pause;
1953 COVERAGE_INC(netdev_set_ethtool);
1954 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1955 ETHTOOL_SSET, "ETHTOOL_SSET");
1958 ovs_mutex_unlock(&netdev->mutex);
1962 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1963 * successful, otherwise a positive errno value. */
1965 netdev_linux_set_policing(struct netdev *netdev_,
1966 uint32_t kbits_rate, uint32_t kbits_burst)
1968 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1969 const char *netdev_name = netdev_get_name(netdev_);
1972 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1973 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1974 : kbits_burst); /* Stick with user-specified value. */
1976 ovs_mutex_lock(&netdev->mutex);
1977 if (netdev->cache_valid & VALID_POLICING) {
1978 error = netdev->netdev_policing_error;
1979 if (error || (netdev->kbits_rate == kbits_rate &&
1980 netdev->kbits_burst == kbits_burst)) {
1981 /* Assume that settings haven't changed since we last set them. */
1984 netdev->cache_valid &= ~VALID_POLICING;
1987 COVERAGE_INC(netdev_set_policing);
1988 /* Remove any existing ingress qdisc. */
1989 error = tc_add_del_ingress_qdisc(netdev_, false);
1991 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1992 netdev_name, ovs_strerror(error));
1997 error = tc_add_del_ingress_qdisc(netdev_, true);
1999 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2000 netdev_name, ovs_strerror(error));
2004 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2006 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2007 netdev_name, ovs_strerror(error));
2012 netdev->kbits_rate = kbits_rate;
2013 netdev->kbits_burst = kbits_burst;
2016 if (!error || error == ENODEV) {
2017 netdev->netdev_policing_error = error;
2018 netdev->cache_valid |= VALID_POLICING;
2020 ovs_mutex_unlock(&netdev->mutex);
2025 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2028 const struct tc_ops *const *opsp;
2030 for (opsp = tcs; *opsp != NULL; opsp++) {
2031 const struct tc_ops *ops = *opsp;
2032 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2033 sset_add(types, ops->ovs_name);
2039 static const struct tc_ops *
2040 tc_lookup_ovs_name(const char *name)
2042 const struct tc_ops *const *opsp;
2044 for (opsp = tcs; *opsp != NULL; opsp++) {
2045 const struct tc_ops *ops = *opsp;
2046 if (!strcmp(name, ops->ovs_name)) {
2053 static const struct tc_ops *
2054 tc_lookup_linux_name(const char *name)
2056 const struct tc_ops *const *opsp;
2058 for (opsp = tcs; *opsp != NULL; opsp++) {
2059 const struct tc_ops *ops = *opsp;
2060 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2067 static struct tc_queue *
2068 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2072 struct tc_queue *queue;
2074 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2075 if (queue->queue_id == queue_id) {
2082 static struct tc_queue *
2083 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2085 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2089 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2091 struct netdev_qos_capabilities *caps)
2093 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2097 caps->n_queues = ops->n_queues;
2102 netdev_linux_get_qos(const struct netdev *netdev_,
2103 const char **typep, struct smap *details)
2105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2108 ovs_mutex_lock(&netdev->mutex);
2109 error = tc_query_qdisc(netdev_);
2111 *typep = netdev->tc->ops->ovs_name;
2112 error = (netdev->tc->ops->qdisc_get
2113 ? netdev->tc->ops->qdisc_get(netdev_, details)
2116 ovs_mutex_unlock(&netdev->mutex);
2122 netdev_linux_set_qos(struct netdev *netdev_,
2123 const char *type, const struct smap *details)
2125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2126 const struct tc_ops *new_ops;
2129 new_ops = tc_lookup_ovs_name(type);
2130 if (!new_ops || !new_ops->tc_install) {
2134 ovs_mutex_lock(&netdev->mutex);
2135 error = tc_query_qdisc(netdev_);
2140 if (new_ops == netdev->tc->ops) {
2141 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2143 /* Delete existing qdisc. */
2144 error = tc_del_qdisc(netdev_);
2148 ovs_assert(netdev->tc == NULL);
2150 /* Install new qdisc. */
2151 error = new_ops->tc_install(netdev_, details);
2152 ovs_assert((error == 0) == (netdev->tc != NULL));
2156 ovs_mutex_unlock(&netdev->mutex);
2161 netdev_linux_get_queue(const struct netdev *netdev_,
2162 unsigned int queue_id, struct smap *details)
2164 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2167 ovs_mutex_lock(&netdev->mutex);
2168 error = tc_query_qdisc(netdev_);
2170 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2172 ? netdev->tc->ops->class_get(netdev_, queue, details)
2175 ovs_mutex_unlock(&netdev->mutex);
2181 netdev_linux_set_queue(struct netdev *netdev_,
2182 unsigned int queue_id, const struct smap *details)
2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2187 ovs_mutex_lock(&netdev->mutex);
2188 error = tc_query_qdisc(netdev_);
2190 error = (queue_id < netdev->tc->ops->n_queues
2191 && netdev->tc->ops->class_set
2192 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2195 ovs_mutex_unlock(&netdev->mutex);
2201 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2203 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2206 ovs_mutex_lock(&netdev->mutex);
2207 error = tc_query_qdisc(netdev_);
2209 if (netdev->tc->ops->class_delete) {
2210 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2212 ? netdev->tc->ops->class_delete(netdev_, queue)
2218 ovs_mutex_unlock(&netdev->mutex);
2224 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2225 unsigned int queue_id,
2226 struct netdev_queue_stats *stats)
2228 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2231 ovs_mutex_lock(&netdev->mutex);
2232 error = tc_query_qdisc(netdev_);
2234 if (netdev->tc->ops->class_get_stats) {
2235 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2237 stats->created = queue->created;
2238 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2247 ovs_mutex_unlock(&netdev->mutex);
2252 struct queue_dump_state {
2253 struct nl_dump dump;
2258 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2260 struct ofpbuf request;
2261 struct tcmsg *tcmsg;
2263 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2267 tcmsg->tcm_parent = 0;
2268 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2269 ofpbuf_uninit(&request);
2271 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2276 finish_queue_dump(struct queue_dump_state *state)
2278 ofpbuf_uninit(&state->buf);
2279 return nl_dump_done(&state->dump);
2282 struct netdev_linux_queue_state {
2283 unsigned int *queues;
2289 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2291 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2294 ovs_mutex_lock(&netdev->mutex);
2295 error = tc_query_qdisc(netdev_);
2297 if (netdev->tc->ops->class_get) {
2298 struct netdev_linux_queue_state *state;
2299 struct tc_queue *queue;
2302 *statep = state = xmalloc(sizeof *state);
2303 state->n_queues = hmap_count(&netdev->tc->queues);
2304 state->cur_queue = 0;
2305 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2308 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2309 state->queues[i++] = queue->queue_id;
2315 ovs_mutex_unlock(&netdev->mutex);
2321 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2322 unsigned int *queue_idp, struct smap *details)
2324 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2325 struct netdev_linux_queue_state *state = state_;
2328 ovs_mutex_lock(&netdev->mutex);
2329 while (state->cur_queue < state->n_queues) {
2330 unsigned int queue_id = state->queues[state->cur_queue++];
2331 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2334 *queue_idp = queue_id;
2335 error = netdev->tc->ops->class_get(netdev_, queue, details);
2339 ovs_mutex_unlock(&netdev->mutex);
2345 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2348 struct netdev_linux_queue_state *state = state_;
2350 free(state->queues);
2356 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2357 netdev_dump_queue_stats_cb *cb, void *aux)
2359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2362 ovs_mutex_lock(&netdev->mutex);
2363 error = tc_query_qdisc(netdev_);
2365 struct queue_dump_state state;
2367 if (!netdev->tc->ops->class_dump_stats) {
2369 } else if (!start_queue_dump(netdev_, &state)) {
2375 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2376 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2383 retval = finish_queue_dump(&state);
2389 ovs_mutex_unlock(&netdev->mutex);
2395 netdev_linux_get_in4(const struct netdev *netdev_,
2396 struct in_addr *address, struct in_addr *netmask)
2398 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2401 ovs_mutex_lock(&netdev->mutex);
2402 if (!(netdev->cache_valid & VALID_IN4)) {
2403 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2404 SIOCGIFADDR, "SIOCGIFADDR");
2406 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2407 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2409 netdev->cache_valid |= VALID_IN4;
2417 if (netdev->address.s_addr != INADDR_ANY) {
2418 *address = netdev->address;
2419 *netmask = netdev->netmask;
2421 error = EADDRNOTAVAIL;
2424 ovs_mutex_unlock(&netdev->mutex);
2430 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2431 struct in_addr netmask)
2433 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2436 ovs_mutex_lock(&netdev->mutex);
2437 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2439 netdev->cache_valid |= VALID_IN4;
2440 netdev->address = address;
2441 netdev->netmask = netmask;
2442 if (address.s_addr != INADDR_ANY) {
2443 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2444 "SIOCSIFNETMASK", netmask);
2447 ovs_mutex_unlock(&netdev->mutex);
2453 parse_if_inet6_line(const char *line,
2454 struct in6_addr *in6, char ifname[16 + 1])
2456 uint8_t *s6 = in6->s6_addr;
2457 #define X8 "%2"SCNx8
2458 return ovs_scan(line,
2459 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2460 "%*x %*x %*x %*x %16s\n",
2461 &s6[0], &s6[1], &s6[2], &s6[3],
2462 &s6[4], &s6[5], &s6[6], &s6[7],
2463 &s6[8], &s6[9], &s6[10], &s6[11],
2464 &s6[12], &s6[13], &s6[14], &s6[15],
2468 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2469 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2471 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2475 ovs_mutex_lock(&netdev->mutex);
2476 if (!(netdev->cache_valid & VALID_IN6)) {
2480 netdev->in6 = in6addr_any;
2482 file = fopen("/proc/net/if_inet6", "r");
2484 const char *name = netdev_get_name(netdev_);
2485 while (fgets(line, sizeof line, file)) {
2486 struct in6_addr in6_tmp;
2487 char ifname[16 + 1];
2488 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2489 && !strcmp(name, ifname))
2491 netdev->in6 = in6_tmp;
2497 netdev->cache_valid |= VALID_IN6;
2500 ovs_mutex_unlock(&netdev->mutex);
2506 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2508 struct sockaddr_in sin;
2509 memset(&sin, 0, sizeof sin);
2510 sin.sin_family = AF_INET;
2511 sin.sin_addr = addr;
2514 memset(sa, 0, sizeof *sa);
2515 memcpy(sa, &sin, sizeof sin);
2519 do_set_addr(struct netdev *netdev,
2520 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2524 make_in4_sockaddr(&ifr.ifr_addr, addr);
2525 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2529 /* Adds 'router' as a default IP gateway. */
2531 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2533 struct in_addr any = { INADDR_ANY };
2537 memset(&rt, 0, sizeof rt);
2538 make_in4_sockaddr(&rt.rt_dst, any);
2539 make_in4_sockaddr(&rt.rt_gateway, router);
2540 make_in4_sockaddr(&rt.rt_genmask, any);
2541 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2542 error = af_inet_ioctl(SIOCADDRT, &rt);
2544 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2550 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2553 static const char fn[] = "/proc/net/route";
2558 *netdev_name = NULL;
2559 stream = fopen(fn, "r");
2560 if (stream == NULL) {
2561 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2566 while (fgets(line, sizeof line, stream)) {
2569 ovs_be32 dest, gateway, mask;
2570 int refcnt, metric, mtu;
2571 unsigned int flags, use, window, irtt;
2574 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2576 iface, &dest, &gateway, &flags, &refcnt,
2577 &use, &metric, &mask, &mtu, &window, &irtt)) {
2578 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2582 if (!(flags & RTF_UP)) {
2583 /* Skip routes that aren't up. */
2587 /* The output of 'dest', 'mask', and 'gateway' were given in
2588 * network byte order, so we don't need need any endian
2589 * conversions here. */
2590 if ((dest & mask) == (host->s_addr & mask)) {
2592 /* The host is directly reachable. */
2593 next_hop->s_addr = 0;
2595 /* To reach the host, we must go through a gateway. */
2596 next_hop->s_addr = gateway;
2598 *netdev_name = xstrdup(iface);
2610 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2612 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2615 ovs_mutex_lock(&netdev->mutex);
2616 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2617 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2619 COVERAGE_INC(netdev_get_ethtool);
2620 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2621 error = netdev_linux_do_ethtool(netdev->up.name,
2624 "ETHTOOL_GDRVINFO");
2626 netdev->cache_valid |= VALID_DRVINFO;
2631 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2632 smap_add(smap, "driver_version", netdev->drvinfo.version);
2633 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2635 ovs_mutex_unlock(&netdev->mutex);
2641 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2644 smap_add(smap, "driver_name", "openvswitch");
2648 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2649 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2650 * returns 0. Otherwise, it returns a positive errno value; in particular,
2651 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2653 netdev_linux_arp_lookup(const struct netdev *netdev,
2654 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2657 struct sockaddr_in sin;
2660 memset(&r, 0, sizeof r);
2661 memset(&sin, 0, sizeof sin);
2662 sin.sin_family = AF_INET;
2663 sin.sin_addr.s_addr = ip;
2665 memcpy(&r.arp_pa, &sin, sizeof sin);
2666 r.arp_ha.sa_family = ARPHRD_ETHER;
2668 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2669 COVERAGE_INC(netdev_arp_lookup);
2670 retval = af_inet_ioctl(SIOCGARP, &r);
2672 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2673 } else if (retval != ENXIO) {
2674 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2675 netdev_get_name(netdev), IP_ARGS(ip),
2676 ovs_strerror(retval));
2682 nd_to_iff_flags(enum netdev_flags nd)
2685 if (nd & NETDEV_UP) {
2688 if (nd & NETDEV_PROMISC) {
2691 if (nd & NETDEV_LOOPBACK) {
2692 iff |= IFF_LOOPBACK;
2698 iff_to_nd_flags(int iff)
2700 enum netdev_flags nd = 0;
2704 if (iff & IFF_PROMISC) {
2705 nd |= NETDEV_PROMISC;
2707 if (iff & IFF_LOOPBACK) {
2708 nd |= NETDEV_LOOPBACK;
2714 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2715 enum netdev_flags on, enum netdev_flags *old_flagsp)
2716 OVS_REQUIRES(netdev->mutex)
2718 int old_flags, new_flags;
2721 old_flags = netdev->ifi_flags;
2722 *old_flagsp = iff_to_nd_flags(old_flags);
2723 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2724 if (new_flags != old_flags) {
2725 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2726 get_flags(&netdev->up, &netdev->ifi_flags);
2733 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2734 enum netdev_flags on, enum netdev_flags *old_flagsp)
2736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2739 ovs_mutex_lock(&netdev->mutex);
2740 error = update_flags(netdev, off, on, old_flagsp);
2741 ovs_mutex_unlock(&netdev->mutex);
2746 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2747 GET_FEATURES, GET_STATUS) \
2753 netdev_linux_wait, \
2755 netdev_linux_alloc, \
2757 netdev_linux_destruct, \
2758 netdev_linux_dealloc, \
2759 NULL, /* get_config */ \
2760 NULL, /* set_config */ \
2761 NULL, /* get_tunnel_config */ \
2762 NULL, /* build header */ \
2763 NULL, /* push header */ \
2764 NULL, /* pop header */ \
2765 NULL, /* get_numa_id */ \
2766 NULL, /* set_multiq */ \
2768 netdev_linux_send, \
2769 netdev_linux_send_wait, \
2771 netdev_linux_set_etheraddr, \
2772 netdev_linux_get_etheraddr, \
2773 netdev_linux_get_mtu, \
2774 netdev_linux_set_mtu, \
2775 netdev_linux_get_ifindex, \
2776 netdev_linux_get_carrier, \
2777 netdev_linux_get_carrier_resets, \
2778 netdev_linux_set_miimon_interval, \
2782 netdev_linux_set_advertisements, \
2784 netdev_linux_set_policing, \
2785 netdev_linux_get_qos_types, \
2786 netdev_linux_get_qos_capabilities, \
2787 netdev_linux_get_qos, \
2788 netdev_linux_set_qos, \
2789 netdev_linux_get_queue, \
2790 netdev_linux_set_queue, \
2791 netdev_linux_delete_queue, \
2792 netdev_linux_get_queue_stats, \
2793 netdev_linux_queue_dump_start, \
2794 netdev_linux_queue_dump_next, \
2795 netdev_linux_queue_dump_done, \
2796 netdev_linux_dump_queue_stats, \
2798 netdev_linux_get_in4, \
2799 netdev_linux_set_in4, \
2800 netdev_linux_get_in6, \
2801 netdev_linux_add_router, \
2802 netdev_linux_get_next_hop, \
2804 netdev_linux_arp_lookup, \
2806 netdev_linux_update_flags, \
2808 netdev_linux_rxq_alloc, \
2809 netdev_linux_rxq_construct, \
2810 netdev_linux_rxq_destruct, \
2811 netdev_linux_rxq_dealloc, \
2812 netdev_linux_rxq_recv, \
2813 netdev_linux_rxq_wait, \
2814 netdev_linux_rxq_drain, \
2817 const struct netdev_class netdev_linux_class =
2820 netdev_linux_construct,
2821 netdev_linux_get_stats,
2822 netdev_linux_get_features,
2823 netdev_linux_get_status);
2825 const struct netdev_class netdev_tap_class =
2828 netdev_linux_construct_tap,
2829 netdev_tap_get_stats,
2830 netdev_linux_get_features,
2831 netdev_linux_get_status);
2833 const struct netdev_class netdev_internal_class =
2836 netdev_linux_construct,
2837 netdev_internal_get_stats,
2838 NULL, /* get_features */
2839 netdev_internal_get_status);
2842 #define CODEL_N_QUEUES 0x0000
2851 static struct codel *
2852 codel_get__(const struct netdev *netdev_)
2854 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2855 return CONTAINER_OF(netdev->tc, struct codel, tc);
2859 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2863 struct codel *codel;
2865 codel = xmalloc(sizeof *codel);
2866 tc_init(&codel->tc, &tc_ops_codel);
2867 codel->target = target;
2868 codel->limit = limit;
2869 codel->interval = interval;
2871 netdev->tc = &codel->tc;
2875 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2879 struct ofpbuf request;
2880 struct tcmsg *tcmsg;
2881 uint32_t otarget, olimit, ointerval;
2884 tc_del_qdisc(netdev);
2886 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2887 NLM_F_EXCL | NLM_F_CREATE, &request);
2891 tcmsg->tcm_handle = tc_make_handle(1, 0);
2892 tcmsg->tcm_parent = TC_H_ROOT;
2894 otarget = target ? target : 5000;
2895 olimit = limit ? limit : 10240;
2896 ointerval = interval ? interval : 100000;
2898 nl_msg_put_string(&request, TCA_KIND, "codel");
2899 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2900 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2901 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2902 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2903 nl_msg_end_nested(&request, opt_offset);
2905 error = tc_transact(&request, NULL);
2907 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2908 "target %u, limit %u, interval %u error %d(%s)",
2909 netdev_get_name(netdev),
2910 otarget, olimit, ointerval,
2911 error, ovs_strerror(error));
2917 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2918 const struct smap *details, struct codel *codel)
2920 const char *target_s;
2921 const char *limit_s;
2922 const char *interval_s;
2924 target_s = smap_get(details, "target");
2925 limit_s = smap_get(details, "limit");
2926 interval_s = smap_get(details, "interval");
2928 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2929 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2930 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2932 if (!codel->target) {
2933 codel->target = 5000;
2935 if (!codel->limit) {
2936 codel->limit = 10240;
2938 if (!codel->interval) {
2939 codel->interval = 100000;
2944 codel_tc_install(struct netdev *netdev, const struct smap *details)
2949 codel_parse_qdisc_details__(netdev, details, &codel);
2950 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2953 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2959 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2961 static const struct nl_policy tca_codel_policy[] = {
2962 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2963 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2964 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2967 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2969 if (!nl_parse_nested(nl_options, tca_codel_policy,
2970 attrs, ARRAY_SIZE(tca_codel_policy))) {
2971 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2975 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2976 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2977 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2982 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2984 struct nlattr *nlattr;
2989 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
2994 error = codel_parse_tca_options__(nlattr, &codel);
2999 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3005 codel_tc_destroy(struct tc *tc)
3007 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3013 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3015 const struct codel *codel = codel_get__(netdev);
3016 smap_add_format(details, "target", "%u", codel->target);
3017 smap_add_format(details, "limit", "%u", codel->limit);
3018 smap_add_format(details, "interval", "%u", codel->interval);
3023 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3027 codel_parse_qdisc_details__(netdev, details, &codel);
3028 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3029 codel_get__(netdev)->target = codel.target;
3030 codel_get__(netdev)->limit = codel.limit;
3031 codel_get__(netdev)->interval = codel.interval;
3035 static const struct tc_ops tc_ops_codel = {
3036 "codel", /* linux_name */
3037 "linux-codel", /* ovs_name */
3038 CODEL_N_QUEUES, /* n_queues */
3051 /* FQ-CoDel traffic control class. */
3053 #define FQCODEL_N_QUEUES 0x0000
3064 static struct fqcodel *
3065 fqcodel_get__(const struct netdev *netdev_)
3067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3068 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3072 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3073 uint32_t interval, uint32_t flows, uint32_t quantum)
3075 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3076 struct fqcodel *fqcodel;
3078 fqcodel = xmalloc(sizeof *fqcodel);
3079 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3080 fqcodel->target = target;
3081 fqcodel->limit = limit;
3082 fqcodel->interval = interval;
3083 fqcodel->flows = flows;
3084 fqcodel->quantum = quantum;
3086 netdev->tc = &fqcodel->tc;
3090 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3091 uint32_t interval, uint32_t flows, uint32_t quantum)
3094 struct ofpbuf request;
3095 struct tcmsg *tcmsg;
3096 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3099 tc_del_qdisc(netdev);
3101 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3102 NLM_F_EXCL | NLM_F_CREATE, &request);
3106 tcmsg->tcm_handle = tc_make_handle(1, 0);
3107 tcmsg->tcm_parent = TC_H_ROOT;
3109 otarget = target ? target : 5000;
3110 olimit = limit ? limit : 10240;
3111 ointerval = interval ? interval : 100000;
3112 oflows = flows ? flows : 1024;
3113 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3116 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3117 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3118 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3119 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3120 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3121 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3122 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3123 nl_msg_end_nested(&request, opt_offset);
3125 error = tc_transact(&request, NULL);
3127 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3128 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3129 netdev_get_name(netdev),
3130 otarget, olimit, ointerval, oflows, oquantum,
3131 error, ovs_strerror(error));
3137 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3138 const struct smap *details, struct fqcodel *fqcodel)
3140 const char *target_s;
3141 const char *limit_s;
3142 const char *interval_s;
3143 const char *flows_s;
3144 const char *quantum_s;
3146 target_s = smap_get(details, "target");
3147 limit_s = smap_get(details, "limit");
3148 interval_s = smap_get(details, "interval");
3149 flows_s = smap_get(details, "flows");
3150 quantum_s = smap_get(details, "quantum");
3151 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3152 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3153 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3154 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3155 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3156 if (!fqcodel->target) {
3157 fqcodel->target = 5000;
3159 if (!fqcodel->limit) {
3160 fqcodel->limit = 10240;
3162 if (!fqcodel->interval) {
3163 fqcodel->interval = 1000000;
3165 if (!fqcodel->flows) {
3166 fqcodel->flows = 1024;
3168 if (!fqcodel->quantum) {
3169 fqcodel->quantum = 1514;
3174 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3177 struct fqcodel fqcodel;
3179 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3180 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3181 fqcodel.interval, fqcodel.flows,
3184 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3185 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3191 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3193 static const struct nl_policy tca_fqcodel_policy[] = {
3194 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3195 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3196 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3197 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3198 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3201 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3203 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3204 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3205 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3209 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3210 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3211 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3212 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3213 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3218 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3220 struct nlattr *nlattr;
3223 struct fqcodel fqcodel;
3225 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3230 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3235 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3236 fqcodel.flows, fqcodel.quantum);
3241 fqcodel_tc_destroy(struct tc *tc)
3243 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3249 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3251 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3252 smap_add_format(details, "target", "%u", fqcodel->target);
3253 smap_add_format(details, "limit", "%u", fqcodel->limit);
3254 smap_add_format(details, "interval", "%u", fqcodel->interval);
3255 smap_add_format(details, "flows", "%u", fqcodel->flows);
3256 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3261 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3263 struct fqcodel fqcodel;
3265 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3266 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3267 fqcodel.flows, fqcodel.quantum);
3268 fqcodel_get__(netdev)->target = fqcodel.target;
3269 fqcodel_get__(netdev)->limit = fqcodel.limit;
3270 fqcodel_get__(netdev)->interval = fqcodel.interval;
3271 fqcodel_get__(netdev)->flows = fqcodel.flows;
3272 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3276 static const struct tc_ops tc_ops_fqcodel = {
3277 "fq_codel", /* linux_name */
3278 "linux-fq_codel", /* ovs_name */
3279 FQCODEL_N_QUEUES, /* n_queues */
3292 /* SFQ traffic control class. */
3294 #define SFQ_N_QUEUES 0x0000
3303 sfq_get__(const struct netdev *netdev_)
3305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3306 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3310 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3312 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3315 sfq = xmalloc(sizeof *sfq);
3316 tc_init(&sfq->tc, &tc_ops_sfq);
3317 sfq->perturb = perturb;
3318 sfq->quantum = quantum;
3320 netdev->tc = &sfq->tc;
3324 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3326 struct tc_sfq_qopt opt;
3327 struct ofpbuf request;
3328 struct tcmsg *tcmsg;
3330 int mtu_error, error;
3331 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3333 tc_del_qdisc(netdev);
3335 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3336 NLM_F_EXCL | NLM_F_CREATE, &request);
3340 tcmsg->tcm_handle = tc_make_handle(1, 0);
3341 tcmsg->tcm_parent = TC_H_ROOT;
3343 memset(&opt, 0, sizeof opt);
3346 opt.quantum = mtu; /* if we cannot find mtu, use default */
3349 opt.quantum = quantum;
3353 opt.perturb_period = 10;
3355 opt.perturb_period = perturb;
3358 nl_msg_put_string(&request, TCA_KIND, "sfq");
3359 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3361 error = tc_transact(&request, NULL);
3363 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3364 "quantum %u, perturb %u error %d(%s)",
3365 netdev_get_name(netdev),
3366 opt.quantum, opt.perturb_period,
3367 error, ovs_strerror(error));
3373 sfq_parse_qdisc_details__(struct netdev *netdev,
3374 const struct smap *details, struct sfq *sfq)
3376 const char *perturb_s;
3377 const char *quantum_s;
3381 perturb_s = smap_get(details, "perturb");
3382 quantum_s = smap_get(details, "quantum");
3383 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3384 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3385 if (!sfq->perturb) {
3389 if (!sfq->quantum) {
3390 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3394 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3395 "device without mtu");
3402 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3407 sfq_parse_qdisc_details__(netdev, details, &sfq);
3408 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3410 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3416 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3418 const struct tc_sfq_qopt *sfq;
3419 struct nlattr *nlattr;
3423 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3425 sfq = nl_attr_get(nlattr);
3426 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3434 sfq_tc_destroy(struct tc *tc)
3436 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3442 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3444 const struct sfq *sfq = sfq_get__(netdev);
3445 smap_add_format(details, "quantum", "%u", sfq->quantum);
3446 smap_add_format(details, "perturb", "%u", sfq->perturb);
3451 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3455 sfq_parse_qdisc_details__(netdev, details, &sfq);
3456 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3457 sfq_get__(netdev)->quantum = sfq.quantum;
3458 sfq_get__(netdev)->perturb = sfq.perturb;
3462 static const struct tc_ops tc_ops_sfq = {
3463 "sfq", /* linux_name */
3464 "linux-sfq", /* ovs_name */
3465 SFQ_N_QUEUES, /* n_queues */
3478 /* HTB traffic control class. */
3480 #define HTB_N_QUEUES 0xf000
3484 unsigned int max_rate; /* In bytes/s. */
3488 struct tc_queue tc_queue;
3489 unsigned int min_rate; /* In bytes/s. */
3490 unsigned int max_rate; /* In bytes/s. */
3491 unsigned int burst; /* In bytes. */
3492 unsigned int priority; /* Lower values are higher priorities. */
3496 htb_get__(const struct netdev *netdev_)
3498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3499 return CONTAINER_OF(netdev->tc, struct htb, tc);
3503 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3505 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3508 htb = xmalloc(sizeof *htb);
3509 tc_init(&htb->tc, &tc_ops_htb);
3510 htb->max_rate = max_rate;
3512 netdev->tc = &htb->tc;
3515 /* Create an HTB qdisc.
3517 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3519 htb_setup_qdisc__(struct netdev *netdev)
3522 struct tc_htb_glob opt;
3523 struct ofpbuf request;
3524 struct tcmsg *tcmsg;
3526 tc_del_qdisc(netdev);
3528 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3529 NLM_F_EXCL | NLM_F_CREATE, &request);
3533 tcmsg->tcm_handle = tc_make_handle(1, 0);
3534 tcmsg->tcm_parent = TC_H_ROOT;
3536 nl_msg_put_string(&request, TCA_KIND, "htb");
3538 memset(&opt, 0, sizeof opt);
3539 opt.rate2quantum = 10;
3543 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3544 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3545 nl_msg_end_nested(&request, opt_offset);
3547 return tc_transact(&request, NULL);
3550 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3551 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3553 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3554 unsigned int parent, struct htb_class *class)
3557 struct tc_htb_opt opt;
3558 struct ofpbuf request;
3559 struct tcmsg *tcmsg;
3563 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3565 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3566 netdev_get_name(netdev));
3570 memset(&opt, 0, sizeof opt);
3571 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3572 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3573 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3574 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3575 opt.prio = class->priority;
3577 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3581 tcmsg->tcm_handle = handle;
3582 tcmsg->tcm_parent = parent;
3584 nl_msg_put_string(&request, TCA_KIND, "htb");
3585 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3586 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3587 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3588 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3589 nl_msg_end_nested(&request, opt_offset);
3591 error = tc_transact(&request, NULL);
3593 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3594 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3595 netdev_get_name(netdev),
3596 tc_get_major(handle), tc_get_minor(handle),
3597 tc_get_major(parent), tc_get_minor(parent),
3598 class->min_rate, class->max_rate,
3599 class->burst, class->priority, ovs_strerror(error));
3604 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3605 * description of them into 'details'. The description complies with the
3606 * specification given in the vswitch database documentation for linux-htb
3609 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3611 static const struct nl_policy tca_htb_policy[] = {
3612 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3613 .min_len = sizeof(struct tc_htb_opt) },
3616 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3617 const struct tc_htb_opt *htb;
3619 if (!nl_parse_nested(nl_options, tca_htb_policy,
3620 attrs, ARRAY_SIZE(tca_htb_policy))) {
3621 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3625 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3626 class->min_rate = htb->rate.rate;
3627 class->max_rate = htb->ceil.rate;
3628 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3629 class->priority = htb->prio;
3634 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3635 struct htb_class *options,
3636 struct netdev_queue_stats *stats)
3638 struct nlattr *nl_options;
3639 unsigned int handle;
3642 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3643 if (!error && queue_id) {
3644 unsigned int major = tc_get_major(handle);
3645 unsigned int minor = tc_get_minor(handle);
3646 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3647 *queue_id = minor - 1;
3652 if (!error && options) {
3653 error = htb_parse_tca_options__(nl_options, options);
3659 htb_parse_qdisc_details__(struct netdev *netdev_,
3660 const struct smap *details, struct htb_class *hc)
3662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3663 const char *max_rate_s;
3665 max_rate_s = smap_get(details, "max-rate");
3666 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3667 if (!hc->max_rate) {
3668 enum netdev_features current;
3670 netdev_linux_read_features(netdev);
3671 current = !netdev->get_features_error ? netdev->current : 0;
3672 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3674 hc->min_rate = hc->max_rate;
3680 htb_parse_class_details__(struct netdev *netdev,
3681 const struct smap *details, struct htb_class *hc)
3683 const struct htb *htb = htb_get__(netdev);
3684 const char *min_rate_s = smap_get(details, "min-rate");
3685 const char *max_rate_s = smap_get(details, "max-rate");
3686 const char *burst_s = smap_get(details, "burst");
3687 const char *priority_s = smap_get(details, "priority");
3690 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3692 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3693 netdev_get_name(netdev));
3697 /* HTB requires at least an mtu sized min-rate to send any traffic even
3698 * on uncongested links. */
3699 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3700 hc->min_rate = MAX(hc->min_rate, mtu);
3701 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3704 hc->max_rate = (max_rate_s
3705 ? strtoull(max_rate_s, NULL, 10) / 8
3707 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3708 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3712 * According to hints in the documentation that I've read, it is important
3713 * that 'burst' be at least as big as the largest frame that might be
3714 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3715 * but having it a bit too small is a problem. Since netdev_get_mtu()
3716 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3717 * the MTU. We actually add 64, instead of 14, as a guard against
3718 * additional headers get tacked on somewhere that we're not aware of. */
3719 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3720 hc->burst = MAX(hc->burst, mtu + 64);
3723 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3729 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3730 unsigned int parent, struct htb_class *options,
3731 struct netdev_queue_stats *stats)
3733 struct ofpbuf *reply;
3736 error = tc_query_class(netdev, handle, parent, &reply);
3738 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3739 ofpbuf_delete(reply);
3745 htb_tc_install(struct netdev *netdev, const struct smap *details)
3749 error = htb_setup_qdisc__(netdev);
3751 struct htb_class hc;
3753 htb_parse_qdisc_details__(netdev, details, &hc);
3754 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3755 tc_make_handle(1, 0), &hc);
3757 htb_install__(netdev, hc.max_rate);
3763 static struct htb_class *
3764 htb_class_cast__(const struct tc_queue *queue)
3766 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3770 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3771 const struct htb_class *hc)
3773 struct htb *htb = htb_get__(netdev);
3774 size_t hash = hash_int(queue_id, 0);
3775 struct tc_queue *queue;
3776 struct htb_class *hcp;
3778 queue = tc_find_queue__(netdev, queue_id, hash);
3780 hcp = htb_class_cast__(queue);
3782 hcp = xmalloc(sizeof *hcp);
3783 queue = &hcp->tc_queue;
3784 queue->queue_id = queue_id;
3785 queue->created = time_msec();
3786 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3789 hcp->min_rate = hc->min_rate;
3790 hcp->max_rate = hc->max_rate;
3791 hcp->burst = hc->burst;
3792 hcp->priority = hc->priority;
3796 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3799 struct queue_dump_state state;
3800 struct htb_class hc;
3802 /* Get qdisc options. */
3804 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3805 htb_install__(netdev, hc.max_rate);
3808 if (!start_queue_dump(netdev, &state)) {
3811 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3812 unsigned int queue_id;
3814 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3815 htb_update_queue__(netdev, queue_id, &hc);
3818 finish_queue_dump(&state);
3824 htb_tc_destroy(struct tc *tc)
3826 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3827 struct htb_class *hc, *next;
3829 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3830 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3838 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3840 const struct htb *htb = htb_get__(netdev);
3841 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3846 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3848 struct htb_class hc;
3851 htb_parse_qdisc_details__(netdev, details, &hc);
3852 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3853 tc_make_handle(1, 0), &hc);
3855 htb_get__(netdev)->max_rate = hc.max_rate;
3861 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3862 const struct tc_queue *queue, struct smap *details)
3864 const struct htb_class *hc = htb_class_cast__(queue);
3866 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3867 if (hc->min_rate != hc->max_rate) {
3868 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3870 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3872 smap_add_format(details, "priority", "%u", hc->priority);
3878 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3879 const struct smap *details)
3881 struct htb_class hc;
3884 error = htb_parse_class_details__(netdev, details, &hc);
3889 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3890 tc_make_handle(1, 0xfffe), &hc);
3895 htb_update_queue__(netdev, queue_id, &hc);
3900 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3902 struct htb_class *hc = htb_class_cast__(queue);
3903 struct htb *htb = htb_get__(netdev);
3906 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3908 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3915 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3916 struct netdev_queue_stats *stats)
3918 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3919 tc_make_handle(1, 0xfffe), NULL, stats);
3923 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3924 const struct ofpbuf *nlmsg,
3925 netdev_dump_queue_stats_cb *cb, void *aux)
3927 struct netdev_queue_stats stats;
3928 unsigned int handle, major, minor;
3931 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3936 major = tc_get_major(handle);
3937 minor = tc_get_minor(handle);
3938 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3939 (*cb)(minor - 1, &stats, aux);
3944 static const struct tc_ops tc_ops_htb = {
3945 "htb", /* linux_name */
3946 "linux-htb", /* ovs_name */
3947 HTB_N_QUEUES, /* n_queues */
3956 htb_class_get_stats,
3957 htb_class_dump_stats
3960 /* "linux-hfsc" traffic control class. */
3962 #define HFSC_N_QUEUES 0xf000
3970 struct tc_queue tc_queue;
3975 static struct hfsc *
3976 hfsc_get__(const struct netdev *netdev_)
3978 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3979 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
3982 static struct hfsc_class *
3983 hfsc_class_cast__(const struct tc_queue *queue)
3985 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3989 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
3991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3994 hfsc = xmalloc(sizeof *hfsc);
3995 tc_init(&hfsc->tc, &tc_ops_hfsc);
3996 hfsc->max_rate = max_rate;
3997 netdev->tc = &hfsc->tc;
4001 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4002 const struct hfsc_class *hc)
4006 struct hfsc_class *hcp;
4007 struct tc_queue *queue;
4009 hfsc = hfsc_get__(netdev);
4010 hash = hash_int(queue_id, 0);
4012 queue = tc_find_queue__(netdev, queue_id, hash);
4014 hcp = hfsc_class_cast__(queue);
4016 hcp = xmalloc(sizeof *hcp);
4017 queue = &hcp->tc_queue;
4018 queue->queue_id = queue_id;
4019 queue->created = time_msec();
4020 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4023 hcp->min_rate = hc->min_rate;
4024 hcp->max_rate = hc->max_rate;
4028 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4030 const struct tc_service_curve *rsc, *fsc, *usc;
4031 static const struct nl_policy tca_hfsc_policy[] = {
4033 .type = NL_A_UNSPEC,
4035 .min_len = sizeof(struct tc_service_curve),
4038 .type = NL_A_UNSPEC,
4040 .min_len = sizeof(struct tc_service_curve),
4043 .type = NL_A_UNSPEC,
4045 .min_len = sizeof(struct tc_service_curve),
4048 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4050 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4051 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4052 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4056 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4057 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4058 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4060 if (rsc->m1 != 0 || rsc->d != 0 ||
4061 fsc->m1 != 0 || fsc->d != 0 ||
4062 usc->m1 != 0 || usc->d != 0) {
4063 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4064 "Non-linear service curves are not supported.");
4068 if (rsc->m2 != fsc->m2) {
4069 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4070 "Real-time service curves are not supported ");
4074 if (rsc->m2 > usc->m2) {
4075 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4076 "Min-rate service curve is greater than "
4077 "the max-rate service curve.");
4081 class->min_rate = fsc->m2;
4082 class->max_rate = usc->m2;
4087 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4088 struct hfsc_class *options,
4089 struct netdev_queue_stats *stats)
4092 unsigned int handle;
4093 struct nlattr *nl_options;
4095 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4101 unsigned int major, minor;
4103 major = tc_get_major(handle);
4104 minor = tc_get_minor(handle);
4105 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4106 *queue_id = minor - 1;
4113 error = hfsc_parse_tca_options__(nl_options, options);
4120 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4121 unsigned int parent, struct hfsc_class *options,
4122 struct netdev_queue_stats *stats)
4125 struct ofpbuf *reply;
4127 error = tc_query_class(netdev, handle, parent, &reply);
4132 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4133 ofpbuf_delete(reply);
4138 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4139 struct hfsc_class *class)
4141 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4143 const char *max_rate_s;
4145 max_rate_s = smap_get(details, "max-rate");
4146 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4149 enum netdev_features current;
4151 netdev_linux_read_features(netdev);
4152 current = !netdev->get_features_error ? netdev->current : 0;
4153 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4156 class->min_rate = max_rate;
4157 class->max_rate = max_rate;
4161 hfsc_parse_class_details__(struct netdev *netdev,
4162 const struct smap *details,
4163 struct hfsc_class * class)
4165 const struct hfsc *hfsc;
4166 uint32_t min_rate, max_rate;
4167 const char *min_rate_s, *max_rate_s;
4169 hfsc = hfsc_get__(netdev);
4170 min_rate_s = smap_get(details, "min-rate");
4171 max_rate_s = smap_get(details, "max-rate");
4173 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4174 min_rate = MAX(min_rate, 1);
4175 min_rate = MIN(min_rate, hfsc->max_rate);
4177 max_rate = (max_rate_s
4178 ? strtoull(max_rate_s, NULL, 10) / 8
4180 max_rate = MAX(max_rate, min_rate);
4181 max_rate = MIN(max_rate, hfsc->max_rate);
4183 class->min_rate = min_rate;
4184 class->max_rate = max_rate;
4189 /* Create an HFSC qdisc.
4191 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4193 hfsc_setup_qdisc__(struct netdev * netdev)
4195 struct tcmsg *tcmsg;
4196 struct ofpbuf request;
4197 struct tc_hfsc_qopt opt;
4199 tc_del_qdisc(netdev);
4201 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4202 NLM_F_EXCL | NLM_F_CREATE, &request);
4208 tcmsg->tcm_handle = tc_make_handle(1, 0);
4209 tcmsg->tcm_parent = TC_H_ROOT;
4211 memset(&opt, 0, sizeof opt);
4214 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4215 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4217 return tc_transact(&request, NULL);
4220 /* Create an HFSC class.
4222 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4223 * sc rate <min_rate> ul rate <max_rate>" */
4225 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4226 unsigned int parent, struct hfsc_class *class)
4230 struct tcmsg *tcmsg;
4231 struct ofpbuf request;
4232 struct tc_service_curve min, max;
4234 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4240 tcmsg->tcm_handle = handle;
4241 tcmsg->tcm_parent = parent;
4245 min.m2 = class->min_rate;
4249 max.m2 = class->max_rate;
4251 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4252 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4253 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4254 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4255 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4256 nl_msg_end_nested(&request, opt_offset);
4258 error = tc_transact(&request, NULL);
4260 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4261 "min-rate %ubps, max-rate %ubps (%s)",
4262 netdev_get_name(netdev),
4263 tc_get_major(handle), tc_get_minor(handle),
4264 tc_get_major(parent), tc_get_minor(parent),
4265 class->min_rate, class->max_rate, ovs_strerror(error));
4272 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4275 struct hfsc_class class;
4277 error = hfsc_setup_qdisc__(netdev);
4283 hfsc_parse_qdisc_details__(netdev, details, &class);
4284 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4285 tc_make_handle(1, 0), &class);
4291 hfsc_install__(netdev, class.max_rate);
4296 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4299 struct queue_dump_state state;
4300 struct hfsc_class hc;
4303 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4304 hfsc_install__(netdev, hc.max_rate);
4306 if (!start_queue_dump(netdev, &state)) {
4310 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4311 unsigned int queue_id;
4313 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4314 hfsc_update_queue__(netdev, queue_id, &hc);
4318 finish_queue_dump(&state);
4323 hfsc_tc_destroy(struct tc *tc)
4326 struct hfsc_class *hc, *next;
4328 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4330 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4331 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4340 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4342 const struct hfsc *hfsc;
4343 hfsc = hfsc_get__(netdev);
4344 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4349 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4352 struct hfsc_class class;
4354 hfsc_parse_qdisc_details__(netdev, details, &class);
4355 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4356 tc_make_handle(1, 0), &class);
4359 hfsc_get__(netdev)->max_rate = class.max_rate;
4366 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4367 const struct tc_queue *queue, struct smap *details)
4369 const struct hfsc_class *hc;
4371 hc = hfsc_class_cast__(queue);
4372 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4373 if (hc->min_rate != hc->max_rate) {
4374 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4380 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4381 const struct smap *details)
4384 struct hfsc_class class;
4386 error = hfsc_parse_class_details__(netdev, details, &class);
4391 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4392 tc_make_handle(1, 0xfffe), &class);
4397 hfsc_update_queue__(netdev, queue_id, &class);
4402 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4406 struct hfsc_class *hc;
4408 hc = hfsc_class_cast__(queue);
4409 hfsc = hfsc_get__(netdev);
4411 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4413 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4420 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4421 struct netdev_queue_stats *stats)
4423 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4424 tc_make_handle(1, 0xfffe), NULL, stats);
4428 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4429 const struct ofpbuf *nlmsg,
4430 netdev_dump_queue_stats_cb *cb, void *aux)
4432 struct netdev_queue_stats stats;
4433 unsigned int handle, major, minor;
4436 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4441 major = tc_get_major(handle);
4442 minor = tc_get_minor(handle);
4443 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4444 (*cb)(minor - 1, &stats, aux);
4449 static const struct tc_ops tc_ops_hfsc = {
4450 "hfsc", /* linux_name */
4451 "linux-hfsc", /* ovs_name */
4452 HFSC_N_QUEUES, /* n_queues */
4453 hfsc_tc_install, /* tc_install */
4454 hfsc_tc_load, /* tc_load */
4455 hfsc_tc_destroy, /* tc_destroy */
4456 hfsc_qdisc_get, /* qdisc_get */
4457 hfsc_qdisc_set, /* qdisc_set */
4458 hfsc_class_get, /* class_get */
4459 hfsc_class_set, /* class_set */
4460 hfsc_class_delete, /* class_delete */
4461 hfsc_class_get_stats, /* class_get_stats */
4462 hfsc_class_dump_stats /* class_dump_stats */
4465 /* "linux-default" traffic control class.
4467 * This class represents the default, unnamed Linux qdisc. It corresponds to
4468 * the "" (empty string) QoS type in the OVS database. */
4471 default_install__(struct netdev *netdev_)
4473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4474 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4476 /* Nothing but a tc class implementation is allowed to write to a tc. This
4477 * class never does that, so we can legitimately use a const tc object. */
4478 netdev->tc = CONST_CAST(struct tc *, &tc);
4482 default_tc_install(struct netdev *netdev,
4483 const struct smap *details OVS_UNUSED)
4485 default_install__(netdev);
4490 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4492 default_install__(netdev);
4496 static const struct tc_ops tc_ops_default = {
4497 NULL, /* linux_name */
4502 NULL, /* tc_destroy */
4503 NULL, /* qdisc_get */
4504 NULL, /* qdisc_set */
4505 NULL, /* class_get */
4506 NULL, /* class_set */
4507 NULL, /* class_delete */
4508 NULL, /* class_get_stats */
4509 NULL /* class_dump_stats */
4512 /* "linux-other" traffic control class.
4517 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4519 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4520 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4522 /* Nothing but a tc class implementation is allowed to write to a tc. This
4523 * class never does that, so we can legitimately use a const tc object. */
4524 netdev->tc = CONST_CAST(struct tc *, &tc);
4528 static const struct tc_ops tc_ops_other = {
4529 NULL, /* linux_name */
4530 "linux-other", /* ovs_name */
4532 NULL, /* tc_install */
4534 NULL, /* tc_destroy */
4535 NULL, /* qdisc_get */
4536 NULL, /* qdisc_set */
4537 NULL, /* class_get */
4538 NULL, /* class_set */
4539 NULL, /* class_delete */
4540 NULL, /* class_get_stats */
4541 NULL /* class_dump_stats */
4544 /* Traffic control. */
4546 /* Number of kernel "tc" ticks per second. */
4547 static double ticks_per_s;
4549 /* Number of kernel "jiffies" per second. This is used for the purpose of
4550 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4551 * one jiffy's worth of data.
4553 * There are two possibilities here:
4555 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4556 * approximate range of 100 to 1024. That means that we really need to
4557 * make sure that the qdisc can buffer that much data.
4559 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4560 * has finely granular timers and there's no need to fudge additional room
4561 * for buffers. (There's no extra effort needed to implement that: the
4562 * large 'buffer_hz' is used as a divisor, so practically any number will
4563 * come out as 0 in the division. Small integer results in the case of
4564 * really high dividends won't have any real effect anyhow.)
4566 static unsigned int buffer_hz;
4568 /* Returns tc handle 'major':'minor'. */
4570 tc_make_handle(unsigned int major, unsigned int minor)
4572 return TC_H_MAKE(major << 16, minor);
4575 /* Returns the major number from 'handle'. */
4577 tc_get_major(unsigned int handle)
4579 return TC_H_MAJ(handle) >> 16;
4582 /* Returns the minor number from 'handle'. */
4584 tc_get_minor(unsigned int handle)
4586 return TC_H_MIN(handle);
4589 static struct tcmsg *
4590 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4591 struct ofpbuf *request)
4593 struct tcmsg *tcmsg;
4597 error = get_ifindex(netdev, &ifindex);
4602 ofpbuf_init(request, 512);
4603 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4604 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4605 tcmsg->tcm_family = AF_UNSPEC;
4606 tcmsg->tcm_ifindex = ifindex;
4607 /* Caller should fill in tcmsg->tcm_handle. */
4608 /* Caller should fill in tcmsg->tcm_parent. */
4614 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4616 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4617 ofpbuf_uninit(request);
4621 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4622 * policing configuration.
4624 * This function is equivalent to running the following when 'add' is true:
4625 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4627 * This function is equivalent to running the following when 'add' is false:
4628 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4630 * The configuration and stats may be seen with the following command:
4631 * /sbin/tc -s qdisc show dev <devname>
4633 * Returns 0 if successful, otherwise a positive errno value.
4636 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4638 struct ofpbuf request;
4639 struct tcmsg *tcmsg;
4641 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4642 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4644 tcmsg = tc_make_request(netdev, type, flags, &request);
4648 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4649 tcmsg->tcm_parent = TC_H_INGRESS;
4650 nl_msg_put_string(&request, TCA_KIND, "ingress");
4651 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4653 error = tc_transact(&request, NULL);
4655 /* If we're deleting the qdisc, don't worry about some of the
4656 * error conditions. */
4657 if (!add && (error == ENOENT || error == EINVAL)) {
4666 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4669 * This function is equivalent to running:
4670 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4671 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4674 * The configuration and stats may be seen with the following command:
4675 * /sbin/tc -s filter show dev <devname> parent ffff:
4677 * Returns 0 if successful, otherwise a positive errno value.
4680 tc_add_policer(struct netdev *netdev,
4681 uint32_t kbits_rate, uint32_t kbits_burst)
4683 struct tc_police tc_police;
4684 struct ofpbuf request;
4685 struct tcmsg *tcmsg;
4686 size_t basic_offset;
4687 size_t police_offset;
4691 memset(&tc_police, 0, sizeof tc_police);
4692 tc_police.action = TC_POLICE_SHOT;
4693 tc_police.mtu = mtu;
4694 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4696 /* The following appears wrong in two ways:
4698 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4699 * arguments (or at least consistently "bytes" as both or "bits" as
4700 * both), but this supplies bytes for the first argument and bits for the
4703 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4705 * However if you "fix" those problems then "tc filter show ..." shows
4706 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4707 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4708 * tc's point of view. Whatever. */
4709 tc_police.burst = tc_bytes_to_ticks(
4710 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4712 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4713 NLM_F_EXCL | NLM_F_CREATE, &request);
4717 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4718 tcmsg->tcm_info = tc_make_handle(49,
4719 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4721 nl_msg_put_string(&request, TCA_KIND, "basic");
4722 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4723 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4724 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4725 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4726 nl_msg_end_nested(&request, police_offset);
4727 nl_msg_end_nested(&request, basic_offset);
4729 error = tc_transact(&request, NULL);
4740 /* The values in psched are not individually very meaningful, but they are
4741 * important. The tables below show some values seen in the wild.
4745 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4746 * (Before that, there are hints that it was 1000000000.)
4748 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4752 * -----------------------------------
4753 * [1] 000c8000 000f4240 000f4240 00000064
4754 * [2] 000003e8 00000400 000f4240 3b9aca00
4755 * [3] 000003e8 00000400 000f4240 3b9aca00
4756 * [4] 000003e8 00000400 000f4240 00000064
4757 * [5] 000003e8 00000040 000f4240 3b9aca00
4758 * [6] 000003e8 00000040 000f4240 000000f9
4760 * a b c d ticks_per_s buffer_hz
4761 * ------- --------- ---------- ------------- ----------- -------------
4762 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4763 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4764 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4765 * [4] 1,000 1,024 1,000,000 100 976,562 100
4766 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4767 * [6] 1,000 64 1,000,000 249 15,625,000 249
4769 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4770 * [2] 2.6.26-1-686-bigmem from Debian lenny
4771 * [3] 2.6.26-2-sparc64 from Debian lenny
4772 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4773 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4774 * [6] 2.6.34 from kernel.org on KVM
4776 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4777 static const char fn[] = "/proc/net/psched";
4778 unsigned int a, b, c, d;
4781 if (!ovsthread_once_start(&once)) {
4788 stream = fopen(fn, "r");
4790 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4794 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4795 VLOG_WARN("%s: read failed", fn);
4799 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4803 VLOG_WARN("%s: invalid scheduler parameters", fn);
4807 ticks_per_s = (double) a * c / b;
4811 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4814 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4817 ovsthread_once_done(&once);
4820 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4821 * rate of 'rate' bytes per second. */
4823 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4826 return (rate * ticks) / ticks_per_s;
4829 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4830 * rate of 'rate' bytes per second. */
4832 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4835 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4838 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4839 * a transmission rate of 'rate' bytes per second. */
4841 tc_buffer_per_jiffy(unsigned int rate)
4844 return rate / buffer_hz;
4847 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4848 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4849 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4850 * stores NULL into it if it is absent.
4852 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4855 * Returns 0 if successful, otherwise a positive errno value. */
4857 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4858 struct nlattr **options)
4860 static const struct nl_policy tca_policy[] = {
4861 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4862 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4864 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4866 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4867 tca_policy, ta, ARRAY_SIZE(ta))) {
4868 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4873 *kind = nl_attr_get_string(ta[TCA_KIND]);
4877 *options = ta[TCA_OPTIONS];
4892 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4893 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4894 * into '*options', and its queue statistics into '*stats'. Any of the output
4895 * arguments may be null.
4897 * Returns 0 if successful, otherwise a positive errno value. */
4899 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4900 struct nlattr **options, struct netdev_queue_stats *stats)
4902 static const struct nl_policy tca_policy[] = {
4903 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4904 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4906 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4908 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4909 tca_policy, ta, ARRAY_SIZE(ta))) {
4910 VLOG_WARN_RL(&rl, "failed to parse class message");
4915 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4916 *handlep = tc->tcm_handle;
4920 *options = ta[TCA_OPTIONS];
4924 const struct gnet_stats_queue *gsq;
4925 struct gnet_stats_basic gsb;
4927 static const struct nl_policy stats_policy[] = {
4928 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4929 .min_len = sizeof gsb },
4930 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4931 .min_len = sizeof *gsq },
4933 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4935 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4936 sa, ARRAY_SIZE(sa))) {
4937 VLOG_WARN_RL(&rl, "failed to parse class stats");
4941 /* Alignment issues screw up the length of struct gnet_stats_basic on
4942 * some arch/bitsize combinations. Newer versions of Linux have a
4943 * struct gnet_stats_basic_packed, but we can't depend on that. The
4944 * easiest thing to do is just to make a copy. */
4945 memset(&gsb, 0, sizeof gsb);
4946 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4947 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4948 stats->tx_bytes = gsb.bytes;
4949 stats->tx_packets = gsb.packets;
4951 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4952 stats->tx_errors = gsq->drops;
4962 memset(stats, 0, sizeof *stats);
4967 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4970 tc_query_class(const struct netdev *netdev,
4971 unsigned int handle, unsigned int parent,
4972 struct ofpbuf **replyp)
4974 struct ofpbuf request;
4975 struct tcmsg *tcmsg;
4978 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
4982 tcmsg->tcm_handle = handle;
4983 tcmsg->tcm_parent = parent;
4985 error = tc_transact(&request, replyp);
4987 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4988 netdev_get_name(netdev),
4989 tc_get_major(handle), tc_get_minor(handle),
4990 tc_get_major(parent), tc_get_minor(parent),
4991 ovs_strerror(error));
4996 /* Equivalent to "tc class del dev <name> handle <handle>". */
4998 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5000 struct ofpbuf request;
5001 struct tcmsg *tcmsg;
5004 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5008 tcmsg->tcm_handle = handle;
5009 tcmsg->tcm_parent = 0;
5011 error = tc_transact(&request, NULL);
5013 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5014 netdev_get_name(netdev),
5015 tc_get_major(handle), tc_get_minor(handle),
5016 ovs_strerror(error));
5021 /* Equivalent to "tc qdisc del dev <name> root". */
5023 tc_del_qdisc(struct netdev *netdev_)
5025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5026 struct ofpbuf request;
5027 struct tcmsg *tcmsg;
5030 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5034 tcmsg->tcm_handle = tc_make_handle(1, 0);
5035 tcmsg->tcm_parent = TC_H_ROOT;
5037 error = tc_transact(&request, NULL);
5038 if (error == EINVAL) {
5039 /* EINVAL probably means that the default qdisc was in use, in which
5040 * case we've accomplished our purpose. */
5043 if (!error && netdev->tc) {
5044 if (netdev->tc->ops->tc_destroy) {
5045 netdev->tc->ops->tc_destroy(netdev->tc);
5053 getqdisc_is_safe(void)
5055 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5056 static bool safe = false;
5058 if (ovsthread_once_start(&once)) {
5059 struct utsname utsname;
5062 if (uname(&utsname) == -1) {
5063 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5064 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5065 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5066 } else if (major < 2 || (major == 2 && minor < 35)) {
5067 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5072 ovsthread_once_done(&once);
5077 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5078 * kernel to determine what they are. Returns 0 if successful, otherwise a
5079 * positive errno value. */
5081 tc_query_qdisc(const struct netdev *netdev_)
5083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5084 struct ofpbuf request, *qdisc;
5085 const struct tc_ops *ops;
5086 struct tcmsg *tcmsg;
5094 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5095 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5096 * 2.6.35 without that fix backported to it.
5098 * To avoid the OOPS, we must not make a request that would attempt to dump
5099 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5100 * few others. There are a few ways that I can see to do this, but most of
5101 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5102 * technique chosen here is to assume that any non-default qdisc that we
5103 * create will have a class with handle 1:0. The built-in qdiscs only have
5104 * a class with handle 0:0.
5106 * On Linux 2.6.35+ we use the straightforward method because it allows us
5107 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5108 * in such a case we get no response at all from the kernel (!) if a
5109 * builtin qdisc is in use (which is later caught by "!error &&
5110 * !qdisc->size"). */
5111 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5115 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5116 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5118 /* Figure out what tc class to instantiate. */
5119 error = tc_transact(&request, &qdisc);
5120 if (!error && qdisc->size) {
5123 error = tc_parse_qdisc(qdisc, &kind, NULL);
5125 ops = &tc_ops_other;
5127 ops = tc_lookup_linux_name(kind);
5129 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5130 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5132 ops = &tc_ops_other;
5135 } else if ((!error && !qdisc->size) || error == ENOENT) {
5136 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5137 * set up by some other entity that doesn't have a handle 1:0. We will
5138 * assume that it's the system default qdisc. */
5139 ops = &tc_ops_default;
5142 /* Who knows? Maybe the device got deleted. */
5143 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5144 netdev_get_name(netdev_), ovs_strerror(error));
5145 ops = &tc_ops_other;
5148 /* Instantiate it. */
5149 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5150 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5151 ofpbuf_delete(qdisc);
5153 return error ? error : load_error;
5156 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5157 approximate the time to transmit packets of various lengths. For an MTU of
5158 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5159 represents two possible packet lengths; for a MTU of 513 through 1024, four
5160 possible lengths; and so on.
5162 Returns, for the specified 'mtu', the number of bits that packet lengths
5163 need to be shifted right to fit within such a 256-entry table. */
5165 tc_calc_cell_log(unsigned int mtu)
5170 mtu = ETH_PAYLOAD_MAX;
5172 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5174 for (cell_log = 0; mtu >= 256; cell_log++) {
5181 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5184 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5186 memset(rate, 0, sizeof *rate);
5187 rate->cell_log = tc_calc_cell_log(mtu);
5188 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5189 /* rate->cell_align = 0; */ /* distro headers. */
5190 rate->mpu = ETH_TOTAL_MIN;
5194 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5195 * attribute of the specified "type".
5197 * See tc_calc_cell_log() above for a description of "rtab"s. */
5199 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5204 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5205 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5206 unsigned packet_size = (i + 1) << rate->cell_log;
5207 if (packet_size < rate->mpu) {
5208 packet_size = rate->mpu;
5210 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5214 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5215 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5216 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5219 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5221 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5222 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5225 /* Linux-only functions declared in netdev-linux.h */
5227 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5228 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5230 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5231 const char *flag_name, bool enable)
5233 const char *netdev_name = netdev_get_name(netdev);
5234 struct ethtool_value evalue;
5238 COVERAGE_INC(netdev_get_ethtool);
5239 memset(&evalue, 0, sizeof evalue);
5240 error = netdev_linux_do_ethtool(netdev_name,
5241 (struct ethtool_cmd *)&evalue,
5242 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5247 COVERAGE_INC(netdev_set_ethtool);
5248 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5249 error = netdev_linux_do_ethtool(netdev_name,
5250 (struct ethtool_cmd *)&evalue,
5251 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5256 COVERAGE_INC(netdev_get_ethtool);
5257 memset(&evalue, 0, sizeof evalue);
5258 error = netdev_linux_do_ethtool(netdev_name,
5259 (struct ethtool_cmd *)&evalue,
5260 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5265 if (new_flags != evalue.data) {
5266 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5267 "device %s failed", enable ? "enable" : "disable",
5268 flag_name, netdev_name);
5275 /* Utility functions. */
5277 /* Copies 'src' into 'dst', performing format conversion in the process. */
5279 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5280 const struct rtnl_link_stats *src)
5282 dst->rx_packets = src->rx_packets;
5283 dst->tx_packets = src->tx_packets;
5284 dst->rx_bytes = src->rx_bytes;
5285 dst->tx_bytes = src->tx_bytes;
5286 dst->rx_errors = src->rx_errors;
5287 dst->tx_errors = src->tx_errors;
5288 dst->rx_dropped = src->rx_dropped;
5289 dst->tx_dropped = src->tx_dropped;
5290 dst->multicast = src->multicast;
5291 dst->collisions = src->collisions;
5292 dst->rx_length_errors = src->rx_length_errors;
5293 dst->rx_over_errors = src->rx_over_errors;
5294 dst->rx_crc_errors = src->rx_crc_errors;
5295 dst->rx_frame_errors = src->rx_frame_errors;
5296 dst->rx_fifo_errors = src->rx_fifo_errors;
5297 dst->rx_missed_errors = src->rx_missed_errors;
5298 dst->tx_aborted_errors = src->tx_aborted_errors;
5299 dst->tx_carrier_errors = src->tx_carrier_errors;
5300 dst->tx_fifo_errors = src->tx_fifo_errors;
5301 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5302 dst->tx_window_errors = src->tx_window_errors;
5305 /* Copies 'src' into 'dst', performing format conversion in the process. */
5307 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5308 const struct rtnl_link_stats64 *src)
5310 dst->rx_packets = src->rx_packets;
5311 dst->tx_packets = src->tx_packets;
5312 dst->rx_bytes = src->rx_bytes;
5313 dst->tx_bytes = src->tx_bytes;
5314 dst->rx_errors = src->rx_errors;
5315 dst->tx_errors = src->tx_errors;
5316 dst->rx_dropped = src->rx_dropped;
5317 dst->tx_dropped = src->tx_dropped;
5318 dst->multicast = src->multicast;
5319 dst->collisions = src->collisions;
5320 dst->rx_length_errors = src->rx_length_errors;
5321 dst->rx_over_errors = src->rx_over_errors;
5322 dst->rx_crc_errors = src->rx_crc_errors;
5323 dst->rx_frame_errors = src->rx_frame_errors;
5324 dst->rx_fifo_errors = src->rx_fifo_errors;
5325 dst->rx_missed_errors = src->rx_missed_errors;
5326 dst->tx_aborted_errors = src->tx_aborted_errors;
5327 dst->tx_carrier_errors = src->tx_carrier_errors;
5328 dst->tx_fifo_errors = src->tx_fifo_errors;
5329 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5330 dst->tx_window_errors = src->tx_window_errors;
5334 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5336 struct ofpbuf request;
5337 struct ofpbuf *reply;
5340 ofpbuf_init(&request, 0);
5341 nl_msg_put_nlmsghdr(&request,
5342 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5343 RTM_GETLINK, NLM_F_REQUEST);
5344 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5345 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5346 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5347 ofpbuf_uninit(&request);
5352 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5353 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5354 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5355 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5358 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5359 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5360 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5363 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5368 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5373 ofpbuf_delete(reply);
5378 get_flags(const struct netdev *dev, unsigned int *flags)
5384 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5386 *flags = ifr.ifr_flags;
5392 set_flags(const char *name, unsigned int flags)
5396 ifr.ifr_flags = flags;
5397 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5401 do_get_ifindex(const char *netdev_name)
5406 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5407 COVERAGE_INC(netdev_get_ifindex);
5409 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5411 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5412 netdev_name, ovs_strerror(error));
5415 return ifr.ifr_ifindex;
5419 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5421 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5423 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5424 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5427 netdev->get_ifindex_error = -ifindex;
5428 netdev->ifindex = 0;
5430 netdev->get_ifindex_error = 0;
5431 netdev->ifindex = ifindex;
5433 netdev->cache_valid |= VALID_IFINDEX;
5436 *ifindexp = netdev->ifindex;
5437 return netdev->get_ifindex_error;
5441 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5447 memset(&ifr, 0, sizeof ifr);
5448 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5449 COVERAGE_INC(netdev_get_hwaddr);
5450 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5452 /* ENODEV probably means that a vif disappeared asynchronously and
5453 * hasn't been removed from the database yet, so reduce the log level
5454 * to INFO for that case. */
5455 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5456 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5457 netdev_name, ovs_strerror(error));
5460 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5461 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5462 VLOG_WARN("%s device has unknown hardware address family %d",
5463 netdev_name, hwaddr_family);
5465 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5470 set_etheraddr(const char *netdev_name,
5471 const uint8_t mac[ETH_ADDR_LEN])
5476 memset(&ifr, 0, sizeof ifr);
5477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5478 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5479 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5480 COVERAGE_INC(netdev_set_hwaddr);
5481 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5483 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5484 netdev_name, ovs_strerror(error));
5490 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5491 int cmd, const char *cmd_name)
5496 memset(&ifr, 0, sizeof ifr);
5497 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5498 ifr.ifr_data = (caddr_t) ecmd;
5501 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5503 if (error != EOPNOTSUPP) {
5504 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5505 "failed: %s", cmd_name, name, ovs_strerror(error));
5507 /* The device doesn't support this operation. That's pretty
5508 * common, so there's no point in logging anything. */
5515 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5516 int cmd, const char *cmd_name)
5521 ifr.ifr_addr.sa_family = AF_INET;
5522 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5524 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5526 *ip = sin->sin_addr;
5531 /* Returns an AF_PACKET raw socket or a negative errno value. */
5533 af_packet_sock(void)
5535 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5538 if (ovsthread_once_start(&once)) {
5539 sock = socket(AF_PACKET, SOCK_RAW, 0);
5541 int error = set_nonblocking(sock);
5548 VLOG_ERR("failed to create packet socket: %s",
5549 ovs_strerror(errno));
5551 ovsthread_once_done(&once);