2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 uint8_t etheraddr[ETH_ADDR_LEN];
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
462 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
463 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
466 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
469 /* For devices of class netdev_tap_class only. */
473 struct netdev_rxq_linux {
474 struct netdev_rxq up;
479 /* This is set pretty low because we probably won't learn anything from the
480 * additional log messages. */
481 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
483 /* Polling miimon status for all ports causes performance degradation when
484 * handling a large number of ports. If there are no devices using miimon, then
485 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
487 * Readers do not depend on this variable synchronizing with the related
488 * changes in the device miimon status, so we can use atomic_count. */
489 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
491 static void netdev_linux_run(void);
493 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
494 int cmd, const char *cmd_name);
495 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
496 int cmd, const char *cmd_name);
497 static int get_flags(const struct netdev *, unsigned int *flags);
498 static int set_flags(const char *, unsigned int flags);
499 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
500 enum netdev_flags on, enum netdev_flags *old_flagsp)
501 OVS_REQUIRES(netdev->mutex);
502 static int do_get_ifindex(const char *netdev_name);
503 static int get_ifindex(const struct netdev *, int *ifindexp);
504 static int do_set_addr(struct netdev *netdev,
505 int ioctl_nr, const char *ioctl_name,
506 struct in_addr addr);
507 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
508 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
509 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
510 static int af_packet_sock(void);
511 static bool netdev_linux_miimon_enabled(void);
512 static void netdev_linux_miimon_run(void);
513 static void netdev_linux_miimon_wait(void);
514 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
517 is_netdev_linux_class(const struct netdev_class *netdev_class)
519 return netdev_class->run == netdev_linux_run;
523 is_tap_netdev(const struct netdev *netdev)
525 return netdev_get_class(netdev) == &netdev_tap_class;
528 static struct netdev_linux *
529 netdev_linux_cast(const struct netdev *netdev)
531 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
533 return CONTAINER_OF(netdev, struct netdev_linux, up);
536 static struct netdev_rxq_linux *
537 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
539 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
540 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
543 static void netdev_linux_update(struct netdev_linux *netdev,
544 const struct rtnetlink_change *)
545 OVS_REQUIRES(netdev->mutex);
546 static void netdev_linux_changed(struct netdev_linux *netdev,
547 unsigned int ifi_flags, unsigned int mask)
548 OVS_REQUIRES(netdev->mutex);
550 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
551 * if no such socket could be created. */
552 static struct nl_sock *
553 netdev_linux_notify_sock(void)
555 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
556 static struct nl_sock *sock;
558 if (ovsthread_once_start(&once)) {
561 error = nl_sock_create(NETLINK_ROUTE, &sock);
563 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
565 nl_sock_destroy(sock);
569 ovsthread_once_done(&once);
576 netdev_linux_miimon_enabled(void)
578 return atomic_count_get(&miimon_cnt) > 0;
582 netdev_linux_run(void)
584 struct nl_sock *sock;
587 if (netdev_linux_miimon_enabled()) {
588 netdev_linux_miimon_run();
591 sock = netdev_linux_notify_sock();
597 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
598 uint64_t buf_stub[4096 / 8];
601 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
602 error = nl_sock_recv(sock, &buf, false);
604 struct rtnetlink_change change;
606 if (rtnetlink_parse(&buf, &change)) {
607 struct netdev *netdev_ = netdev_from_name(change.ifname);
608 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
611 ovs_mutex_lock(&netdev->mutex);
612 netdev_linux_update(netdev, &change);
613 ovs_mutex_unlock(&netdev->mutex);
615 netdev_close(netdev_);
617 } else if (error == ENOBUFS) {
618 struct shash device_shash;
619 struct shash_node *node;
623 shash_init(&device_shash);
624 netdev_get_devices(&netdev_linux_class, &device_shash);
625 SHASH_FOR_EACH (node, &device_shash) {
626 struct netdev *netdev_ = node->data;
627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
630 ovs_mutex_lock(&netdev->mutex);
631 get_flags(netdev_, &flags);
632 netdev_linux_changed(netdev, flags, 0);
633 ovs_mutex_unlock(&netdev->mutex);
635 netdev_close(netdev_);
637 shash_destroy(&device_shash);
638 } else if (error != EAGAIN) {
639 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
640 ovs_strerror(error));
647 netdev_linux_wait(void)
649 struct nl_sock *sock;
651 if (netdev_linux_miimon_enabled()) {
652 netdev_linux_miimon_wait();
654 sock = netdev_linux_notify_sock();
656 nl_sock_wait(sock, POLLIN);
661 netdev_linux_changed(struct netdev_linux *dev,
662 unsigned int ifi_flags, unsigned int mask)
663 OVS_REQUIRES(dev->mutex)
665 netdev_change_seq_changed(&dev->up);
667 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
668 dev->carrier_resets++;
670 dev->ifi_flags = ifi_flags;
672 dev->cache_valid &= mask;
676 netdev_linux_update(struct netdev_linux *dev,
677 const struct rtnetlink_change *change)
678 OVS_REQUIRES(dev->mutex)
680 if (change->nlmsg_type == RTM_NEWLINK) {
682 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
684 /* Update netdev from rtnl-change msg. */
686 dev->mtu = change->mtu;
687 dev->cache_valid |= VALID_MTU;
688 dev->netdev_mtu_error = 0;
691 if (!eth_addr_is_zero(change->addr)) {
692 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
693 dev->cache_valid |= VALID_ETHERADDR;
694 dev->ether_addr_error = 0;
697 dev->ifindex = change->if_index;
698 dev->cache_valid |= VALID_IFINDEX;
699 dev->get_ifindex_error = 0;
701 netdev_linux_changed(dev, change->ifi_flags, 0);
705 static struct netdev *
706 netdev_linux_alloc(void)
708 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
713 netdev_linux_common_construct(struct netdev_linux *netdev)
715 ovs_mutex_init(&netdev->mutex);
718 /* Creates system and internal devices. */
720 netdev_linux_construct(struct netdev *netdev_)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725 netdev_linux_common_construct(netdev);
727 error = get_flags(&netdev->up, &netdev->ifi_flags);
728 if (error == ENODEV) {
729 if (netdev->up.netdev_class != &netdev_internal_class) {
730 /* The device does not exist, so don't allow it to be opened. */
733 /* "Internal" netdevs have to be created as netdev objects before
734 * they exist in the kernel, because creating them in the kernel
735 * happens by passing a netdev object to dpif_port_add().
736 * Therefore, ignore the error. */
743 /* For most types of netdevs we open the device for each call of
744 * netdev_open(). However, this is not the case with tap devices,
745 * since it is only possible to open the device once. In this
746 * situation we share a single file descriptor, and consequently
747 * buffers, across all readers. Therefore once data is read it will
748 * be unavailable to other reads for tap devices. */
750 netdev_linux_construct_tap(struct netdev *netdev_)
752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
753 static const char tap_dev[] = "/dev/net/tun";
754 const char *name = netdev_->name;
758 netdev_linux_common_construct(netdev);
760 /* Open tap device. */
761 netdev->tap_fd = open(tap_dev, O_RDWR);
762 if (netdev->tap_fd < 0) {
764 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
768 /* Create tap device. */
769 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
770 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
771 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
772 VLOG_WARN("%s: creating tap device failed: %s", name,
773 ovs_strerror(errno));
778 /* Make non-blocking. */
779 error = set_nonblocking(netdev->tap_fd);
787 close(netdev->tap_fd);
792 netdev_linux_destruct(struct netdev *netdev_)
794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796 if (netdev->tc && netdev->tc->ops->tc_destroy) {
797 netdev->tc->ops->tc_destroy(netdev->tc);
800 if (netdev_get_class(netdev_) == &netdev_tap_class
801 && netdev->tap_fd >= 0)
803 close(netdev->tap_fd);
806 if (netdev->miimon_interval > 0) {
807 atomic_count_dec(&miimon_cnt);
810 ovs_mutex_destroy(&netdev->mutex);
814 netdev_linux_dealloc(struct netdev *netdev_)
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
820 static struct netdev_rxq *
821 netdev_linux_rxq_alloc(void)
823 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
828 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
830 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
831 struct netdev *netdev_ = rx->up.netdev;
832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
835 ovs_mutex_lock(&netdev->mutex);
836 rx->is_tap = is_tap_netdev(netdev_);
838 rx->fd = netdev->tap_fd;
840 struct sockaddr_ll sll;
842 /* Result of tcpdump -dd inbound */
843 static const struct sock_filter filt[] = {
844 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
845 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
846 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
847 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
849 static const struct sock_fprog fprog = {
850 ARRAY_SIZE(filt), (struct sock_filter *) filt
853 /* Create file descriptor. */
854 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
857 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
862 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
864 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
865 netdev_get_name(netdev_), ovs_strerror(error));
869 /* Set non-blocking mode. */
870 error = set_nonblocking(rx->fd);
875 /* Get ethernet device index. */
876 error = get_ifindex(&netdev->up, &ifindex);
881 /* Bind to specific ethernet device. */
882 memset(&sll, 0, sizeof sll);
883 sll.sll_family = AF_PACKET;
884 sll.sll_ifindex = ifindex;
885 sll.sll_protocol = htons(ETH_P_ALL);
886 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
888 VLOG_ERR("%s: failed to bind raw socket (%s)",
889 netdev_get_name(netdev_), ovs_strerror(error));
893 /* Filter for only inbound packets. */
894 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
898 VLOG_ERR("%s: failed to attach filter (%s)",
899 netdev_get_name(netdev_), ovs_strerror(error));
903 ovs_mutex_unlock(&netdev->mutex);
911 ovs_mutex_unlock(&netdev->mutex);
916 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
918 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
926 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
928 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
934 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
936 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
937 return htons(aux->tp_vlan_tpid);
939 return htons(ETH_TYPE_VLAN);
944 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
946 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
950 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
955 struct cmsghdr *cmsg;
958 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
962 /* Reserve headroom for a single VLAN tag */
963 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
964 size = dp_packet_tailroom(buffer);
966 iov.iov_base = dp_packet_data(buffer);
968 msgh.msg_name = NULL;
969 msgh.msg_namelen = 0;
972 msgh.msg_control = &cmsg_buffer;
973 msgh.msg_controllen = sizeof cmsg_buffer;
977 retval = recvmsg(fd, &msgh, MSG_TRUNC);
978 } while (retval < 0 && errno == EINTR);
982 } else if (retval > size) {
986 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
988 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
989 const struct tpacket_auxdata *aux;
991 if (cmsg->cmsg_level != SOL_PACKET
992 || cmsg->cmsg_type != PACKET_AUXDATA
993 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
997 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
998 if (auxdata_has_vlan_tci(aux)) {
999 if (retval < ETH_HEADER_LEN) {
1003 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1004 htons(aux->tp_vlan_tci));
1013 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1016 size_t size = dp_packet_tailroom(buffer);
1019 retval = read(fd, dp_packet_data(buffer), size);
1020 } while (retval < 0 && errno == EINTR);
1024 } else if (retval > size) {
1028 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1033 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1036 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1037 struct netdev *netdev = rx->up.netdev;
1038 struct dp_packet *buffer;
1042 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1043 mtu = ETH_PAYLOAD_MAX;
1046 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1047 DP_NETDEV_HEADROOM);
1048 retval = (rx->is_tap
1049 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1050 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1053 if (retval != EAGAIN && retval != EMSGSIZE) {
1054 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1055 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1057 dp_packet_delete(buffer);
1059 dp_packet_pad(buffer);
1060 dp_packet_set_rss_hash(buffer, 0);
1061 packets[0] = buffer;
1069 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1071 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1072 poll_fd_wait(rx->fd, POLLIN);
1076 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1078 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1081 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1082 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1086 drain_fd(rx->fd, ifr.ifr_qlen);
1089 return drain_rcvbuf(rx->fd);
1093 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1094 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1095 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1096 * the packet is too big or too small to transmit on the device.
1098 * The caller retains ownership of 'buffer' in all cases.
1100 * The kernel maintains a packet transmission queue, so the caller is not
1101 * expected to do additional queuing of packets. */
1103 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1104 struct dp_packet **pkts, int cnt, bool may_steal)
1109 /* 'i' is incremented only if there's no error */
1110 for (i = 0; i < cnt;) {
1111 const void *data = dp_packet_data(pkts[i]);
1112 size_t size = dp_packet_size(pkts[i]);
1115 if (!is_tap_netdev(netdev_)) {
1116 /* Use our AF_PACKET socket to send to this device. */
1117 struct sockaddr_ll sll;
1123 sock = af_packet_sock();
1128 ifindex = netdev_get_ifindex(netdev_);
1133 /* We don't bother setting most fields in sockaddr_ll because the
1134 * kernel ignores them for SOCK_RAW. */
1135 memset(&sll, 0, sizeof sll);
1136 sll.sll_family = AF_PACKET;
1137 sll.sll_ifindex = ifindex;
1139 iov.iov_base = CONST_CAST(void *, data);
1142 msg.msg_name = &sll;
1143 msg.msg_namelen = sizeof sll;
1146 msg.msg_control = NULL;
1147 msg.msg_controllen = 0;
1150 retval = sendmsg(sock, &msg, 0);
1152 /* Use the tap fd to send to this device. This is essential for
1153 * tap devices, because packets sent to a tap device with an
1154 * AF_PACKET socket will loop back to be *received* again on the
1155 * tap device. This doesn't occur on other interface types
1156 * because we attach a socket filter to the rx socket. */
1157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1159 retval = write(netdev->tap_fd, data, size);
1163 /* The Linux AF_PACKET implementation never blocks waiting for room
1164 * for packets, instead returning ENOBUFS. Translate this into
1165 * EAGAIN for the caller. */
1166 error = errno == ENOBUFS ? EAGAIN : errno;
1167 if (error == EINTR) {
1168 /* continue without incrementing 'i', i.e. retry this packet */
1172 } else if (retval != size) {
1173 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1174 " of %"PRIuSIZE") on %s", retval, size,
1175 netdev_get_name(netdev_));
1180 /* Process the next packet in the batch */
1185 for (i = 0; i < cnt; i++) {
1186 dp_packet_delete(pkts[i]);
1190 if (error && error != EAGAIN) {
1191 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1192 netdev_get_name(netdev_), ovs_strerror(error));
1199 /* Registers with the poll loop to wake up from the next call to poll_block()
1200 * when the packet transmission queue has sufficient room to transmit a packet
1201 * with netdev_send().
1203 * The kernel maintains a packet transmission queue, so the client is not
1204 * expected to do additional queuing of packets. Thus, this function is
1205 * unlikely to ever be used. It is included for completeness. */
1207 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1209 if (is_tap_netdev(netdev)) {
1210 /* TAP device always accepts packets.*/
1211 poll_immediate_wake();
1215 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1216 * otherwise a positive errno value. */
1218 netdev_linux_set_etheraddr(struct netdev *netdev_,
1219 const uint8_t mac[ETH_ADDR_LEN])
1221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1222 enum netdev_flags old_flags = 0;
1225 ovs_mutex_lock(&netdev->mutex);
1227 if (netdev->cache_valid & VALID_ETHERADDR) {
1228 error = netdev->ether_addr_error;
1229 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1232 netdev->cache_valid &= ~VALID_ETHERADDR;
1235 /* Tap devices must be brought down before setting the address. */
1236 if (is_tap_netdev(netdev_)) {
1237 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1239 error = set_etheraddr(netdev_get_name(netdev_), mac);
1240 if (!error || error == ENODEV) {
1241 netdev->ether_addr_error = error;
1242 netdev->cache_valid |= VALID_ETHERADDR;
1244 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1248 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1249 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1253 ovs_mutex_unlock(&netdev->mutex);
1257 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1259 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1260 uint8_t mac[ETH_ADDR_LEN])
1262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1265 ovs_mutex_lock(&netdev->mutex);
1266 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1267 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1269 netdev->cache_valid |= VALID_ETHERADDR;
1272 error = netdev->ether_addr_error;
1274 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1276 ovs_mutex_unlock(&netdev->mutex);
1282 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1286 if (!(netdev->cache_valid & VALID_MTU)) {
1289 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1290 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1291 netdev->mtu = ifr.ifr_mtu;
1292 netdev->cache_valid |= VALID_MTU;
1295 error = netdev->netdev_mtu_error;
1297 *mtup = netdev->mtu;
1303 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1304 * in bytes, not including the hardware header; thus, this is typically 1500
1305 * bytes for Ethernet devices. */
1307 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1309 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1312 ovs_mutex_lock(&netdev->mutex);
1313 error = netdev_linux_get_mtu__(netdev, mtup);
1314 ovs_mutex_unlock(&netdev->mutex);
1319 /* Sets the maximum size of transmitted (MTU) for given device using linux
1320 * networking ioctl interface.
1323 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1329 ovs_mutex_lock(&netdev->mutex);
1330 if (netdev->cache_valid & VALID_MTU) {
1331 error = netdev->netdev_mtu_error;
1332 if (error || netdev->mtu == mtu) {
1335 netdev->cache_valid &= ~VALID_MTU;
1338 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1339 SIOCSIFMTU, "SIOCSIFMTU");
1340 if (!error || error == ENODEV) {
1341 netdev->netdev_mtu_error = error;
1342 netdev->mtu = ifr.ifr_mtu;
1343 netdev->cache_valid |= VALID_MTU;
1346 ovs_mutex_unlock(&netdev->mutex);
1350 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1351 * On failure, returns a negative errno value. */
1353 netdev_linux_get_ifindex(const struct netdev *netdev_)
1355 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1358 ovs_mutex_lock(&netdev->mutex);
1359 error = get_ifindex(netdev_, &ifindex);
1360 ovs_mutex_unlock(&netdev->mutex);
1362 return error ? -error : ifindex;
1366 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1368 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1370 ovs_mutex_lock(&netdev->mutex);
1371 if (netdev->miimon_interval > 0) {
1372 *carrier = netdev->miimon;
1374 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1376 ovs_mutex_unlock(&netdev->mutex);
1381 static long long int
1382 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1384 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1385 long long int carrier_resets;
1387 ovs_mutex_lock(&netdev->mutex);
1388 carrier_resets = netdev->carrier_resets;
1389 ovs_mutex_unlock(&netdev->mutex);
1391 return carrier_resets;
1395 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1396 struct mii_ioctl_data *data)
1401 memset(&ifr, 0, sizeof ifr);
1402 memcpy(&ifr.ifr_data, data, sizeof *data);
1403 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1404 memcpy(data, &ifr.ifr_data, sizeof *data);
1410 netdev_linux_get_miimon(const char *name, bool *miimon)
1412 struct mii_ioctl_data data;
1417 memset(&data, 0, sizeof data);
1418 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1420 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1421 data.reg_num = MII_BMSR;
1422 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1426 *miimon = !!(data.val_out & BMSR_LSTATUS);
1428 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1431 struct ethtool_cmd ecmd;
1433 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1436 COVERAGE_INC(netdev_get_ethtool);
1437 memset(&ecmd, 0, sizeof ecmd);
1438 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1441 struct ethtool_value eval;
1443 memcpy(&eval, &ecmd, sizeof eval);
1444 *miimon = !!eval.data;
1446 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1454 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1455 long long int interval)
1457 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1459 ovs_mutex_lock(&netdev->mutex);
1460 interval = interval > 0 ? MAX(interval, 100) : 0;
1461 if (netdev->miimon_interval != interval) {
1462 if (interval && !netdev->miimon_interval) {
1463 atomic_count_inc(&miimon_cnt);
1464 } else if (!interval && netdev->miimon_interval) {
1465 atomic_count_dec(&miimon_cnt);
1468 netdev->miimon_interval = interval;
1469 timer_set_expired(&netdev->miimon_timer);
1471 ovs_mutex_unlock(&netdev->mutex);
1477 netdev_linux_miimon_run(void)
1479 struct shash device_shash;
1480 struct shash_node *node;
1482 shash_init(&device_shash);
1483 netdev_get_devices(&netdev_linux_class, &device_shash);
1484 SHASH_FOR_EACH (node, &device_shash) {
1485 struct netdev *netdev = node->data;
1486 struct netdev_linux *dev = netdev_linux_cast(netdev);
1489 ovs_mutex_lock(&dev->mutex);
1490 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1491 netdev_linux_get_miimon(dev->up.name, &miimon);
1492 if (miimon != dev->miimon) {
1493 dev->miimon = miimon;
1494 netdev_linux_changed(dev, dev->ifi_flags, 0);
1497 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1499 ovs_mutex_unlock(&dev->mutex);
1500 netdev_close(netdev);
1503 shash_destroy(&device_shash);
1507 netdev_linux_miimon_wait(void)
1509 struct shash device_shash;
1510 struct shash_node *node;
1512 shash_init(&device_shash);
1513 netdev_get_devices(&netdev_linux_class, &device_shash);
1514 SHASH_FOR_EACH (node, &device_shash) {
1515 struct netdev *netdev = node->data;
1516 struct netdev_linux *dev = netdev_linux_cast(netdev);
1518 ovs_mutex_lock(&dev->mutex);
1519 if (dev->miimon_interval > 0) {
1520 timer_wait(&dev->miimon_timer);
1522 ovs_mutex_unlock(&dev->mutex);
1523 netdev_close(netdev);
1525 shash_destroy(&device_shash);
1529 swap_uint64(uint64_t *a, uint64_t *b)
1536 /* Copies 'src' into 'dst', performing format conversion in the process.
1538 * 'src' is allowed to be misaligned. */
1540 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1541 const struct ovs_vport_stats *src)
1543 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1544 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1545 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1546 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1547 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1548 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1549 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1550 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1552 dst->collisions = 0;
1553 dst->rx_length_errors = 0;
1554 dst->rx_over_errors = 0;
1555 dst->rx_crc_errors = 0;
1556 dst->rx_frame_errors = 0;
1557 dst->rx_fifo_errors = 0;
1558 dst->rx_missed_errors = 0;
1559 dst->tx_aborted_errors = 0;
1560 dst->tx_carrier_errors = 0;
1561 dst->tx_fifo_errors = 0;
1562 dst->tx_heartbeat_errors = 0;
1563 dst->tx_window_errors = 0;
1567 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1569 struct dpif_netlink_vport reply;
1573 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1576 } else if (!reply.stats) {
1581 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1589 get_stats_via_vport(const struct netdev *netdev_,
1590 struct netdev_stats *stats)
1592 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1594 if (!netdev->vport_stats_error ||
1595 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1598 error = get_stats_via_vport__(netdev_, stats);
1599 if (error && error != ENOENT && error != ENODEV) {
1600 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1602 netdev_get_name(netdev_), ovs_strerror(error));
1604 netdev->vport_stats_error = error;
1605 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1609 /* Retrieves current device stats for 'netdev-linux'. */
1611 netdev_linux_get_stats(const struct netdev *netdev_,
1612 struct netdev_stats *stats)
1614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1615 struct netdev_stats dev_stats;
1618 ovs_mutex_lock(&netdev->mutex);
1619 get_stats_via_vport(netdev_, stats);
1620 error = get_stats_via_netlink(netdev_, &dev_stats);
1622 if (!netdev->vport_stats_error) {
1625 } else if (netdev->vport_stats_error) {
1626 /* stats not available from OVS then use netdev stats. */
1629 /* Use kernel netdev's packet and byte counts since vport's counters
1630 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1632 stats->rx_packets = dev_stats.rx_packets;
1633 stats->rx_bytes = dev_stats.rx_bytes;
1634 stats->tx_packets = dev_stats.tx_packets;
1635 stats->tx_bytes = dev_stats.tx_bytes;
1637 stats->rx_errors += dev_stats.rx_errors;
1638 stats->tx_errors += dev_stats.tx_errors;
1639 stats->rx_dropped += dev_stats.rx_dropped;
1640 stats->tx_dropped += dev_stats.tx_dropped;
1641 stats->multicast += dev_stats.multicast;
1642 stats->collisions += dev_stats.collisions;
1643 stats->rx_length_errors += dev_stats.rx_length_errors;
1644 stats->rx_over_errors += dev_stats.rx_over_errors;
1645 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1646 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1647 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1648 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1649 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1650 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1651 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1652 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1653 stats->tx_window_errors += dev_stats.tx_window_errors;
1655 ovs_mutex_unlock(&netdev->mutex);
1660 /* Retrieves current device stats for 'netdev-tap' netdev or
1661 * netdev-internal. */
1663 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1666 struct netdev_stats dev_stats;
1669 ovs_mutex_lock(&netdev->mutex);
1670 get_stats_via_vport(netdev_, stats);
1671 error = get_stats_via_netlink(netdev_, &dev_stats);
1673 if (!netdev->vport_stats_error) {
1676 } else if (netdev->vport_stats_error) {
1677 /* Transmit and receive stats will appear to be swapped relative to the
1678 * other ports since we are the one sending the data, not a remote
1679 * computer. For consistency, we swap them back here. This does not
1680 * apply if we are getting stats from the vport layer because it always
1681 * tracks stats from the perspective of the switch. */
1684 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1685 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1686 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1687 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1688 stats->rx_length_errors = 0;
1689 stats->rx_over_errors = 0;
1690 stats->rx_crc_errors = 0;
1691 stats->rx_frame_errors = 0;
1692 stats->rx_fifo_errors = 0;
1693 stats->rx_missed_errors = 0;
1694 stats->tx_aborted_errors = 0;
1695 stats->tx_carrier_errors = 0;
1696 stats->tx_fifo_errors = 0;
1697 stats->tx_heartbeat_errors = 0;
1698 stats->tx_window_errors = 0;
1700 /* Use kernel netdev's packet and byte counts since vport counters
1701 * do not reflect packet counts on the wire when GSO, TSO or GRO
1703 stats->rx_packets = dev_stats.tx_packets;
1704 stats->rx_bytes = dev_stats.tx_bytes;
1705 stats->tx_packets = dev_stats.rx_packets;
1706 stats->tx_bytes = dev_stats.rx_bytes;
1708 stats->rx_dropped += dev_stats.tx_dropped;
1709 stats->tx_dropped += dev_stats.rx_dropped;
1711 stats->rx_errors += dev_stats.tx_errors;
1712 stats->tx_errors += dev_stats.rx_errors;
1714 stats->multicast += dev_stats.multicast;
1715 stats->collisions += dev_stats.collisions;
1717 ovs_mutex_unlock(&netdev->mutex);
1723 netdev_internal_get_stats(const struct netdev *netdev_,
1724 struct netdev_stats *stats)
1726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1729 ovs_mutex_lock(&netdev->mutex);
1730 get_stats_via_vport(netdev_, stats);
1731 error = netdev->vport_stats_error;
1732 ovs_mutex_unlock(&netdev->mutex);
1738 netdev_linux_read_features(struct netdev_linux *netdev)
1740 struct ethtool_cmd ecmd;
1744 if (netdev->cache_valid & VALID_FEATURES) {
1748 COVERAGE_INC(netdev_get_ethtool);
1749 memset(&ecmd, 0, sizeof ecmd);
1750 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1751 ETHTOOL_GSET, "ETHTOOL_GSET");
1756 /* Supported features. */
1757 netdev->supported = 0;
1758 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1759 netdev->supported |= NETDEV_F_10MB_HD;
1761 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1762 netdev->supported |= NETDEV_F_10MB_FD;
1764 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1765 netdev->supported |= NETDEV_F_100MB_HD;
1767 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1768 netdev->supported |= NETDEV_F_100MB_FD;
1770 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1771 netdev->supported |= NETDEV_F_1GB_HD;
1773 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1774 netdev->supported |= NETDEV_F_1GB_FD;
1776 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1777 netdev->supported |= NETDEV_F_10GB_FD;
1779 if (ecmd.supported & SUPPORTED_TP) {
1780 netdev->supported |= NETDEV_F_COPPER;
1782 if (ecmd.supported & SUPPORTED_FIBRE) {
1783 netdev->supported |= NETDEV_F_FIBER;
1785 if (ecmd.supported & SUPPORTED_Autoneg) {
1786 netdev->supported |= NETDEV_F_AUTONEG;
1788 if (ecmd.supported & SUPPORTED_Pause) {
1789 netdev->supported |= NETDEV_F_PAUSE;
1791 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1792 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1795 /* Advertised features. */
1796 netdev->advertised = 0;
1797 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1798 netdev->advertised |= NETDEV_F_10MB_HD;
1800 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1801 netdev->advertised |= NETDEV_F_10MB_FD;
1803 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1804 netdev->advertised |= NETDEV_F_100MB_HD;
1806 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1807 netdev->advertised |= NETDEV_F_100MB_FD;
1809 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1810 netdev->advertised |= NETDEV_F_1GB_HD;
1812 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1813 netdev->advertised |= NETDEV_F_1GB_FD;
1815 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1816 netdev->advertised |= NETDEV_F_10GB_FD;
1818 if (ecmd.advertising & ADVERTISED_TP) {
1819 netdev->advertised |= NETDEV_F_COPPER;
1821 if (ecmd.advertising & ADVERTISED_FIBRE) {
1822 netdev->advertised |= NETDEV_F_FIBER;
1824 if (ecmd.advertising & ADVERTISED_Autoneg) {
1825 netdev->advertised |= NETDEV_F_AUTONEG;
1827 if (ecmd.advertising & ADVERTISED_Pause) {
1828 netdev->advertised |= NETDEV_F_PAUSE;
1830 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1831 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1834 /* Current settings. */
1836 if (speed == SPEED_10) {
1837 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1838 } else if (speed == SPEED_100) {
1839 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1840 } else if (speed == SPEED_1000) {
1841 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1842 } else if (speed == SPEED_10000) {
1843 netdev->current = NETDEV_F_10GB_FD;
1844 } else if (speed == 40000) {
1845 netdev->current = NETDEV_F_40GB_FD;
1846 } else if (speed == 100000) {
1847 netdev->current = NETDEV_F_100GB_FD;
1848 } else if (speed == 1000000) {
1849 netdev->current = NETDEV_F_1TB_FD;
1851 netdev->current = 0;
1854 if (ecmd.port == PORT_TP) {
1855 netdev->current |= NETDEV_F_COPPER;
1856 } else if (ecmd.port == PORT_FIBRE) {
1857 netdev->current |= NETDEV_F_FIBER;
1861 netdev->current |= NETDEV_F_AUTONEG;
1865 netdev->cache_valid |= VALID_FEATURES;
1866 netdev->get_features_error = error;
1869 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1870 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1871 * Returns 0 if successful, otherwise a positive errno value. */
1873 netdev_linux_get_features(const struct netdev *netdev_,
1874 enum netdev_features *current,
1875 enum netdev_features *advertised,
1876 enum netdev_features *supported,
1877 enum netdev_features *peer)
1879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1882 ovs_mutex_lock(&netdev->mutex);
1883 netdev_linux_read_features(netdev);
1884 if (!netdev->get_features_error) {
1885 *current = netdev->current;
1886 *advertised = netdev->advertised;
1887 *supported = netdev->supported;
1888 *peer = 0; /* XXX */
1890 error = netdev->get_features_error;
1891 ovs_mutex_unlock(&netdev->mutex);
1896 /* Set the features advertised by 'netdev' to 'advertise'. */
1898 netdev_linux_set_advertisements(struct netdev *netdev_,
1899 enum netdev_features advertise)
1901 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1902 struct ethtool_cmd ecmd;
1905 ovs_mutex_lock(&netdev->mutex);
1907 COVERAGE_INC(netdev_get_ethtool);
1908 memset(&ecmd, 0, sizeof ecmd);
1909 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1910 ETHTOOL_GSET, "ETHTOOL_GSET");
1915 ecmd.advertising = 0;
1916 if (advertise & NETDEV_F_10MB_HD) {
1917 ecmd.advertising |= ADVERTISED_10baseT_Half;
1919 if (advertise & NETDEV_F_10MB_FD) {
1920 ecmd.advertising |= ADVERTISED_10baseT_Full;
1922 if (advertise & NETDEV_F_100MB_HD) {
1923 ecmd.advertising |= ADVERTISED_100baseT_Half;
1925 if (advertise & NETDEV_F_100MB_FD) {
1926 ecmd.advertising |= ADVERTISED_100baseT_Full;
1928 if (advertise & NETDEV_F_1GB_HD) {
1929 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1931 if (advertise & NETDEV_F_1GB_FD) {
1932 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1934 if (advertise & NETDEV_F_10GB_FD) {
1935 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1937 if (advertise & NETDEV_F_COPPER) {
1938 ecmd.advertising |= ADVERTISED_TP;
1940 if (advertise & NETDEV_F_FIBER) {
1941 ecmd.advertising |= ADVERTISED_FIBRE;
1943 if (advertise & NETDEV_F_AUTONEG) {
1944 ecmd.advertising |= ADVERTISED_Autoneg;
1946 if (advertise & NETDEV_F_PAUSE) {
1947 ecmd.advertising |= ADVERTISED_Pause;
1949 if (advertise & NETDEV_F_PAUSE_ASYM) {
1950 ecmd.advertising |= ADVERTISED_Asym_Pause;
1952 COVERAGE_INC(netdev_set_ethtool);
1953 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1954 ETHTOOL_SSET, "ETHTOOL_SSET");
1957 ovs_mutex_unlock(&netdev->mutex);
1961 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1962 * successful, otherwise a positive errno value. */
1964 netdev_linux_set_policing(struct netdev *netdev_,
1965 uint32_t kbits_rate, uint32_t kbits_burst)
1967 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1968 const char *netdev_name = netdev_get_name(netdev_);
1971 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1972 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1973 : kbits_burst); /* Stick with user-specified value. */
1975 ovs_mutex_lock(&netdev->mutex);
1976 if (netdev->cache_valid & VALID_POLICING) {
1977 error = netdev->netdev_policing_error;
1978 if (error || (netdev->kbits_rate == kbits_rate &&
1979 netdev->kbits_burst == kbits_burst)) {
1980 /* Assume that settings haven't changed since we last set them. */
1983 netdev->cache_valid &= ~VALID_POLICING;
1986 COVERAGE_INC(netdev_set_policing);
1987 /* Remove any existing ingress qdisc. */
1988 error = tc_add_del_ingress_qdisc(netdev_, false);
1990 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1991 netdev_name, ovs_strerror(error));
1996 error = tc_add_del_ingress_qdisc(netdev_, true);
1998 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1999 netdev_name, ovs_strerror(error));
2003 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2005 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2006 netdev_name, ovs_strerror(error));
2011 netdev->kbits_rate = kbits_rate;
2012 netdev->kbits_burst = kbits_burst;
2015 if (!error || error == ENODEV) {
2016 netdev->netdev_policing_error = error;
2017 netdev->cache_valid |= VALID_POLICING;
2019 ovs_mutex_unlock(&netdev->mutex);
2024 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2027 const struct tc_ops *const *opsp;
2029 for (opsp = tcs; *opsp != NULL; opsp++) {
2030 const struct tc_ops *ops = *opsp;
2031 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2032 sset_add(types, ops->ovs_name);
2038 static const struct tc_ops *
2039 tc_lookup_ovs_name(const char *name)
2041 const struct tc_ops *const *opsp;
2043 for (opsp = tcs; *opsp != NULL; opsp++) {
2044 const struct tc_ops *ops = *opsp;
2045 if (!strcmp(name, ops->ovs_name)) {
2052 static const struct tc_ops *
2053 tc_lookup_linux_name(const char *name)
2055 const struct tc_ops *const *opsp;
2057 for (opsp = tcs; *opsp != NULL; opsp++) {
2058 const struct tc_ops *ops = *opsp;
2059 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2066 static struct tc_queue *
2067 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2071 struct tc_queue *queue;
2073 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2074 if (queue->queue_id == queue_id) {
2081 static struct tc_queue *
2082 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2084 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2088 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2090 struct netdev_qos_capabilities *caps)
2092 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2096 caps->n_queues = ops->n_queues;
2101 netdev_linux_get_qos(const struct netdev *netdev_,
2102 const char **typep, struct smap *details)
2104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2107 ovs_mutex_lock(&netdev->mutex);
2108 error = tc_query_qdisc(netdev_);
2110 *typep = netdev->tc->ops->ovs_name;
2111 error = (netdev->tc->ops->qdisc_get
2112 ? netdev->tc->ops->qdisc_get(netdev_, details)
2115 ovs_mutex_unlock(&netdev->mutex);
2121 netdev_linux_set_qos(struct netdev *netdev_,
2122 const char *type, const struct smap *details)
2124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2125 const struct tc_ops *new_ops;
2128 new_ops = tc_lookup_ovs_name(type);
2129 if (!new_ops || !new_ops->tc_install) {
2133 ovs_mutex_lock(&netdev->mutex);
2134 error = tc_query_qdisc(netdev_);
2139 if (new_ops == netdev->tc->ops) {
2140 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2142 /* Delete existing qdisc. */
2143 error = tc_del_qdisc(netdev_);
2147 ovs_assert(netdev->tc == NULL);
2149 /* Install new qdisc. */
2150 error = new_ops->tc_install(netdev_, details);
2151 ovs_assert((error == 0) == (netdev->tc != NULL));
2155 ovs_mutex_unlock(&netdev->mutex);
2160 netdev_linux_get_queue(const struct netdev *netdev_,
2161 unsigned int queue_id, struct smap *details)
2163 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2166 ovs_mutex_lock(&netdev->mutex);
2167 error = tc_query_qdisc(netdev_);
2169 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2171 ? netdev->tc->ops->class_get(netdev_, queue, details)
2174 ovs_mutex_unlock(&netdev->mutex);
2180 netdev_linux_set_queue(struct netdev *netdev_,
2181 unsigned int queue_id, const struct smap *details)
2183 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2186 ovs_mutex_lock(&netdev->mutex);
2187 error = tc_query_qdisc(netdev_);
2189 error = (queue_id < netdev->tc->ops->n_queues
2190 && netdev->tc->ops->class_set
2191 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2194 ovs_mutex_unlock(&netdev->mutex);
2200 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2202 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2205 ovs_mutex_lock(&netdev->mutex);
2206 error = tc_query_qdisc(netdev_);
2208 if (netdev->tc->ops->class_delete) {
2209 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2211 ? netdev->tc->ops->class_delete(netdev_, queue)
2217 ovs_mutex_unlock(&netdev->mutex);
2223 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2224 unsigned int queue_id,
2225 struct netdev_queue_stats *stats)
2227 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2230 ovs_mutex_lock(&netdev->mutex);
2231 error = tc_query_qdisc(netdev_);
2233 if (netdev->tc->ops->class_get_stats) {
2234 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2236 stats->created = queue->created;
2237 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2246 ovs_mutex_unlock(&netdev->mutex);
2251 struct queue_dump_state {
2252 struct nl_dump dump;
2257 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2259 struct ofpbuf request;
2260 struct tcmsg *tcmsg;
2262 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2266 tcmsg->tcm_parent = 0;
2267 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2268 ofpbuf_uninit(&request);
2270 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2275 finish_queue_dump(struct queue_dump_state *state)
2277 ofpbuf_uninit(&state->buf);
2278 return nl_dump_done(&state->dump);
2281 struct netdev_linux_queue_state {
2282 unsigned int *queues;
2288 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2290 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2293 ovs_mutex_lock(&netdev->mutex);
2294 error = tc_query_qdisc(netdev_);
2296 if (netdev->tc->ops->class_get) {
2297 struct netdev_linux_queue_state *state;
2298 struct tc_queue *queue;
2301 *statep = state = xmalloc(sizeof *state);
2302 state->n_queues = hmap_count(&netdev->tc->queues);
2303 state->cur_queue = 0;
2304 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2307 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2308 state->queues[i++] = queue->queue_id;
2314 ovs_mutex_unlock(&netdev->mutex);
2320 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2321 unsigned int *queue_idp, struct smap *details)
2323 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2324 struct netdev_linux_queue_state *state = state_;
2327 ovs_mutex_lock(&netdev->mutex);
2328 while (state->cur_queue < state->n_queues) {
2329 unsigned int queue_id = state->queues[state->cur_queue++];
2330 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2333 *queue_idp = queue_id;
2334 error = netdev->tc->ops->class_get(netdev_, queue, details);
2338 ovs_mutex_unlock(&netdev->mutex);
2344 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2347 struct netdev_linux_queue_state *state = state_;
2349 free(state->queues);
2355 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2356 netdev_dump_queue_stats_cb *cb, void *aux)
2358 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2361 ovs_mutex_lock(&netdev->mutex);
2362 error = tc_query_qdisc(netdev_);
2364 struct queue_dump_state state;
2366 if (!netdev->tc->ops->class_dump_stats) {
2368 } else if (!start_queue_dump(netdev_, &state)) {
2374 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2375 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2382 retval = finish_queue_dump(&state);
2388 ovs_mutex_unlock(&netdev->mutex);
2394 netdev_linux_get_in4(const struct netdev *netdev_,
2395 struct in_addr *address, struct in_addr *netmask)
2397 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2400 ovs_mutex_lock(&netdev->mutex);
2401 if (!(netdev->cache_valid & VALID_IN4)) {
2402 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2403 SIOCGIFADDR, "SIOCGIFADDR");
2405 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2406 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2408 netdev->cache_valid |= VALID_IN4;
2416 if (netdev->address.s_addr != INADDR_ANY) {
2417 *address = netdev->address;
2418 *netmask = netdev->netmask;
2420 error = EADDRNOTAVAIL;
2423 ovs_mutex_unlock(&netdev->mutex);
2429 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2430 struct in_addr netmask)
2432 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2435 ovs_mutex_lock(&netdev->mutex);
2436 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2438 netdev->cache_valid |= VALID_IN4;
2439 netdev->address = address;
2440 netdev->netmask = netmask;
2441 if (address.s_addr != INADDR_ANY) {
2442 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2443 "SIOCSIFNETMASK", netmask);
2446 ovs_mutex_unlock(&netdev->mutex);
2452 parse_if_inet6_line(const char *line,
2453 struct in6_addr *in6, char ifname[16 + 1])
2455 uint8_t *s6 = in6->s6_addr;
2456 #define X8 "%2"SCNx8
2457 return ovs_scan(line,
2458 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2459 "%*x %*x %*x %*x %16s\n",
2460 &s6[0], &s6[1], &s6[2], &s6[3],
2461 &s6[4], &s6[5], &s6[6], &s6[7],
2462 &s6[8], &s6[9], &s6[10], &s6[11],
2463 &s6[12], &s6[13], &s6[14], &s6[15],
2467 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2468 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2470 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2474 ovs_mutex_lock(&netdev->mutex);
2475 if (!(netdev->cache_valid & VALID_IN6)) {
2479 netdev->in6 = in6addr_any;
2481 file = fopen("/proc/net/if_inet6", "r");
2483 const char *name = netdev_get_name(netdev_);
2484 while (fgets(line, sizeof line, file)) {
2485 struct in6_addr in6_tmp;
2486 char ifname[16 + 1];
2487 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2488 && !strcmp(name, ifname))
2490 netdev->in6 = in6_tmp;
2496 netdev->cache_valid |= VALID_IN6;
2499 ovs_mutex_unlock(&netdev->mutex);
2505 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2507 struct sockaddr_in sin;
2508 memset(&sin, 0, sizeof sin);
2509 sin.sin_family = AF_INET;
2510 sin.sin_addr = addr;
2513 memset(sa, 0, sizeof *sa);
2514 memcpy(sa, &sin, sizeof sin);
2518 do_set_addr(struct netdev *netdev,
2519 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2523 make_in4_sockaddr(&ifr.ifr_addr, addr);
2524 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2528 /* Adds 'router' as a default IP gateway. */
2530 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2532 struct in_addr any = { INADDR_ANY };
2536 memset(&rt, 0, sizeof rt);
2537 make_in4_sockaddr(&rt.rt_dst, any);
2538 make_in4_sockaddr(&rt.rt_gateway, router);
2539 make_in4_sockaddr(&rt.rt_genmask, any);
2540 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2541 error = af_inet_ioctl(SIOCADDRT, &rt);
2543 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2549 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2552 static const char fn[] = "/proc/net/route";
2557 *netdev_name = NULL;
2558 stream = fopen(fn, "r");
2559 if (stream == NULL) {
2560 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2565 while (fgets(line, sizeof line, stream)) {
2568 ovs_be32 dest, gateway, mask;
2569 int refcnt, metric, mtu;
2570 unsigned int flags, use, window, irtt;
2573 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2575 iface, &dest, &gateway, &flags, &refcnt,
2576 &use, &metric, &mask, &mtu, &window, &irtt)) {
2577 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2581 if (!(flags & RTF_UP)) {
2582 /* Skip routes that aren't up. */
2586 /* The output of 'dest', 'mask', and 'gateway' were given in
2587 * network byte order, so we don't need need any endian
2588 * conversions here. */
2589 if ((dest & mask) == (host->s_addr & mask)) {
2591 /* The host is directly reachable. */
2592 next_hop->s_addr = 0;
2594 /* To reach the host, we must go through a gateway. */
2595 next_hop->s_addr = gateway;
2597 *netdev_name = xstrdup(iface);
2609 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2614 ovs_mutex_lock(&netdev->mutex);
2615 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2616 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2618 COVERAGE_INC(netdev_get_ethtool);
2619 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2620 error = netdev_linux_do_ethtool(netdev->up.name,
2623 "ETHTOOL_GDRVINFO");
2625 netdev->cache_valid |= VALID_DRVINFO;
2630 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2631 smap_add(smap, "driver_version", netdev->drvinfo.version);
2632 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2634 ovs_mutex_unlock(&netdev->mutex);
2640 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2643 smap_add(smap, "driver_name", "openvswitch");
2647 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2648 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2649 * returns 0. Otherwise, it returns a positive errno value; in particular,
2650 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2652 netdev_linux_arp_lookup(const struct netdev *netdev,
2653 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2656 struct sockaddr_in sin;
2659 memset(&r, 0, sizeof r);
2660 memset(&sin, 0, sizeof sin);
2661 sin.sin_family = AF_INET;
2662 sin.sin_addr.s_addr = ip;
2664 memcpy(&r.arp_pa, &sin, sizeof sin);
2665 r.arp_ha.sa_family = ARPHRD_ETHER;
2667 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2668 COVERAGE_INC(netdev_arp_lookup);
2669 retval = af_inet_ioctl(SIOCGARP, &r);
2671 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2672 } else if (retval != ENXIO) {
2673 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2674 netdev_get_name(netdev), IP_ARGS(ip),
2675 ovs_strerror(retval));
2681 nd_to_iff_flags(enum netdev_flags nd)
2684 if (nd & NETDEV_UP) {
2687 if (nd & NETDEV_PROMISC) {
2690 if (nd & NETDEV_LOOPBACK) {
2691 iff |= IFF_LOOPBACK;
2697 iff_to_nd_flags(int iff)
2699 enum netdev_flags nd = 0;
2703 if (iff & IFF_PROMISC) {
2704 nd |= NETDEV_PROMISC;
2706 if (iff & IFF_LOOPBACK) {
2707 nd |= NETDEV_LOOPBACK;
2713 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2714 enum netdev_flags on, enum netdev_flags *old_flagsp)
2715 OVS_REQUIRES(netdev->mutex)
2717 int old_flags, new_flags;
2720 old_flags = netdev->ifi_flags;
2721 *old_flagsp = iff_to_nd_flags(old_flags);
2722 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2723 if (new_flags != old_flags) {
2724 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2725 get_flags(&netdev->up, &netdev->ifi_flags);
2732 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2733 enum netdev_flags on, enum netdev_flags *old_flagsp)
2735 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2738 ovs_mutex_lock(&netdev->mutex);
2739 error = update_flags(netdev, off, on, old_flagsp);
2740 ovs_mutex_unlock(&netdev->mutex);
2745 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2746 GET_FEATURES, GET_STATUS) \
2752 netdev_linux_wait, \
2754 netdev_linux_alloc, \
2756 netdev_linux_destruct, \
2757 netdev_linux_dealloc, \
2758 NULL, /* get_config */ \
2759 NULL, /* set_config */ \
2760 NULL, /* get_tunnel_config */ \
2761 NULL, /* build header */ \
2762 NULL, /* push header */ \
2763 NULL, /* pop header */ \
2764 NULL, /* get_numa_id */ \
2765 NULL, /* set_multiq */ \
2767 netdev_linux_send, \
2768 netdev_linux_send_wait, \
2770 netdev_linux_set_etheraddr, \
2771 netdev_linux_get_etheraddr, \
2772 netdev_linux_get_mtu, \
2773 netdev_linux_set_mtu, \
2774 netdev_linux_get_ifindex, \
2775 netdev_linux_get_carrier, \
2776 netdev_linux_get_carrier_resets, \
2777 netdev_linux_set_miimon_interval, \
2781 netdev_linux_set_advertisements, \
2783 netdev_linux_set_policing, \
2784 netdev_linux_get_qos_types, \
2785 netdev_linux_get_qos_capabilities, \
2786 netdev_linux_get_qos, \
2787 netdev_linux_set_qos, \
2788 netdev_linux_get_queue, \
2789 netdev_linux_set_queue, \
2790 netdev_linux_delete_queue, \
2791 netdev_linux_get_queue_stats, \
2792 netdev_linux_queue_dump_start, \
2793 netdev_linux_queue_dump_next, \
2794 netdev_linux_queue_dump_done, \
2795 netdev_linux_dump_queue_stats, \
2797 netdev_linux_get_in4, \
2798 netdev_linux_set_in4, \
2799 netdev_linux_get_in6, \
2800 netdev_linux_add_router, \
2801 netdev_linux_get_next_hop, \
2803 netdev_linux_arp_lookup, \
2805 netdev_linux_update_flags, \
2807 netdev_linux_rxq_alloc, \
2808 netdev_linux_rxq_construct, \
2809 netdev_linux_rxq_destruct, \
2810 netdev_linux_rxq_dealloc, \
2811 netdev_linux_rxq_recv, \
2812 netdev_linux_rxq_wait, \
2813 netdev_linux_rxq_drain, \
2816 const struct netdev_class netdev_linux_class =
2819 netdev_linux_construct,
2820 netdev_linux_get_stats,
2821 netdev_linux_get_features,
2822 netdev_linux_get_status);
2824 const struct netdev_class netdev_tap_class =
2827 netdev_linux_construct_tap,
2828 netdev_tap_get_stats,
2829 netdev_linux_get_features,
2830 netdev_linux_get_status);
2832 const struct netdev_class netdev_internal_class =
2835 netdev_linux_construct,
2836 netdev_internal_get_stats,
2837 NULL, /* get_features */
2838 netdev_internal_get_status);
2841 #define CODEL_N_QUEUES 0x0000
2843 /* In sufficiently new kernel headers these are defined as enums in
2844 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2845 * kernels. (This overrides any enum definition in the header file but that's
2847 #define TCA_CODEL_TARGET 1
2848 #define TCA_CODEL_LIMIT 2
2849 #define TCA_CODEL_INTERVAL 3
2858 static struct codel *
2859 codel_get__(const struct netdev *netdev_)
2861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2862 return CONTAINER_OF(netdev->tc, struct codel, tc);
2866 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2870 struct codel *codel;
2872 codel = xmalloc(sizeof *codel);
2873 tc_init(&codel->tc, &tc_ops_codel);
2874 codel->target = target;
2875 codel->limit = limit;
2876 codel->interval = interval;
2878 netdev->tc = &codel->tc;
2882 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2886 struct ofpbuf request;
2887 struct tcmsg *tcmsg;
2888 uint32_t otarget, olimit, ointerval;
2891 tc_del_qdisc(netdev);
2893 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2894 NLM_F_EXCL | NLM_F_CREATE, &request);
2898 tcmsg->tcm_handle = tc_make_handle(1, 0);
2899 tcmsg->tcm_parent = TC_H_ROOT;
2901 otarget = target ? target : 5000;
2902 olimit = limit ? limit : 10240;
2903 ointerval = interval ? interval : 100000;
2905 nl_msg_put_string(&request, TCA_KIND, "codel");
2906 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2907 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2908 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2909 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2910 nl_msg_end_nested(&request, opt_offset);
2912 error = tc_transact(&request, NULL);
2914 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2915 "target %u, limit %u, interval %u error %d(%s)",
2916 netdev_get_name(netdev),
2917 otarget, olimit, ointerval,
2918 error, ovs_strerror(error));
2924 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2925 const struct smap *details, struct codel *codel)
2927 const char *target_s;
2928 const char *limit_s;
2929 const char *interval_s;
2931 target_s = smap_get(details, "target");
2932 limit_s = smap_get(details, "limit");
2933 interval_s = smap_get(details, "interval");
2935 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2936 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2937 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2939 if (!codel->target) {
2940 codel->target = 5000;
2942 if (!codel->limit) {
2943 codel->limit = 10240;
2945 if (!codel->interval) {
2946 codel->interval = 100000;
2951 codel_tc_install(struct netdev *netdev, const struct smap *details)
2956 codel_parse_qdisc_details__(netdev, details, &codel);
2957 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2960 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2966 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2968 static const struct nl_policy tca_codel_policy[] = {
2969 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2970 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2971 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2974 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2976 if (!nl_parse_nested(nl_options, tca_codel_policy,
2977 attrs, ARRAY_SIZE(tca_codel_policy))) {
2978 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2982 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2983 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2984 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2989 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2991 struct nlattr *nlattr;
2996 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3001 error = codel_parse_tca_options__(nlattr, &codel);
3006 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3012 codel_tc_destroy(struct tc *tc)
3014 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3020 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3022 const struct codel *codel = codel_get__(netdev);
3023 smap_add_format(details, "target", "%u", codel->target);
3024 smap_add_format(details, "limit", "%u", codel->limit);
3025 smap_add_format(details, "interval", "%u", codel->interval);
3030 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3034 codel_parse_qdisc_details__(netdev, details, &codel);
3035 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3036 codel_get__(netdev)->target = codel.target;
3037 codel_get__(netdev)->limit = codel.limit;
3038 codel_get__(netdev)->interval = codel.interval;
3042 static const struct tc_ops tc_ops_codel = {
3043 "codel", /* linux_name */
3044 "linux-codel", /* ovs_name */
3045 CODEL_N_QUEUES, /* n_queues */
3058 /* FQ-CoDel traffic control class. */
3060 #define FQCODEL_N_QUEUES 0x0000
3062 /* In sufficiently new kernel headers these are defined as enums in
3063 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3064 * kernels. (This overrides any enum definition in the header file but that's
3066 #define TCA_FQ_CODEL_TARGET 1
3067 #define TCA_FQ_CODEL_LIMIT 2
3068 #define TCA_FQ_CODEL_INTERVAL 3
3069 #define TCA_FQ_CODEL_ECN 4
3070 #define TCA_FQ_CODEL_FLOWS 5
3071 #define TCA_FQ_CODEL_QUANTUM 6
3082 static struct fqcodel *
3083 fqcodel_get__(const struct netdev *netdev_)
3085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3086 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3090 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3091 uint32_t interval, uint32_t flows, uint32_t quantum)
3093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3094 struct fqcodel *fqcodel;
3096 fqcodel = xmalloc(sizeof *fqcodel);
3097 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3098 fqcodel->target = target;
3099 fqcodel->limit = limit;
3100 fqcodel->interval = interval;
3101 fqcodel->flows = flows;
3102 fqcodel->quantum = quantum;
3104 netdev->tc = &fqcodel->tc;
3108 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3109 uint32_t interval, uint32_t flows, uint32_t quantum)
3112 struct ofpbuf request;
3113 struct tcmsg *tcmsg;
3114 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3117 tc_del_qdisc(netdev);
3119 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3120 NLM_F_EXCL | NLM_F_CREATE, &request);
3124 tcmsg->tcm_handle = tc_make_handle(1, 0);
3125 tcmsg->tcm_parent = TC_H_ROOT;
3127 otarget = target ? target : 5000;
3128 olimit = limit ? limit : 10240;
3129 ointerval = interval ? interval : 100000;
3130 oflows = flows ? flows : 1024;
3131 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3134 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3135 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3136 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3137 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3138 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3139 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3141 nl_msg_end_nested(&request, opt_offset);
3143 error = tc_transact(&request, NULL);
3145 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3146 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3147 netdev_get_name(netdev),
3148 otarget, olimit, ointerval, oflows, oquantum,
3149 error, ovs_strerror(error));
3155 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3156 const struct smap *details, struct fqcodel *fqcodel)
3158 const char *target_s;
3159 const char *limit_s;
3160 const char *interval_s;
3161 const char *flows_s;
3162 const char *quantum_s;
3164 target_s = smap_get(details, "target");
3165 limit_s = smap_get(details, "limit");
3166 interval_s = smap_get(details, "interval");
3167 flows_s = smap_get(details, "flows");
3168 quantum_s = smap_get(details, "quantum");
3169 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3170 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3171 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3172 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3173 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3174 if (!fqcodel->target) {
3175 fqcodel->target = 5000;
3177 if (!fqcodel->limit) {
3178 fqcodel->limit = 10240;
3180 if (!fqcodel->interval) {
3181 fqcodel->interval = 1000000;
3183 if (!fqcodel->flows) {
3184 fqcodel->flows = 1024;
3186 if (!fqcodel->quantum) {
3187 fqcodel->quantum = 1514;
3192 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3195 struct fqcodel fqcodel;
3197 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3198 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3199 fqcodel.interval, fqcodel.flows,
3202 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3203 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3209 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3211 static const struct nl_policy tca_fqcodel_policy[] = {
3212 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3213 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3214 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3215 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3219 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3221 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3222 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3223 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3227 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3228 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3229 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3230 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3231 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3236 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3238 struct nlattr *nlattr;
3241 struct fqcodel fqcodel;
3243 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3248 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3253 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3254 fqcodel.flows, fqcodel.quantum);
3259 fqcodel_tc_destroy(struct tc *tc)
3261 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3267 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3269 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3270 smap_add_format(details, "target", "%u", fqcodel->target);
3271 smap_add_format(details, "limit", "%u", fqcodel->limit);
3272 smap_add_format(details, "interval", "%u", fqcodel->interval);
3273 smap_add_format(details, "flows", "%u", fqcodel->flows);
3274 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3279 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3281 struct fqcodel fqcodel;
3283 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3284 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3285 fqcodel.flows, fqcodel.quantum);
3286 fqcodel_get__(netdev)->target = fqcodel.target;
3287 fqcodel_get__(netdev)->limit = fqcodel.limit;
3288 fqcodel_get__(netdev)->interval = fqcodel.interval;
3289 fqcodel_get__(netdev)->flows = fqcodel.flows;
3290 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3294 static const struct tc_ops tc_ops_fqcodel = {
3295 "fq_codel", /* linux_name */
3296 "linux-fq_codel", /* ovs_name */
3297 FQCODEL_N_QUEUES, /* n_queues */
3310 /* SFQ traffic control class. */
3312 #define SFQ_N_QUEUES 0x0000
3321 sfq_get__(const struct netdev *netdev_)
3323 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3324 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3328 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3330 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3333 sfq = xmalloc(sizeof *sfq);
3334 tc_init(&sfq->tc, &tc_ops_sfq);
3335 sfq->perturb = perturb;
3336 sfq->quantum = quantum;
3338 netdev->tc = &sfq->tc;
3342 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3344 struct tc_sfq_qopt opt;
3345 struct ofpbuf request;
3346 struct tcmsg *tcmsg;
3348 int mtu_error, error;
3349 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3351 tc_del_qdisc(netdev);
3353 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3354 NLM_F_EXCL | NLM_F_CREATE, &request);
3358 tcmsg->tcm_handle = tc_make_handle(1, 0);
3359 tcmsg->tcm_parent = TC_H_ROOT;
3361 memset(&opt, 0, sizeof opt);
3364 opt.quantum = mtu; /* if we cannot find mtu, use default */
3367 opt.quantum = quantum;
3371 opt.perturb_period = 10;
3373 opt.perturb_period = perturb;
3376 nl_msg_put_string(&request, TCA_KIND, "sfq");
3377 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3379 error = tc_transact(&request, NULL);
3381 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3382 "quantum %u, perturb %u error %d(%s)",
3383 netdev_get_name(netdev),
3384 opt.quantum, opt.perturb_period,
3385 error, ovs_strerror(error));
3391 sfq_parse_qdisc_details__(struct netdev *netdev,
3392 const struct smap *details, struct sfq *sfq)
3394 const char *perturb_s;
3395 const char *quantum_s;
3399 perturb_s = smap_get(details, "perturb");
3400 quantum_s = smap_get(details, "quantum");
3401 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3402 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3403 if (!sfq->perturb) {
3407 if (!sfq->quantum) {
3408 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3412 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3413 "device without mtu");
3420 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3425 sfq_parse_qdisc_details__(netdev, details, &sfq);
3426 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3428 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3434 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3436 const struct tc_sfq_qopt *sfq;
3437 struct nlattr *nlattr;
3441 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3443 sfq = nl_attr_get(nlattr);
3444 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3452 sfq_tc_destroy(struct tc *tc)
3454 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3460 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3462 const struct sfq *sfq = sfq_get__(netdev);
3463 smap_add_format(details, "quantum", "%u", sfq->quantum);
3464 smap_add_format(details, "perturb", "%u", sfq->perturb);
3469 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3473 sfq_parse_qdisc_details__(netdev, details, &sfq);
3474 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3475 sfq_get__(netdev)->quantum = sfq.quantum;
3476 sfq_get__(netdev)->perturb = sfq.perturb;
3480 static const struct tc_ops tc_ops_sfq = {
3481 "sfq", /* linux_name */
3482 "linux-sfq", /* ovs_name */
3483 SFQ_N_QUEUES, /* n_queues */
3496 /* HTB traffic control class. */
3498 #define HTB_N_QUEUES 0xf000
3499 #define HTB_RATE2QUANTUM 10
3503 unsigned int max_rate; /* In bytes/s. */
3507 struct tc_queue tc_queue;
3508 unsigned int min_rate; /* In bytes/s. */
3509 unsigned int max_rate; /* In bytes/s. */
3510 unsigned int burst; /* In bytes. */
3511 unsigned int priority; /* Lower values are higher priorities. */
3515 htb_get__(const struct netdev *netdev_)
3517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3518 return CONTAINER_OF(netdev->tc, struct htb, tc);
3522 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3524 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3527 htb = xmalloc(sizeof *htb);
3528 tc_init(&htb->tc, &tc_ops_htb);
3529 htb->max_rate = max_rate;
3531 netdev->tc = &htb->tc;
3534 /* Create an HTB qdisc.
3536 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3538 htb_setup_qdisc__(struct netdev *netdev)
3541 struct tc_htb_glob opt;
3542 struct ofpbuf request;
3543 struct tcmsg *tcmsg;
3545 tc_del_qdisc(netdev);
3547 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3548 NLM_F_EXCL | NLM_F_CREATE, &request);
3552 tcmsg->tcm_handle = tc_make_handle(1, 0);
3553 tcmsg->tcm_parent = TC_H_ROOT;
3555 nl_msg_put_string(&request, TCA_KIND, "htb");
3557 memset(&opt, 0, sizeof opt);
3558 opt.rate2quantum = HTB_RATE2QUANTUM;
3562 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3563 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3564 nl_msg_end_nested(&request, opt_offset);
3566 return tc_transact(&request, NULL);
3569 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3570 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3572 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3573 unsigned int parent, struct htb_class *class)
3576 struct tc_htb_opt opt;
3577 struct ofpbuf request;
3578 struct tcmsg *tcmsg;
3582 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3584 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3585 netdev_get_name(netdev));
3589 memset(&opt, 0, sizeof opt);
3590 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3591 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3592 /* Makes sure the quantum is at least MTU. Setting quantum will
3593 * make htb ignore the r2q for this class. */
3594 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3597 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3598 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3599 opt.prio = class->priority;
3601 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3605 tcmsg->tcm_handle = handle;
3606 tcmsg->tcm_parent = parent;
3608 nl_msg_put_string(&request, TCA_KIND, "htb");
3609 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3610 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3611 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3612 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3613 nl_msg_end_nested(&request, opt_offset);
3615 error = tc_transact(&request, NULL);
3617 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3618 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3619 netdev_get_name(netdev),
3620 tc_get_major(handle), tc_get_minor(handle),
3621 tc_get_major(parent), tc_get_minor(parent),
3622 class->min_rate, class->max_rate,
3623 class->burst, class->priority, ovs_strerror(error));
3628 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3629 * description of them into 'details'. The description complies with the
3630 * specification given in the vswitch database documentation for linux-htb
3633 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3635 static const struct nl_policy tca_htb_policy[] = {
3636 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3637 .min_len = sizeof(struct tc_htb_opt) },
3640 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3641 const struct tc_htb_opt *htb;
3643 if (!nl_parse_nested(nl_options, tca_htb_policy,
3644 attrs, ARRAY_SIZE(tca_htb_policy))) {
3645 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3649 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3650 class->min_rate = htb->rate.rate;
3651 class->max_rate = htb->ceil.rate;
3652 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3653 class->priority = htb->prio;
3658 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3659 struct htb_class *options,
3660 struct netdev_queue_stats *stats)
3662 struct nlattr *nl_options;
3663 unsigned int handle;
3666 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3667 if (!error && queue_id) {
3668 unsigned int major = tc_get_major(handle);
3669 unsigned int minor = tc_get_minor(handle);
3670 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3671 *queue_id = minor - 1;
3676 if (!error && options) {
3677 error = htb_parse_tca_options__(nl_options, options);
3683 htb_parse_qdisc_details__(struct netdev *netdev_,
3684 const struct smap *details, struct htb_class *hc)
3686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3687 const char *max_rate_s;
3689 max_rate_s = smap_get(details, "max-rate");
3690 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3691 if (!hc->max_rate) {
3692 enum netdev_features current;
3694 netdev_linux_read_features(netdev);
3695 current = !netdev->get_features_error ? netdev->current : 0;
3696 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3698 hc->min_rate = hc->max_rate;
3704 htb_parse_class_details__(struct netdev *netdev,
3705 const struct smap *details, struct htb_class *hc)
3707 const struct htb *htb = htb_get__(netdev);
3708 const char *min_rate_s = smap_get(details, "min-rate");
3709 const char *max_rate_s = smap_get(details, "max-rate");
3710 const char *burst_s = smap_get(details, "burst");
3711 const char *priority_s = smap_get(details, "priority");
3714 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3716 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3717 netdev_get_name(netdev));
3721 /* HTB requires at least an mtu sized min-rate to send any traffic even
3722 * on uncongested links. */
3723 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3724 hc->min_rate = MAX(hc->min_rate, mtu);
3725 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3728 hc->max_rate = (max_rate_s
3729 ? strtoull(max_rate_s, NULL, 10) / 8
3731 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3732 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3736 * According to hints in the documentation that I've read, it is important
3737 * that 'burst' be at least as big as the largest frame that might be
3738 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3739 * but having it a bit too small is a problem. Since netdev_get_mtu()
3740 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3741 * the MTU. We actually add 64, instead of 14, as a guard against
3742 * additional headers get tacked on somewhere that we're not aware of. */
3743 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3744 hc->burst = MAX(hc->burst, mtu + 64);
3747 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3753 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3754 unsigned int parent, struct htb_class *options,
3755 struct netdev_queue_stats *stats)
3757 struct ofpbuf *reply;
3760 error = tc_query_class(netdev, handle, parent, &reply);
3762 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3763 ofpbuf_delete(reply);
3769 htb_tc_install(struct netdev *netdev, const struct smap *details)
3773 error = htb_setup_qdisc__(netdev);
3775 struct htb_class hc;
3777 htb_parse_qdisc_details__(netdev, details, &hc);
3778 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3779 tc_make_handle(1, 0), &hc);
3781 htb_install__(netdev, hc.max_rate);
3787 static struct htb_class *
3788 htb_class_cast__(const struct tc_queue *queue)
3790 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3794 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3795 const struct htb_class *hc)
3797 struct htb *htb = htb_get__(netdev);
3798 size_t hash = hash_int(queue_id, 0);
3799 struct tc_queue *queue;
3800 struct htb_class *hcp;
3802 queue = tc_find_queue__(netdev, queue_id, hash);
3804 hcp = htb_class_cast__(queue);
3806 hcp = xmalloc(sizeof *hcp);
3807 queue = &hcp->tc_queue;
3808 queue->queue_id = queue_id;
3809 queue->created = time_msec();
3810 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3813 hcp->min_rate = hc->min_rate;
3814 hcp->max_rate = hc->max_rate;
3815 hcp->burst = hc->burst;
3816 hcp->priority = hc->priority;
3820 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3823 struct queue_dump_state state;
3824 struct htb_class hc;
3826 /* Get qdisc options. */
3828 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3829 htb_install__(netdev, hc.max_rate);
3832 if (!start_queue_dump(netdev, &state)) {
3835 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3836 unsigned int queue_id;
3838 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3839 htb_update_queue__(netdev, queue_id, &hc);
3842 finish_queue_dump(&state);
3848 htb_tc_destroy(struct tc *tc)
3850 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3851 struct htb_class *hc, *next;
3853 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3854 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3862 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3864 const struct htb *htb = htb_get__(netdev);
3865 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3870 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3872 struct htb_class hc;
3875 htb_parse_qdisc_details__(netdev, details, &hc);
3876 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3877 tc_make_handle(1, 0), &hc);
3879 htb_get__(netdev)->max_rate = hc.max_rate;
3885 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3886 const struct tc_queue *queue, struct smap *details)
3888 const struct htb_class *hc = htb_class_cast__(queue);
3890 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3891 if (hc->min_rate != hc->max_rate) {
3892 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3894 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3896 smap_add_format(details, "priority", "%u", hc->priority);
3902 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3903 const struct smap *details)
3905 struct htb_class hc;
3908 error = htb_parse_class_details__(netdev, details, &hc);
3913 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3914 tc_make_handle(1, 0xfffe), &hc);
3919 htb_update_queue__(netdev, queue_id, &hc);
3924 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3926 struct htb_class *hc = htb_class_cast__(queue);
3927 struct htb *htb = htb_get__(netdev);
3930 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3932 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3939 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3940 struct netdev_queue_stats *stats)
3942 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3943 tc_make_handle(1, 0xfffe), NULL, stats);
3947 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3948 const struct ofpbuf *nlmsg,
3949 netdev_dump_queue_stats_cb *cb, void *aux)
3951 struct netdev_queue_stats stats;
3952 unsigned int handle, major, minor;
3955 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3960 major = tc_get_major(handle);
3961 minor = tc_get_minor(handle);
3962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3963 (*cb)(minor - 1, &stats, aux);
3968 static const struct tc_ops tc_ops_htb = {
3969 "htb", /* linux_name */
3970 "linux-htb", /* ovs_name */
3971 HTB_N_QUEUES, /* n_queues */
3980 htb_class_get_stats,
3981 htb_class_dump_stats
3984 /* "linux-hfsc" traffic control class. */
3986 #define HFSC_N_QUEUES 0xf000
3994 struct tc_queue tc_queue;
3999 static struct hfsc *
4000 hfsc_get__(const struct netdev *netdev_)
4002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4003 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4006 static struct hfsc_class *
4007 hfsc_class_cast__(const struct tc_queue *queue)
4009 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4013 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4018 hfsc = xmalloc(sizeof *hfsc);
4019 tc_init(&hfsc->tc, &tc_ops_hfsc);
4020 hfsc->max_rate = max_rate;
4021 netdev->tc = &hfsc->tc;
4025 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4026 const struct hfsc_class *hc)
4030 struct hfsc_class *hcp;
4031 struct tc_queue *queue;
4033 hfsc = hfsc_get__(netdev);
4034 hash = hash_int(queue_id, 0);
4036 queue = tc_find_queue__(netdev, queue_id, hash);
4038 hcp = hfsc_class_cast__(queue);
4040 hcp = xmalloc(sizeof *hcp);
4041 queue = &hcp->tc_queue;
4042 queue->queue_id = queue_id;
4043 queue->created = time_msec();
4044 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4047 hcp->min_rate = hc->min_rate;
4048 hcp->max_rate = hc->max_rate;
4052 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4054 const struct tc_service_curve *rsc, *fsc, *usc;
4055 static const struct nl_policy tca_hfsc_policy[] = {
4057 .type = NL_A_UNSPEC,
4059 .min_len = sizeof(struct tc_service_curve),
4062 .type = NL_A_UNSPEC,
4064 .min_len = sizeof(struct tc_service_curve),
4067 .type = NL_A_UNSPEC,
4069 .min_len = sizeof(struct tc_service_curve),
4072 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4074 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4075 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4080 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4081 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4082 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4084 if (rsc->m1 != 0 || rsc->d != 0 ||
4085 fsc->m1 != 0 || fsc->d != 0 ||
4086 usc->m1 != 0 || usc->d != 0) {
4087 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4088 "Non-linear service curves are not supported.");
4092 if (rsc->m2 != fsc->m2) {
4093 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4094 "Real-time service curves are not supported ");
4098 if (rsc->m2 > usc->m2) {
4099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4100 "Min-rate service curve is greater than "
4101 "the max-rate service curve.");
4105 class->min_rate = fsc->m2;
4106 class->max_rate = usc->m2;
4111 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4112 struct hfsc_class *options,
4113 struct netdev_queue_stats *stats)
4116 unsigned int handle;
4117 struct nlattr *nl_options;
4119 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4125 unsigned int major, minor;
4127 major = tc_get_major(handle);
4128 minor = tc_get_minor(handle);
4129 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4130 *queue_id = minor - 1;
4137 error = hfsc_parse_tca_options__(nl_options, options);
4144 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4145 unsigned int parent, struct hfsc_class *options,
4146 struct netdev_queue_stats *stats)
4149 struct ofpbuf *reply;
4151 error = tc_query_class(netdev, handle, parent, &reply);
4156 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4157 ofpbuf_delete(reply);
4162 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4163 struct hfsc_class *class)
4165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4167 const char *max_rate_s;
4169 max_rate_s = smap_get(details, "max-rate");
4170 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4173 enum netdev_features current;
4175 netdev_linux_read_features(netdev);
4176 current = !netdev->get_features_error ? netdev->current : 0;
4177 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4180 class->min_rate = max_rate;
4181 class->max_rate = max_rate;
4185 hfsc_parse_class_details__(struct netdev *netdev,
4186 const struct smap *details,
4187 struct hfsc_class * class)
4189 const struct hfsc *hfsc;
4190 uint32_t min_rate, max_rate;
4191 const char *min_rate_s, *max_rate_s;
4193 hfsc = hfsc_get__(netdev);
4194 min_rate_s = smap_get(details, "min-rate");
4195 max_rate_s = smap_get(details, "max-rate");
4197 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4198 min_rate = MAX(min_rate, 1);
4199 min_rate = MIN(min_rate, hfsc->max_rate);
4201 max_rate = (max_rate_s
4202 ? strtoull(max_rate_s, NULL, 10) / 8
4204 max_rate = MAX(max_rate, min_rate);
4205 max_rate = MIN(max_rate, hfsc->max_rate);
4207 class->min_rate = min_rate;
4208 class->max_rate = max_rate;
4213 /* Create an HFSC qdisc.
4215 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4217 hfsc_setup_qdisc__(struct netdev * netdev)
4219 struct tcmsg *tcmsg;
4220 struct ofpbuf request;
4221 struct tc_hfsc_qopt opt;
4223 tc_del_qdisc(netdev);
4225 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4226 NLM_F_EXCL | NLM_F_CREATE, &request);
4232 tcmsg->tcm_handle = tc_make_handle(1, 0);
4233 tcmsg->tcm_parent = TC_H_ROOT;
4235 memset(&opt, 0, sizeof opt);
4238 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4239 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4241 return tc_transact(&request, NULL);
4244 /* Create an HFSC class.
4246 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4247 * sc rate <min_rate> ul rate <max_rate>" */
4249 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4250 unsigned int parent, struct hfsc_class *class)
4254 struct tcmsg *tcmsg;
4255 struct ofpbuf request;
4256 struct tc_service_curve min, max;
4258 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4264 tcmsg->tcm_handle = handle;
4265 tcmsg->tcm_parent = parent;
4269 min.m2 = class->min_rate;
4273 max.m2 = class->max_rate;
4275 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4276 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4277 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4278 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4279 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4280 nl_msg_end_nested(&request, opt_offset);
4282 error = tc_transact(&request, NULL);
4284 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4285 "min-rate %ubps, max-rate %ubps (%s)",
4286 netdev_get_name(netdev),
4287 tc_get_major(handle), tc_get_minor(handle),
4288 tc_get_major(parent), tc_get_minor(parent),
4289 class->min_rate, class->max_rate, ovs_strerror(error));
4296 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4299 struct hfsc_class class;
4301 error = hfsc_setup_qdisc__(netdev);
4307 hfsc_parse_qdisc_details__(netdev, details, &class);
4308 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4309 tc_make_handle(1, 0), &class);
4315 hfsc_install__(netdev, class.max_rate);
4320 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4323 struct queue_dump_state state;
4324 struct hfsc_class hc;
4327 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4328 hfsc_install__(netdev, hc.max_rate);
4330 if (!start_queue_dump(netdev, &state)) {
4334 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4335 unsigned int queue_id;
4337 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4338 hfsc_update_queue__(netdev, queue_id, &hc);
4342 finish_queue_dump(&state);
4347 hfsc_tc_destroy(struct tc *tc)
4350 struct hfsc_class *hc, *next;
4352 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4354 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4355 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4364 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4366 const struct hfsc *hfsc;
4367 hfsc = hfsc_get__(netdev);
4368 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4373 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4376 struct hfsc_class class;
4378 hfsc_parse_qdisc_details__(netdev, details, &class);
4379 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4380 tc_make_handle(1, 0), &class);
4383 hfsc_get__(netdev)->max_rate = class.max_rate;
4390 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4391 const struct tc_queue *queue, struct smap *details)
4393 const struct hfsc_class *hc;
4395 hc = hfsc_class_cast__(queue);
4396 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4397 if (hc->min_rate != hc->max_rate) {
4398 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4404 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4405 const struct smap *details)
4408 struct hfsc_class class;
4410 error = hfsc_parse_class_details__(netdev, details, &class);
4415 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4416 tc_make_handle(1, 0xfffe), &class);
4421 hfsc_update_queue__(netdev, queue_id, &class);
4426 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4430 struct hfsc_class *hc;
4432 hc = hfsc_class_cast__(queue);
4433 hfsc = hfsc_get__(netdev);
4435 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4437 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4444 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4445 struct netdev_queue_stats *stats)
4447 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4448 tc_make_handle(1, 0xfffe), NULL, stats);
4452 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4453 const struct ofpbuf *nlmsg,
4454 netdev_dump_queue_stats_cb *cb, void *aux)
4456 struct netdev_queue_stats stats;
4457 unsigned int handle, major, minor;
4460 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4465 major = tc_get_major(handle);
4466 minor = tc_get_minor(handle);
4467 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4468 (*cb)(minor - 1, &stats, aux);
4473 static const struct tc_ops tc_ops_hfsc = {
4474 "hfsc", /* linux_name */
4475 "linux-hfsc", /* ovs_name */
4476 HFSC_N_QUEUES, /* n_queues */
4477 hfsc_tc_install, /* tc_install */
4478 hfsc_tc_load, /* tc_load */
4479 hfsc_tc_destroy, /* tc_destroy */
4480 hfsc_qdisc_get, /* qdisc_get */
4481 hfsc_qdisc_set, /* qdisc_set */
4482 hfsc_class_get, /* class_get */
4483 hfsc_class_set, /* class_set */
4484 hfsc_class_delete, /* class_delete */
4485 hfsc_class_get_stats, /* class_get_stats */
4486 hfsc_class_dump_stats /* class_dump_stats */
4489 /* "linux-default" traffic control class.
4491 * This class represents the default, unnamed Linux qdisc. It corresponds to
4492 * the "" (empty string) QoS type in the OVS database. */
4495 default_install__(struct netdev *netdev_)
4497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4498 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4500 /* Nothing but a tc class implementation is allowed to write to a tc. This
4501 * class never does that, so we can legitimately use a const tc object. */
4502 netdev->tc = CONST_CAST(struct tc *, &tc);
4506 default_tc_install(struct netdev *netdev,
4507 const struct smap *details OVS_UNUSED)
4509 default_install__(netdev);
4514 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4516 default_install__(netdev);
4520 static const struct tc_ops tc_ops_default = {
4521 NULL, /* linux_name */
4526 NULL, /* tc_destroy */
4527 NULL, /* qdisc_get */
4528 NULL, /* qdisc_set */
4529 NULL, /* class_get */
4530 NULL, /* class_set */
4531 NULL, /* class_delete */
4532 NULL, /* class_get_stats */
4533 NULL /* class_dump_stats */
4536 /* "linux-other" traffic control class.
4541 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4543 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4544 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4546 /* Nothing but a tc class implementation is allowed to write to a tc. This
4547 * class never does that, so we can legitimately use a const tc object. */
4548 netdev->tc = CONST_CAST(struct tc *, &tc);
4552 static const struct tc_ops tc_ops_other = {
4553 NULL, /* linux_name */
4554 "linux-other", /* ovs_name */
4556 NULL, /* tc_install */
4558 NULL, /* tc_destroy */
4559 NULL, /* qdisc_get */
4560 NULL, /* qdisc_set */
4561 NULL, /* class_get */
4562 NULL, /* class_set */
4563 NULL, /* class_delete */
4564 NULL, /* class_get_stats */
4565 NULL /* class_dump_stats */
4568 /* Traffic control. */
4570 /* Number of kernel "tc" ticks per second. */
4571 static double ticks_per_s;
4573 /* Number of kernel "jiffies" per second. This is used for the purpose of
4574 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4575 * one jiffy's worth of data.
4577 * There are two possibilities here:
4579 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4580 * approximate range of 100 to 1024. That means that we really need to
4581 * make sure that the qdisc can buffer that much data.
4583 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4584 * has finely granular timers and there's no need to fudge additional room
4585 * for buffers. (There's no extra effort needed to implement that: the
4586 * large 'buffer_hz' is used as a divisor, so practically any number will
4587 * come out as 0 in the division. Small integer results in the case of
4588 * really high dividends won't have any real effect anyhow.)
4590 static unsigned int buffer_hz;
4592 /* Returns tc handle 'major':'minor'. */
4594 tc_make_handle(unsigned int major, unsigned int minor)
4596 return TC_H_MAKE(major << 16, minor);
4599 /* Returns the major number from 'handle'. */
4601 tc_get_major(unsigned int handle)
4603 return TC_H_MAJ(handle) >> 16;
4606 /* Returns the minor number from 'handle'. */
4608 tc_get_minor(unsigned int handle)
4610 return TC_H_MIN(handle);
4613 static struct tcmsg *
4614 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4615 struct ofpbuf *request)
4617 struct tcmsg *tcmsg;
4621 error = get_ifindex(netdev, &ifindex);
4626 ofpbuf_init(request, 512);
4627 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4628 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4629 tcmsg->tcm_family = AF_UNSPEC;
4630 tcmsg->tcm_ifindex = ifindex;
4631 /* Caller should fill in tcmsg->tcm_handle. */
4632 /* Caller should fill in tcmsg->tcm_parent. */
4638 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4640 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4641 ofpbuf_uninit(request);
4645 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4646 * policing configuration.
4648 * This function is equivalent to running the following when 'add' is true:
4649 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4651 * This function is equivalent to running the following when 'add' is false:
4652 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4654 * The configuration and stats may be seen with the following command:
4655 * /sbin/tc -s qdisc show dev <devname>
4657 * Returns 0 if successful, otherwise a positive errno value.
4660 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4662 struct ofpbuf request;
4663 struct tcmsg *tcmsg;
4665 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4666 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4668 tcmsg = tc_make_request(netdev, type, flags, &request);
4672 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4673 tcmsg->tcm_parent = TC_H_INGRESS;
4674 nl_msg_put_string(&request, TCA_KIND, "ingress");
4675 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4677 error = tc_transact(&request, NULL);
4679 /* If we're deleting the qdisc, don't worry about some of the
4680 * error conditions. */
4681 if (!add && (error == ENOENT || error == EINVAL)) {
4690 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4693 * This function is equivalent to running:
4694 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4695 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4698 * The configuration and stats may be seen with the following command:
4699 * /sbin/tc -s filter show dev <devname> parent ffff:
4701 * Returns 0 if successful, otherwise a positive errno value.
4704 tc_add_policer(struct netdev *netdev,
4705 uint32_t kbits_rate, uint32_t kbits_burst)
4707 struct tc_police tc_police;
4708 struct ofpbuf request;
4709 struct tcmsg *tcmsg;
4710 size_t basic_offset;
4711 size_t police_offset;
4715 memset(&tc_police, 0, sizeof tc_police);
4716 tc_police.action = TC_POLICE_SHOT;
4717 tc_police.mtu = mtu;
4718 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4720 /* The following appears wrong in two ways:
4722 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4723 * arguments (or at least consistently "bytes" as both or "bits" as
4724 * both), but this supplies bytes for the first argument and bits for the
4727 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4729 * However if you "fix" those problems then "tc filter show ..." shows
4730 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4731 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4732 * tc's point of view. Whatever. */
4733 tc_police.burst = tc_bytes_to_ticks(
4734 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4736 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4737 NLM_F_EXCL | NLM_F_CREATE, &request);
4741 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4742 tcmsg->tcm_info = tc_make_handle(49,
4743 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4745 nl_msg_put_string(&request, TCA_KIND, "basic");
4746 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4747 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4748 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4749 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4750 nl_msg_end_nested(&request, police_offset);
4751 nl_msg_end_nested(&request, basic_offset);
4753 error = tc_transact(&request, NULL);
4764 /* The values in psched are not individually very meaningful, but they are
4765 * important. The tables below show some values seen in the wild.
4769 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4770 * (Before that, there are hints that it was 1000000000.)
4772 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4776 * -----------------------------------
4777 * [1] 000c8000 000f4240 000f4240 00000064
4778 * [2] 000003e8 00000400 000f4240 3b9aca00
4779 * [3] 000003e8 00000400 000f4240 3b9aca00
4780 * [4] 000003e8 00000400 000f4240 00000064
4781 * [5] 000003e8 00000040 000f4240 3b9aca00
4782 * [6] 000003e8 00000040 000f4240 000000f9
4784 * a b c d ticks_per_s buffer_hz
4785 * ------- --------- ---------- ------------- ----------- -------------
4786 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4787 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4788 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4789 * [4] 1,000 1,024 1,000,000 100 976,562 100
4790 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4791 * [6] 1,000 64 1,000,000 249 15,625,000 249
4793 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4794 * [2] 2.6.26-1-686-bigmem from Debian lenny
4795 * [3] 2.6.26-2-sparc64 from Debian lenny
4796 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4797 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4798 * [6] 2.6.34 from kernel.org on KVM
4800 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4801 static const char fn[] = "/proc/net/psched";
4802 unsigned int a, b, c, d;
4805 if (!ovsthread_once_start(&once)) {
4812 stream = fopen(fn, "r");
4814 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4818 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4819 VLOG_WARN("%s: read failed", fn);
4823 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4827 VLOG_WARN("%s: invalid scheduler parameters", fn);
4831 ticks_per_s = (double) a * c / b;
4835 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4838 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4841 ovsthread_once_done(&once);
4844 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4845 * rate of 'rate' bytes per second. */
4847 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4850 return (rate * ticks) / ticks_per_s;
4853 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4854 * rate of 'rate' bytes per second. */
4856 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4859 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4862 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4863 * a transmission rate of 'rate' bytes per second. */
4865 tc_buffer_per_jiffy(unsigned int rate)
4868 return rate / buffer_hz;
4871 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4872 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4873 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4874 * stores NULL into it if it is absent.
4876 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4879 * Returns 0 if successful, otherwise a positive errno value. */
4881 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4882 struct nlattr **options)
4884 static const struct nl_policy tca_policy[] = {
4885 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4886 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4888 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4890 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4891 tca_policy, ta, ARRAY_SIZE(ta))) {
4892 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4897 *kind = nl_attr_get_string(ta[TCA_KIND]);
4901 *options = ta[TCA_OPTIONS];
4916 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4917 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4918 * into '*options', and its queue statistics into '*stats'. Any of the output
4919 * arguments may be null.
4921 * Returns 0 if successful, otherwise a positive errno value. */
4923 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4924 struct nlattr **options, struct netdev_queue_stats *stats)
4926 static const struct nl_policy tca_policy[] = {
4927 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4928 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4930 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4932 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4933 tca_policy, ta, ARRAY_SIZE(ta))) {
4934 VLOG_WARN_RL(&rl, "failed to parse class message");
4939 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4940 *handlep = tc->tcm_handle;
4944 *options = ta[TCA_OPTIONS];
4948 const struct gnet_stats_queue *gsq;
4949 struct gnet_stats_basic gsb;
4951 static const struct nl_policy stats_policy[] = {
4952 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4953 .min_len = sizeof gsb },
4954 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4955 .min_len = sizeof *gsq },
4957 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4959 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4960 sa, ARRAY_SIZE(sa))) {
4961 VLOG_WARN_RL(&rl, "failed to parse class stats");
4965 /* Alignment issues screw up the length of struct gnet_stats_basic on
4966 * some arch/bitsize combinations. Newer versions of Linux have a
4967 * struct gnet_stats_basic_packed, but we can't depend on that. The
4968 * easiest thing to do is just to make a copy. */
4969 memset(&gsb, 0, sizeof gsb);
4970 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4971 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4972 stats->tx_bytes = gsb.bytes;
4973 stats->tx_packets = gsb.packets;
4975 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4976 stats->tx_errors = gsq->drops;
4986 memset(stats, 0, sizeof *stats);
4991 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4994 tc_query_class(const struct netdev *netdev,
4995 unsigned int handle, unsigned int parent,
4996 struct ofpbuf **replyp)
4998 struct ofpbuf request;
4999 struct tcmsg *tcmsg;
5002 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5006 tcmsg->tcm_handle = handle;
5007 tcmsg->tcm_parent = parent;
5009 error = tc_transact(&request, replyp);
5011 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5012 netdev_get_name(netdev),
5013 tc_get_major(handle), tc_get_minor(handle),
5014 tc_get_major(parent), tc_get_minor(parent),
5015 ovs_strerror(error));
5020 /* Equivalent to "tc class del dev <name> handle <handle>". */
5022 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5024 struct ofpbuf request;
5025 struct tcmsg *tcmsg;
5028 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5032 tcmsg->tcm_handle = handle;
5033 tcmsg->tcm_parent = 0;
5035 error = tc_transact(&request, NULL);
5037 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5038 netdev_get_name(netdev),
5039 tc_get_major(handle), tc_get_minor(handle),
5040 ovs_strerror(error));
5045 /* Equivalent to "tc qdisc del dev <name> root". */
5047 tc_del_qdisc(struct netdev *netdev_)
5049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5050 struct ofpbuf request;
5051 struct tcmsg *tcmsg;
5054 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5058 tcmsg->tcm_handle = tc_make_handle(1, 0);
5059 tcmsg->tcm_parent = TC_H_ROOT;
5061 error = tc_transact(&request, NULL);
5062 if (error == EINVAL) {
5063 /* EINVAL probably means that the default qdisc was in use, in which
5064 * case we've accomplished our purpose. */
5067 if (!error && netdev->tc) {
5068 if (netdev->tc->ops->tc_destroy) {
5069 netdev->tc->ops->tc_destroy(netdev->tc);
5077 getqdisc_is_safe(void)
5079 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5080 static bool safe = false;
5082 if (ovsthread_once_start(&once)) {
5083 struct utsname utsname;
5086 if (uname(&utsname) == -1) {
5087 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5088 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5089 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5090 } else if (major < 2 || (major == 2 && minor < 35)) {
5091 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5096 ovsthread_once_done(&once);
5101 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5102 * kernel to determine what they are. Returns 0 if successful, otherwise a
5103 * positive errno value. */
5105 tc_query_qdisc(const struct netdev *netdev_)
5107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5108 struct ofpbuf request, *qdisc;
5109 const struct tc_ops *ops;
5110 struct tcmsg *tcmsg;
5118 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5119 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5120 * 2.6.35 without that fix backported to it.
5122 * To avoid the OOPS, we must not make a request that would attempt to dump
5123 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5124 * few others. There are a few ways that I can see to do this, but most of
5125 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5126 * technique chosen here is to assume that any non-default qdisc that we
5127 * create will have a class with handle 1:0. The built-in qdiscs only have
5128 * a class with handle 0:0.
5130 * On Linux 2.6.35+ we use the straightforward method because it allows us
5131 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5132 * in such a case we get no response at all from the kernel (!) if a
5133 * builtin qdisc is in use (which is later caught by "!error &&
5134 * !qdisc->size"). */
5135 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5139 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5140 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5142 /* Figure out what tc class to instantiate. */
5143 error = tc_transact(&request, &qdisc);
5144 if (!error && qdisc->size) {
5147 error = tc_parse_qdisc(qdisc, &kind, NULL);
5149 ops = &tc_ops_other;
5151 ops = tc_lookup_linux_name(kind);
5153 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5154 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5156 ops = &tc_ops_other;
5159 } else if ((!error && !qdisc->size) || error == ENOENT) {
5160 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5161 * set up by some other entity that doesn't have a handle 1:0. We will
5162 * assume that it's the system default qdisc. */
5163 ops = &tc_ops_default;
5166 /* Who knows? Maybe the device got deleted. */
5167 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5168 netdev_get_name(netdev_), ovs_strerror(error));
5169 ops = &tc_ops_other;
5172 /* Instantiate it. */
5173 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5174 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5175 ofpbuf_delete(qdisc);
5177 return error ? error : load_error;
5180 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5181 approximate the time to transmit packets of various lengths. For an MTU of
5182 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5183 represents two possible packet lengths; for a MTU of 513 through 1024, four
5184 possible lengths; and so on.
5186 Returns, for the specified 'mtu', the number of bits that packet lengths
5187 need to be shifted right to fit within such a 256-entry table. */
5189 tc_calc_cell_log(unsigned int mtu)
5194 mtu = ETH_PAYLOAD_MAX;
5196 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5198 for (cell_log = 0; mtu >= 256; cell_log++) {
5205 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5208 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5210 memset(rate, 0, sizeof *rate);
5211 rate->cell_log = tc_calc_cell_log(mtu);
5212 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5213 /* rate->cell_align = 0; */ /* distro headers. */
5214 rate->mpu = ETH_TOTAL_MIN;
5218 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5219 * attribute of the specified "type".
5221 * See tc_calc_cell_log() above for a description of "rtab"s. */
5223 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5228 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5229 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5230 unsigned packet_size = (i + 1) << rate->cell_log;
5231 if (packet_size < rate->mpu) {
5232 packet_size = rate->mpu;
5234 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5238 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5239 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5240 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5243 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5245 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5246 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5249 /* Linux-only functions declared in netdev-linux.h */
5251 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5252 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5254 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5255 const char *flag_name, bool enable)
5257 const char *netdev_name = netdev_get_name(netdev);
5258 struct ethtool_value evalue;
5262 COVERAGE_INC(netdev_get_ethtool);
5263 memset(&evalue, 0, sizeof evalue);
5264 error = netdev_linux_do_ethtool(netdev_name,
5265 (struct ethtool_cmd *)&evalue,
5266 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5271 COVERAGE_INC(netdev_set_ethtool);
5272 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5273 error = netdev_linux_do_ethtool(netdev_name,
5274 (struct ethtool_cmd *)&evalue,
5275 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5280 COVERAGE_INC(netdev_get_ethtool);
5281 memset(&evalue, 0, sizeof evalue);
5282 error = netdev_linux_do_ethtool(netdev_name,
5283 (struct ethtool_cmd *)&evalue,
5284 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5289 if (new_flags != evalue.data) {
5290 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5291 "device %s failed", enable ? "enable" : "disable",
5292 flag_name, netdev_name);
5299 /* Utility functions. */
5301 /* Copies 'src' into 'dst', performing format conversion in the process. */
5303 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5304 const struct rtnl_link_stats *src)
5306 dst->rx_packets = src->rx_packets;
5307 dst->tx_packets = src->tx_packets;
5308 dst->rx_bytes = src->rx_bytes;
5309 dst->tx_bytes = src->tx_bytes;
5310 dst->rx_errors = src->rx_errors;
5311 dst->tx_errors = src->tx_errors;
5312 dst->rx_dropped = src->rx_dropped;
5313 dst->tx_dropped = src->tx_dropped;
5314 dst->multicast = src->multicast;
5315 dst->collisions = src->collisions;
5316 dst->rx_length_errors = src->rx_length_errors;
5317 dst->rx_over_errors = src->rx_over_errors;
5318 dst->rx_crc_errors = src->rx_crc_errors;
5319 dst->rx_frame_errors = src->rx_frame_errors;
5320 dst->rx_fifo_errors = src->rx_fifo_errors;
5321 dst->rx_missed_errors = src->rx_missed_errors;
5322 dst->tx_aborted_errors = src->tx_aborted_errors;
5323 dst->tx_carrier_errors = src->tx_carrier_errors;
5324 dst->tx_fifo_errors = src->tx_fifo_errors;
5325 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5326 dst->tx_window_errors = src->tx_window_errors;
5329 /* Copies 'src' into 'dst', performing format conversion in the process. */
5331 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5332 const struct rtnl_link_stats64 *src)
5334 dst->rx_packets = src->rx_packets;
5335 dst->tx_packets = src->tx_packets;
5336 dst->rx_bytes = src->rx_bytes;
5337 dst->tx_bytes = src->tx_bytes;
5338 dst->rx_errors = src->rx_errors;
5339 dst->tx_errors = src->tx_errors;
5340 dst->rx_dropped = src->rx_dropped;
5341 dst->tx_dropped = src->tx_dropped;
5342 dst->multicast = src->multicast;
5343 dst->collisions = src->collisions;
5344 dst->rx_length_errors = src->rx_length_errors;
5345 dst->rx_over_errors = src->rx_over_errors;
5346 dst->rx_crc_errors = src->rx_crc_errors;
5347 dst->rx_frame_errors = src->rx_frame_errors;
5348 dst->rx_fifo_errors = src->rx_fifo_errors;
5349 dst->rx_missed_errors = src->rx_missed_errors;
5350 dst->tx_aborted_errors = src->tx_aborted_errors;
5351 dst->tx_carrier_errors = src->tx_carrier_errors;
5352 dst->tx_fifo_errors = src->tx_fifo_errors;
5353 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5354 dst->tx_window_errors = src->tx_window_errors;
5358 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5360 struct ofpbuf request;
5361 struct ofpbuf *reply;
5364 ofpbuf_init(&request, 0);
5365 nl_msg_put_nlmsghdr(&request,
5366 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5367 RTM_GETLINK, NLM_F_REQUEST);
5368 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5369 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5370 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5371 ofpbuf_uninit(&request);
5376 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5377 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5378 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5379 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5382 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5383 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5384 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5387 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5392 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5397 ofpbuf_delete(reply);
5402 get_flags(const struct netdev *dev, unsigned int *flags)
5408 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5410 *flags = ifr.ifr_flags;
5416 set_flags(const char *name, unsigned int flags)
5420 ifr.ifr_flags = flags;
5421 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5425 do_get_ifindex(const char *netdev_name)
5430 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5431 COVERAGE_INC(netdev_get_ifindex);
5433 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5435 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5436 netdev_name, ovs_strerror(error));
5439 return ifr.ifr_ifindex;
5443 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5447 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5448 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5451 netdev->get_ifindex_error = -ifindex;
5452 netdev->ifindex = 0;
5454 netdev->get_ifindex_error = 0;
5455 netdev->ifindex = ifindex;
5457 netdev->cache_valid |= VALID_IFINDEX;
5460 *ifindexp = netdev->ifindex;
5461 return netdev->get_ifindex_error;
5465 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5471 memset(&ifr, 0, sizeof ifr);
5472 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5473 COVERAGE_INC(netdev_get_hwaddr);
5474 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5476 /* ENODEV probably means that a vif disappeared asynchronously and
5477 * hasn't been removed from the database yet, so reduce the log level
5478 * to INFO for that case. */
5479 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5480 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5481 netdev_name, ovs_strerror(error));
5484 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5485 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5486 VLOG_WARN("%s device has unknown hardware address family %d",
5487 netdev_name, hwaddr_family);
5489 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5494 set_etheraddr(const char *netdev_name,
5495 const uint8_t mac[ETH_ADDR_LEN])
5500 memset(&ifr, 0, sizeof ifr);
5501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5502 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5503 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5504 COVERAGE_INC(netdev_set_hwaddr);
5505 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5507 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5508 netdev_name, ovs_strerror(error));
5514 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5515 int cmd, const char *cmd_name)
5520 memset(&ifr, 0, sizeof ifr);
5521 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5522 ifr.ifr_data = (caddr_t) ecmd;
5525 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5527 if (error != EOPNOTSUPP) {
5528 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5529 "failed: %s", cmd_name, name, ovs_strerror(error));
5531 /* The device doesn't support this operation. That's pretty
5532 * common, so there's no point in logging anything. */
5539 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5540 int cmd, const char *cmd_name)
5545 ifr.ifr_addr.sa_family = AF_INET;
5546 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5548 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5550 *ip = sin->sin_addr;
5555 /* Returns an AF_PACKET raw socket or a negative errno value. */
5557 af_packet_sock(void)
5559 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5562 if (ovsthread_once_start(&once)) {
5563 sock = socket(AF_PACKET, SOCK_RAW, 0);
5565 int error = set_nonblocking(sock);
5572 VLOG_ERR("failed to create packet socket: %s",
5573 ovs_strerror(errno));
5575 ovsthread_once_done(&once);