2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 struct eth_addr etheraddr;
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
461 int in4_error; /* Cached error code from reading in4 addr. */
462 int in6_error; /* Cached error code from reading in6 addr. */
464 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
465 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
466 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
468 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
471 /* For devices of class netdev_tap_class only. */
475 struct netdev_rxq_linux {
476 struct netdev_rxq up;
481 /* This is set pretty low because we probably won't learn anything from the
482 * additional log messages. */
483 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
485 /* Polling miimon status for all ports causes performance degradation when
486 * handling a large number of ports. If there are no devices using miimon, then
487 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
489 * Readers do not depend on this variable synchronizing with the related
490 * changes in the device miimon status, so we can use atomic_count. */
491 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
493 static void netdev_linux_run(void);
495 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
496 int cmd, const char *cmd_name);
497 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
498 int cmd, const char *cmd_name);
499 static int get_flags(const struct netdev *, unsigned int *flags);
500 static int set_flags(const char *, unsigned int flags);
501 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
502 enum netdev_flags on, enum netdev_flags *old_flagsp)
503 OVS_REQUIRES(netdev->mutex);
504 static int do_get_ifindex(const char *netdev_name);
505 static int get_ifindex(const struct netdev *, int *ifindexp);
506 static int do_set_addr(struct netdev *netdev,
507 int ioctl_nr, const char *ioctl_name,
508 struct in_addr addr);
509 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
510 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
511 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
512 static int af_packet_sock(void);
513 static bool netdev_linux_miimon_enabled(void);
514 static void netdev_linux_miimon_run(void);
515 static void netdev_linux_miimon_wait(void);
516 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
519 is_netdev_linux_class(const struct netdev_class *netdev_class)
521 return netdev_class->run == netdev_linux_run;
525 is_tap_netdev(const struct netdev *netdev)
527 return netdev_get_class(netdev) == &netdev_tap_class;
530 static struct netdev_linux *
531 netdev_linux_cast(const struct netdev *netdev)
533 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
535 return CONTAINER_OF(netdev, struct netdev_linux, up);
538 static struct netdev_rxq_linux *
539 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
541 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
542 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
545 static void netdev_linux_update(struct netdev_linux *netdev,
546 const struct rtnetlink_change *)
547 OVS_REQUIRES(netdev->mutex);
548 static void netdev_linux_changed(struct netdev_linux *netdev,
549 unsigned int ifi_flags, unsigned int mask)
550 OVS_REQUIRES(netdev->mutex);
552 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
553 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
554 * if no such socket could be created. */
555 static struct nl_sock *
556 netdev_linux_notify_sock(void)
558 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
559 static struct nl_sock *sock;
560 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
561 RTNLGRP_IPV6_IFADDR};
563 if (ovsthread_once_start(&once)) {
566 error = nl_sock_create(NETLINK_ROUTE, &sock);
570 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
571 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
573 nl_sock_destroy(sock);
579 ovsthread_once_done(&once);
586 netdev_linux_miimon_enabled(void)
588 return atomic_count_get(&miimon_cnt) > 0;
592 netdev_linux_run(void)
594 struct nl_sock *sock;
597 if (netdev_linux_miimon_enabled()) {
598 netdev_linux_miimon_run();
601 sock = netdev_linux_notify_sock();
607 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
608 uint64_t buf_stub[4096 / 8];
611 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
612 error = nl_sock_recv(sock, &buf, false);
614 struct rtnetlink_change change;
616 if (rtnetlink_parse(&buf, &change)) {
617 struct netdev *netdev_ = netdev_from_name(change.ifname);
618 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
621 ovs_mutex_lock(&netdev->mutex);
622 netdev_linux_update(netdev, &change);
623 ovs_mutex_unlock(&netdev->mutex);
625 netdev_close(netdev_);
627 } else if (error == ENOBUFS) {
628 struct shash device_shash;
629 struct shash_node *node;
633 shash_init(&device_shash);
634 netdev_get_devices(&netdev_linux_class, &device_shash);
635 SHASH_FOR_EACH (node, &device_shash) {
636 struct netdev *netdev_ = node->data;
637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
640 ovs_mutex_lock(&netdev->mutex);
641 get_flags(netdev_, &flags);
642 netdev_linux_changed(netdev, flags, 0);
643 ovs_mutex_unlock(&netdev->mutex);
645 netdev_close(netdev_);
647 shash_destroy(&device_shash);
648 } else if (error != EAGAIN) {
649 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
650 ovs_strerror(error));
657 netdev_linux_wait(void)
659 struct nl_sock *sock;
661 if (netdev_linux_miimon_enabled()) {
662 netdev_linux_miimon_wait();
664 sock = netdev_linux_notify_sock();
666 nl_sock_wait(sock, POLLIN);
671 netdev_linux_changed(struct netdev_linux *dev,
672 unsigned int ifi_flags, unsigned int mask)
673 OVS_REQUIRES(dev->mutex)
675 netdev_change_seq_changed(&dev->up);
677 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
678 dev->carrier_resets++;
680 dev->ifi_flags = ifi_flags;
682 dev->cache_valid &= mask;
686 netdev_linux_update(struct netdev_linux *dev,
687 const struct rtnetlink_change *change)
688 OVS_REQUIRES(dev->mutex)
690 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
691 if (change->nlmsg_type == RTM_NEWLINK) {
692 /* Keep drv-info, in4, in6. */
693 netdev_linux_changed(dev, change->ifi_flags,
694 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
696 /* Update netdev from rtnl-change msg. */
698 dev->mtu = change->mtu;
699 dev->cache_valid |= VALID_MTU;
700 dev->netdev_mtu_error = 0;
703 if (!eth_addr_is_zero(change->mac)) {
704 dev->etheraddr = change->mac;
705 dev->cache_valid |= VALID_ETHERADDR;
706 dev->ether_addr_error = 0;
709 dev->ifindex = change->if_index;
710 dev->cache_valid |= VALID_IFINDEX;
711 dev->get_ifindex_error = 0;
713 netdev_linux_changed(dev, change->ifi_flags, 0);
715 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
716 /* Invalidates in4, in6. */
717 netdev_linux_changed(dev, dev->ifi_flags,
718 ~(VALID_IN4 | VALID_IN6));
724 static struct netdev *
725 netdev_linux_alloc(void)
727 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
732 netdev_linux_common_construct(struct netdev_linux *netdev)
734 ovs_mutex_init(&netdev->mutex);
737 /* Creates system and internal devices. */
739 netdev_linux_construct(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
744 netdev_linux_common_construct(netdev);
746 error = get_flags(&netdev->up, &netdev->ifi_flags);
747 if (error == ENODEV) {
748 if (netdev->up.netdev_class != &netdev_internal_class) {
749 /* The device does not exist, so don't allow it to be opened. */
752 /* "Internal" netdevs have to be created as netdev objects before
753 * they exist in the kernel, because creating them in the kernel
754 * happens by passing a netdev object to dpif_port_add().
755 * Therefore, ignore the error. */
762 /* For most types of netdevs we open the device for each call of
763 * netdev_open(). However, this is not the case with tap devices,
764 * since it is only possible to open the device once. In this
765 * situation we share a single file descriptor, and consequently
766 * buffers, across all readers. Therefore once data is read it will
767 * be unavailable to other reads for tap devices. */
769 netdev_linux_construct_tap(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 static const char tap_dev[] = "/dev/net/tun";
773 const char *name = netdev_->name;
777 netdev_linux_common_construct(netdev);
779 /* Open tap device. */
780 netdev->tap_fd = open(tap_dev, O_RDWR);
781 if (netdev->tap_fd < 0) {
783 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
787 /* Create tap device. */
788 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
789 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
790 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
791 VLOG_WARN("%s: creating tap device failed: %s", name,
792 ovs_strerror(errno));
797 /* Make non-blocking. */
798 error = set_nonblocking(netdev->tap_fd);
806 close(netdev->tap_fd);
811 netdev_linux_destruct(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 if (netdev->tc && netdev->tc->ops->tc_destroy) {
816 netdev->tc->ops->tc_destroy(netdev->tc);
819 if (netdev_get_class(netdev_) == &netdev_tap_class
820 && netdev->tap_fd >= 0)
822 close(netdev->tap_fd);
825 if (netdev->miimon_interval > 0) {
826 atomic_count_dec(&miimon_cnt);
829 ovs_mutex_destroy(&netdev->mutex);
833 netdev_linux_dealloc(struct netdev *netdev_)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 static struct netdev_rxq *
840 netdev_linux_rxq_alloc(void)
842 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
847 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
849 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
850 struct netdev *netdev_ = rx->up.netdev;
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
854 ovs_mutex_lock(&netdev->mutex);
855 rx->is_tap = is_tap_netdev(netdev_);
857 rx->fd = netdev->tap_fd;
859 struct sockaddr_ll sll;
861 /* Result of tcpdump -dd inbound */
862 static const struct sock_filter filt[] = {
863 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
864 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
865 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
866 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
868 static const struct sock_fprog fprog = {
869 ARRAY_SIZE(filt), (struct sock_filter *) filt
872 /* Create file descriptor. */
873 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
876 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
881 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
883 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
884 netdev_get_name(netdev_), ovs_strerror(error));
888 /* Set non-blocking mode. */
889 error = set_nonblocking(rx->fd);
894 /* Get ethernet device index. */
895 error = get_ifindex(&netdev->up, &ifindex);
900 /* Bind to specific ethernet device. */
901 memset(&sll, 0, sizeof sll);
902 sll.sll_family = AF_PACKET;
903 sll.sll_ifindex = ifindex;
904 sll.sll_protocol = htons(ETH_P_ALL);
905 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
907 VLOG_ERR("%s: failed to bind raw socket (%s)",
908 netdev_get_name(netdev_), ovs_strerror(error));
912 /* Filter for only inbound packets. */
913 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
917 VLOG_ERR("%s: failed to attach filter (%s)",
918 netdev_get_name(netdev_), ovs_strerror(error));
922 ovs_mutex_unlock(&netdev->mutex);
930 ovs_mutex_unlock(&netdev->mutex);
935 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
937 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
945 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
947 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
953 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
955 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
956 return htons(aux->tp_vlan_tpid);
958 return htons(ETH_TYPE_VLAN);
963 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
965 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
969 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
974 struct cmsghdr *cmsg;
977 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
981 /* Reserve headroom for a single VLAN tag */
982 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
983 size = dp_packet_tailroom(buffer);
985 iov.iov_base = dp_packet_data(buffer);
987 msgh.msg_name = NULL;
988 msgh.msg_namelen = 0;
991 msgh.msg_control = &cmsg_buffer;
992 msgh.msg_controllen = sizeof cmsg_buffer;
996 retval = recvmsg(fd, &msgh, MSG_TRUNC);
997 } while (retval < 0 && errno == EINTR);
1001 } else if (retval > size) {
1005 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1007 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1008 const struct tpacket_auxdata *aux;
1010 if (cmsg->cmsg_level != SOL_PACKET
1011 || cmsg->cmsg_type != PACKET_AUXDATA
1012 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1016 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1017 if (auxdata_has_vlan_tci(aux)) {
1018 if (retval < ETH_HEADER_LEN) {
1022 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1023 htons(aux->tp_vlan_tci));
1032 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1035 size_t size = dp_packet_tailroom(buffer);
1038 retval = read(fd, dp_packet_data(buffer), size);
1039 } while (retval < 0 && errno == EINTR);
1043 } else if (retval > size) {
1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1052 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1055 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1056 struct netdev *netdev = rx->up.netdev;
1057 struct dp_packet *buffer;
1061 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1062 mtu = ETH_PAYLOAD_MAX;
1065 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1066 DP_NETDEV_HEADROOM);
1067 retval = (rx->is_tap
1068 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1069 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1072 if (retval != EAGAIN && retval != EMSGSIZE) {
1073 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1074 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1076 dp_packet_delete(buffer);
1078 dp_packet_pad(buffer);
1079 dp_packet_rss_invalidate(buffer);
1080 packets[0] = buffer;
1088 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1090 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1091 poll_fd_wait(rx->fd, POLLIN);
1095 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1097 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1100 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1101 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1105 drain_fd(rx->fd, ifr.ifr_qlen);
1108 return drain_rcvbuf(rx->fd);
1112 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1113 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1114 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1115 * the packet is too big or too small to transmit on the device.
1117 * The caller retains ownership of 'buffer' in all cases.
1119 * The kernel maintains a packet transmission queue, so the caller is not
1120 * expected to do additional queuing of packets. */
1122 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1123 struct dp_packet **pkts, int cnt, bool may_steal)
1128 /* 'i' is incremented only if there's no error */
1129 for (i = 0; i < cnt;) {
1130 const void *data = dp_packet_data(pkts[i]);
1131 size_t size = dp_packet_size(pkts[i]);
1134 if (!is_tap_netdev(netdev_)) {
1135 /* Use our AF_PACKET socket to send to this device. */
1136 struct sockaddr_ll sll;
1142 sock = af_packet_sock();
1147 ifindex = netdev_get_ifindex(netdev_);
1152 /* We don't bother setting most fields in sockaddr_ll because the
1153 * kernel ignores them for SOCK_RAW. */
1154 memset(&sll, 0, sizeof sll);
1155 sll.sll_family = AF_PACKET;
1156 sll.sll_ifindex = ifindex;
1158 iov.iov_base = CONST_CAST(void *, data);
1161 msg.msg_name = &sll;
1162 msg.msg_namelen = sizeof sll;
1165 msg.msg_control = NULL;
1166 msg.msg_controllen = 0;
1169 retval = sendmsg(sock, &msg, 0);
1171 /* Use the tap fd to send to this device. This is essential for
1172 * tap devices, because packets sent to a tap device with an
1173 * AF_PACKET socket will loop back to be *received* again on the
1174 * tap device. This doesn't occur on other interface types
1175 * because we attach a socket filter to the rx socket. */
1176 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1178 retval = write(netdev->tap_fd, data, size);
1182 /* The Linux AF_PACKET implementation never blocks waiting for room
1183 * for packets, instead returning ENOBUFS. Translate this into
1184 * EAGAIN for the caller. */
1185 error = errno == ENOBUFS ? EAGAIN : errno;
1186 if (error == EINTR) {
1187 /* continue without incrementing 'i', i.e. retry this packet */
1191 } else if (retval != size) {
1192 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1193 " of %"PRIuSIZE") on %s", retval, size,
1194 netdev_get_name(netdev_));
1199 /* Process the next packet in the batch */
1204 for (i = 0; i < cnt; i++) {
1205 dp_packet_delete(pkts[i]);
1209 if (error && error != EAGAIN) {
1210 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1211 netdev_get_name(netdev_), ovs_strerror(error));
1218 /* Registers with the poll loop to wake up from the next call to poll_block()
1219 * when the packet transmission queue has sufficient room to transmit a packet
1220 * with netdev_send().
1222 * The kernel maintains a packet transmission queue, so the client is not
1223 * expected to do additional queuing of packets. Thus, this function is
1224 * unlikely to ever be used. It is included for completeness. */
1226 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1228 if (is_tap_netdev(netdev)) {
1229 /* TAP device always accepts packets.*/
1230 poll_immediate_wake();
1234 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1235 * otherwise a positive errno value. */
1237 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1240 enum netdev_flags old_flags = 0;
1243 ovs_mutex_lock(&netdev->mutex);
1245 if (netdev->cache_valid & VALID_ETHERADDR) {
1246 error = netdev->ether_addr_error;
1247 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1250 netdev->cache_valid &= ~VALID_ETHERADDR;
1253 /* Tap devices must be brought down before setting the address. */
1254 if (is_tap_netdev(netdev_)) {
1255 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1257 error = set_etheraddr(netdev_get_name(netdev_), mac);
1258 if (!error || error == ENODEV) {
1259 netdev->ether_addr_error = error;
1260 netdev->cache_valid |= VALID_ETHERADDR;
1262 netdev->etheraddr = mac;
1266 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1267 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1271 ovs_mutex_unlock(&netdev->mutex);
1275 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1277 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1279 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1282 ovs_mutex_lock(&netdev->mutex);
1283 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1284 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1285 &netdev->etheraddr);
1286 netdev->cache_valid |= VALID_ETHERADDR;
1289 error = netdev->ether_addr_error;
1291 *mac = netdev->etheraddr;
1293 ovs_mutex_unlock(&netdev->mutex);
1299 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1303 if (!(netdev->cache_valid & VALID_MTU)) {
1306 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1307 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1308 netdev->mtu = ifr.ifr_mtu;
1309 netdev->cache_valid |= VALID_MTU;
1312 error = netdev->netdev_mtu_error;
1314 *mtup = netdev->mtu;
1320 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1321 * in bytes, not including the hardware header; thus, this is typically 1500
1322 * bytes for Ethernet devices. */
1324 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1329 ovs_mutex_lock(&netdev->mutex);
1330 error = netdev_linux_get_mtu__(netdev, mtup);
1331 ovs_mutex_unlock(&netdev->mutex);
1336 /* Sets the maximum size of transmitted (MTU) for given device using linux
1337 * networking ioctl interface.
1340 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1346 ovs_mutex_lock(&netdev->mutex);
1347 if (netdev->cache_valid & VALID_MTU) {
1348 error = netdev->netdev_mtu_error;
1349 if (error || netdev->mtu == mtu) {
1352 netdev->cache_valid &= ~VALID_MTU;
1355 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1356 SIOCSIFMTU, "SIOCSIFMTU");
1357 if (!error || error == ENODEV) {
1358 netdev->netdev_mtu_error = error;
1359 netdev->mtu = ifr.ifr_mtu;
1360 netdev->cache_valid |= VALID_MTU;
1363 ovs_mutex_unlock(&netdev->mutex);
1367 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1368 * On failure, returns a negative errno value. */
1370 netdev_linux_get_ifindex(const struct netdev *netdev_)
1372 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1375 ovs_mutex_lock(&netdev->mutex);
1376 error = get_ifindex(netdev_, &ifindex);
1377 ovs_mutex_unlock(&netdev->mutex);
1379 return error ? -error : ifindex;
1383 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1387 ovs_mutex_lock(&netdev->mutex);
1388 if (netdev->miimon_interval > 0) {
1389 *carrier = netdev->miimon;
1391 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1393 ovs_mutex_unlock(&netdev->mutex);
1398 static long long int
1399 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1402 long long int carrier_resets;
1404 ovs_mutex_lock(&netdev->mutex);
1405 carrier_resets = netdev->carrier_resets;
1406 ovs_mutex_unlock(&netdev->mutex);
1408 return carrier_resets;
1412 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1413 struct mii_ioctl_data *data)
1418 memset(&ifr, 0, sizeof ifr);
1419 memcpy(&ifr.ifr_data, data, sizeof *data);
1420 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1421 memcpy(data, &ifr.ifr_data, sizeof *data);
1427 netdev_linux_get_miimon(const char *name, bool *miimon)
1429 struct mii_ioctl_data data;
1434 memset(&data, 0, sizeof data);
1435 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1437 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1438 data.reg_num = MII_BMSR;
1439 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1443 *miimon = !!(data.val_out & BMSR_LSTATUS);
1445 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1448 struct ethtool_cmd ecmd;
1450 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1453 COVERAGE_INC(netdev_get_ethtool);
1454 memset(&ecmd, 0, sizeof ecmd);
1455 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1458 struct ethtool_value eval;
1460 memcpy(&eval, &ecmd, sizeof eval);
1461 *miimon = !!eval.data;
1463 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1471 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1472 long long int interval)
1474 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1476 ovs_mutex_lock(&netdev->mutex);
1477 interval = interval > 0 ? MAX(interval, 100) : 0;
1478 if (netdev->miimon_interval != interval) {
1479 if (interval && !netdev->miimon_interval) {
1480 atomic_count_inc(&miimon_cnt);
1481 } else if (!interval && netdev->miimon_interval) {
1482 atomic_count_dec(&miimon_cnt);
1485 netdev->miimon_interval = interval;
1486 timer_set_expired(&netdev->miimon_timer);
1488 ovs_mutex_unlock(&netdev->mutex);
1494 netdev_linux_miimon_run(void)
1496 struct shash device_shash;
1497 struct shash_node *node;
1499 shash_init(&device_shash);
1500 netdev_get_devices(&netdev_linux_class, &device_shash);
1501 SHASH_FOR_EACH (node, &device_shash) {
1502 struct netdev *netdev = node->data;
1503 struct netdev_linux *dev = netdev_linux_cast(netdev);
1506 ovs_mutex_lock(&dev->mutex);
1507 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1508 netdev_linux_get_miimon(dev->up.name, &miimon);
1509 if (miimon != dev->miimon) {
1510 dev->miimon = miimon;
1511 netdev_linux_changed(dev, dev->ifi_flags, 0);
1514 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1516 ovs_mutex_unlock(&dev->mutex);
1517 netdev_close(netdev);
1520 shash_destroy(&device_shash);
1524 netdev_linux_miimon_wait(void)
1526 struct shash device_shash;
1527 struct shash_node *node;
1529 shash_init(&device_shash);
1530 netdev_get_devices(&netdev_linux_class, &device_shash);
1531 SHASH_FOR_EACH (node, &device_shash) {
1532 struct netdev *netdev = node->data;
1533 struct netdev_linux *dev = netdev_linux_cast(netdev);
1535 ovs_mutex_lock(&dev->mutex);
1536 if (dev->miimon_interval > 0) {
1537 timer_wait(&dev->miimon_timer);
1539 ovs_mutex_unlock(&dev->mutex);
1540 netdev_close(netdev);
1542 shash_destroy(&device_shash);
1546 swap_uint64(uint64_t *a, uint64_t *b)
1553 /* Copies 'src' into 'dst', performing format conversion in the process.
1555 * 'src' is allowed to be misaligned. */
1557 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1558 const struct ovs_vport_stats *src)
1560 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1561 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1562 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1563 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1564 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1565 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1566 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1567 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1569 dst->collisions = 0;
1570 dst->rx_length_errors = 0;
1571 dst->rx_over_errors = 0;
1572 dst->rx_crc_errors = 0;
1573 dst->rx_frame_errors = 0;
1574 dst->rx_fifo_errors = 0;
1575 dst->rx_missed_errors = 0;
1576 dst->tx_aborted_errors = 0;
1577 dst->tx_carrier_errors = 0;
1578 dst->tx_fifo_errors = 0;
1579 dst->tx_heartbeat_errors = 0;
1580 dst->tx_window_errors = 0;
1584 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1586 struct dpif_netlink_vport reply;
1590 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1593 } else if (!reply.stats) {
1598 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1606 get_stats_via_vport(const struct netdev *netdev_,
1607 struct netdev_stats *stats)
1609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1611 if (!netdev->vport_stats_error ||
1612 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1615 error = get_stats_via_vport__(netdev_, stats);
1616 if (error && error != ENOENT && error != ENODEV) {
1617 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1619 netdev_get_name(netdev_), ovs_strerror(error));
1621 netdev->vport_stats_error = error;
1622 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1626 /* Retrieves current device stats for 'netdev-linux'. */
1628 netdev_linux_get_stats(const struct netdev *netdev_,
1629 struct netdev_stats *stats)
1631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1632 struct netdev_stats dev_stats;
1635 ovs_mutex_lock(&netdev->mutex);
1636 get_stats_via_vport(netdev_, stats);
1637 error = get_stats_via_netlink(netdev_, &dev_stats);
1639 if (!netdev->vport_stats_error) {
1642 } else if (netdev->vport_stats_error) {
1643 /* stats not available from OVS then use netdev stats. */
1646 /* Use kernel netdev's packet and byte counts since vport's counters
1647 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1649 stats->rx_packets = dev_stats.rx_packets;
1650 stats->rx_bytes = dev_stats.rx_bytes;
1651 stats->tx_packets = dev_stats.tx_packets;
1652 stats->tx_bytes = dev_stats.tx_bytes;
1654 stats->rx_errors += dev_stats.rx_errors;
1655 stats->tx_errors += dev_stats.tx_errors;
1656 stats->rx_dropped += dev_stats.rx_dropped;
1657 stats->tx_dropped += dev_stats.tx_dropped;
1658 stats->multicast += dev_stats.multicast;
1659 stats->collisions += dev_stats.collisions;
1660 stats->rx_length_errors += dev_stats.rx_length_errors;
1661 stats->rx_over_errors += dev_stats.rx_over_errors;
1662 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1663 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1664 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1665 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1666 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1667 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1668 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1669 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1670 stats->tx_window_errors += dev_stats.tx_window_errors;
1672 ovs_mutex_unlock(&netdev->mutex);
1677 /* Retrieves current device stats for 'netdev-tap' netdev or
1678 * netdev-internal. */
1680 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1683 struct netdev_stats dev_stats;
1686 ovs_mutex_lock(&netdev->mutex);
1687 get_stats_via_vport(netdev_, stats);
1688 error = get_stats_via_netlink(netdev_, &dev_stats);
1690 if (!netdev->vport_stats_error) {
1693 } else if (netdev->vport_stats_error) {
1694 /* Transmit and receive stats will appear to be swapped relative to the
1695 * other ports since we are the one sending the data, not a remote
1696 * computer. For consistency, we swap them back here. This does not
1697 * apply if we are getting stats from the vport layer because it always
1698 * tracks stats from the perspective of the switch. */
1701 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1702 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1703 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1704 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1705 stats->rx_length_errors = 0;
1706 stats->rx_over_errors = 0;
1707 stats->rx_crc_errors = 0;
1708 stats->rx_frame_errors = 0;
1709 stats->rx_fifo_errors = 0;
1710 stats->rx_missed_errors = 0;
1711 stats->tx_aborted_errors = 0;
1712 stats->tx_carrier_errors = 0;
1713 stats->tx_fifo_errors = 0;
1714 stats->tx_heartbeat_errors = 0;
1715 stats->tx_window_errors = 0;
1717 /* Use kernel netdev's packet and byte counts since vport counters
1718 * do not reflect packet counts on the wire when GSO, TSO or GRO
1720 stats->rx_packets = dev_stats.tx_packets;
1721 stats->rx_bytes = dev_stats.tx_bytes;
1722 stats->tx_packets = dev_stats.rx_packets;
1723 stats->tx_bytes = dev_stats.rx_bytes;
1725 stats->rx_dropped += dev_stats.tx_dropped;
1726 stats->tx_dropped += dev_stats.rx_dropped;
1728 stats->rx_errors += dev_stats.tx_errors;
1729 stats->tx_errors += dev_stats.rx_errors;
1731 stats->multicast += dev_stats.multicast;
1732 stats->collisions += dev_stats.collisions;
1734 ovs_mutex_unlock(&netdev->mutex);
1740 netdev_internal_get_stats(const struct netdev *netdev_,
1741 struct netdev_stats *stats)
1743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1746 ovs_mutex_lock(&netdev->mutex);
1747 get_stats_via_vport(netdev_, stats);
1748 error = netdev->vport_stats_error;
1749 ovs_mutex_unlock(&netdev->mutex);
1755 netdev_linux_read_features(struct netdev_linux *netdev)
1757 struct ethtool_cmd ecmd;
1761 if (netdev->cache_valid & VALID_FEATURES) {
1765 COVERAGE_INC(netdev_get_ethtool);
1766 memset(&ecmd, 0, sizeof ecmd);
1767 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1768 ETHTOOL_GSET, "ETHTOOL_GSET");
1773 /* Supported features. */
1774 netdev->supported = 0;
1775 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1776 netdev->supported |= NETDEV_F_10MB_HD;
1778 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1779 netdev->supported |= NETDEV_F_10MB_FD;
1781 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1782 netdev->supported |= NETDEV_F_100MB_HD;
1784 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1785 netdev->supported |= NETDEV_F_100MB_FD;
1787 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1788 netdev->supported |= NETDEV_F_1GB_HD;
1790 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1791 netdev->supported |= NETDEV_F_1GB_FD;
1793 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1794 netdev->supported |= NETDEV_F_10GB_FD;
1796 if (ecmd.supported & SUPPORTED_TP) {
1797 netdev->supported |= NETDEV_F_COPPER;
1799 if (ecmd.supported & SUPPORTED_FIBRE) {
1800 netdev->supported |= NETDEV_F_FIBER;
1802 if (ecmd.supported & SUPPORTED_Autoneg) {
1803 netdev->supported |= NETDEV_F_AUTONEG;
1805 if (ecmd.supported & SUPPORTED_Pause) {
1806 netdev->supported |= NETDEV_F_PAUSE;
1808 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1809 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1812 /* Advertised features. */
1813 netdev->advertised = 0;
1814 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1815 netdev->advertised |= NETDEV_F_10MB_HD;
1817 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1818 netdev->advertised |= NETDEV_F_10MB_FD;
1820 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1821 netdev->advertised |= NETDEV_F_100MB_HD;
1823 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1824 netdev->advertised |= NETDEV_F_100MB_FD;
1826 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1827 netdev->advertised |= NETDEV_F_1GB_HD;
1829 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1830 netdev->advertised |= NETDEV_F_1GB_FD;
1832 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1833 netdev->advertised |= NETDEV_F_10GB_FD;
1835 if (ecmd.advertising & ADVERTISED_TP) {
1836 netdev->advertised |= NETDEV_F_COPPER;
1838 if (ecmd.advertising & ADVERTISED_FIBRE) {
1839 netdev->advertised |= NETDEV_F_FIBER;
1841 if (ecmd.advertising & ADVERTISED_Autoneg) {
1842 netdev->advertised |= NETDEV_F_AUTONEG;
1844 if (ecmd.advertising & ADVERTISED_Pause) {
1845 netdev->advertised |= NETDEV_F_PAUSE;
1847 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1848 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1851 /* Current settings. */
1853 if (speed == SPEED_10) {
1854 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1855 } else if (speed == SPEED_100) {
1856 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1857 } else if (speed == SPEED_1000) {
1858 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1859 } else if (speed == SPEED_10000) {
1860 netdev->current = NETDEV_F_10GB_FD;
1861 } else if (speed == 40000) {
1862 netdev->current = NETDEV_F_40GB_FD;
1863 } else if (speed == 100000) {
1864 netdev->current = NETDEV_F_100GB_FD;
1865 } else if (speed == 1000000) {
1866 netdev->current = NETDEV_F_1TB_FD;
1868 netdev->current = 0;
1871 if (ecmd.port == PORT_TP) {
1872 netdev->current |= NETDEV_F_COPPER;
1873 } else if (ecmd.port == PORT_FIBRE) {
1874 netdev->current |= NETDEV_F_FIBER;
1878 netdev->current |= NETDEV_F_AUTONEG;
1882 netdev->cache_valid |= VALID_FEATURES;
1883 netdev->get_features_error = error;
1886 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1887 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1888 * Returns 0 if successful, otherwise a positive errno value. */
1890 netdev_linux_get_features(const struct netdev *netdev_,
1891 enum netdev_features *current,
1892 enum netdev_features *advertised,
1893 enum netdev_features *supported,
1894 enum netdev_features *peer)
1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1899 ovs_mutex_lock(&netdev->mutex);
1900 netdev_linux_read_features(netdev);
1901 if (!netdev->get_features_error) {
1902 *current = netdev->current;
1903 *advertised = netdev->advertised;
1904 *supported = netdev->supported;
1905 *peer = 0; /* XXX */
1907 error = netdev->get_features_error;
1908 ovs_mutex_unlock(&netdev->mutex);
1913 /* Set the features advertised by 'netdev' to 'advertise'. */
1915 netdev_linux_set_advertisements(struct netdev *netdev_,
1916 enum netdev_features advertise)
1918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1919 struct ethtool_cmd ecmd;
1922 ovs_mutex_lock(&netdev->mutex);
1924 COVERAGE_INC(netdev_get_ethtool);
1925 memset(&ecmd, 0, sizeof ecmd);
1926 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1927 ETHTOOL_GSET, "ETHTOOL_GSET");
1932 ecmd.advertising = 0;
1933 if (advertise & NETDEV_F_10MB_HD) {
1934 ecmd.advertising |= ADVERTISED_10baseT_Half;
1936 if (advertise & NETDEV_F_10MB_FD) {
1937 ecmd.advertising |= ADVERTISED_10baseT_Full;
1939 if (advertise & NETDEV_F_100MB_HD) {
1940 ecmd.advertising |= ADVERTISED_100baseT_Half;
1942 if (advertise & NETDEV_F_100MB_FD) {
1943 ecmd.advertising |= ADVERTISED_100baseT_Full;
1945 if (advertise & NETDEV_F_1GB_HD) {
1946 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1948 if (advertise & NETDEV_F_1GB_FD) {
1949 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1951 if (advertise & NETDEV_F_10GB_FD) {
1952 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1954 if (advertise & NETDEV_F_COPPER) {
1955 ecmd.advertising |= ADVERTISED_TP;
1957 if (advertise & NETDEV_F_FIBER) {
1958 ecmd.advertising |= ADVERTISED_FIBRE;
1960 if (advertise & NETDEV_F_AUTONEG) {
1961 ecmd.advertising |= ADVERTISED_Autoneg;
1963 if (advertise & NETDEV_F_PAUSE) {
1964 ecmd.advertising |= ADVERTISED_Pause;
1966 if (advertise & NETDEV_F_PAUSE_ASYM) {
1967 ecmd.advertising |= ADVERTISED_Asym_Pause;
1969 COVERAGE_INC(netdev_set_ethtool);
1970 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1971 ETHTOOL_SSET, "ETHTOOL_SSET");
1974 ovs_mutex_unlock(&netdev->mutex);
1978 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1979 * successful, otherwise a positive errno value. */
1981 netdev_linux_set_policing(struct netdev *netdev_,
1982 uint32_t kbits_rate, uint32_t kbits_burst)
1984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1985 const char *netdev_name = netdev_get_name(netdev_);
1988 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1989 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1990 : kbits_burst); /* Stick with user-specified value. */
1992 ovs_mutex_lock(&netdev->mutex);
1993 if (netdev->cache_valid & VALID_POLICING) {
1994 error = netdev->netdev_policing_error;
1995 if (error || (netdev->kbits_rate == kbits_rate &&
1996 netdev->kbits_burst == kbits_burst)) {
1997 /* Assume that settings haven't changed since we last set them. */
2000 netdev->cache_valid &= ~VALID_POLICING;
2003 COVERAGE_INC(netdev_set_policing);
2004 /* Remove any existing ingress qdisc. */
2005 error = tc_add_del_ingress_qdisc(netdev_, false);
2007 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2008 netdev_name, ovs_strerror(error));
2013 error = tc_add_del_ingress_qdisc(netdev_, true);
2015 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2016 netdev_name, ovs_strerror(error));
2020 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2022 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2023 netdev_name, ovs_strerror(error));
2028 netdev->kbits_rate = kbits_rate;
2029 netdev->kbits_burst = kbits_burst;
2032 if (!error || error == ENODEV) {
2033 netdev->netdev_policing_error = error;
2034 netdev->cache_valid |= VALID_POLICING;
2036 ovs_mutex_unlock(&netdev->mutex);
2041 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2044 const struct tc_ops *const *opsp;
2046 for (opsp = tcs; *opsp != NULL; opsp++) {
2047 const struct tc_ops *ops = *opsp;
2048 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2049 sset_add(types, ops->ovs_name);
2055 static const struct tc_ops *
2056 tc_lookup_ovs_name(const char *name)
2058 const struct tc_ops *const *opsp;
2060 for (opsp = tcs; *opsp != NULL; opsp++) {
2061 const struct tc_ops *ops = *opsp;
2062 if (!strcmp(name, ops->ovs_name)) {
2069 static const struct tc_ops *
2070 tc_lookup_linux_name(const char *name)
2072 const struct tc_ops *const *opsp;
2074 for (opsp = tcs; *opsp != NULL; opsp++) {
2075 const struct tc_ops *ops = *opsp;
2076 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2083 static struct tc_queue *
2084 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2088 struct tc_queue *queue;
2090 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2091 if (queue->queue_id == queue_id) {
2098 static struct tc_queue *
2099 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2101 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2105 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2107 struct netdev_qos_capabilities *caps)
2109 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2113 caps->n_queues = ops->n_queues;
2118 netdev_linux_get_qos(const struct netdev *netdev_,
2119 const char **typep, struct smap *details)
2121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2124 ovs_mutex_lock(&netdev->mutex);
2125 error = tc_query_qdisc(netdev_);
2127 *typep = netdev->tc->ops->ovs_name;
2128 error = (netdev->tc->ops->qdisc_get
2129 ? netdev->tc->ops->qdisc_get(netdev_, details)
2132 ovs_mutex_unlock(&netdev->mutex);
2138 netdev_linux_set_qos(struct netdev *netdev_,
2139 const char *type, const struct smap *details)
2141 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2142 const struct tc_ops *new_ops;
2145 new_ops = tc_lookup_ovs_name(type);
2146 if (!new_ops || !new_ops->tc_install) {
2150 ovs_mutex_lock(&netdev->mutex);
2151 error = tc_query_qdisc(netdev_);
2156 if (new_ops == netdev->tc->ops) {
2157 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2159 /* Delete existing qdisc. */
2160 error = tc_del_qdisc(netdev_);
2164 ovs_assert(netdev->tc == NULL);
2166 /* Install new qdisc. */
2167 error = new_ops->tc_install(netdev_, details);
2168 ovs_assert((error == 0) == (netdev->tc != NULL));
2172 ovs_mutex_unlock(&netdev->mutex);
2177 netdev_linux_get_queue(const struct netdev *netdev_,
2178 unsigned int queue_id, struct smap *details)
2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2183 ovs_mutex_lock(&netdev->mutex);
2184 error = tc_query_qdisc(netdev_);
2186 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2188 ? netdev->tc->ops->class_get(netdev_, queue, details)
2191 ovs_mutex_unlock(&netdev->mutex);
2197 netdev_linux_set_queue(struct netdev *netdev_,
2198 unsigned int queue_id, const struct smap *details)
2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2203 ovs_mutex_lock(&netdev->mutex);
2204 error = tc_query_qdisc(netdev_);
2206 error = (queue_id < netdev->tc->ops->n_queues
2207 && netdev->tc->ops->class_set
2208 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2211 ovs_mutex_unlock(&netdev->mutex);
2217 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2222 ovs_mutex_lock(&netdev->mutex);
2223 error = tc_query_qdisc(netdev_);
2225 if (netdev->tc->ops->class_delete) {
2226 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2228 ? netdev->tc->ops->class_delete(netdev_, queue)
2234 ovs_mutex_unlock(&netdev->mutex);
2240 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2241 unsigned int queue_id,
2242 struct netdev_queue_stats *stats)
2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2247 ovs_mutex_lock(&netdev->mutex);
2248 error = tc_query_qdisc(netdev_);
2250 if (netdev->tc->ops->class_get_stats) {
2251 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2253 stats->created = queue->created;
2254 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2263 ovs_mutex_unlock(&netdev->mutex);
2268 struct queue_dump_state {
2269 struct nl_dump dump;
2274 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2276 struct ofpbuf request;
2277 struct tcmsg *tcmsg;
2279 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2283 tcmsg->tcm_parent = 0;
2284 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2285 ofpbuf_uninit(&request);
2287 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2292 finish_queue_dump(struct queue_dump_state *state)
2294 ofpbuf_uninit(&state->buf);
2295 return nl_dump_done(&state->dump);
2298 struct netdev_linux_queue_state {
2299 unsigned int *queues;
2305 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2307 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2310 ovs_mutex_lock(&netdev->mutex);
2311 error = tc_query_qdisc(netdev_);
2313 if (netdev->tc->ops->class_get) {
2314 struct netdev_linux_queue_state *state;
2315 struct tc_queue *queue;
2318 *statep = state = xmalloc(sizeof *state);
2319 state->n_queues = hmap_count(&netdev->tc->queues);
2320 state->cur_queue = 0;
2321 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2324 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2325 state->queues[i++] = queue->queue_id;
2331 ovs_mutex_unlock(&netdev->mutex);
2337 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2338 unsigned int *queue_idp, struct smap *details)
2340 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2341 struct netdev_linux_queue_state *state = state_;
2344 ovs_mutex_lock(&netdev->mutex);
2345 while (state->cur_queue < state->n_queues) {
2346 unsigned int queue_id = state->queues[state->cur_queue++];
2347 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2350 *queue_idp = queue_id;
2351 error = netdev->tc->ops->class_get(netdev_, queue, details);
2355 ovs_mutex_unlock(&netdev->mutex);
2361 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2364 struct netdev_linux_queue_state *state = state_;
2366 free(state->queues);
2372 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2373 netdev_dump_queue_stats_cb *cb, void *aux)
2375 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2378 ovs_mutex_lock(&netdev->mutex);
2379 error = tc_query_qdisc(netdev_);
2381 struct queue_dump_state state;
2383 if (!netdev->tc->ops->class_dump_stats) {
2385 } else if (!start_queue_dump(netdev_, &state)) {
2391 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2392 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2399 retval = finish_queue_dump(&state);
2405 ovs_mutex_unlock(&netdev->mutex);
2411 netdev_linux_get_in4(const struct netdev *netdev_,
2412 struct in_addr *address, struct in_addr *netmask)
2414 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2417 ovs_mutex_lock(&netdev->mutex);
2418 if (!(netdev->cache_valid & VALID_IN4)) {
2419 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2420 SIOCGIFADDR, "SIOCGIFADDR");
2422 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2423 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2425 netdev->in4_error = error;
2426 netdev->cache_valid |= VALID_IN4;
2428 error = netdev->in4_error;
2432 if (netdev->address.s_addr != INADDR_ANY) {
2433 *address = netdev->address;
2434 *netmask = netdev->netmask;
2436 error = EADDRNOTAVAIL;
2439 ovs_mutex_unlock(&netdev->mutex);
2445 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2446 struct in_addr netmask)
2448 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2451 ovs_mutex_lock(&netdev->mutex);
2452 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2454 netdev->address = address;
2455 netdev->netmask = netmask;
2456 if (address.s_addr != INADDR_ANY) {
2457 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2458 "SIOCSIFNETMASK", netmask);
2463 netdev->cache_valid |= VALID_IN4;
2464 netdev->in4_error = 0;
2466 netdev->cache_valid &= ~VALID_IN4;
2468 ovs_mutex_unlock(&netdev->mutex);
2474 parse_if_inet6_line(const char *line,
2475 struct in6_addr *in6, char ifname[16 + 1])
2477 uint8_t *s6 = in6->s6_addr;
2478 #define X8 "%2"SCNx8
2479 return ovs_scan(line,
2480 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2481 "%*x %*x %*x %*x %16s\n",
2482 &s6[0], &s6[1], &s6[2], &s6[3],
2483 &s6[4], &s6[5], &s6[6], &s6[7],
2484 &s6[8], &s6[9], &s6[10], &s6[11],
2485 &s6[12], &s6[13], &s6[14], &s6[15],
2489 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2490 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2493 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2495 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2498 ovs_mutex_lock(&netdev->mutex);
2499 if (!(netdev->cache_valid & VALID_IN6)) {
2503 netdev->in6 = in6addr_any;
2504 netdev->in6_error = EADDRNOTAVAIL;
2506 file = fopen("/proc/net/if_inet6", "r");
2508 const char *name = netdev_get_name(netdev_);
2509 while (fgets(line, sizeof line, file)) {
2510 struct in6_addr in6_tmp;
2511 char ifname[16 + 1];
2512 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2513 && !strcmp(name, ifname))
2515 netdev->in6 = in6_tmp;
2516 netdev->in6_error = 0;
2522 netdev->in6_error = EOPNOTSUPP;
2524 netdev->cache_valid |= VALID_IN6;
2527 error = netdev->in6_error;
2528 ovs_mutex_unlock(&netdev->mutex);
2534 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2536 struct sockaddr_in sin;
2537 memset(&sin, 0, sizeof sin);
2538 sin.sin_family = AF_INET;
2539 sin.sin_addr = addr;
2542 memset(sa, 0, sizeof *sa);
2543 memcpy(sa, &sin, sizeof sin);
2547 do_set_addr(struct netdev *netdev,
2548 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2552 make_in4_sockaddr(&ifr.ifr_addr, addr);
2553 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2557 /* Adds 'router' as a default IP gateway. */
2559 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2561 struct in_addr any = { INADDR_ANY };
2565 memset(&rt, 0, sizeof rt);
2566 make_in4_sockaddr(&rt.rt_dst, any);
2567 make_in4_sockaddr(&rt.rt_gateway, router);
2568 make_in4_sockaddr(&rt.rt_genmask, any);
2569 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2570 error = af_inet_ioctl(SIOCADDRT, &rt);
2572 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2578 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2581 static const char fn[] = "/proc/net/route";
2586 *netdev_name = NULL;
2587 stream = fopen(fn, "r");
2588 if (stream == NULL) {
2589 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2594 while (fgets(line, sizeof line, stream)) {
2597 ovs_be32 dest, gateway, mask;
2598 int refcnt, metric, mtu;
2599 unsigned int flags, use, window, irtt;
2602 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2604 iface, &dest, &gateway, &flags, &refcnt,
2605 &use, &metric, &mask, &mtu, &window, &irtt)) {
2606 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2610 if (!(flags & RTF_UP)) {
2611 /* Skip routes that aren't up. */
2615 /* The output of 'dest', 'mask', and 'gateway' were given in
2616 * network byte order, so we don't need need any endian
2617 * conversions here. */
2618 if ((dest & mask) == (host->s_addr & mask)) {
2620 /* The host is directly reachable. */
2621 next_hop->s_addr = 0;
2623 /* To reach the host, we must go through a gateway. */
2624 next_hop->s_addr = gateway;
2626 *netdev_name = xstrdup(iface);
2638 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2640 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2643 ovs_mutex_lock(&netdev->mutex);
2644 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2645 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2647 COVERAGE_INC(netdev_get_ethtool);
2648 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2649 error = netdev_linux_do_ethtool(netdev->up.name,
2652 "ETHTOOL_GDRVINFO");
2654 netdev->cache_valid |= VALID_DRVINFO;
2659 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2660 smap_add(smap, "driver_version", netdev->drvinfo.version);
2661 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2663 ovs_mutex_unlock(&netdev->mutex);
2669 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2672 smap_add(smap, "driver_name", "openvswitch");
2676 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2677 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2678 * returns 0. Otherwise, it returns a positive errno value; in particular,
2679 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2681 netdev_linux_arp_lookup(const struct netdev *netdev,
2682 ovs_be32 ip, struct eth_addr *mac)
2685 struct sockaddr_in sin;
2688 memset(&r, 0, sizeof r);
2689 memset(&sin, 0, sizeof sin);
2690 sin.sin_family = AF_INET;
2691 sin.sin_addr.s_addr = ip;
2693 memcpy(&r.arp_pa, &sin, sizeof sin);
2694 r.arp_ha.sa_family = ARPHRD_ETHER;
2696 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2697 COVERAGE_INC(netdev_arp_lookup);
2698 retval = af_inet_ioctl(SIOCGARP, &r);
2700 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2701 } else if (retval != ENXIO) {
2702 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2703 netdev_get_name(netdev), IP_ARGS(ip),
2704 ovs_strerror(retval));
2710 nd_to_iff_flags(enum netdev_flags nd)
2713 if (nd & NETDEV_UP) {
2716 if (nd & NETDEV_PROMISC) {
2719 if (nd & NETDEV_LOOPBACK) {
2720 iff |= IFF_LOOPBACK;
2726 iff_to_nd_flags(int iff)
2728 enum netdev_flags nd = 0;
2732 if (iff & IFF_PROMISC) {
2733 nd |= NETDEV_PROMISC;
2735 if (iff & IFF_LOOPBACK) {
2736 nd |= NETDEV_LOOPBACK;
2742 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2743 enum netdev_flags on, enum netdev_flags *old_flagsp)
2744 OVS_REQUIRES(netdev->mutex)
2746 int old_flags, new_flags;
2749 old_flags = netdev->ifi_flags;
2750 *old_flagsp = iff_to_nd_flags(old_flags);
2751 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2752 if (new_flags != old_flags) {
2753 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2754 get_flags(&netdev->up, &netdev->ifi_flags);
2761 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2762 enum netdev_flags on, enum netdev_flags *old_flagsp)
2764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2767 ovs_mutex_lock(&netdev->mutex);
2768 error = update_flags(netdev, off, on, old_flagsp);
2769 ovs_mutex_unlock(&netdev->mutex);
2774 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2775 GET_FEATURES, GET_STATUS) \
2781 netdev_linux_wait, \
2783 netdev_linux_alloc, \
2785 netdev_linux_destruct, \
2786 netdev_linux_dealloc, \
2787 NULL, /* get_config */ \
2788 NULL, /* set_config */ \
2789 NULL, /* get_tunnel_config */ \
2790 NULL, /* build header */ \
2791 NULL, /* push header */ \
2792 NULL, /* pop header */ \
2793 NULL, /* get_numa_id */ \
2794 NULL, /* set_multiq */ \
2796 netdev_linux_send, \
2797 netdev_linux_send_wait, \
2799 netdev_linux_set_etheraddr, \
2800 netdev_linux_get_etheraddr, \
2801 netdev_linux_get_mtu, \
2802 netdev_linux_set_mtu, \
2803 netdev_linux_get_ifindex, \
2804 netdev_linux_get_carrier, \
2805 netdev_linux_get_carrier_resets, \
2806 netdev_linux_set_miimon_interval, \
2810 netdev_linux_set_advertisements, \
2812 netdev_linux_set_policing, \
2813 netdev_linux_get_qos_types, \
2814 netdev_linux_get_qos_capabilities, \
2815 netdev_linux_get_qos, \
2816 netdev_linux_set_qos, \
2817 netdev_linux_get_queue, \
2818 netdev_linux_set_queue, \
2819 netdev_linux_delete_queue, \
2820 netdev_linux_get_queue_stats, \
2821 netdev_linux_queue_dump_start, \
2822 netdev_linux_queue_dump_next, \
2823 netdev_linux_queue_dump_done, \
2824 netdev_linux_dump_queue_stats, \
2826 netdev_linux_get_in4, \
2827 netdev_linux_set_in4, \
2828 netdev_linux_get_in6, \
2829 netdev_linux_add_router, \
2830 netdev_linux_get_next_hop, \
2832 netdev_linux_arp_lookup, \
2834 netdev_linux_update_flags, \
2836 netdev_linux_rxq_alloc, \
2837 netdev_linux_rxq_construct, \
2838 netdev_linux_rxq_destruct, \
2839 netdev_linux_rxq_dealloc, \
2840 netdev_linux_rxq_recv, \
2841 netdev_linux_rxq_wait, \
2842 netdev_linux_rxq_drain, \
2845 const struct netdev_class netdev_linux_class =
2848 netdev_linux_construct,
2849 netdev_linux_get_stats,
2850 netdev_linux_get_features,
2851 netdev_linux_get_status);
2853 const struct netdev_class netdev_tap_class =
2856 netdev_linux_construct_tap,
2857 netdev_tap_get_stats,
2858 netdev_linux_get_features,
2859 netdev_linux_get_status);
2861 const struct netdev_class netdev_internal_class =
2864 netdev_linux_construct,
2865 netdev_internal_get_stats,
2866 NULL, /* get_features */
2867 netdev_internal_get_status);
2870 #define CODEL_N_QUEUES 0x0000
2872 /* In sufficiently new kernel headers these are defined as enums in
2873 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2874 * kernels. (This overrides any enum definition in the header file but that's
2876 #define TCA_CODEL_TARGET 1
2877 #define TCA_CODEL_LIMIT 2
2878 #define TCA_CODEL_INTERVAL 3
2887 static struct codel *
2888 codel_get__(const struct netdev *netdev_)
2890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2891 return CONTAINER_OF(netdev->tc, struct codel, tc);
2895 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2899 struct codel *codel;
2901 codel = xmalloc(sizeof *codel);
2902 tc_init(&codel->tc, &tc_ops_codel);
2903 codel->target = target;
2904 codel->limit = limit;
2905 codel->interval = interval;
2907 netdev->tc = &codel->tc;
2911 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2915 struct ofpbuf request;
2916 struct tcmsg *tcmsg;
2917 uint32_t otarget, olimit, ointerval;
2920 tc_del_qdisc(netdev);
2922 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2923 NLM_F_EXCL | NLM_F_CREATE, &request);
2927 tcmsg->tcm_handle = tc_make_handle(1, 0);
2928 tcmsg->tcm_parent = TC_H_ROOT;
2930 otarget = target ? target : 5000;
2931 olimit = limit ? limit : 10240;
2932 ointerval = interval ? interval : 100000;
2934 nl_msg_put_string(&request, TCA_KIND, "codel");
2935 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2936 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2937 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2938 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2939 nl_msg_end_nested(&request, opt_offset);
2941 error = tc_transact(&request, NULL);
2943 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2944 "target %u, limit %u, interval %u error %d(%s)",
2945 netdev_get_name(netdev),
2946 otarget, olimit, ointerval,
2947 error, ovs_strerror(error));
2953 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2954 const struct smap *details, struct codel *codel)
2956 const char *target_s;
2957 const char *limit_s;
2958 const char *interval_s;
2960 target_s = smap_get(details, "target");
2961 limit_s = smap_get(details, "limit");
2962 interval_s = smap_get(details, "interval");
2964 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2965 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2966 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2968 if (!codel->target) {
2969 codel->target = 5000;
2971 if (!codel->limit) {
2972 codel->limit = 10240;
2974 if (!codel->interval) {
2975 codel->interval = 100000;
2980 codel_tc_install(struct netdev *netdev, const struct smap *details)
2985 codel_parse_qdisc_details__(netdev, details, &codel);
2986 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2989 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2995 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2997 static const struct nl_policy tca_codel_policy[] = {
2998 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2999 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3000 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3003 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3005 if (!nl_parse_nested(nl_options, tca_codel_policy,
3006 attrs, ARRAY_SIZE(tca_codel_policy))) {
3007 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3011 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3012 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3013 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3018 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3020 struct nlattr *nlattr;
3025 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3030 error = codel_parse_tca_options__(nlattr, &codel);
3035 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3041 codel_tc_destroy(struct tc *tc)
3043 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3049 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3051 const struct codel *codel = codel_get__(netdev);
3052 smap_add_format(details, "target", "%u", codel->target);
3053 smap_add_format(details, "limit", "%u", codel->limit);
3054 smap_add_format(details, "interval", "%u", codel->interval);
3059 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3063 codel_parse_qdisc_details__(netdev, details, &codel);
3064 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3065 codel_get__(netdev)->target = codel.target;
3066 codel_get__(netdev)->limit = codel.limit;
3067 codel_get__(netdev)->interval = codel.interval;
3071 static const struct tc_ops tc_ops_codel = {
3072 "codel", /* linux_name */
3073 "linux-codel", /* ovs_name */
3074 CODEL_N_QUEUES, /* n_queues */
3087 /* FQ-CoDel traffic control class. */
3089 #define FQCODEL_N_QUEUES 0x0000
3091 /* In sufficiently new kernel headers these are defined as enums in
3092 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3093 * kernels. (This overrides any enum definition in the header file but that's
3095 #define TCA_FQ_CODEL_TARGET 1
3096 #define TCA_FQ_CODEL_LIMIT 2
3097 #define TCA_FQ_CODEL_INTERVAL 3
3098 #define TCA_FQ_CODEL_ECN 4
3099 #define TCA_FQ_CODEL_FLOWS 5
3100 #define TCA_FQ_CODEL_QUANTUM 6
3111 static struct fqcodel *
3112 fqcodel_get__(const struct netdev *netdev_)
3114 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3115 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3119 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3120 uint32_t interval, uint32_t flows, uint32_t quantum)
3122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3123 struct fqcodel *fqcodel;
3125 fqcodel = xmalloc(sizeof *fqcodel);
3126 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3127 fqcodel->target = target;
3128 fqcodel->limit = limit;
3129 fqcodel->interval = interval;
3130 fqcodel->flows = flows;
3131 fqcodel->quantum = quantum;
3133 netdev->tc = &fqcodel->tc;
3137 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3138 uint32_t interval, uint32_t flows, uint32_t quantum)
3141 struct ofpbuf request;
3142 struct tcmsg *tcmsg;
3143 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3146 tc_del_qdisc(netdev);
3148 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3149 NLM_F_EXCL | NLM_F_CREATE, &request);
3153 tcmsg->tcm_handle = tc_make_handle(1, 0);
3154 tcmsg->tcm_parent = TC_H_ROOT;
3156 otarget = target ? target : 5000;
3157 olimit = limit ? limit : 10240;
3158 ointerval = interval ? interval : 100000;
3159 oflows = flows ? flows : 1024;
3160 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3163 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3164 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3165 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3166 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3167 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3168 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3169 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3170 nl_msg_end_nested(&request, opt_offset);
3172 error = tc_transact(&request, NULL);
3174 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3175 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3176 netdev_get_name(netdev),
3177 otarget, olimit, ointerval, oflows, oquantum,
3178 error, ovs_strerror(error));
3184 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3185 const struct smap *details, struct fqcodel *fqcodel)
3187 const char *target_s;
3188 const char *limit_s;
3189 const char *interval_s;
3190 const char *flows_s;
3191 const char *quantum_s;
3193 target_s = smap_get(details, "target");
3194 limit_s = smap_get(details, "limit");
3195 interval_s = smap_get(details, "interval");
3196 flows_s = smap_get(details, "flows");
3197 quantum_s = smap_get(details, "quantum");
3198 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3199 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3200 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3201 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3202 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3203 if (!fqcodel->target) {
3204 fqcodel->target = 5000;
3206 if (!fqcodel->limit) {
3207 fqcodel->limit = 10240;
3209 if (!fqcodel->interval) {
3210 fqcodel->interval = 1000000;
3212 if (!fqcodel->flows) {
3213 fqcodel->flows = 1024;
3215 if (!fqcodel->quantum) {
3216 fqcodel->quantum = 1514;
3221 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3224 struct fqcodel fqcodel;
3226 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3227 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3228 fqcodel.interval, fqcodel.flows,
3231 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3232 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3238 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3240 static const struct nl_policy tca_fqcodel_policy[] = {
3241 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3242 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3243 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3244 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3245 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3248 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3250 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3251 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3252 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3256 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3257 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3258 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3259 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3260 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3265 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3267 struct nlattr *nlattr;
3270 struct fqcodel fqcodel;
3272 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3277 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3282 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3283 fqcodel.flows, fqcodel.quantum);
3288 fqcodel_tc_destroy(struct tc *tc)
3290 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3296 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3298 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3299 smap_add_format(details, "target", "%u", fqcodel->target);
3300 smap_add_format(details, "limit", "%u", fqcodel->limit);
3301 smap_add_format(details, "interval", "%u", fqcodel->interval);
3302 smap_add_format(details, "flows", "%u", fqcodel->flows);
3303 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3308 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3310 struct fqcodel fqcodel;
3312 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3313 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3314 fqcodel.flows, fqcodel.quantum);
3315 fqcodel_get__(netdev)->target = fqcodel.target;
3316 fqcodel_get__(netdev)->limit = fqcodel.limit;
3317 fqcodel_get__(netdev)->interval = fqcodel.interval;
3318 fqcodel_get__(netdev)->flows = fqcodel.flows;
3319 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3323 static const struct tc_ops tc_ops_fqcodel = {
3324 "fq_codel", /* linux_name */
3325 "linux-fq_codel", /* ovs_name */
3326 FQCODEL_N_QUEUES, /* n_queues */
3339 /* SFQ traffic control class. */
3341 #define SFQ_N_QUEUES 0x0000
3350 sfq_get__(const struct netdev *netdev_)
3352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3353 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3357 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3362 sfq = xmalloc(sizeof *sfq);
3363 tc_init(&sfq->tc, &tc_ops_sfq);
3364 sfq->perturb = perturb;
3365 sfq->quantum = quantum;
3367 netdev->tc = &sfq->tc;
3371 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3373 struct tc_sfq_qopt opt;
3374 struct ofpbuf request;
3375 struct tcmsg *tcmsg;
3377 int mtu_error, error;
3378 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3380 tc_del_qdisc(netdev);
3382 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3383 NLM_F_EXCL | NLM_F_CREATE, &request);
3387 tcmsg->tcm_handle = tc_make_handle(1, 0);
3388 tcmsg->tcm_parent = TC_H_ROOT;
3390 memset(&opt, 0, sizeof opt);
3393 opt.quantum = mtu; /* if we cannot find mtu, use default */
3396 opt.quantum = quantum;
3400 opt.perturb_period = 10;
3402 opt.perturb_period = perturb;
3405 nl_msg_put_string(&request, TCA_KIND, "sfq");
3406 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3408 error = tc_transact(&request, NULL);
3410 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3411 "quantum %u, perturb %u error %d(%s)",
3412 netdev_get_name(netdev),
3413 opt.quantum, opt.perturb_period,
3414 error, ovs_strerror(error));
3420 sfq_parse_qdisc_details__(struct netdev *netdev,
3421 const struct smap *details, struct sfq *sfq)
3423 const char *perturb_s;
3424 const char *quantum_s;
3428 perturb_s = smap_get(details, "perturb");
3429 quantum_s = smap_get(details, "quantum");
3430 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3431 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3432 if (!sfq->perturb) {
3436 if (!sfq->quantum) {
3437 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3441 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3442 "device without mtu");
3449 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3454 sfq_parse_qdisc_details__(netdev, details, &sfq);
3455 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3457 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3463 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3465 const struct tc_sfq_qopt *sfq;
3466 struct nlattr *nlattr;
3470 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3472 sfq = nl_attr_get(nlattr);
3473 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3481 sfq_tc_destroy(struct tc *tc)
3483 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3489 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3491 const struct sfq *sfq = sfq_get__(netdev);
3492 smap_add_format(details, "quantum", "%u", sfq->quantum);
3493 smap_add_format(details, "perturb", "%u", sfq->perturb);
3498 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3502 sfq_parse_qdisc_details__(netdev, details, &sfq);
3503 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3504 sfq_get__(netdev)->quantum = sfq.quantum;
3505 sfq_get__(netdev)->perturb = sfq.perturb;
3509 static const struct tc_ops tc_ops_sfq = {
3510 "sfq", /* linux_name */
3511 "linux-sfq", /* ovs_name */
3512 SFQ_N_QUEUES, /* n_queues */
3525 /* HTB traffic control class. */
3527 #define HTB_N_QUEUES 0xf000
3528 #define HTB_RATE2QUANTUM 10
3532 unsigned int max_rate; /* In bytes/s. */
3536 struct tc_queue tc_queue;
3537 unsigned int min_rate; /* In bytes/s. */
3538 unsigned int max_rate; /* In bytes/s. */
3539 unsigned int burst; /* In bytes. */
3540 unsigned int priority; /* Lower values are higher priorities. */
3544 htb_get__(const struct netdev *netdev_)
3546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3547 return CONTAINER_OF(netdev->tc, struct htb, tc);
3551 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3553 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3556 htb = xmalloc(sizeof *htb);
3557 tc_init(&htb->tc, &tc_ops_htb);
3558 htb->max_rate = max_rate;
3560 netdev->tc = &htb->tc;
3563 /* Create an HTB qdisc.
3565 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3567 htb_setup_qdisc__(struct netdev *netdev)
3570 struct tc_htb_glob opt;
3571 struct ofpbuf request;
3572 struct tcmsg *tcmsg;
3574 tc_del_qdisc(netdev);
3576 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3577 NLM_F_EXCL | NLM_F_CREATE, &request);
3581 tcmsg->tcm_handle = tc_make_handle(1, 0);
3582 tcmsg->tcm_parent = TC_H_ROOT;
3584 nl_msg_put_string(&request, TCA_KIND, "htb");
3586 memset(&opt, 0, sizeof opt);
3587 opt.rate2quantum = HTB_RATE2QUANTUM;
3591 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3592 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3593 nl_msg_end_nested(&request, opt_offset);
3595 return tc_transact(&request, NULL);
3598 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3599 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3601 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3602 unsigned int parent, struct htb_class *class)
3605 struct tc_htb_opt opt;
3606 struct ofpbuf request;
3607 struct tcmsg *tcmsg;
3611 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3613 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3614 netdev_get_name(netdev));
3618 memset(&opt, 0, sizeof opt);
3619 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3620 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3621 /* Makes sure the quantum is at least MTU. Setting quantum will
3622 * make htb ignore the r2q for this class. */
3623 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3626 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3627 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3628 opt.prio = class->priority;
3630 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3634 tcmsg->tcm_handle = handle;
3635 tcmsg->tcm_parent = parent;
3637 nl_msg_put_string(&request, TCA_KIND, "htb");
3638 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3639 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3640 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3641 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3642 nl_msg_end_nested(&request, opt_offset);
3644 error = tc_transact(&request, NULL);
3646 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3647 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3648 netdev_get_name(netdev),
3649 tc_get_major(handle), tc_get_minor(handle),
3650 tc_get_major(parent), tc_get_minor(parent),
3651 class->min_rate, class->max_rate,
3652 class->burst, class->priority, ovs_strerror(error));
3657 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3658 * description of them into 'details'. The description complies with the
3659 * specification given in the vswitch database documentation for linux-htb
3662 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3664 static const struct nl_policy tca_htb_policy[] = {
3665 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3666 .min_len = sizeof(struct tc_htb_opt) },
3669 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3670 const struct tc_htb_opt *htb;
3672 if (!nl_parse_nested(nl_options, tca_htb_policy,
3673 attrs, ARRAY_SIZE(tca_htb_policy))) {
3674 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3678 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3679 class->min_rate = htb->rate.rate;
3680 class->max_rate = htb->ceil.rate;
3681 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3682 class->priority = htb->prio;
3687 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3688 struct htb_class *options,
3689 struct netdev_queue_stats *stats)
3691 struct nlattr *nl_options;
3692 unsigned int handle;
3695 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3696 if (!error && queue_id) {
3697 unsigned int major = tc_get_major(handle);
3698 unsigned int minor = tc_get_minor(handle);
3699 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3700 *queue_id = minor - 1;
3705 if (!error && options) {
3706 error = htb_parse_tca_options__(nl_options, options);
3712 htb_parse_qdisc_details__(struct netdev *netdev_,
3713 const struct smap *details, struct htb_class *hc)
3715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3716 const char *max_rate_s;
3718 max_rate_s = smap_get(details, "max-rate");
3719 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3720 if (!hc->max_rate) {
3721 enum netdev_features current;
3723 netdev_linux_read_features(netdev);
3724 current = !netdev->get_features_error ? netdev->current : 0;
3725 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3727 hc->min_rate = hc->max_rate;
3733 htb_parse_class_details__(struct netdev *netdev,
3734 const struct smap *details, struct htb_class *hc)
3736 const struct htb *htb = htb_get__(netdev);
3737 const char *min_rate_s = smap_get(details, "min-rate");
3738 const char *max_rate_s = smap_get(details, "max-rate");
3739 const char *burst_s = smap_get(details, "burst");
3740 const char *priority_s = smap_get(details, "priority");
3743 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3745 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3746 netdev_get_name(netdev));
3750 /* HTB requires at least an mtu sized min-rate to send any traffic even
3751 * on uncongested links. */
3752 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3753 hc->min_rate = MAX(hc->min_rate, mtu);
3754 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3757 hc->max_rate = (max_rate_s
3758 ? strtoull(max_rate_s, NULL, 10) / 8
3760 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3761 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3765 * According to hints in the documentation that I've read, it is important
3766 * that 'burst' be at least as big as the largest frame that might be
3767 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3768 * but having it a bit too small is a problem. Since netdev_get_mtu()
3769 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3770 * the MTU. We actually add 64, instead of 14, as a guard against
3771 * additional headers get tacked on somewhere that we're not aware of. */
3772 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3773 hc->burst = MAX(hc->burst, mtu + 64);
3776 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3782 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3783 unsigned int parent, struct htb_class *options,
3784 struct netdev_queue_stats *stats)
3786 struct ofpbuf *reply;
3789 error = tc_query_class(netdev, handle, parent, &reply);
3791 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3792 ofpbuf_delete(reply);
3798 htb_tc_install(struct netdev *netdev, const struct smap *details)
3802 error = htb_setup_qdisc__(netdev);
3804 struct htb_class hc;
3806 htb_parse_qdisc_details__(netdev, details, &hc);
3807 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3808 tc_make_handle(1, 0), &hc);
3810 htb_install__(netdev, hc.max_rate);
3816 static struct htb_class *
3817 htb_class_cast__(const struct tc_queue *queue)
3819 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3823 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3824 const struct htb_class *hc)
3826 struct htb *htb = htb_get__(netdev);
3827 size_t hash = hash_int(queue_id, 0);
3828 struct tc_queue *queue;
3829 struct htb_class *hcp;
3831 queue = tc_find_queue__(netdev, queue_id, hash);
3833 hcp = htb_class_cast__(queue);
3835 hcp = xmalloc(sizeof *hcp);
3836 queue = &hcp->tc_queue;
3837 queue->queue_id = queue_id;
3838 queue->created = time_msec();
3839 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3842 hcp->min_rate = hc->min_rate;
3843 hcp->max_rate = hc->max_rate;
3844 hcp->burst = hc->burst;
3845 hcp->priority = hc->priority;
3849 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3852 struct queue_dump_state state;
3853 struct htb_class hc;
3855 /* Get qdisc options. */
3857 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3858 htb_install__(netdev, hc.max_rate);
3861 if (!start_queue_dump(netdev, &state)) {
3864 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3865 unsigned int queue_id;
3867 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3868 htb_update_queue__(netdev, queue_id, &hc);
3871 finish_queue_dump(&state);
3877 htb_tc_destroy(struct tc *tc)
3879 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3880 struct htb_class *hc, *next;
3882 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3883 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3891 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3893 const struct htb *htb = htb_get__(netdev);
3894 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3899 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3901 struct htb_class hc;
3904 htb_parse_qdisc_details__(netdev, details, &hc);
3905 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3906 tc_make_handle(1, 0), &hc);
3908 htb_get__(netdev)->max_rate = hc.max_rate;
3914 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3915 const struct tc_queue *queue, struct smap *details)
3917 const struct htb_class *hc = htb_class_cast__(queue);
3919 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3920 if (hc->min_rate != hc->max_rate) {
3921 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3923 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3925 smap_add_format(details, "priority", "%u", hc->priority);
3931 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3932 const struct smap *details)
3934 struct htb_class hc;
3937 error = htb_parse_class_details__(netdev, details, &hc);
3942 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3943 tc_make_handle(1, 0xfffe), &hc);
3948 htb_update_queue__(netdev, queue_id, &hc);
3953 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3955 struct htb_class *hc = htb_class_cast__(queue);
3956 struct htb *htb = htb_get__(netdev);
3959 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3961 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3968 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3969 struct netdev_queue_stats *stats)
3971 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3972 tc_make_handle(1, 0xfffe), NULL, stats);
3976 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3977 const struct ofpbuf *nlmsg,
3978 netdev_dump_queue_stats_cb *cb, void *aux)
3980 struct netdev_queue_stats stats;
3981 unsigned int handle, major, minor;
3984 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3989 major = tc_get_major(handle);
3990 minor = tc_get_minor(handle);
3991 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3992 (*cb)(minor - 1, &stats, aux);
3997 static const struct tc_ops tc_ops_htb = {
3998 "htb", /* linux_name */
3999 "linux-htb", /* ovs_name */
4000 HTB_N_QUEUES, /* n_queues */
4009 htb_class_get_stats,
4010 htb_class_dump_stats
4013 /* "linux-hfsc" traffic control class. */
4015 #define HFSC_N_QUEUES 0xf000
4023 struct tc_queue tc_queue;
4028 static struct hfsc *
4029 hfsc_get__(const struct netdev *netdev_)
4031 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4032 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4035 static struct hfsc_class *
4036 hfsc_class_cast__(const struct tc_queue *queue)
4038 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4042 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4044 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4047 hfsc = xmalloc(sizeof *hfsc);
4048 tc_init(&hfsc->tc, &tc_ops_hfsc);
4049 hfsc->max_rate = max_rate;
4050 netdev->tc = &hfsc->tc;
4054 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4055 const struct hfsc_class *hc)
4059 struct hfsc_class *hcp;
4060 struct tc_queue *queue;
4062 hfsc = hfsc_get__(netdev);
4063 hash = hash_int(queue_id, 0);
4065 queue = tc_find_queue__(netdev, queue_id, hash);
4067 hcp = hfsc_class_cast__(queue);
4069 hcp = xmalloc(sizeof *hcp);
4070 queue = &hcp->tc_queue;
4071 queue->queue_id = queue_id;
4072 queue->created = time_msec();
4073 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4076 hcp->min_rate = hc->min_rate;
4077 hcp->max_rate = hc->max_rate;
4081 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4083 const struct tc_service_curve *rsc, *fsc, *usc;
4084 static const struct nl_policy tca_hfsc_policy[] = {
4086 .type = NL_A_UNSPEC,
4088 .min_len = sizeof(struct tc_service_curve),
4091 .type = NL_A_UNSPEC,
4093 .min_len = sizeof(struct tc_service_curve),
4096 .type = NL_A_UNSPEC,
4098 .min_len = sizeof(struct tc_service_curve),
4101 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4103 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4104 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4109 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4110 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4111 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4113 if (rsc->m1 != 0 || rsc->d != 0 ||
4114 fsc->m1 != 0 || fsc->d != 0 ||
4115 usc->m1 != 0 || usc->d != 0) {
4116 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4117 "Non-linear service curves are not supported.");
4121 if (rsc->m2 != fsc->m2) {
4122 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4123 "Real-time service curves are not supported ");
4127 if (rsc->m2 > usc->m2) {
4128 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4129 "Min-rate service curve is greater than "
4130 "the max-rate service curve.");
4134 class->min_rate = fsc->m2;
4135 class->max_rate = usc->m2;
4140 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4141 struct hfsc_class *options,
4142 struct netdev_queue_stats *stats)
4145 unsigned int handle;
4146 struct nlattr *nl_options;
4148 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4154 unsigned int major, minor;
4156 major = tc_get_major(handle);
4157 minor = tc_get_minor(handle);
4158 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4159 *queue_id = minor - 1;
4166 error = hfsc_parse_tca_options__(nl_options, options);
4173 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4174 unsigned int parent, struct hfsc_class *options,
4175 struct netdev_queue_stats *stats)
4178 struct ofpbuf *reply;
4180 error = tc_query_class(netdev, handle, parent, &reply);
4185 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4186 ofpbuf_delete(reply);
4191 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4192 struct hfsc_class *class)
4194 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4196 const char *max_rate_s;
4198 max_rate_s = smap_get(details, "max-rate");
4199 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4202 enum netdev_features current;
4204 netdev_linux_read_features(netdev);
4205 current = !netdev->get_features_error ? netdev->current : 0;
4206 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4209 class->min_rate = max_rate;
4210 class->max_rate = max_rate;
4214 hfsc_parse_class_details__(struct netdev *netdev,
4215 const struct smap *details,
4216 struct hfsc_class * class)
4218 const struct hfsc *hfsc;
4219 uint32_t min_rate, max_rate;
4220 const char *min_rate_s, *max_rate_s;
4222 hfsc = hfsc_get__(netdev);
4223 min_rate_s = smap_get(details, "min-rate");
4224 max_rate_s = smap_get(details, "max-rate");
4226 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4227 min_rate = MAX(min_rate, 1);
4228 min_rate = MIN(min_rate, hfsc->max_rate);
4230 max_rate = (max_rate_s
4231 ? strtoull(max_rate_s, NULL, 10) / 8
4233 max_rate = MAX(max_rate, min_rate);
4234 max_rate = MIN(max_rate, hfsc->max_rate);
4236 class->min_rate = min_rate;
4237 class->max_rate = max_rate;
4242 /* Create an HFSC qdisc.
4244 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4246 hfsc_setup_qdisc__(struct netdev * netdev)
4248 struct tcmsg *tcmsg;
4249 struct ofpbuf request;
4250 struct tc_hfsc_qopt opt;
4252 tc_del_qdisc(netdev);
4254 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4255 NLM_F_EXCL | NLM_F_CREATE, &request);
4261 tcmsg->tcm_handle = tc_make_handle(1, 0);
4262 tcmsg->tcm_parent = TC_H_ROOT;
4264 memset(&opt, 0, sizeof opt);
4267 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4268 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4270 return tc_transact(&request, NULL);
4273 /* Create an HFSC class.
4275 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4276 * sc rate <min_rate> ul rate <max_rate>" */
4278 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4279 unsigned int parent, struct hfsc_class *class)
4283 struct tcmsg *tcmsg;
4284 struct ofpbuf request;
4285 struct tc_service_curve min, max;
4287 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4293 tcmsg->tcm_handle = handle;
4294 tcmsg->tcm_parent = parent;
4298 min.m2 = class->min_rate;
4302 max.m2 = class->max_rate;
4304 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4305 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4306 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4307 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4308 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4309 nl_msg_end_nested(&request, opt_offset);
4311 error = tc_transact(&request, NULL);
4313 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4314 "min-rate %ubps, max-rate %ubps (%s)",
4315 netdev_get_name(netdev),
4316 tc_get_major(handle), tc_get_minor(handle),
4317 tc_get_major(parent), tc_get_minor(parent),
4318 class->min_rate, class->max_rate, ovs_strerror(error));
4325 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4328 struct hfsc_class class;
4330 error = hfsc_setup_qdisc__(netdev);
4336 hfsc_parse_qdisc_details__(netdev, details, &class);
4337 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4338 tc_make_handle(1, 0), &class);
4344 hfsc_install__(netdev, class.max_rate);
4349 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4352 struct queue_dump_state state;
4353 struct hfsc_class hc;
4356 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4357 hfsc_install__(netdev, hc.max_rate);
4359 if (!start_queue_dump(netdev, &state)) {
4363 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4364 unsigned int queue_id;
4366 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4367 hfsc_update_queue__(netdev, queue_id, &hc);
4371 finish_queue_dump(&state);
4376 hfsc_tc_destroy(struct tc *tc)
4379 struct hfsc_class *hc, *next;
4381 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4383 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4384 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4393 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4395 const struct hfsc *hfsc;
4396 hfsc = hfsc_get__(netdev);
4397 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4402 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4405 struct hfsc_class class;
4407 hfsc_parse_qdisc_details__(netdev, details, &class);
4408 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4409 tc_make_handle(1, 0), &class);
4412 hfsc_get__(netdev)->max_rate = class.max_rate;
4419 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4420 const struct tc_queue *queue, struct smap *details)
4422 const struct hfsc_class *hc;
4424 hc = hfsc_class_cast__(queue);
4425 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4426 if (hc->min_rate != hc->max_rate) {
4427 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4433 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4434 const struct smap *details)
4437 struct hfsc_class class;
4439 error = hfsc_parse_class_details__(netdev, details, &class);
4444 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4445 tc_make_handle(1, 0xfffe), &class);
4450 hfsc_update_queue__(netdev, queue_id, &class);
4455 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4459 struct hfsc_class *hc;
4461 hc = hfsc_class_cast__(queue);
4462 hfsc = hfsc_get__(netdev);
4464 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4466 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4473 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4474 struct netdev_queue_stats *stats)
4476 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4477 tc_make_handle(1, 0xfffe), NULL, stats);
4481 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4482 const struct ofpbuf *nlmsg,
4483 netdev_dump_queue_stats_cb *cb, void *aux)
4485 struct netdev_queue_stats stats;
4486 unsigned int handle, major, minor;
4489 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4494 major = tc_get_major(handle);
4495 minor = tc_get_minor(handle);
4496 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4497 (*cb)(minor - 1, &stats, aux);
4502 static const struct tc_ops tc_ops_hfsc = {
4503 "hfsc", /* linux_name */
4504 "linux-hfsc", /* ovs_name */
4505 HFSC_N_QUEUES, /* n_queues */
4506 hfsc_tc_install, /* tc_install */
4507 hfsc_tc_load, /* tc_load */
4508 hfsc_tc_destroy, /* tc_destroy */
4509 hfsc_qdisc_get, /* qdisc_get */
4510 hfsc_qdisc_set, /* qdisc_set */
4511 hfsc_class_get, /* class_get */
4512 hfsc_class_set, /* class_set */
4513 hfsc_class_delete, /* class_delete */
4514 hfsc_class_get_stats, /* class_get_stats */
4515 hfsc_class_dump_stats /* class_dump_stats */
4518 /* "linux-default" traffic control class.
4520 * This class represents the default, unnamed Linux qdisc. It corresponds to
4521 * the "" (empty string) QoS type in the OVS database. */
4524 default_install__(struct netdev *netdev_)
4526 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4527 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4529 /* Nothing but a tc class implementation is allowed to write to a tc. This
4530 * class never does that, so we can legitimately use a const tc object. */
4531 netdev->tc = CONST_CAST(struct tc *, &tc);
4535 default_tc_install(struct netdev *netdev,
4536 const struct smap *details OVS_UNUSED)
4538 default_install__(netdev);
4543 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4545 default_install__(netdev);
4549 static const struct tc_ops tc_ops_default = {
4550 NULL, /* linux_name */
4555 NULL, /* tc_destroy */
4556 NULL, /* qdisc_get */
4557 NULL, /* qdisc_set */
4558 NULL, /* class_get */
4559 NULL, /* class_set */
4560 NULL, /* class_delete */
4561 NULL, /* class_get_stats */
4562 NULL /* class_dump_stats */
4565 /* "linux-other" traffic control class.
4570 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4572 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4573 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4575 /* Nothing but a tc class implementation is allowed to write to a tc. This
4576 * class never does that, so we can legitimately use a const tc object. */
4577 netdev->tc = CONST_CAST(struct tc *, &tc);
4581 static const struct tc_ops tc_ops_other = {
4582 NULL, /* linux_name */
4583 "linux-other", /* ovs_name */
4585 NULL, /* tc_install */
4587 NULL, /* tc_destroy */
4588 NULL, /* qdisc_get */
4589 NULL, /* qdisc_set */
4590 NULL, /* class_get */
4591 NULL, /* class_set */
4592 NULL, /* class_delete */
4593 NULL, /* class_get_stats */
4594 NULL /* class_dump_stats */
4597 /* Traffic control. */
4599 /* Number of kernel "tc" ticks per second. */
4600 static double ticks_per_s;
4602 /* Number of kernel "jiffies" per second. This is used for the purpose of
4603 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4604 * one jiffy's worth of data.
4606 * There are two possibilities here:
4608 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4609 * approximate range of 100 to 1024. That means that we really need to
4610 * make sure that the qdisc can buffer that much data.
4612 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4613 * has finely granular timers and there's no need to fudge additional room
4614 * for buffers. (There's no extra effort needed to implement that: the
4615 * large 'buffer_hz' is used as a divisor, so practically any number will
4616 * come out as 0 in the division. Small integer results in the case of
4617 * really high dividends won't have any real effect anyhow.)
4619 static unsigned int buffer_hz;
4621 /* Returns tc handle 'major':'minor'. */
4623 tc_make_handle(unsigned int major, unsigned int minor)
4625 return TC_H_MAKE(major << 16, minor);
4628 /* Returns the major number from 'handle'. */
4630 tc_get_major(unsigned int handle)
4632 return TC_H_MAJ(handle) >> 16;
4635 /* Returns the minor number from 'handle'. */
4637 tc_get_minor(unsigned int handle)
4639 return TC_H_MIN(handle);
4642 static struct tcmsg *
4643 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4644 struct ofpbuf *request)
4646 struct tcmsg *tcmsg;
4650 error = get_ifindex(netdev, &ifindex);
4655 ofpbuf_init(request, 512);
4656 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4657 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4658 tcmsg->tcm_family = AF_UNSPEC;
4659 tcmsg->tcm_ifindex = ifindex;
4660 /* Caller should fill in tcmsg->tcm_handle. */
4661 /* Caller should fill in tcmsg->tcm_parent. */
4667 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4669 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4670 ofpbuf_uninit(request);
4674 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4675 * policing configuration.
4677 * This function is equivalent to running the following when 'add' is true:
4678 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4680 * This function is equivalent to running the following when 'add' is false:
4681 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4683 * The configuration and stats may be seen with the following command:
4684 * /sbin/tc -s qdisc show dev <devname>
4686 * Returns 0 if successful, otherwise a positive errno value.
4689 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4691 struct ofpbuf request;
4692 struct tcmsg *tcmsg;
4694 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4695 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4697 tcmsg = tc_make_request(netdev, type, flags, &request);
4701 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4702 tcmsg->tcm_parent = TC_H_INGRESS;
4703 nl_msg_put_string(&request, TCA_KIND, "ingress");
4704 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4706 error = tc_transact(&request, NULL);
4708 /* If we're deleting the qdisc, don't worry about some of the
4709 * error conditions. */
4710 if (!add && (error == ENOENT || error == EINVAL)) {
4719 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4722 * This function is equivalent to running:
4723 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4724 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4727 * The configuration and stats may be seen with the following command:
4728 * /sbin/tc -s filter show dev <devname> parent ffff:
4730 * Returns 0 if successful, otherwise a positive errno value.
4733 tc_add_policer(struct netdev *netdev,
4734 uint32_t kbits_rate, uint32_t kbits_burst)
4736 struct tc_police tc_police;
4737 struct ofpbuf request;
4738 struct tcmsg *tcmsg;
4739 size_t basic_offset;
4740 size_t police_offset;
4744 memset(&tc_police, 0, sizeof tc_police);
4745 tc_police.action = TC_POLICE_SHOT;
4746 tc_police.mtu = mtu;
4747 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4749 /* The following appears wrong in two ways:
4751 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4752 * arguments (or at least consistently "bytes" as both or "bits" as
4753 * both), but this supplies bytes for the first argument and bits for the
4756 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4758 * However if you "fix" those problems then "tc filter show ..." shows
4759 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4760 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4761 * tc's point of view. Whatever. */
4762 tc_police.burst = tc_bytes_to_ticks(
4763 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4765 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4766 NLM_F_EXCL | NLM_F_CREATE, &request);
4770 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4771 tcmsg->tcm_info = tc_make_handle(49,
4772 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4774 nl_msg_put_string(&request, TCA_KIND, "basic");
4775 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4776 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4777 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4778 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4779 nl_msg_end_nested(&request, police_offset);
4780 nl_msg_end_nested(&request, basic_offset);
4782 error = tc_transact(&request, NULL);
4793 /* The values in psched are not individually very meaningful, but they are
4794 * important. The tables below show some values seen in the wild.
4798 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4799 * (Before that, there are hints that it was 1000000000.)
4801 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4805 * -----------------------------------
4806 * [1] 000c8000 000f4240 000f4240 00000064
4807 * [2] 000003e8 00000400 000f4240 3b9aca00
4808 * [3] 000003e8 00000400 000f4240 3b9aca00
4809 * [4] 000003e8 00000400 000f4240 00000064
4810 * [5] 000003e8 00000040 000f4240 3b9aca00
4811 * [6] 000003e8 00000040 000f4240 000000f9
4813 * a b c d ticks_per_s buffer_hz
4814 * ------- --------- ---------- ------------- ----------- -------------
4815 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4816 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4817 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4818 * [4] 1,000 1,024 1,000,000 100 976,562 100
4819 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4820 * [6] 1,000 64 1,000,000 249 15,625,000 249
4822 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4823 * [2] 2.6.26-1-686-bigmem from Debian lenny
4824 * [3] 2.6.26-2-sparc64 from Debian lenny
4825 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4826 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4827 * [6] 2.6.34 from kernel.org on KVM
4829 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4830 static const char fn[] = "/proc/net/psched";
4831 unsigned int a, b, c, d;
4834 if (!ovsthread_once_start(&once)) {
4841 stream = fopen(fn, "r");
4843 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4847 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4848 VLOG_WARN("%s: read failed", fn);
4852 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4856 VLOG_WARN("%s: invalid scheduler parameters", fn);
4860 ticks_per_s = (double) a * c / b;
4864 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4867 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4870 ovsthread_once_done(&once);
4873 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4874 * rate of 'rate' bytes per second. */
4876 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4879 return (rate * ticks) / ticks_per_s;
4882 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4883 * rate of 'rate' bytes per second. */
4885 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4888 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4891 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4892 * a transmission rate of 'rate' bytes per second. */
4894 tc_buffer_per_jiffy(unsigned int rate)
4897 return rate / buffer_hz;
4900 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4901 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4902 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4903 * stores NULL into it if it is absent.
4905 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4908 * Returns 0 if successful, otherwise a positive errno value. */
4910 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4911 struct nlattr **options)
4913 static const struct nl_policy tca_policy[] = {
4914 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4915 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4917 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4919 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4920 tca_policy, ta, ARRAY_SIZE(ta))) {
4921 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4926 *kind = nl_attr_get_string(ta[TCA_KIND]);
4930 *options = ta[TCA_OPTIONS];
4945 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4946 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4947 * into '*options', and its queue statistics into '*stats'. Any of the output
4948 * arguments may be null.
4950 * Returns 0 if successful, otherwise a positive errno value. */
4952 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4953 struct nlattr **options, struct netdev_queue_stats *stats)
4955 static const struct nl_policy tca_policy[] = {
4956 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4957 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4959 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4961 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4962 tca_policy, ta, ARRAY_SIZE(ta))) {
4963 VLOG_WARN_RL(&rl, "failed to parse class message");
4968 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4969 *handlep = tc->tcm_handle;
4973 *options = ta[TCA_OPTIONS];
4977 const struct gnet_stats_queue *gsq;
4978 struct gnet_stats_basic gsb;
4980 static const struct nl_policy stats_policy[] = {
4981 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4982 .min_len = sizeof gsb },
4983 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4984 .min_len = sizeof *gsq },
4986 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4988 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4989 sa, ARRAY_SIZE(sa))) {
4990 VLOG_WARN_RL(&rl, "failed to parse class stats");
4994 /* Alignment issues screw up the length of struct gnet_stats_basic on
4995 * some arch/bitsize combinations. Newer versions of Linux have a
4996 * struct gnet_stats_basic_packed, but we can't depend on that. The
4997 * easiest thing to do is just to make a copy. */
4998 memset(&gsb, 0, sizeof gsb);
4999 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5000 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5001 stats->tx_bytes = gsb.bytes;
5002 stats->tx_packets = gsb.packets;
5004 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5005 stats->tx_errors = gsq->drops;
5015 memset(stats, 0, sizeof *stats);
5020 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5023 tc_query_class(const struct netdev *netdev,
5024 unsigned int handle, unsigned int parent,
5025 struct ofpbuf **replyp)
5027 struct ofpbuf request;
5028 struct tcmsg *tcmsg;
5031 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5035 tcmsg->tcm_handle = handle;
5036 tcmsg->tcm_parent = parent;
5038 error = tc_transact(&request, replyp);
5040 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5041 netdev_get_name(netdev),
5042 tc_get_major(handle), tc_get_minor(handle),
5043 tc_get_major(parent), tc_get_minor(parent),
5044 ovs_strerror(error));
5049 /* Equivalent to "tc class del dev <name> handle <handle>". */
5051 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5053 struct ofpbuf request;
5054 struct tcmsg *tcmsg;
5057 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5061 tcmsg->tcm_handle = handle;
5062 tcmsg->tcm_parent = 0;
5064 error = tc_transact(&request, NULL);
5066 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5067 netdev_get_name(netdev),
5068 tc_get_major(handle), tc_get_minor(handle),
5069 ovs_strerror(error));
5074 /* Equivalent to "tc qdisc del dev <name> root". */
5076 tc_del_qdisc(struct netdev *netdev_)
5078 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5079 struct ofpbuf request;
5080 struct tcmsg *tcmsg;
5083 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5087 tcmsg->tcm_handle = tc_make_handle(1, 0);
5088 tcmsg->tcm_parent = TC_H_ROOT;
5090 error = tc_transact(&request, NULL);
5091 if (error == EINVAL) {
5092 /* EINVAL probably means that the default qdisc was in use, in which
5093 * case we've accomplished our purpose. */
5096 if (!error && netdev->tc) {
5097 if (netdev->tc->ops->tc_destroy) {
5098 netdev->tc->ops->tc_destroy(netdev->tc);
5106 getqdisc_is_safe(void)
5108 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5109 static bool safe = false;
5111 if (ovsthread_once_start(&once)) {
5112 struct utsname utsname;
5115 if (uname(&utsname) == -1) {
5116 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5117 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5118 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5119 } else if (major < 2 || (major == 2 && minor < 35)) {
5120 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5125 ovsthread_once_done(&once);
5130 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5131 * kernel to determine what they are. Returns 0 if successful, otherwise a
5132 * positive errno value. */
5134 tc_query_qdisc(const struct netdev *netdev_)
5136 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5137 struct ofpbuf request, *qdisc;
5138 const struct tc_ops *ops;
5139 struct tcmsg *tcmsg;
5147 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5148 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5149 * 2.6.35 without that fix backported to it.
5151 * To avoid the OOPS, we must not make a request that would attempt to dump
5152 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5153 * few others. There are a few ways that I can see to do this, but most of
5154 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5155 * technique chosen here is to assume that any non-default qdisc that we
5156 * create will have a class with handle 1:0. The built-in qdiscs only have
5157 * a class with handle 0:0.
5159 * On Linux 2.6.35+ we use the straightforward method because it allows us
5160 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5161 * in such a case we get no response at all from the kernel (!) if a
5162 * builtin qdisc is in use (which is later caught by "!error &&
5163 * !qdisc->size"). */
5164 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5168 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5169 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5171 /* Figure out what tc class to instantiate. */
5172 error = tc_transact(&request, &qdisc);
5173 if (!error && qdisc->size) {
5176 error = tc_parse_qdisc(qdisc, &kind, NULL);
5178 ops = &tc_ops_other;
5180 ops = tc_lookup_linux_name(kind);
5182 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5183 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5185 ops = &tc_ops_other;
5188 } else if ((!error && !qdisc->size) || error == ENOENT) {
5189 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5190 * set up by some other entity that doesn't have a handle 1:0. We will
5191 * assume that it's the system default qdisc. */
5192 ops = &tc_ops_default;
5195 /* Who knows? Maybe the device got deleted. */
5196 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5197 netdev_get_name(netdev_), ovs_strerror(error));
5198 ops = &tc_ops_other;
5201 /* Instantiate it. */
5202 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5203 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5204 ofpbuf_delete(qdisc);
5206 return error ? error : load_error;
5209 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5210 approximate the time to transmit packets of various lengths. For an MTU of
5211 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5212 represents two possible packet lengths; for a MTU of 513 through 1024, four
5213 possible lengths; and so on.
5215 Returns, for the specified 'mtu', the number of bits that packet lengths
5216 need to be shifted right to fit within such a 256-entry table. */
5218 tc_calc_cell_log(unsigned int mtu)
5223 mtu = ETH_PAYLOAD_MAX;
5225 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5227 for (cell_log = 0; mtu >= 256; cell_log++) {
5234 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5237 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5239 memset(rate, 0, sizeof *rate);
5240 rate->cell_log = tc_calc_cell_log(mtu);
5241 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5242 /* rate->cell_align = 0; */ /* distro headers. */
5243 rate->mpu = ETH_TOTAL_MIN;
5247 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5248 * attribute of the specified "type".
5250 * See tc_calc_cell_log() above for a description of "rtab"s. */
5252 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5257 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5258 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5259 unsigned packet_size = (i + 1) << rate->cell_log;
5260 if (packet_size < rate->mpu) {
5261 packet_size = rate->mpu;
5263 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5267 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5268 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5269 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5272 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5274 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5275 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5278 /* Linux-only functions declared in netdev-linux.h */
5280 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5281 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5283 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5284 const char *flag_name, bool enable)
5286 const char *netdev_name = netdev_get_name(netdev);
5287 struct ethtool_value evalue;
5291 COVERAGE_INC(netdev_get_ethtool);
5292 memset(&evalue, 0, sizeof evalue);
5293 error = netdev_linux_do_ethtool(netdev_name,
5294 (struct ethtool_cmd *)&evalue,
5295 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5300 COVERAGE_INC(netdev_set_ethtool);
5301 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5302 if (new_flags == evalue.data) {
5305 evalue.data = new_flags;
5306 error = netdev_linux_do_ethtool(netdev_name,
5307 (struct ethtool_cmd *)&evalue,
5308 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5313 COVERAGE_INC(netdev_get_ethtool);
5314 memset(&evalue, 0, sizeof evalue);
5315 error = netdev_linux_do_ethtool(netdev_name,
5316 (struct ethtool_cmd *)&evalue,
5317 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5322 if (new_flags != evalue.data) {
5323 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5324 "device %s failed", enable ? "enable" : "disable",
5325 flag_name, netdev_name);
5332 /* Utility functions. */
5334 /* Copies 'src' into 'dst', performing format conversion in the process. */
5336 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5337 const struct rtnl_link_stats *src)
5339 dst->rx_packets = src->rx_packets;
5340 dst->tx_packets = src->tx_packets;
5341 dst->rx_bytes = src->rx_bytes;
5342 dst->tx_bytes = src->tx_bytes;
5343 dst->rx_errors = src->rx_errors;
5344 dst->tx_errors = src->tx_errors;
5345 dst->rx_dropped = src->rx_dropped;
5346 dst->tx_dropped = src->tx_dropped;
5347 dst->multicast = src->multicast;
5348 dst->collisions = src->collisions;
5349 dst->rx_length_errors = src->rx_length_errors;
5350 dst->rx_over_errors = src->rx_over_errors;
5351 dst->rx_crc_errors = src->rx_crc_errors;
5352 dst->rx_frame_errors = src->rx_frame_errors;
5353 dst->rx_fifo_errors = src->rx_fifo_errors;
5354 dst->rx_missed_errors = src->rx_missed_errors;
5355 dst->tx_aborted_errors = src->tx_aborted_errors;
5356 dst->tx_carrier_errors = src->tx_carrier_errors;
5357 dst->tx_fifo_errors = src->tx_fifo_errors;
5358 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5359 dst->tx_window_errors = src->tx_window_errors;
5362 /* Copies 'src' into 'dst', performing format conversion in the process. */
5364 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5365 const struct rtnl_link_stats64 *src)
5367 dst->rx_packets = src->rx_packets;
5368 dst->tx_packets = src->tx_packets;
5369 dst->rx_bytes = src->rx_bytes;
5370 dst->tx_bytes = src->tx_bytes;
5371 dst->rx_errors = src->rx_errors;
5372 dst->tx_errors = src->tx_errors;
5373 dst->rx_dropped = src->rx_dropped;
5374 dst->tx_dropped = src->tx_dropped;
5375 dst->multicast = src->multicast;
5376 dst->collisions = src->collisions;
5377 dst->rx_length_errors = src->rx_length_errors;
5378 dst->rx_over_errors = src->rx_over_errors;
5379 dst->rx_crc_errors = src->rx_crc_errors;
5380 dst->rx_frame_errors = src->rx_frame_errors;
5381 dst->rx_fifo_errors = src->rx_fifo_errors;
5382 dst->rx_missed_errors = src->rx_missed_errors;
5383 dst->tx_aborted_errors = src->tx_aborted_errors;
5384 dst->tx_carrier_errors = src->tx_carrier_errors;
5385 dst->tx_fifo_errors = src->tx_fifo_errors;
5386 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5387 dst->tx_window_errors = src->tx_window_errors;
5391 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5393 struct ofpbuf request;
5394 struct ofpbuf *reply;
5397 ofpbuf_init(&request, 0);
5398 nl_msg_put_nlmsghdr(&request,
5399 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5400 RTM_GETLINK, NLM_F_REQUEST);
5401 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5402 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5403 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5404 ofpbuf_uninit(&request);
5409 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5410 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5411 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5412 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5415 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5416 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5417 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5420 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5425 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5430 ofpbuf_delete(reply);
5435 get_flags(const struct netdev *dev, unsigned int *flags)
5441 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5443 *flags = ifr.ifr_flags;
5449 set_flags(const char *name, unsigned int flags)
5453 ifr.ifr_flags = flags;
5454 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5458 do_get_ifindex(const char *netdev_name)
5463 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5464 COVERAGE_INC(netdev_get_ifindex);
5466 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5468 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5469 netdev_name, ovs_strerror(error));
5472 return ifr.ifr_ifindex;
5476 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5480 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5481 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5484 netdev->get_ifindex_error = -ifindex;
5485 netdev->ifindex = 0;
5487 netdev->get_ifindex_error = 0;
5488 netdev->ifindex = ifindex;
5490 netdev->cache_valid |= VALID_IFINDEX;
5493 *ifindexp = netdev->ifindex;
5494 return netdev->get_ifindex_error;
5498 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5504 memset(&ifr, 0, sizeof ifr);
5505 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5506 COVERAGE_INC(netdev_get_hwaddr);
5507 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5509 /* ENODEV probably means that a vif disappeared asynchronously and
5510 * hasn't been removed from the database yet, so reduce the log level
5511 * to INFO for that case. */
5512 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5513 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5514 netdev_name, ovs_strerror(error));
5517 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5518 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5519 VLOG_WARN("%s device has unknown hardware address family %d",
5520 netdev_name, hwaddr_family);
5522 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5527 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5532 memset(&ifr, 0, sizeof ifr);
5533 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5534 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5535 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5536 COVERAGE_INC(netdev_set_hwaddr);
5537 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5539 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5540 netdev_name, ovs_strerror(error));
5546 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5547 int cmd, const char *cmd_name)
5552 memset(&ifr, 0, sizeof ifr);
5553 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5554 ifr.ifr_data = (caddr_t) ecmd;
5557 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5559 if (error != EOPNOTSUPP) {
5560 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5561 "failed: %s", cmd_name, name, ovs_strerror(error));
5563 /* The device doesn't support this operation. That's pretty
5564 * common, so there's no point in logging anything. */
5571 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5572 int cmd, const char *cmd_name)
5577 ifr.ifr_addr.sa_family = AF_INET;
5578 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5580 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5582 *ip = sin->sin_addr;
5587 /* Returns an AF_PACKET raw socket or a negative errno value. */
5589 af_packet_sock(void)
5591 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5594 if (ovsthread_once_start(&once)) {
5595 sock = socket(AF_PACKET, SOCK_RAW, 0);
5597 int error = set_nonblocking(sock);
5604 VLOG_ERR("failed to create packet socket: %s",
5605 ovs_strerror(errno));
5607 ovsthread_once_done(&once);