2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 struct eth_addr etheraddr;
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
461 int in4_error; /* Cached error code from reading in4 addr. */
462 int in6_error; /* Cached error code from reading in6 addr. */
464 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
465 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
466 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
468 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
471 /* For devices of class netdev_tap_class only. */
475 struct netdev_rxq_linux {
476 struct netdev_rxq up;
481 /* This is set pretty low because we probably won't learn anything from the
482 * additional log messages. */
483 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
485 /* Polling miimon status for all ports causes performance degradation when
486 * handling a large number of ports. If there are no devices using miimon, then
487 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
489 * Readers do not depend on this variable synchronizing with the related
490 * changes in the device miimon status, so we can use atomic_count. */
491 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
493 static void netdev_linux_run(void);
495 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
496 int cmd, const char *cmd_name);
497 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
498 int cmd, const char *cmd_name);
499 static int get_flags(const struct netdev *, unsigned int *flags);
500 static int set_flags(const char *, unsigned int flags);
501 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
502 enum netdev_flags on, enum netdev_flags *old_flagsp)
503 OVS_REQUIRES(netdev->mutex);
504 static int do_get_ifindex(const char *netdev_name);
505 static int get_ifindex(const struct netdev *, int *ifindexp);
506 static int do_set_addr(struct netdev *netdev,
507 int ioctl_nr, const char *ioctl_name,
508 struct in_addr addr);
509 static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
510 static int set_etheraddr(const char *netdev_name, const struct eth_addr);
511 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
512 static int af_packet_sock(void);
513 static bool netdev_linux_miimon_enabled(void);
514 static void netdev_linux_miimon_run(void);
515 static void netdev_linux_miimon_wait(void);
516 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
519 is_netdev_linux_class(const struct netdev_class *netdev_class)
521 return netdev_class->run == netdev_linux_run;
525 is_tap_netdev(const struct netdev *netdev)
527 return netdev_get_class(netdev) == &netdev_tap_class;
530 static struct netdev_linux *
531 netdev_linux_cast(const struct netdev *netdev)
533 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
535 return CONTAINER_OF(netdev, struct netdev_linux, up);
538 static struct netdev_rxq_linux *
539 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
541 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
542 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
545 static void netdev_linux_update(struct netdev_linux *netdev,
546 const struct rtnetlink_change *)
547 OVS_REQUIRES(netdev->mutex);
548 static void netdev_linux_changed(struct netdev_linux *netdev,
549 unsigned int ifi_flags, unsigned int mask)
550 OVS_REQUIRES(netdev->mutex);
552 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
553 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
554 * if no such socket could be created. */
555 static struct nl_sock *
556 netdev_linux_notify_sock(void)
558 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
559 static struct nl_sock *sock;
560 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
561 RTNLGRP_IPV6_IFADDR};
563 if (ovsthread_once_start(&once)) {
566 error = nl_sock_create(NETLINK_ROUTE, &sock);
570 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
571 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
573 nl_sock_destroy(sock);
579 ovsthread_once_done(&once);
586 netdev_linux_miimon_enabled(void)
588 return atomic_count_get(&miimon_cnt) > 0;
592 netdev_linux_run(void)
594 struct nl_sock *sock;
597 if (netdev_linux_miimon_enabled()) {
598 netdev_linux_miimon_run();
601 sock = netdev_linux_notify_sock();
607 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
608 uint64_t buf_stub[4096 / 8];
611 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
612 error = nl_sock_recv(sock, &buf, false);
614 struct rtnetlink_change change;
616 if (rtnetlink_parse(&buf, &change)) {
617 struct netdev *netdev_ = netdev_from_name(change.ifname);
618 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
619 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
621 ovs_mutex_lock(&netdev->mutex);
622 netdev_linux_update(netdev, &change);
623 ovs_mutex_unlock(&netdev->mutex);
625 netdev_close(netdev_);
627 } else if (error == ENOBUFS) {
628 struct shash device_shash;
629 struct shash_node *node;
633 shash_init(&device_shash);
634 netdev_get_devices(&netdev_linux_class, &device_shash);
635 SHASH_FOR_EACH (node, &device_shash) {
636 struct netdev *netdev_ = node->data;
637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
640 ovs_mutex_lock(&netdev->mutex);
641 get_flags(netdev_, &flags);
642 netdev_linux_changed(netdev, flags, 0);
643 ovs_mutex_unlock(&netdev->mutex);
645 netdev_close(netdev_);
647 shash_destroy(&device_shash);
648 } else if (error != EAGAIN) {
649 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
650 ovs_strerror(error));
657 netdev_linux_wait(void)
659 struct nl_sock *sock;
661 if (netdev_linux_miimon_enabled()) {
662 netdev_linux_miimon_wait();
664 sock = netdev_linux_notify_sock();
666 nl_sock_wait(sock, POLLIN);
671 netdev_linux_changed(struct netdev_linux *dev,
672 unsigned int ifi_flags, unsigned int mask)
673 OVS_REQUIRES(dev->mutex)
675 netdev_change_seq_changed(&dev->up);
677 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
678 dev->carrier_resets++;
680 dev->ifi_flags = ifi_flags;
682 dev->cache_valid &= mask;
686 netdev_linux_update(struct netdev_linux *dev,
687 const struct rtnetlink_change *change)
688 OVS_REQUIRES(dev->mutex)
690 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
691 if (change->nlmsg_type == RTM_NEWLINK) {
692 /* Keep drv-info, in4, in6. */
693 netdev_linux_changed(dev, change->ifi_flags,
694 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
696 /* Update netdev from rtnl-change msg. */
698 dev->mtu = change->mtu;
699 dev->cache_valid |= VALID_MTU;
700 dev->netdev_mtu_error = 0;
703 if (!eth_addr_is_zero(change->mac)) {
704 dev->etheraddr = change->mac;
705 dev->cache_valid |= VALID_ETHERADDR;
706 dev->ether_addr_error = 0;
709 dev->ifindex = change->if_index;
710 dev->cache_valid |= VALID_IFINDEX;
711 dev->get_ifindex_error = 0;
713 netdev_linux_changed(dev, change->ifi_flags, 0);
715 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
716 /* Invalidates in4, in6. */
717 netdev_linux_changed(dev, dev->ifi_flags,
718 ~(VALID_IN4 | VALID_IN6));
724 static struct netdev *
725 netdev_linux_alloc(void)
727 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
732 netdev_linux_common_construct(struct netdev_linux *netdev)
734 ovs_mutex_init(&netdev->mutex);
737 /* Creates system and internal devices. */
739 netdev_linux_construct(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
744 netdev_linux_common_construct(netdev);
746 error = get_flags(&netdev->up, &netdev->ifi_flags);
747 if (error == ENODEV) {
748 if (netdev->up.netdev_class != &netdev_internal_class) {
749 /* The device does not exist, so don't allow it to be opened. */
752 /* "Internal" netdevs have to be created as netdev objects before
753 * they exist in the kernel, because creating them in the kernel
754 * happens by passing a netdev object to dpif_port_add().
755 * Therefore, ignore the error. */
762 /* For most types of netdevs we open the device for each call of
763 * netdev_open(). However, this is not the case with tap devices,
764 * since it is only possible to open the device once. In this
765 * situation we share a single file descriptor, and consequently
766 * buffers, across all readers. Therefore once data is read it will
767 * be unavailable to other reads for tap devices. */
769 netdev_linux_construct_tap(struct netdev *netdev_)
771 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 static const char tap_dev[] = "/dev/net/tun";
773 const char *name = netdev_->name;
777 netdev_linux_common_construct(netdev);
779 /* Open tap device. */
780 netdev->tap_fd = open(tap_dev, O_RDWR);
781 if (netdev->tap_fd < 0) {
783 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
787 /* Create tap device. */
788 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
789 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
790 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
791 VLOG_WARN("%s: creating tap device failed: %s", name,
792 ovs_strerror(errno));
797 /* Make non-blocking. */
798 error = set_nonblocking(netdev->tap_fd);
806 close(netdev->tap_fd);
811 netdev_linux_destruct(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 if (netdev->tc && netdev->tc->ops->tc_destroy) {
816 netdev->tc->ops->tc_destroy(netdev->tc);
819 if (netdev_get_class(netdev_) == &netdev_tap_class
820 && netdev->tap_fd >= 0)
822 close(netdev->tap_fd);
825 if (netdev->miimon_interval > 0) {
826 atomic_count_dec(&miimon_cnt);
829 ovs_mutex_destroy(&netdev->mutex);
833 netdev_linux_dealloc(struct netdev *netdev_)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 static struct netdev_rxq *
840 netdev_linux_rxq_alloc(void)
842 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
847 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
849 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
850 struct netdev *netdev_ = rx->up.netdev;
851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
854 ovs_mutex_lock(&netdev->mutex);
855 rx->is_tap = is_tap_netdev(netdev_);
857 rx->fd = netdev->tap_fd;
859 struct sockaddr_ll sll;
861 /* Result of tcpdump -dd inbound */
862 static const struct sock_filter filt[] = {
863 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
864 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
865 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
866 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
868 static const struct sock_fprog fprog = {
869 ARRAY_SIZE(filt), (struct sock_filter *) filt
872 /* Create file descriptor. */
873 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
876 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
881 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
883 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
884 netdev_get_name(netdev_), ovs_strerror(error));
888 /* Set non-blocking mode. */
889 error = set_nonblocking(rx->fd);
894 /* Get ethernet device index. */
895 error = get_ifindex(&netdev->up, &ifindex);
900 /* Bind to specific ethernet device. */
901 memset(&sll, 0, sizeof sll);
902 sll.sll_family = AF_PACKET;
903 sll.sll_ifindex = ifindex;
904 sll.sll_protocol = htons(ETH_P_ALL);
905 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
907 VLOG_ERR("%s: failed to bind raw socket (%s)",
908 netdev_get_name(netdev_), ovs_strerror(error));
912 /* Filter for only inbound packets. */
913 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
917 VLOG_ERR("%s: failed to attach filter (%s)",
918 netdev_get_name(netdev_), ovs_strerror(error));
922 ovs_mutex_unlock(&netdev->mutex);
930 ovs_mutex_unlock(&netdev->mutex);
935 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
937 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
945 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
947 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
953 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
955 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
956 return htons(aux->tp_vlan_tpid);
958 return htons(ETH_TYPE_VLAN);
963 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
965 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
969 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
974 struct cmsghdr *cmsg;
977 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
981 /* Reserve headroom for a single VLAN tag */
982 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
983 size = dp_packet_tailroom(buffer);
985 iov.iov_base = dp_packet_data(buffer);
987 msgh.msg_name = NULL;
988 msgh.msg_namelen = 0;
991 msgh.msg_control = &cmsg_buffer;
992 msgh.msg_controllen = sizeof cmsg_buffer;
996 retval = recvmsg(fd, &msgh, MSG_TRUNC);
997 } while (retval < 0 && errno == EINTR);
1001 } else if (retval > size) {
1005 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1007 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1008 const struct tpacket_auxdata *aux;
1010 if (cmsg->cmsg_level != SOL_PACKET
1011 || cmsg->cmsg_type != PACKET_AUXDATA
1012 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1016 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1017 if (auxdata_has_vlan_tci(aux)) {
1018 if (retval < ETH_HEADER_LEN) {
1022 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1023 htons(aux->tp_vlan_tci));
1032 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1035 size_t size = dp_packet_tailroom(buffer);
1038 retval = read(fd, dp_packet_data(buffer), size);
1039 } while (retval < 0 && errno == EINTR);
1045 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1050 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1053 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1054 struct netdev *netdev = rx->up.netdev;
1055 struct dp_packet *buffer;
1059 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1060 mtu = ETH_PAYLOAD_MAX;
1063 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1064 DP_NETDEV_HEADROOM);
1065 retval = (rx->is_tap
1066 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1067 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1070 if (retval != EAGAIN && retval != EMSGSIZE) {
1071 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1072 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1074 dp_packet_delete(buffer);
1076 dp_packet_pad(buffer);
1077 dp_packet_rss_invalidate(buffer);
1078 packets[0] = buffer;
1086 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1088 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1089 poll_fd_wait(rx->fd, POLLIN);
1093 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1098 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1099 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1103 drain_fd(rx->fd, ifr.ifr_qlen);
1106 return drain_rcvbuf(rx->fd);
1110 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1111 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1112 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1113 * the packet is too big or too small to transmit on the device.
1115 * The caller retains ownership of 'buffer' in all cases.
1117 * The kernel maintains a packet transmission queue, so the caller is not
1118 * expected to do additional queuing of packets. */
1120 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1121 struct dp_packet **pkts, int cnt, bool may_steal)
1126 /* 'i' is incremented only if there's no error */
1127 for (i = 0; i < cnt;) {
1128 const void *data = dp_packet_data(pkts[i]);
1129 size_t size = dp_packet_size(pkts[i]);
1132 if (!is_tap_netdev(netdev_)) {
1133 /* Use our AF_PACKET socket to send to this device. */
1134 struct sockaddr_ll sll;
1140 sock = af_packet_sock();
1145 ifindex = netdev_get_ifindex(netdev_);
1150 /* We don't bother setting most fields in sockaddr_ll because the
1151 * kernel ignores them for SOCK_RAW. */
1152 memset(&sll, 0, sizeof sll);
1153 sll.sll_family = AF_PACKET;
1154 sll.sll_ifindex = ifindex;
1156 iov.iov_base = CONST_CAST(void *, data);
1159 msg.msg_name = &sll;
1160 msg.msg_namelen = sizeof sll;
1163 msg.msg_control = NULL;
1164 msg.msg_controllen = 0;
1167 retval = sendmsg(sock, &msg, 0);
1169 /* Use the tap fd to send to this device. This is essential for
1170 * tap devices, because packets sent to a tap device with an
1171 * AF_PACKET socket will loop back to be *received* again on the
1172 * tap device. This doesn't occur on other interface types
1173 * because we attach a socket filter to the rx socket. */
1174 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1176 retval = write(netdev->tap_fd, data, size);
1180 /* The Linux AF_PACKET implementation never blocks waiting for room
1181 * for packets, instead returning ENOBUFS. Translate this into
1182 * EAGAIN for the caller. */
1183 error = errno == ENOBUFS ? EAGAIN : errno;
1184 if (error == EINTR) {
1185 /* continue without incrementing 'i', i.e. retry this packet */
1189 } else if (retval != size) {
1190 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1191 " of %"PRIuSIZE") on %s", retval, size,
1192 netdev_get_name(netdev_));
1197 /* Process the next packet in the batch */
1202 for (i = 0; i < cnt; i++) {
1203 dp_packet_delete(pkts[i]);
1207 if (error && error != EAGAIN) {
1208 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1209 netdev_get_name(netdev_), ovs_strerror(error));
1216 /* Registers with the poll loop to wake up from the next call to poll_block()
1217 * when the packet transmission queue has sufficient room to transmit a packet
1218 * with netdev_send().
1220 * The kernel maintains a packet transmission queue, so the client is not
1221 * expected to do additional queuing of packets. Thus, this function is
1222 * unlikely to ever be used. It is included for completeness. */
1224 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1226 if (is_tap_netdev(netdev)) {
1227 /* TAP device always accepts packets.*/
1228 poll_immediate_wake();
1232 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1233 * otherwise a positive errno value. */
1235 netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1237 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1238 enum netdev_flags old_flags = 0;
1241 ovs_mutex_lock(&netdev->mutex);
1243 if (netdev->cache_valid & VALID_ETHERADDR) {
1244 error = netdev->ether_addr_error;
1245 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1248 netdev->cache_valid &= ~VALID_ETHERADDR;
1251 /* Tap devices must be brought down before setting the address. */
1252 if (is_tap_netdev(netdev_)) {
1253 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1255 error = set_etheraddr(netdev_get_name(netdev_), mac);
1256 if (!error || error == ENODEV) {
1257 netdev->ether_addr_error = error;
1258 netdev->cache_valid |= VALID_ETHERADDR;
1260 netdev->etheraddr = mac;
1264 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1265 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1269 ovs_mutex_unlock(&netdev->mutex);
1273 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1275 netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1277 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1280 ovs_mutex_lock(&netdev->mutex);
1281 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1282 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1283 &netdev->etheraddr);
1284 netdev->cache_valid |= VALID_ETHERADDR;
1287 error = netdev->ether_addr_error;
1289 *mac = netdev->etheraddr;
1291 ovs_mutex_unlock(&netdev->mutex);
1297 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1301 if (!(netdev->cache_valid & VALID_MTU)) {
1304 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1305 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1306 netdev->mtu = ifr.ifr_mtu;
1307 netdev->cache_valid |= VALID_MTU;
1310 error = netdev->netdev_mtu_error;
1312 *mtup = netdev->mtu;
1318 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1319 * in bytes, not including the hardware header; thus, this is typically 1500
1320 * bytes for Ethernet devices. */
1322 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1327 ovs_mutex_lock(&netdev->mutex);
1328 error = netdev_linux_get_mtu__(netdev, mtup);
1329 ovs_mutex_unlock(&netdev->mutex);
1334 /* Sets the maximum size of transmitted (MTU) for given device using linux
1335 * networking ioctl interface.
1338 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1344 ovs_mutex_lock(&netdev->mutex);
1345 if (netdev->cache_valid & VALID_MTU) {
1346 error = netdev->netdev_mtu_error;
1347 if (error || netdev->mtu == mtu) {
1350 netdev->cache_valid &= ~VALID_MTU;
1353 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1354 SIOCSIFMTU, "SIOCSIFMTU");
1355 if (!error || error == ENODEV) {
1356 netdev->netdev_mtu_error = error;
1357 netdev->mtu = ifr.ifr_mtu;
1358 netdev->cache_valid |= VALID_MTU;
1361 ovs_mutex_unlock(&netdev->mutex);
1365 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1366 * On failure, returns a negative errno value. */
1368 netdev_linux_get_ifindex(const struct netdev *netdev_)
1370 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1373 ovs_mutex_lock(&netdev->mutex);
1374 error = get_ifindex(netdev_, &ifindex);
1375 ovs_mutex_unlock(&netdev->mutex);
1377 return error ? -error : ifindex;
1381 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1383 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1385 ovs_mutex_lock(&netdev->mutex);
1386 if (netdev->miimon_interval > 0) {
1387 *carrier = netdev->miimon;
1389 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1391 ovs_mutex_unlock(&netdev->mutex);
1396 static long long int
1397 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1399 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1400 long long int carrier_resets;
1402 ovs_mutex_lock(&netdev->mutex);
1403 carrier_resets = netdev->carrier_resets;
1404 ovs_mutex_unlock(&netdev->mutex);
1406 return carrier_resets;
1410 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1411 struct mii_ioctl_data *data)
1416 memset(&ifr, 0, sizeof ifr);
1417 memcpy(&ifr.ifr_data, data, sizeof *data);
1418 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1419 memcpy(data, &ifr.ifr_data, sizeof *data);
1425 netdev_linux_get_miimon(const char *name, bool *miimon)
1427 struct mii_ioctl_data data;
1432 memset(&data, 0, sizeof data);
1433 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1435 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1436 data.reg_num = MII_BMSR;
1437 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1441 *miimon = !!(data.val_out & BMSR_LSTATUS);
1443 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1446 struct ethtool_cmd ecmd;
1448 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1451 COVERAGE_INC(netdev_get_ethtool);
1452 memset(&ecmd, 0, sizeof ecmd);
1453 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1456 struct ethtool_value eval;
1458 memcpy(&eval, &ecmd, sizeof eval);
1459 *miimon = !!eval.data;
1461 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1469 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1470 long long int interval)
1472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1474 ovs_mutex_lock(&netdev->mutex);
1475 interval = interval > 0 ? MAX(interval, 100) : 0;
1476 if (netdev->miimon_interval != interval) {
1477 if (interval && !netdev->miimon_interval) {
1478 atomic_count_inc(&miimon_cnt);
1479 } else if (!interval && netdev->miimon_interval) {
1480 atomic_count_dec(&miimon_cnt);
1483 netdev->miimon_interval = interval;
1484 timer_set_expired(&netdev->miimon_timer);
1486 ovs_mutex_unlock(&netdev->mutex);
1492 netdev_linux_miimon_run(void)
1494 struct shash device_shash;
1495 struct shash_node *node;
1497 shash_init(&device_shash);
1498 netdev_get_devices(&netdev_linux_class, &device_shash);
1499 SHASH_FOR_EACH (node, &device_shash) {
1500 struct netdev *netdev = node->data;
1501 struct netdev_linux *dev = netdev_linux_cast(netdev);
1504 ovs_mutex_lock(&dev->mutex);
1505 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1506 netdev_linux_get_miimon(dev->up.name, &miimon);
1507 if (miimon != dev->miimon) {
1508 dev->miimon = miimon;
1509 netdev_linux_changed(dev, dev->ifi_flags, 0);
1512 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1514 ovs_mutex_unlock(&dev->mutex);
1515 netdev_close(netdev);
1518 shash_destroy(&device_shash);
1522 netdev_linux_miimon_wait(void)
1524 struct shash device_shash;
1525 struct shash_node *node;
1527 shash_init(&device_shash);
1528 netdev_get_devices(&netdev_linux_class, &device_shash);
1529 SHASH_FOR_EACH (node, &device_shash) {
1530 struct netdev *netdev = node->data;
1531 struct netdev_linux *dev = netdev_linux_cast(netdev);
1533 ovs_mutex_lock(&dev->mutex);
1534 if (dev->miimon_interval > 0) {
1535 timer_wait(&dev->miimon_timer);
1537 ovs_mutex_unlock(&dev->mutex);
1538 netdev_close(netdev);
1540 shash_destroy(&device_shash);
1544 swap_uint64(uint64_t *a, uint64_t *b)
1551 /* Copies 'src' into 'dst', performing format conversion in the process.
1553 * 'src' is allowed to be misaligned. */
1555 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1556 const struct ovs_vport_stats *src)
1558 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1559 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1560 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1561 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1562 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1563 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1564 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1565 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1567 dst->collisions = 0;
1568 dst->rx_length_errors = 0;
1569 dst->rx_over_errors = 0;
1570 dst->rx_crc_errors = 0;
1571 dst->rx_frame_errors = 0;
1572 dst->rx_fifo_errors = 0;
1573 dst->rx_missed_errors = 0;
1574 dst->tx_aborted_errors = 0;
1575 dst->tx_carrier_errors = 0;
1576 dst->tx_fifo_errors = 0;
1577 dst->tx_heartbeat_errors = 0;
1578 dst->tx_window_errors = 0;
1582 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1584 struct dpif_netlink_vport reply;
1588 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1591 } else if (!reply.stats) {
1596 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1604 get_stats_via_vport(const struct netdev *netdev_,
1605 struct netdev_stats *stats)
1607 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1609 if (!netdev->vport_stats_error ||
1610 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1613 error = get_stats_via_vport__(netdev_, stats);
1614 if (error && error != ENOENT && error != ENODEV) {
1615 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1617 netdev_get_name(netdev_), ovs_strerror(error));
1619 netdev->vport_stats_error = error;
1620 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1624 /* Retrieves current device stats for 'netdev-linux'. */
1626 netdev_linux_get_stats(const struct netdev *netdev_,
1627 struct netdev_stats *stats)
1629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1630 struct netdev_stats dev_stats;
1633 ovs_mutex_lock(&netdev->mutex);
1634 get_stats_via_vport(netdev_, stats);
1635 error = get_stats_via_netlink(netdev_, &dev_stats);
1637 if (!netdev->vport_stats_error) {
1640 } else if (netdev->vport_stats_error) {
1641 /* stats not available from OVS then use netdev stats. */
1644 /* Use kernel netdev's packet and byte counts since vport's counters
1645 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1647 stats->rx_packets = dev_stats.rx_packets;
1648 stats->rx_bytes = dev_stats.rx_bytes;
1649 stats->tx_packets = dev_stats.tx_packets;
1650 stats->tx_bytes = dev_stats.tx_bytes;
1652 stats->rx_errors += dev_stats.rx_errors;
1653 stats->tx_errors += dev_stats.tx_errors;
1654 stats->rx_dropped += dev_stats.rx_dropped;
1655 stats->tx_dropped += dev_stats.tx_dropped;
1656 stats->multicast += dev_stats.multicast;
1657 stats->collisions += dev_stats.collisions;
1658 stats->rx_length_errors += dev_stats.rx_length_errors;
1659 stats->rx_over_errors += dev_stats.rx_over_errors;
1660 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1661 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1662 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1663 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1664 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1665 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1666 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1667 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1668 stats->tx_window_errors += dev_stats.tx_window_errors;
1670 ovs_mutex_unlock(&netdev->mutex);
1675 /* Retrieves current device stats for 'netdev-tap' netdev or
1676 * netdev-internal. */
1678 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1681 struct netdev_stats dev_stats;
1684 ovs_mutex_lock(&netdev->mutex);
1685 get_stats_via_vport(netdev_, stats);
1686 error = get_stats_via_netlink(netdev_, &dev_stats);
1688 if (!netdev->vport_stats_error) {
1691 } else if (netdev->vport_stats_error) {
1692 /* Transmit and receive stats will appear to be swapped relative to the
1693 * other ports since we are the one sending the data, not a remote
1694 * computer. For consistency, we swap them back here. This does not
1695 * apply if we are getting stats from the vport layer because it always
1696 * tracks stats from the perspective of the switch. */
1699 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1700 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1701 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1702 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1703 stats->rx_length_errors = 0;
1704 stats->rx_over_errors = 0;
1705 stats->rx_crc_errors = 0;
1706 stats->rx_frame_errors = 0;
1707 stats->rx_fifo_errors = 0;
1708 stats->rx_missed_errors = 0;
1709 stats->tx_aborted_errors = 0;
1710 stats->tx_carrier_errors = 0;
1711 stats->tx_fifo_errors = 0;
1712 stats->tx_heartbeat_errors = 0;
1713 stats->tx_window_errors = 0;
1715 /* Use kernel netdev's packet and byte counts since vport counters
1716 * do not reflect packet counts on the wire when GSO, TSO or GRO
1718 stats->rx_packets = dev_stats.tx_packets;
1719 stats->rx_bytes = dev_stats.tx_bytes;
1720 stats->tx_packets = dev_stats.rx_packets;
1721 stats->tx_bytes = dev_stats.rx_bytes;
1723 stats->rx_dropped += dev_stats.tx_dropped;
1724 stats->tx_dropped += dev_stats.rx_dropped;
1726 stats->rx_errors += dev_stats.tx_errors;
1727 stats->tx_errors += dev_stats.rx_errors;
1729 stats->multicast += dev_stats.multicast;
1730 stats->collisions += dev_stats.collisions;
1732 ovs_mutex_unlock(&netdev->mutex);
1738 netdev_internal_get_stats(const struct netdev *netdev_,
1739 struct netdev_stats *stats)
1741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1744 ovs_mutex_lock(&netdev->mutex);
1745 get_stats_via_vport(netdev_, stats);
1746 error = netdev->vport_stats_error;
1747 ovs_mutex_unlock(&netdev->mutex);
1753 netdev_linux_read_features(struct netdev_linux *netdev)
1755 struct ethtool_cmd ecmd;
1759 if (netdev->cache_valid & VALID_FEATURES) {
1763 COVERAGE_INC(netdev_get_ethtool);
1764 memset(&ecmd, 0, sizeof ecmd);
1765 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1766 ETHTOOL_GSET, "ETHTOOL_GSET");
1771 /* Supported features. */
1772 netdev->supported = 0;
1773 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1774 netdev->supported |= NETDEV_F_10MB_HD;
1776 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1777 netdev->supported |= NETDEV_F_10MB_FD;
1779 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1780 netdev->supported |= NETDEV_F_100MB_HD;
1782 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1783 netdev->supported |= NETDEV_F_100MB_FD;
1785 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1786 netdev->supported |= NETDEV_F_1GB_HD;
1788 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1789 netdev->supported |= NETDEV_F_1GB_FD;
1791 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1792 netdev->supported |= NETDEV_F_10GB_FD;
1794 if (ecmd.supported & SUPPORTED_TP) {
1795 netdev->supported |= NETDEV_F_COPPER;
1797 if (ecmd.supported & SUPPORTED_FIBRE) {
1798 netdev->supported |= NETDEV_F_FIBER;
1800 if (ecmd.supported & SUPPORTED_Autoneg) {
1801 netdev->supported |= NETDEV_F_AUTONEG;
1803 if (ecmd.supported & SUPPORTED_Pause) {
1804 netdev->supported |= NETDEV_F_PAUSE;
1806 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1807 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1810 /* Advertised features. */
1811 netdev->advertised = 0;
1812 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1813 netdev->advertised |= NETDEV_F_10MB_HD;
1815 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1816 netdev->advertised |= NETDEV_F_10MB_FD;
1818 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1819 netdev->advertised |= NETDEV_F_100MB_HD;
1821 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1822 netdev->advertised |= NETDEV_F_100MB_FD;
1824 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1825 netdev->advertised |= NETDEV_F_1GB_HD;
1827 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1828 netdev->advertised |= NETDEV_F_1GB_FD;
1830 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1831 netdev->advertised |= NETDEV_F_10GB_FD;
1833 if (ecmd.advertising & ADVERTISED_TP) {
1834 netdev->advertised |= NETDEV_F_COPPER;
1836 if (ecmd.advertising & ADVERTISED_FIBRE) {
1837 netdev->advertised |= NETDEV_F_FIBER;
1839 if (ecmd.advertising & ADVERTISED_Autoneg) {
1840 netdev->advertised |= NETDEV_F_AUTONEG;
1842 if (ecmd.advertising & ADVERTISED_Pause) {
1843 netdev->advertised |= NETDEV_F_PAUSE;
1845 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1846 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1849 /* Current settings. */
1851 if (speed == SPEED_10) {
1852 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1853 } else if (speed == SPEED_100) {
1854 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1855 } else if (speed == SPEED_1000) {
1856 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1857 } else if (speed == SPEED_10000) {
1858 netdev->current = NETDEV_F_10GB_FD;
1859 } else if (speed == 40000) {
1860 netdev->current = NETDEV_F_40GB_FD;
1861 } else if (speed == 100000) {
1862 netdev->current = NETDEV_F_100GB_FD;
1863 } else if (speed == 1000000) {
1864 netdev->current = NETDEV_F_1TB_FD;
1866 netdev->current = 0;
1869 if (ecmd.port == PORT_TP) {
1870 netdev->current |= NETDEV_F_COPPER;
1871 } else if (ecmd.port == PORT_FIBRE) {
1872 netdev->current |= NETDEV_F_FIBER;
1876 netdev->current |= NETDEV_F_AUTONEG;
1880 netdev->cache_valid |= VALID_FEATURES;
1881 netdev->get_features_error = error;
1884 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1885 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1886 * Returns 0 if successful, otherwise a positive errno value. */
1888 netdev_linux_get_features(const struct netdev *netdev_,
1889 enum netdev_features *current,
1890 enum netdev_features *advertised,
1891 enum netdev_features *supported,
1892 enum netdev_features *peer)
1894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1897 ovs_mutex_lock(&netdev->mutex);
1898 netdev_linux_read_features(netdev);
1899 if (!netdev->get_features_error) {
1900 *current = netdev->current;
1901 *advertised = netdev->advertised;
1902 *supported = netdev->supported;
1903 *peer = 0; /* XXX */
1905 error = netdev->get_features_error;
1906 ovs_mutex_unlock(&netdev->mutex);
1911 /* Set the features advertised by 'netdev' to 'advertise'. */
1913 netdev_linux_set_advertisements(struct netdev *netdev_,
1914 enum netdev_features advertise)
1916 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1917 struct ethtool_cmd ecmd;
1920 ovs_mutex_lock(&netdev->mutex);
1922 COVERAGE_INC(netdev_get_ethtool);
1923 memset(&ecmd, 0, sizeof ecmd);
1924 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1925 ETHTOOL_GSET, "ETHTOOL_GSET");
1930 ecmd.advertising = 0;
1931 if (advertise & NETDEV_F_10MB_HD) {
1932 ecmd.advertising |= ADVERTISED_10baseT_Half;
1934 if (advertise & NETDEV_F_10MB_FD) {
1935 ecmd.advertising |= ADVERTISED_10baseT_Full;
1937 if (advertise & NETDEV_F_100MB_HD) {
1938 ecmd.advertising |= ADVERTISED_100baseT_Half;
1940 if (advertise & NETDEV_F_100MB_FD) {
1941 ecmd.advertising |= ADVERTISED_100baseT_Full;
1943 if (advertise & NETDEV_F_1GB_HD) {
1944 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1946 if (advertise & NETDEV_F_1GB_FD) {
1947 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1949 if (advertise & NETDEV_F_10GB_FD) {
1950 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1952 if (advertise & NETDEV_F_COPPER) {
1953 ecmd.advertising |= ADVERTISED_TP;
1955 if (advertise & NETDEV_F_FIBER) {
1956 ecmd.advertising |= ADVERTISED_FIBRE;
1958 if (advertise & NETDEV_F_AUTONEG) {
1959 ecmd.advertising |= ADVERTISED_Autoneg;
1961 if (advertise & NETDEV_F_PAUSE) {
1962 ecmd.advertising |= ADVERTISED_Pause;
1964 if (advertise & NETDEV_F_PAUSE_ASYM) {
1965 ecmd.advertising |= ADVERTISED_Asym_Pause;
1967 COVERAGE_INC(netdev_set_ethtool);
1968 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1969 ETHTOOL_SSET, "ETHTOOL_SSET");
1972 ovs_mutex_unlock(&netdev->mutex);
1976 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1977 * successful, otherwise a positive errno value. */
1979 netdev_linux_set_policing(struct netdev *netdev_,
1980 uint32_t kbits_rate, uint32_t kbits_burst)
1982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1983 const char *netdev_name = netdev_get_name(netdev_);
1986 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1987 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1988 : kbits_burst); /* Stick with user-specified value. */
1990 ovs_mutex_lock(&netdev->mutex);
1991 if (netdev->cache_valid & VALID_POLICING) {
1992 error = netdev->netdev_policing_error;
1993 if (error || (netdev->kbits_rate == kbits_rate &&
1994 netdev->kbits_burst == kbits_burst)) {
1995 /* Assume that settings haven't changed since we last set them. */
1998 netdev->cache_valid &= ~VALID_POLICING;
2001 COVERAGE_INC(netdev_set_policing);
2002 /* Remove any existing ingress qdisc. */
2003 error = tc_add_del_ingress_qdisc(netdev_, false);
2005 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2006 netdev_name, ovs_strerror(error));
2011 error = tc_add_del_ingress_qdisc(netdev_, true);
2013 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2014 netdev_name, ovs_strerror(error));
2018 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2020 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2021 netdev_name, ovs_strerror(error));
2026 netdev->kbits_rate = kbits_rate;
2027 netdev->kbits_burst = kbits_burst;
2030 if (!error || error == ENODEV) {
2031 netdev->netdev_policing_error = error;
2032 netdev->cache_valid |= VALID_POLICING;
2034 ovs_mutex_unlock(&netdev->mutex);
2039 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2042 const struct tc_ops *const *opsp;
2044 for (opsp = tcs; *opsp != NULL; opsp++) {
2045 const struct tc_ops *ops = *opsp;
2046 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2047 sset_add(types, ops->ovs_name);
2053 static const struct tc_ops *
2054 tc_lookup_ovs_name(const char *name)
2056 const struct tc_ops *const *opsp;
2058 for (opsp = tcs; *opsp != NULL; opsp++) {
2059 const struct tc_ops *ops = *opsp;
2060 if (!strcmp(name, ops->ovs_name)) {
2067 static const struct tc_ops *
2068 tc_lookup_linux_name(const char *name)
2070 const struct tc_ops *const *opsp;
2072 for (opsp = tcs; *opsp != NULL; opsp++) {
2073 const struct tc_ops *ops = *opsp;
2074 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2081 static struct tc_queue *
2082 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2086 struct tc_queue *queue;
2088 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2089 if (queue->queue_id == queue_id) {
2096 static struct tc_queue *
2097 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2099 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2103 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2105 struct netdev_qos_capabilities *caps)
2107 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2111 caps->n_queues = ops->n_queues;
2116 netdev_linux_get_qos(const struct netdev *netdev_,
2117 const char **typep, struct smap *details)
2119 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2122 ovs_mutex_lock(&netdev->mutex);
2123 error = tc_query_qdisc(netdev_);
2125 *typep = netdev->tc->ops->ovs_name;
2126 error = (netdev->tc->ops->qdisc_get
2127 ? netdev->tc->ops->qdisc_get(netdev_, details)
2130 ovs_mutex_unlock(&netdev->mutex);
2136 netdev_linux_set_qos(struct netdev *netdev_,
2137 const char *type, const struct smap *details)
2139 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2140 const struct tc_ops *new_ops;
2143 new_ops = tc_lookup_ovs_name(type);
2144 if (!new_ops || !new_ops->tc_install) {
2148 ovs_mutex_lock(&netdev->mutex);
2149 error = tc_query_qdisc(netdev_);
2154 if (new_ops == netdev->tc->ops) {
2155 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2157 /* Delete existing qdisc. */
2158 error = tc_del_qdisc(netdev_);
2162 ovs_assert(netdev->tc == NULL);
2164 /* Install new qdisc. */
2165 error = new_ops->tc_install(netdev_, details);
2166 ovs_assert((error == 0) == (netdev->tc != NULL));
2170 ovs_mutex_unlock(&netdev->mutex);
2175 netdev_linux_get_queue(const struct netdev *netdev_,
2176 unsigned int queue_id, struct smap *details)
2178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2181 ovs_mutex_lock(&netdev->mutex);
2182 error = tc_query_qdisc(netdev_);
2184 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2186 ? netdev->tc->ops->class_get(netdev_, queue, details)
2189 ovs_mutex_unlock(&netdev->mutex);
2195 netdev_linux_set_queue(struct netdev *netdev_,
2196 unsigned int queue_id, const struct smap *details)
2198 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2201 ovs_mutex_lock(&netdev->mutex);
2202 error = tc_query_qdisc(netdev_);
2204 error = (queue_id < netdev->tc->ops->n_queues
2205 && netdev->tc->ops->class_set
2206 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2209 ovs_mutex_unlock(&netdev->mutex);
2215 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2217 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2220 ovs_mutex_lock(&netdev->mutex);
2221 error = tc_query_qdisc(netdev_);
2223 if (netdev->tc->ops->class_delete) {
2224 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2226 ? netdev->tc->ops->class_delete(netdev_, queue)
2232 ovs_mutex_unlock(&netdev->mutex);
2238 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2239 unsigned int queue_id,
2240 struct netdev_queue_stats *stats)
2242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2245 ovs_mutex_lock(&netdev->mutex);
2246 error = tc_query_qdisc(netdev_);
2248 if (netdev->tc->ops->class_get_stats) {
2249 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2251 stats->created = queue->created;
2252 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2261 ovs_mutex_unlock(&netdev->mutex);
2266 struct queue_dump_state {
2267 struct nl_dump dump;
2272 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2274 struct ofpbuf request;
2275 struct tcmsg *tcmsg;
2277 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2281 tcmsg->tcm_parent = 0;
2282 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2283 ofpbuf_uninit(&request);
2285 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2290 finish_queue_dump(struct queue_dump_state *state)
2292 ofpbuf_uninit(&state->buf);
2293 return nl_dump_done(&state->dump);
2296 struct netdev_linux_queue_state {
2297 unsigned int *queues;
2303 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2305 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2308 ovs_mutex_lock(&netdev->mutex);
2309 error = tc_query_qdisc(netdev_);
2311 if (netdev->tc->ops->class_get) {
2312 struct netdev_linux_queue_state *state;
2313 struct tc_queue *queue;
2316 *statep = state = xmalloc(sizeof *state);
2317 state->n_queues = hmap_count(&netdev->tc->queues);
2318 state->cur_queue = 0;
2319 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2322 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2323 state->queues[i++] = queue->queue_id;
2329 ovs_mutex_unlock(&netdev->mutex);
2335 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2336 unsigned int *queue_idp, struct smap *details)
2338 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2339 struct netdev_linux_queue_state *state = state_;
2342 ovs_mutex_lock(&netdev->mutex);
2343 while (state->cur_queue < state->n_queues) {
2344 unsigned int queue_id = state->queues[state->cur_queue++];
2345 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2348 *queue_idp = queue_id;
2349 error = netdev->tc->ops->class_get(netdev_, queue, details);
2353 ovs_mutex_unlock(&netdev->mutex);
2359 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2362 struct netdev_linux_queue_state *state = state_;
2364 free(state->queues);
2370 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2371 netdev_dump_queue_stats_cb *cb, void *aux)
2373 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2376 ovs_mutex_lock(&netdev->mutex);
2377 error = tc_query_qdisc(netdev_);
2379 struct queue_dump_state state;
2381 if (!netdev->tc->ops->class_dump_stats) {
2383 } else if (!start_queue_dump(netdev_, &state)) {
2389 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2390 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2397 retval = finish_queue_dump(&state);
2403 ovs_mutex_unlock(&netdev->mutex);
2409 netdev_linux_get_in4(const struct netdev *netdev_,
2410 struct in_addr *address, struct in_addr *netmask)
2412 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2415 ovs_mutex_lock(&netdev->mutex);
2416 if (!(netdev->cache_valid & VALID_IN4)) {
2417 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2418 SIOCGIFADDR, "SIOCGIFADDR");
2420 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2421 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2423 netdev->in4_error = error;
2424 netdev->cache_valid |= VALID_IN4;
2426 error = netdev->in4_error;
2430 if (netdev->address.s_addr != INADDR_ANY) {
2431 *address = netdev->address;
2432 *netmask = netdev->netmask;
2434 error = EADDRNOTAVAIL;
2437 ovs_mutex_unlock(&netdev->mutex);
2443 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2444 struct in_addr netmask)
2446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2449 ovs_mutex_lock(&netdev->mutex);
2450 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2452 netdev->address = address;
2453 netdev->netmask = netmask;
2454 if (address.s_addr != INADDR_ANY) {
2455 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2456 "SIOCSIFNETMASK", netmask);
2461 netdev->cache_valid |= VALID_IN4;
2462 netdev->in4_error = 0;
2464 netdev->cache_valid &= ~VALID_IN4;
2466 ovs_mutex_unlock(&netdev->mutex);
2472 parse_if_inet6_line(const char *line,
2473 struct in6_addr *in6, char ifname[16 + 1])
2475 uint8_t *s6 = in6->s6_addr;
2476 #define X8 "%2"SCNx8
2477 return ovs_scan(line,
2478 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2479 "%*x %*x %*x %*x %16s\n",
2480 &s6[0], &s6[1], &s6[2], &s6[3],
2481 &s6[4], &s6[5], &s6[6], &s6[7],
2482 &s6[8], &s6[9], &s6[10], &s6[11],
2483 &s6[12], &s6[13], &s6[14], &s6[15],
2487 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2488 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2491 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2493 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2496 ovs_mutex_lock(&netdev->mutex);
2497 if (!(netdev->cache_valid & VALID_IN6)) {
2501 netdev->in6 = in6addr_any;
2502 netdev->in6_error = EADDRNOTAVAIL;
2504 file = fopen("/proc/net/if_inet6", "r");
2506 const char *name = netdev_get_name(netdev_);
2507 while (fgets(line, sizeof line, file)) {
2508 struct in6_addr in6_tmp;
2509 char ifname[16 + 1];
2510 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2511 && !strcmp(name, ifname))
2513 netdev->in6 = in6_tmp;
2514 netdev->in6_error = 0;
2520 netdev->in6_error = EOPNOTSUPP;
2522 netdev->cache_valid |= VALID_IN6;
2525 error = netdev->in6_error;
2526 ovs_mutex_unlock(&netdev->mutex);
2532 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2534 struct sockaddr_in sin;
2535 memset(&sin, 0, sizeof sin);
2536 sin.sin_family = AF_INET;
2537 sin.sin_addr = addr;
2540 memset(sa, 0, sizeof *sa);
2541 memcpy(sa, &sin, sizeof sin);
2545 do_set_addr(struct netdev *netdev,
2546 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2550 make_in4_sockaddr(&ifr.ifr_addr, addr);
2551 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2555 /* Adds 'router' as a default IP gateway. */
2557 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2559 struct in_addr any = { INADDR_ANY };
2563 memset(&rt, 0, sizeof rt);
2564 make_in4_sockaddr(&rt.rt_dst, any);
2565 make_in4_sockaddr(&rt.rt_gateway, router);
2566 make_in4_sockaddr(&rt.rt_genmask, any);
2567 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2568 error = af_inet_ioctl(SIOCADDRT, &rt);
2570 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2576 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2579 static const char fn[] = "/proc/net/route";
2584 *netdev_name = NULL;
2585 stream = fopen(fn, "r");
2586 if (stream == NULL) {
2587 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2592 while (fgets(line, sizeof line, stream)) {
2595 ovs_be32 dest, gateway, mask;
2596 int refcnt, metric, mtu;
2597 unsigned int flags, use, window, irtt;
2600 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2602 iface, &dest, &gateway, &flags, &refcnt,
2603 &use, &metric, &mask, &mtu, &window, &irtt)) {
2604 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2608 if (!(flags & RTF_UP)) {
2609 /* Skip routes that aren't up. */
2613 /* The output of 'dest', 'mask', and 'gateway' were given in
2614 * network byte order, so we don't need need any endian
2615 * conversions here. */
2616 if ((dest & mask) == (host->s_addr & mask)) {
2618 /* The host is directly reachable. */
2619 next_hop->s_addr = 0;
2621 /* To reach the host, we must go through a gateway. */
2622 next_hop->s_addr = gateway;
2624 *netdev_name = xstrdup(iface);
2636 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2638 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2641 ovs_mutex_lock(&netdev->mutex);
2642 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2643 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2645 COVERAGE_INC(netdev_get_ethtool);
2646 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2647 error = netdev_linux_do_ethtool(netdev->up.name,
2650 "ETHTOOL_GDRVINFO");
2652 netdev->cache_valid |= VALID_DRVINFO;
2657 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2658 smap_add(smap, "driver_version", netdev->drvinfo.version);
2659 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2661 ovs_mutex_unlock(&netdev->mutex);
2667 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2670 smap_add(smap, "driver_name", "openvswitch");
2674 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2675 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2676 * returns 0. Otherwise, it returns a positive errno value; in particular,
2677 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2679 netdev_linux_arp_lookup(const struct netdev *netdev,
2680 ovs_be32 ip, struct eth_addr *mac)
2683 struct sockaddr_in sin;
2686 memset(&r, 0, sizeof r);
2687 memset(&sin, 0, sizeof sin);
2688 sin.sin_family = AF_INET;
2689 sin.sin_addr.s_addr = ip;
2691 memcpy(&r.arp_pa, &sin, sizeof sin);
2692 r.arp_ha.sa_family = ARPHRD_ETHER;
2694 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2695 COVERAGE_INC(netdev_arp_lookup);
2696 retval = af_inet_ioctl(SIOCGARP, &r);
2698 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2699 } else if (retval != ENXIO) {
2700 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2701 netdev_get_name(netdev), IP_ARGS(ip),
2702 ovs_strerror(retval));
2708 nd_to_iff_flags(enum netdev_flags nd)
2711 if (nd & NETDEV_UP) {
2714 if (nd & NETDEV_PROMISC) {
2717 if (nd & NETDEV_LOOPBACK) {
2718 iff |= IFF_LOOPBACK;
2724 iff_to_nd_flags(int iff)
2726 enum netdev_flags nd = 0;
2730 if (iff & IFF_PROMISC) {
2731 nd |= NETDEV_PROMISC;
2733 if (iff & IFF_LOOPBACK) {
2734 nd |= NETDEV_LOOPBACK;
2740 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2741 enum netdev_flags on, enum netdev_flags *old_flagsp)
2742 OVS_REQUIRES(netdev->mutex)
2744 int old_flags, new_flags;
2747 old_flags = netdev->ifi_flags;
2748 *old_flagsp = iff_to_nd_flags(old_flags);
2749 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2750 if (new_flags != old_flags) {
2751 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2752 get_flags(&netdev->up, &netdev->ifi_flags);
2759 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2760 enum netdev_flags on, enum netdev_flags *old_flagsp)
2762 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2765 ovs_mutex_lock(&netdev->mutex);
2766 error = update_flags(netdev, off, on, old_flagsp);
2767 ovs_mutex_unlock(&netdev->mutex);
2772 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2773 GET_FEATURES, GET_STATUS) \
2779 netdev_linux_wait, \
2781 netdev_linux_alloc, \
2783 netdev_linux_destruct, \
2784 netdev_linux_dealloc, \
2785 NULL, /* get_config */ \
2786 NULL, /* set_config */ \
2787 NULL, /* get_tunnel_config */ \
2788 NULL, /* build header */ \
2789 NULL, /* push header */ \
2790 NULL, /* pop header */ \
2791 NULL, /* get_numa_id */ \
2792 NULL, /* set_multiq */ \
2794 netdev_linux_send, \
2795 netdev_linux_send_wait, \
2797 netdev_linux_set_etheraddr, \
2798 netdev_linux_get_etheraddr, \
2799 netdev_linux_get_mtu, \
2800 netdev_linux_set_mtu, \
2801 netdev_linux_get_ifindex, \
2802 netdev_linux_get_carrier, \
2803 netdev_linux_get_carrier_resets, \
2804 netdev_linux_set_miimon_interval, \
2808 netdev_linux_set_advertisements, \
2810 netdev_linux_set_policing, \
2811 netdev_linux_get_qos_types, \
2812 netdev_linux_get_qos_capabilities, \
2813 netdev_linux_get_qos, \
2814 netdev_linux_set_qos, \
2815 netdev_linux_get_queue, \
2816 netdev_linux_set_queue, \
2817 netdev_linux_delete_queue, \
2818 netdev_linux_get_queue_stats, \
2819 netdev_linux_queue_dump_start, \
2820 netdev_linux_queue_dump_next, \
2821 netdev_linux_queue_dump_done, \
2822 netdev_linux_dump_queue_stats, \
2824 netdev_linux_get_in4, \
2825 netdev_linux_set_in4, \
2826 netdev_linux_get_in6, \
2827 netdev_linux_add_router, \
2828 netdev_linux_get_next_hop, \
2830 netdev_linux_arp_lookup, \
2832 netdev_linux_update_flags, \
2834 netdev_linux_rxq_alloc, \
2835 netdev_linux_rxq_construct, \
2836 netdev_linux_rxq_destruct, \
2837 netdev_linux_rxq_dealloc, \
2838 netdev_linux_rxq_recv, \
2839 netdev_linux_rxq_wait, \
2840 netdev_linux_rxq_drain, \
2843 const struct netdev_class netdev_linux_class =
2846 netdev_linux_construct,
2847 netdev_linux_get_stats,
2848 netdev_linux_get_features,
2849 netdev_linux_get_status);
2851 const struct netdev_class netdev_tap_class =
2854 netdev_linux_construct_tap,
2855 netdev_tap_get_stats,
2856 netdev_linux_get_features,
2857 netdev_linux_get_status);
2859 const struct netdev_class netdev_internal_class =
2862 netdev_linux_construct,
2863 netdev_internal_get_stats,
2864 NULL, /* get_features */
2865 netdev_internal_get_status);
2868 #define CODEL_N_QUEUES 0x0000
2870 /* In sufficiently new kernel headers these are defined as enums in
2871 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2872 * kernels. (This overrides any enum definition in the header file but that's
2874 #define TCA_CODEL_TARGET 1
2875 #define TCA_CODEL_LIMIT 2
2876 #define TCA_CODEL_INTERVAL 3
2885 static struct codel *
2886 codel_get__(const struct netdev *netdev_)
2888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2889 return CONTAINER_OF(netdev->tc, struct codel, tc);
2893 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2897 struct codel *codel;
2899 codel = xmalloc(sizeof *codel);
2900 tc_init(&codel->tc, &tc_ops_codel);
2901 codel->target = target;
2902 codel->limit = limit;
2903 codel->interval = interval;
2905 netdev->tc = &codel->tc;
2909 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2913 struct ofpbuf request;
2914 struct tcmsg *tcmsg;
2915 uint32_t otarget, olimit, ointerval;
2918 tc_del_qdisc(netdev);
2920 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2921 NLM_F_EXCL | NLM_F_CREATE, &request);
2925 tcmsg->tcm_handle = tc_make_handle(1, 0);
2926 tcmsg->tcm_parent = TC_H_ROOT;
2928 otarget = target ? target : 5000;
2929 olimit = limit ? limit : 10240;
2930 ointerval = interval ? interval : 100000;
2932 nl_msg_put_string(&request, TCA_KIND, "codel");
2933 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2934 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2935 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2936 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2937 nl_msg_end_nested(&request, opt_offset);
2939 error = tc_transact(&request, NULL);
2941 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2942 "target %u, limit %u, interval %u error %d(%s)",
2943 netdev_get_name(netdev),
2944 otarget, olimit, ointerval,
2945 error, ovs_strerror(error));
2951 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2952 const struct smap *details, struct codel *codel)
2954 const char *target_s;
2955 const char *limit_s;
2956 const char *interval_s;
2958 target_s = smap_get(details, "target");
2959 limit_s = smap_get(details, "limit");
2960 interval_s = smap_get(details, "interval");
2962 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2963 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2964 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2966 if (!codel->target) {
2967 codel->target = 5000;
2969 if (!codel->limit) {
2970 codel->limit = 10240;
2972 if (!codel->interval) {
2973 codel->interval = 100000;
2978 codel_tc_install(struct netdev *netdev, const struct smap *details)
2983 codel_parse_qdisc_details__(netdev, details, &codel);
2984 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2987 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2993 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2995 static const struct nl_policy tca_codel_policy[] = {
2996 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2997 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2998 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3001 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3003 if (!nl_parse_nested(nl_options, tca_codel_policy,
3004 attrs, ARRAY_SIZE(tca_codel_policy))) {
3005 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3009 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3010 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3011 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3016 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3018 struct nlattr *nlattr;
3023 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3028 error = codel_parse_tca_options__(nlattr, &codel);
3033 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3039 codel_tc_destroy(struct tc *tc)
3041 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3047 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3049 const struct codel *codel = codel_get__(netdev);
3050 smap_add_format(details, "target", "%u", codel->target);
3051 smap_add_format(details, "limit", "%u", codel->limit);
3052 smap_add_format(details, "interval", "%u", codel->interval);
3057 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3061 codel_parse_qdisc_details__(netdev, details, &codel);
3062 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3063 codel_get__(netdev)->target = codel.target;
3064 codel_get__(netdev)->limit = codel.limit;
3065 codel_get__(netdev)->interval = codel.interval;
3069 static const struct tc_ops tc_ops_codel = {
3070 "codel", /* linux_name */
3071 "linux-codel", /* ovs_name */
3072 CODEL_N_QUEUES, /* n_queues */
3085 /* FQ-CoDel traffic control class. */
3087 #define FQCODEL_N_QUEUES 0x0000
3089 /* In sufficiently new kernel headers these are defined as enums in
3090 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3091 * kernels. (This overrides any enum definition in the header file but that's
3093 #define TCA_FQ_CODEL_TARGET 1
3094 #define TCA_FQ_CODEL_LIMIT 2
3095 #define TCA_FQ_CODEL_INTERVAL 3
3096 #define TCA_FQ_CODEL_ECN 4
3097 #define TCA_FQ_CODEL_FLOWS 5
3098 #define TCA_FQ_CODEL_QUANTUM 6
3109 static struct fqcodel *
3110 fqcodel_get__(const struct netdev *netdev_)
3112 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3113 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3117 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3118 uint32_t interval, uint32_t flows, uint32_t quantum)
3120 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3121 struct fqcodel *fqcodel;
3123 fqcodel = xmalloc(sizeof *fqcodel);
3124 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3125 fqcodel->target = target;
3126 fqcodel->limit = limit;
3127 fqcodel->interval = interval;
3128 fqcodel->flows = flows;
3129 fqcodel->quantum = quantum;
3131 netdev->tc = &fqcodel->tc;
3135 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3136 uint32_t interval, uint32_t flows, uint32_t quantum)
3139 struct ofpbuf request;
3140 struct tcmsg *tcmsg;
3141 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3144 tc_del_qdisc(netdev);
3146 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3147 NLM_F_EXCL | NLM_F_CREATE, &request);
3151 tcmsg->tcm_handle = tc_make_handle(1, 0);
3152 tcmsg->tcm_parent = TC_H_ROOT;
3154 otarget = target ? target : 5000;
3155 olimit = limit ? limit : 10240;
3156 ointerval = interval ? interval : 100000;
3157 oflows = flows ? flows : 1024;
3158 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3161 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3162 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3163 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3164 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3165 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3166 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3167 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3168 nl_msg_end_nested(&request, opt_offset);
3170 error = tc_transact(&request, NULL);
3172 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3173 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3174 netdev_get_name(netdev),
3175 otarget, olimit, ointerval, oflows, oquantum,
3176 error, ovs_strerror(error));
3182 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3183 const struct smap *details, struct fqcodel *fqcodel)
3185 const char *target_s;
3186 const char *limit_s;
3187 const char *interval_s;
3188 const char *flows_s;
3189 const char *quantum_s;
3191 target_s = smap_get(details, "target");
3192 limit_s = smap_get(details, "limit");
3193 interval_s = smap_get(details, "interval");
3194 flows_s = smap_get(details, "flows");
3195 quantum_s = smap_get(details, "quantum");
3196 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3197 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3198 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3199 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3200 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3201 if (!fqcodel->target) {
3202 fqcodel->target = 5000;
3204 if (!fqcodel->limit) {
3205 fqcodel->limit = 10240;
3207 if (!fqcodel->interval) {
3208 fqcodel->interval = 1000000;
3210 if (!fqcodel->flows) {
3211 fqcodel->flows = 1024;
3213 if (!fqcodel->quantum) {
3214 fqcodel->quantum = 1514;
3219 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3222 struct fqcodel fqcodel;
3224 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3225 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3226 fqcodel.interval, fqcodel.flows,
3229 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3230 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3236 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3238 static const struct nl_policy tca_fqcodel_policy[] = {
3239 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3240 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3241 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3242 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3243 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3246 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3248 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3249 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3250 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3254 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3255 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3256 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3257 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3258 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3263 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3265 struct nlattr *nlattr;
3268 struct fqcodel fqcodel;
3270 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3275 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3280 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3281 fqcodel.flows, fqcodel.quantum);
3286 fqcodel_tc_destroy(struct tc *tc)
3288 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3294 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3296 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3297 smap_add_format(details, "target", "%u", fqcodel->target);
3298 smap_add_format(details, "limit", "%u", fqcodel->limit);
3299 smap_add_format(details, "interval", "%u", fqcodel->interval);
3300 smap_add_format(details, "flows", "%u", fqcodel->flows);
3301 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3306 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3308 struct fqcodel fqcodel;
3310 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3311 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3312 fqcodel.flows, fqcodel.quantum);
3313 fqcodel_get__(netdev)->target = fqcodel.target;
3314 fqcodel_get__(netdev)->limit = fqcodel.limit;
3315 fqcodel_get__(netdev)->interval = fqcodel.interval;
3316 fqcodel_get__(netdev)->flows = fqcodel.flows;
3317 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3321 static const struct tc_ops tc_ops_fqcodel = {
3322 "fq_codel", /* linux_name */
3323 "linux-fq_codel", /* ovs_name */
3324 FQCODEL_N_QUEUES, /* n_queues */
3337 /* SFQ traffic control class. */
3339 #define SFQ_N_QUEUES 0x0000
3348 sfq_get__(const struct netdev *netdev_)
3350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3351 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3355 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3357 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3360 sfq = xmalloc(sizeof *sfq);
3361 tc_init(&sfq->tc, &tc_ops_sfq);
3362 sfq->perturb = perturb;
3363 sfq->quantum = quantum;
3365 netdev->tc = &sfq->tc;
3369 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3371 struct tc_sfq_qopt opt;
3372 struct ofpbuf request;
3373 struct tcmsg *tcmsg;
3375 int mtu_error, error;
3376 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3378 tc_del_qdisc(netdev);
3380 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3381 NLM_F_EXCL | NLM_F_CREATE, &request);
3385 tcmsg->tcm_handle = tc_make_handle(1, 0);
3386 tcmsg->tcm_parent = TC_H_ROOT;
3388 memset(&opt, 0, sizeof opt);
3391 opt.quantum = mtu; /* if we cannot find mtu, use default */
3394 opt.quantum = quantum;
3398 opt.perturb_period = 10;
3400 opt.perturb_period = perturb;
3403 nl_msg_put_string(&request, TCA_KIND, "sfq");
3404 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3406 error = tc_transact(&request, NULL);
3408 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3409 "quantum %u, perturb %u error %d(%s)",
3410 netdev_get_name(netdev),
3411 opt.quantum, opt.perturb_period,
3412 error, ovs_strerror(error));
3418 sfq_parse_qdisc_details__(struct netdev *netdev,
3419 const struct smap *details, struct sfq *sfq)
3421 const char *perturb_s;
3422 const char *quantum_s;
3426 perturb_s = smap_get(details, "perturb");
3427 quantum_s = smap_get(details, "quantum");
3428 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3429 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3430 if (!sfq->perturb) {
3434 if (!sfq->quantum) {
3435 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3439 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3440 "device without mtu");
3447 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3452 sfq_parse_qdisc_details__(netdev, details, &sfq);
3453 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3455 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3461 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3463 const struct tc_sfq_qopt *sfq;
3464 struct nlattr *nlattr;
3468 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3470 sfq = nl_attr_get(nlattr);
3471 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3479 sfq_tc_destroy(struct tc *tc)
3481 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3487 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3489 const struct sfq *sfq = sfq_get__(netdev);
3490 smap_add_format(details, "quantum", "%u", sfq->quantum);
3491 smap_add_format(details, "perturb", "%u", sfq->perturb);
3496 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3500 sfq_parse_qdisc_details__(netdev, details, &sfq);
3501 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3502 sfq_get__(netdev)->quantum = sfq.quantum;
3503 sfq_get__(netdev)->perturb = sfq.perturb;
3507 static const struct tc_ops tc_ops_sfq = {
3508 "sfq", /* linux_name */
3509 "linux-sfq", /* ovs_name */
3510 SFQ_N_QUEUES, /* n_queues */
3523 /* HTB traffic control class. */
3525 #define HTB_N_QUEUES 0xf000
3526 #define HTB_RATE2QUANTUM 10
3530 unsigned int max_rate; /* In bytes/s. */
3534 struct tc_queue tc_queue;
3535 unsigned int min_rate; /* In bytes/s. */
3536 unsigned int max_rate; /* In bytes/s. */
3537 unsigned int burst; /* In bytes. */
3538 unsigned int priority; /* Lower values are higher priorities. */
3542 htb_get__(const struct netdev *netdev_)
3544 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3545 return CONTAINER_OF(netdev->tc, struct htb, tc);
3549 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3551 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3554 htb = xmalloc(sizeof *htb);
3555 tc_init(&htb->tc, &tc_ops_htb);
3556 htb->max_rate = max_rate;
3558 netdev->tc = &htb->tc;
3561 /* Create an HTB qdisc.
3563 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3565 htb_setup_qdisc__(struct netdev *netdev)
3568 struct tc_htb_glob opt;
3569 struct ofpbuf request;
3570 struct tcmsg *tcmsg;
3572 tc_del_qdisc(netdev);
3574 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3575 NLM_F_EXCL | NLM_F_CREATE, &request);
3579 tcmsg->tcm_handle = tc_make_handle(1, 0);
3580 tcmsg->tcm_parent = TC_H_ROOT;
3582 nl_msg_put_string(&request, TCA_KIND, "htb");
3584 memset(&opt, 0, sizeof opt);
3585 opt.rate2quantum = HTB_RATE2QUANTUM;
3589 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3590 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3591 nl_msg_end_nested(&request, opt_offset);
3593 return tc_transact(&request, NULL);
3596 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3597 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3599 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3600 unsigned int parent, struct htb_class *class)
3603 struct tc_htb_opt opt;
3604 struct ofpbuf request;
3605 struct tcmsg *tcmsg;
3609 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3611 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3612 netdev_get_name(netdev));
3616 memset(&opt, 0, sizeof opt);
3617 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3618 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3619 /* Makes sure the quantum is at least MTU. Setting quantum will
3620 * make htb ignore the r2q for this class. */
3621 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3624 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3625 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3626 opt.prio = class->priority;
3628 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3632 tcmsg->tcm_handle = handle;
3633 tcmsg->tcm_parent = parent;
3635 nl_msg_put_string(&request, TCA_KIND, "htb");
3636 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3637 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3638 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3639 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3640 nl_msg_end_nested(&request, opt_offset);
3642 error = tc_transact(&request, NULL);
3644 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3645 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3646 netdev_get_name(netdev),
3647 tc_get_major(handle), tc_get_minor(handle),
3648 tc_get_major(parent), tc_get_minor(parent),
3649 class->min_rate, class->max_rate,
3650 class->burst, class->priority, ovs_strerror(error));
3655 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3656 * description of them into 'details'. The description complies with the
3657 * specification given in the vswitch database documentation for linux-htb
3660 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3662 static const struct nl_policy tca_htb_policy[] = {
3663 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3664 .min_len = sizeof(struct tc_htb_opt) },
3667 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3668 const struct tc_htb_opt *htb;
3670 if (!nl_parse_nested(nl_options, tca_htb_policy,
3671 attrs, ARRAY_SIZE(tca_htb_policy))) {
3672 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3676 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3677 class->min_rate = htb->rate.rate;
3678 class->max_rate = htb->ceil.rate;
3679 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3680 class->priority = htb->prio;
3685 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3686 struct htb_class *options,
3687 struct netdev_queue_stats *stats)
3689 struct nlattr *nl_options;
3690 unsigned int handle;
3693 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3694 if (!error && queue_id) {
3695 unsigned int major = tc_get_major(handle);
3696 unsigned int minor = tc_get_minor(handle);
3697 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3698 *queue_id = minor - 1;
3703 if (!error && options) {
3704 error = htb_parse_tca_options__(nl_options, options);
3710 htb_parse_qdisc_details__(struct netdev *netdev_,
3711 const struct smap *details, struct htb_class *hc)
3713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3714 const char *max_rate_s;
3716 max_rate_s = smap_get(details, "max-rate");
3717 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3718 if (!hc->max_rate) {
3719 enum netdev_features current;
3721 netdev_linux_read_features(netdev);
3722 current = !netdev->get_features_error ? netdev->current : 0;
3723 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3725 hc->min_rate = hc->max_rate;
3731 htb_parse_class_details__(struct netdev *netdev,
3732 const struct smap *details, struct htb_class *hc)
3734 const struct htb *htb = htb_get__(netdev);
3735 const char *min_rate_s = smap_get(details, "min-rate");
3736 const char *max_rate_s = smap_get(details, "max-rate");
3737 const char *burst_s = smap_get(details, "burst");
3738 const char *priority_s = smap_get(details, "priority");
3741 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3743 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3744 netdev_get_name(netdev));
3748 /* HTB requires at least an mtu sized min-rate to send any traffic even
3749 * on uncongested links. */
3750 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3751 hc->min_rate = MAX(hc->min_rate, mtu);
3752 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3755 hc->max_rate = (max_rate_s
3756 ? strtoull(max_rate_s, NULL, 10) / 8
3758 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3759 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3763 * According to hints in the documentation that I've read, it is important
3764 * that 'burst' be at least as big as the largest frame that might be
3765 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3766 * but having it a bit too small is a problem. Since netdev_get_mtu()
3767 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3768 * the MTU. We actually add 64, instead of 14, as a guard against
3769 * additional headers get tacked on somewhere that we're not aware of. */
3770 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3771 hc->burst = MAX(hc->burst, mtu + 64);
3774 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3780 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3781 unsigned int parent, struct htb_class *options,
3782 struct netdev_queue_stats *stats)
3784 struct ofpbuf *reply;
3787 error = tc_query_class(netdev, handle, parent, &reply);
3789 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3790 ofpbuf_delete(reply);
3796 htb_tc_install(struct netdev *netdev, const struct smap *details)
3800 error = htb_setup_qdisc__(netdev);
3802 struct htb_class hc;
3804 htb_parse_qdisc_details__(netdev, details, &hc);
3805 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3806 tc_make_handle(1, 0), &hc);
3808 htb_install__(netdev, hc.max_rate);
3814 static struct htb_class *
3815 htb_class_cast__(const struct tc_queue *queue)
3817 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3821 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3822 const struct htb_class *hc)
3824 struct htb *htb = htb_get__(netdev);
3825 size_t hash = hash_int(queue_id, 0);
3826 struct tc_queue *queue;
3827 struct htb_class *hcp;
3829 queue = tc_find_queue__(netdev, queue_id, hash);
3831 hcp = htb_class_cast__(queue);
3833 hcp = xmalloc(sizeof *hcp);
3834 queue = &hcp->tc_queue;
3835 queue->queue_id = queue_id;
3836 queue->created = time_msec();
3837 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3840 hcp->min_rate = hc->min_rate;
3841 hcp->max_rate = hc->max_rate;
3842 hcp->burst = hc->burst;
3843 hcp->priority = hc->priority;
3847 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3850 struct queue_dump_state state;
3851 struct htb_class hc;
3853 /* Get qdisc options. */
3855 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3856 htb_install__(netdev, hc.max_rate);
3859 if (!start_queue_dump(netdev, &state)) {
3862 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3863 unsigned int queue_id;
3865 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3866 htb_update_queue__(netdev, queue_id, &hc);
3869 finish_queue_dump(&state);
3875 htb_tc_destroy(struct tc *tc)
3877 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3878 struct htb_class *hc, *next;
3880 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3881 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3889 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3891 const struct htb *htb = htb_get__(netdev);
3892 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3897 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3899 struct htb_class hc;
3902 htb_parse_qdisc_details__(netdev, details, &hc);
3903 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3904 tc_make_handle(1, 0), &hc);
3906 htb_get__(netdev)->max_rate = hc.max_rate;
3912 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3913 const struct tc_queue *queue, struct smap *details)
3915 const struct htb_class *hc = htb_class_cast__(queue);
3917 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3918 if (hc->min_rate != hc->max_rate) {
3919 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3921 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3923 smap_add_format(details, "priority", "%u", hc->priority);
3929 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3930 const struct smap *details)
3932 struct htb_class hc;
3935 error = htb_parse_class_details__(netdev, details, &hc);
3940 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3941 tc_make_handle(1, 0xfffe), &hc);
3946 htb_update_queue__(netdev, queue_id, &hc);
3951 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3953 struct htb_class *hc = htb_class_cast__(queue);
3954 struct htb *htb = htb_get__(netdev);
3957 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3959 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3966 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3967 struct netdev_queue_stats *stats)
3969 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3970 tc_make_handle(1, 0xfffe), NULL, stats);
3974 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3975 const struct ofpbuf *nlmsg,
3976 netdev_dump_queue_stats_cb *cb, void *aux)
3978 struct netdev_queue_stats stats;
3979 unsigned int handle, major, minor;
3982 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3987 major = tc_get_major(handle);
3988 minor = tc_get_minor(handle);
3989 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3990 (*cb)(minor - 1, &stats, aux);
3995 static const struct tc_ops tc_ops_htb = {
3996 "htb", /* linux_name */
3997 "linux-htb", /* ovs_name */
3998 HTB_N_QUEUES, /* n_queues */
4007 htb_class_get_stats,
4008 htb_class_dump_stats
4011 /* "linux-hfsc" traffic control class. */
4013 #define HFSC_N_QUEUES 0xf000
4021 struct tc_queue tc_queue;
4026 static struct hfsc *
4027 hfsc_get__(const struct netdev *netdev_)
4029 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4030 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4033 static struct hfsc_class *
4034 hfsc_class_cast__(const struct tc_queue *queue)
4036 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4040 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4042 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4045 hfsc = xmalloc(sizeof *hfsc);
4046 tc_init(&hfsc->tc, &tc_ops_hfsc);
4047 hfsc->max_rate = max_rate;
4048 netdev->tc = &hfsc->tc;
4052 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4053 const struct hfsc_class *hc)
4057 struct hfsc_class *hcp;
4058 struct tc_queue *queue;
4060 hfsc = hfsc_get__(netdev);
4061 hash = hash_int(queue_id, 0);
4063 queue = tc_find_queue__(netdev, queue_id, hash);
4065 hcp = hfsc_class_cast__(queue);
4067 hcp = xmalloc(sizeof *hcp);
4068 queue = &hcp->tc_queue;
4069 queue->queue_id = queue_id;
4070 queue->created = time_msec();
4071 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4074 hcp->min_rate = hc->min_rate;
4075 hcp->max_rate = hc->max_rate;
4079 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4081 const struct tc_service_curve *rsc, *fsc, *usc;
4082 static const struct nl_policy tca_hfsc_policy[] = {
4084 .type = NL_A_UNSPEC,
4086 .min_len = sizeof(struct tc_service_curve),
4089 .type = NL_A_UNSPEC,
4091 .min_len = sizeof(struct tc_service_curve),
4094 .type = NL_A_UNSPEC,
4096 .min_len = sizeof(struct tc_service_curve),
4099 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4101 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4102 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4107 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4108 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4109 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4111 if (rsc->m1 != 0 || rsc->d != 0 ||
4112 fsc->m1 != 0 || fsc->d != 0 ||
4113 usc->m1 != 0 || usc->d != 0) {
4114 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4115 "Non-linear service curves are not supported.");
4119 if (rsc->m2 != fsc->m2) {
4120 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4121 "Real-time service curves are not supported ");
4125 if (rsc->m2 > usc->m2) {
4126 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4127 "Min-rate service curve is greater than "
4128 "the max-rate service curve.");
4132 class->min_rate = fsc->m2;
4133 class->max_rate = usc->m2;
4138 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4139 struct hfsc_class *options,
4140 struct netdev_queue_stats *stats)
4143 unsigned int handle;
4144 struct nlattr *nl_options;
4146 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4152 unsigned int major, minor;
4154 major = tc_get_major(handle);
4155 minor = tc_get_minor(handle);
4156 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4157 *queue_id = minor - 1;
4164 error = hfsc_parse_tca_options__(nl_options, options);
4171 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4172 unsigned int parent, struct hfsc_class *options,
4173 struct netdev_queue_stats *stats)
4176 struct ofpbuf *reply;
4178 error = tc_query_class(netdev, handle, parent, &reply);
4183 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4184 ofpbuf_delete(reply);
4189 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4190 struct hfsc_class *class)
4192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4194 const char *max_rate_s;
4196 max_rate_s = smap_get(details, "max-rate");
4197 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4200 enum netdev_features current;
4202 netdev_linux_read_features(netdev);
4203 current = !netdev->get_features_error ? netdev->current : 0;
4204 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4207 class->min_rate = max_rate;
4208 class->max_rate = max_rate;
4212 hfsc_parse_class_details__(struct netdev *netdev,
4213 const struct smap *details,
4214 struct hfsc_class * class)
4216 const struct hfsc *hfsc;
4217 uint32_t min_rate, max_rate;
4218 const char *min_rate_s, *max_rate_s;
4220 hfsc = hfsc_get__(netdev);
4221 min_rate_s = smap_get(details, "min-rate");
4222 max_rate_s = smap_get(details, "max-rate");
4224 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4225 min_rate = MAX(min_rate, 1);
4226 min_rate = MIN(min_rate, hfsc->max_rate);
4228 max_rate = (max_rate_s
4229 ? strtoull(max_rate_s, NULL, 10) / 8
4231 max_rate = MAX(max_rate, min_rate);
4232 max_rate = MIN(max_rate, hfsc->max_rate);
4234 class->min_rate = min_rate;
4235 class->max_rate = max_rate;
4240 /* Create an HFSC qdisc.
4242 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4244 hfsc_setup_qdisc__(struct netdev * netdev)
4246 struct tcmsg *tcmsg;
4247 struct ofpbuf request;
4248 struct tc_hfsc_qopt opt;
4250 tc_del_qdisc(netdev);
4252 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4253 NLM_F_EXCL | NLM_F_CREATE, &request);
4259 tcmsg->tcm_handle = tc_make_handle(1, 0);
4260 tcmsg->tcm_parent = TC_H_ROOT;
4262 memset(&opt, 0, sizeof opt);
4265 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4266 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4268 return tc_transact(&request, NULL);
4271 /* Create an HFSC class.
4273 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4274 * sc rate <min_rate> ul rate <max_rate>" */
4276 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4277 unsigned int parent, struct hfsc_class *class)
4281 struct tcmsg *tcmsg;
4282 struct ofpbuf request;
4283 struct tc_service_curve min, max;
4285 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4291 tcmsg->tcm_handle = handle;
4292 tcmsg->tcm_parent = parent;
4296 min.m2 = class->min_rate;
4300 max.m2 = class->max_rate;
4302 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4303 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4304 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4305 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4306 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4307 nl_msg_end_nested(&request, opt_offset);
4309 error = tc_transact(&request, NULL);
4311 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4312 "min-rate %ubps, max-rate %ubps (%s)",
4313 netdev_get_name(netdev),
4314 tc_get_major(handle), tc_get_minor(handle),
4315 tc_get_major(parent), tc_get_minor(parent),
4316 class->min_rate, class->max_rate, ovs_strerror(error));
4323 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4326 struct hfsc_class class;
4328 error = hfsc_setup_qdisc__(netdev);
4334 hfsc_parse_qdisc_details__(netdev, details, &class);
4335 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4336 tc_make_handle(1, 0), &class);
4342 hfsc_install__(netdev, class.max_rate);
4347 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4350 struct queue_dump_state state;
4351 struct hfsc_class hc;
4354 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4355 hfsc_install__(netdev, hc.max_rate);
4357 if (!start_queue_dump(netdev, &state)) {
4361 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4362 unsigned int queue_id;
4364 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4365 hfsc_update_queue__(netdev, queue_id, &hc);
4369 finish_queue_dump(&state);
4374 hfsc_tc_destroy(struct tc *tc)
4377 struct hfsc_class *hc, *next;
4379 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4381 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4382 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4391 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4393 const struct hfsc *hfsc;
4394 hfsc = hfsc_get__(netdev);
4395 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4400 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4403 struct hfsc_class class;
4405 hfsc_parse_qdisc_details__(netdev, details, &class);
4406 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4407 tc_make_handle(1, 0), &class);
4410 hfsc_get__(netdev)->max_rate = class.max_rate;
4417 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4418 const struct tc_queue *queue, struct smap *details)
4420 const struct hfsc_class *hc;
4422 hc = hfsc_class_cast__(queue);
4423 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4424 if (hc->min_rate != hc->max_rate) {
4425 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4431 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4432 const struct smap *details)
4435 struct hfsc_class class;
4437 error = hfsc_parse_class_details__(netdev, details, &class);
4442 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4443 tc_make_handle(1, 0xfffe), &class);
4448 hfsc_update_queue__(netdev, queue_id, &class);
4453 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4457 struct hfsc_class *hc;
4459 hc = hfsc_class_cast__(queue);
4460 hfsc = hfsc_get__(netdev);
4462 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4464 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4471 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4472 struct netdev_queue_stats *stats)
4474 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4475 tc_make_handle(1, 0xfffe), NULL, stats);
4479 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4480 const struct ofpbuf *nlmsg,
4481 netdev_dump_queue_stats_cb *cb, void *aux)
4483 struct netdev_queue_stats stats;
4484 unsigned int handle, major, minor;
4487 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4492 major = tc_get_major(handle);
4493 minor = tc_get_minor(handle);
4494 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4495 (*cb)(minor - 1, &stats, aux);
4500 static const struct tc_ops tc_ops_hfsc = {
4501 "hfsc", /* linux_name */
4502 "linux-hfsc", /* ovs_name */
4503 HFSC_N_QUEUES, /* n_queues */
4504 hfsc_tc_install, /* tc_install */
4505 hfsc_tc_load, /* tc_load */
4506 hfsc_tc_destroy, /* tc_destroy */
4507 hfsc_qdisc_get, /* qdisc_get */
4508 hfsc_qdisc_set, /* qdisc_set */
4509 hfsc_class_get, /* class_get */
4510 hfsc_class_set, /* class_set */
4511 hfsc_class_delete, /* class_delete */
4512 hfsc_class_get_stats, /* class_get_stats */
4513 hfsc_class_dump_stats /* class_dump_stats */
4516 /* "linux-default" traffic control class.
4518 * This class represents the default, unnamed Linux qdisc. It corresponds to
4519 * the "" (empty string) QoS type in the OVS database. */
4522 default_install__(struct netdev *netdev_)
4524 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4525 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4527 /* Nothing but a tc class implementation is allowed to write to a tc. This
4528 * class never does that, so we can legitimately use a const tc object. */
4529 netdev->tc = CONST_CAST(struct tc *, &tc);
4533 default_tc_install(struct netdev *netdev,
4534 const struct smap *details OVS_UNUSED)
4536 default_install__(netdev);
4541 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4543 default_install__(netdev);
4547 static const struct tc_ops tc_ops_default = {
4548 NULL, /* linux_name */
4553 NULL, /* tc_destroy */
4554 NULL, /* qdisc_get */
4555 NULL, /* qdisc_set */
4556 NULL, /* class_get */
4557 NULL, /* class_set */
4558 NULL, /* class_delete */
4559 NULL, /* class_get_stats */
4560 NULL /* class_dump_stats */
4563 /* "linux-other" traffic control class.
4568 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4570 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4571 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4573 /* Nothing but a tc class implementation is allowed to write to a tc. This
4574 * class never does that, so we can legitimately use a const tc object. */
4575 netdev->tc = CONST_CAST(struct tc *, &tc);
4579 static const struct tc_ops tc_ops_other = {
4580 NULL, /* linux_name */
4581 "linux-other", /* ovs_name */
4583 NULL, /* tc_install */
4585 NULL, /* tc_destroy */
4586 NULL, /* qdisc_get */
4587 NULL, /* qdisc_set */
4588 NULL, /* class_get */
4589 NULL, /* class_set */
4590 NULL, /* class_delete */
4591 NULL, /* class_get_stats */
4592 NULL /* class_dump_stats */
4595 /* Traffic control. */
4597 /* Number of kernel "tc" ticks per second. */
4598 static double ticks_per_s;
4600 /* Number of kernel "jiffies" per second. This is used for the purpose of
4601 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4602 * one jiffy's worth of data.
4604 * There are two possibilities here:
4606 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4607 * approximate range of 100 to 1024. That means that we really need to
4608 * make sure that the qdisc can buffer that much data.
4610 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4611 * has finely granular timers and there's no need to fudge additional room
4612 * for buffers. (There's no extra effort needed to implement that: the
4613 * large 'buffer_hz' is used as a divisor, so practically any number will
4614 * come out as 0 in the division. Small integer results in the case of
4615 * really high dividends won't have any real effect anyhow.)
4617 static unsigned int buffer_hz;
4619 /* Returns tc handle 'major':'minor'. */
4621 tc_make_handle(unsigned int major, unsigned int minor)
4623 return TC_H_MAKE(major << 16, minor);
4626 /* Returns the major number from 'handle'. */
4628 tc_get_major(unsigned int handle)
4630 return TC_H_MAJ(handle) >> 16;
4633 /* Returns the minor number from 'handle'. */
4635 tc_get_minor(unsigned int handle)
4637 return TC_H_MIN(handle);
4640 static struct tcmsg *
4641 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4642 struct ofpbuf *request)
4644 struct tcmsg *tcmsg;
4648 error = get_ifindex(netdev, &ifindex);
4653 ofpbuf_init(request, 512);
4654 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4655 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4656 tcmsg->tcm_family = AF_UNSPEC;
4657 tcmsg->tcm_ifindex = ifindex;
4658 /* Caller should fill in tcmsg->tcm_handle. */
4659 /* Caller should fill in tcmsg->tcm_parent. */
4665 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4667 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4668 ofpbuf_uninit(request);
4672 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4673 * policing configuration.
4675 * This function is equivalent to running the following when 'add' is true:
4676 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4678 * This function is equivalent to running the following when 'add' is false:
4679 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4681 * The configuration and stats may be seen with the following command:
4682 * /sbin/tc -s qdisc show dev <devname>
4684 * Returns 0 if successful, otherwise a positive errno value.
4687 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4689 struct ofpbuf request;
4690 struct tcmsg *tcmsg;
4692 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4693 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4695 tcmsg = tc_make_request(netdev, type, flags, &request);
4699 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4700 tcmsg->tcm_parent = TC_H_INGRESS;
4701 nl_msg_put_string(&request, TCA_KIND, "ingress");
4702 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4704 error = tc_transact(&request, NULL);
4706 /* If we're deleting the qdisc, don't worry about some of the
4707 * error conditions. */
4708 if (!add && (error == ENOENT || error == EINVAL)) {
4717 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4720 * This function is equivalent to running:
4721 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4722 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4725 * The configuration and stats may be seen with the following command:
4726 * /sbin/tc -s filter show dev <devname> parent ffff:
4728 * Returns 0 if successful, otherwise a positive errno value.
4731 tc_add_policer(struct netdev *netdev,
4732 uint32_t kbits_rate, uint32_t kbits_burst)
4734 struct tc_police tc_police;
4735 struct ofpbuf request;
4736 struct tcmsg *tcmsg;
4737 size_t basic_offset;
4738 size_t police_offset;
4742 memset(&tc_police, 0, sizeof tc_police);
4743 tc_police.action = TC_POLICE_SHOT;
4744 tc_police.mtu = mtu;
4745 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4747 /* The following appears wrong in two ways:
4749 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4750 * arguments (or at least consistently "bytes" as both or "bits" as
4751 * both), but this supplies bytes for the first argument and bits for the
4754 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4756 * However if you "fix" those problems then "tc filter show ..." shows
4757 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4758 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4759 * tc's point of view. Whatever. */
4760 tc_police.burst = tc_bytes_to_ticks(
4761 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4763 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4764 NLM_F_EXCL | NLM_F_CREATE, &request);
4768 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4769 tcmsg->tcm_info = tc_make_handle(49,
4770 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4772 nl_msg_put_string(&request, TCA_KIND, "basic");
4773 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4774 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4775 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4776 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4777 nl_msg_end_nested(&request, police_offset);
4778 nl_msg_end_nested(&request, basic_offset);
4780 error = tc_transact(&request, NULL);
4791 /* The values in psched are not individually very meaningful, but they are
4792 * important. The tables below show some values seen in the wild.
4796 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4797 * (Before that, there are hints that it was 1000000000.)
4799 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4803 * -----------------------------------
4804 * [1] 000c8000 000f4240 000f4240 00000064
4805 * [2] 000003e8 00000400 000f4240 3b9aca00
4806 * [3] 000003e8 00000400 000f4240 3b9aca00
4807 * [4] 000003e8 00000400 000f4240 00000064
4808 * [5] 000003e8 00000040 000f4240 3b9aca00
4809 * [6] 000003e8 00000040 000f4240 000000f9
4811 * a b c d ticks_per_s buffer_hz
4812 * ------- --------- ---------- ------------- ----------- -------------
4813 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4814 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4815 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4816 * [4] 1,000 1,024 1,000,000 100 976,562 100
4817 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4818 * [6] 1,000 64 1,000,000 249 15,625,000 249
4820 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4821 * [2] 2.6.26-1-686-bigmem from Debian lenny
4822 * [3] 2.6.26-2-sparc64 from Debian lenny
4823 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4824 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4825 * [6] 2.6.34 from kernel.org on KVM
4827 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4828 static const char fn[] = "/proc/net/psched";
4829 unsigned int a, b, c, d;
4832 if (!ovsthread_once_start(&once)) {
4839 stream = fopen(fn, "r");
4841 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4845 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4846 VLOG_WARN("%s: read failed", fn);
4850 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4854 VLOG_WARN("%s: invalid scheduler parameters", fn);
4858 ticks_per_s = (double) a * c / b;
4862 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4865 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4868 ovsthread_once_done(&once);
4871 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4872 * rate of 'rate' bytes per second. */
4874 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4877 return (rate * ticks) / ticks_per_s;
4880 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4881 * rate of 'rate' bytes per second. */
4883 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4886 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4889 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4890 * a transmission rate of 'rate' bytes per second. */
4892 tc_buffer_per_jiffy(unsigned int rate)
4895 return rate / buffer_hz;
4898 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4899 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4900 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4901 * stores NULL into it if it is absent.
4903 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4906 * Returns 0 if successful, otherwise a positive errno value. */
4908 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4909 struct nlattr **options)
4911 static const struct nl_policy tca_policy[] = {
4912 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4913 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4915 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4917 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4918 tca_policy, ta, ARRAY_SIZE(ta))) {
4919 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4924 *kind = nl_attr_get_string(ta[TCA_KIND]);
4928 *options = ta[TCA_OPTIONS];
4943 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4944 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4945 * into '*options', and its queue statistics into '*stats'. Any of the output
4946 * arguments may be null.
4948 * Returns 0 if successful, otherwise a positive errno value. */
4950 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4951 struct nlattr **options, struct netdev_queue_stats *stats)
4953 static const struct nl_policy tca_policy[] = {
4954 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4955 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4957 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4959 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4960 tca_policy, ta, ARRAY_SIZE(ta))) {
4961 VLOG_WARN_RL(&rl, "failed to parse class message");
4966 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4967 *handlep = tc->tcm_handle;
4971 *options = ta[TCA_OPTIONS];
4975 const struct gnet_stats_queue *gsq;
4976 struct gnet_stats_basic gsb;
4978 static const struct nl_policy stats_policy[] = {
4979 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4980 .min_len = sizeof gsb },
4981 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4982 .min_len = sizeof *gsq },
4984 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4986 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4987 sa, ARRAY_SIZE(sa))) {
4988 VLOG_WARN_RL(&rl, "failed to parse class stats");
4992 /* Alignment issues screw up the length of struct gnet_stats_basic on
4993 * some arch/bitsize combinations. Newer versions of Linux have a
4994 * struct gnet_stats_basic_packed, but we can't depend on that. The
4995 * easiest thing to do is just to make a copy. */
4996 memset(&gsb, 0, sizeof gsb);
4997 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4998 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4999 stats->tx_bytes = gsb.bytes;
5000 stats->tx_packets = gsb.packets;
5002 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5003 stats->tx_errors = gsq->drops;
5013 memset(stats, 0, sizeof *stats);
5018 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5021 tc_query_class(const struct netdev *netdev,
5022 unsigned int handle, unsigned int parent,
5023 struct ofpbuf **replyp)
5025 struct ofpbuf request;
5026 struct tcmsg *tcmsg;
5029 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5033 tcmsg->tcm_handle = handle;
5034 tcmsg->tcm_parent = parent;
5036 error = tc_transact(&request, replyp);
5038 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5039 netdev_get_name(netdev),
5040 tc_get_major(handle), tc_get_minor(handle),
5041 tc_get_major(parent), tc_get_minor(parent),
5042 ovs_strerror(error));
5047 /* Equivalent to "tc class del dev <name> handle <handle>". */
5049 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5051 struct ofpbuf request;
5052 struct tcmsg *tcmsg;
5055 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5059 tcmsg->tcm_handle = handle;
5060 tcmsg->tcm_parent = 0;
5062 error = tc_transact(&request, NULL);
5064 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5065 netdev_get_name(netdev),
5066 tc_get_major(handle), tc_get_minor(handle),
5067 ovs_strerror(error));
5072 /* Equivalent to "tc qdisc del dev <name> root". */
5074 tc_del_qdisc(struct netdev *netdev_)
5076 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5077 struct ofpbuf request;
5078 struct tcmsg *tcmsg;
5081 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5085 tcmsg->tcm_handle = tc_make_handle(1, 0);
5086 tcmsg->tcm_parent = TC_H_ROOT;
5088 error = tc_transact(&request, NULL);
5089 if (error == EINVAL) {
5090 /* EINVAL probably means that the default qdisc was in use, in which
5091 * case we've accomplished our purpose. */
5094 if (!error && netdev->tc) {
5095 if (netdev->tc->ops->tc_destroy) {
5096 netdev->tc->ops->tc_destroy(netdev->tc);
5104 getqdisc_is_safe(void)
5106 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5107 static bool safe = false;
5109 if (ovsthread_once_start(&once)) {
5110 struct utsname utsname;
5113 if (uname(&utsname) == -1) {
5114 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5115 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5116 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5117 } else if (major < 2 || (major == 2 && minor < 35)) {
5118 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5123 ovsthread_once_done(&once);
5128 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5129 * kernel to determine what they are. Returns 0 if successful, otherwise a
5130 * positive errno value. */
5132 tc_query_qdisc(const struct netdev *netdev_)
5134 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5135 struct ofpbuf request, *qdisc;
5136 const struct tc_ops *ops;
5137 struct tcmsg *tcmsg;
5145 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5146 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5147 * 2.6.35 without that fix backported to it.
5149 * To avoid the OOPS, we must not make a request that would attempt to dump
5150 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5151 * few others. There are a few ways that I can see to do this, but most of
5152 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5153 * technique chosen here is to assume that any non-default qdisc that we
5154 * create will have a class with handle 1:0. The built-in qdiscs only have
5155 * a class with handle 0:0.
5157 * On Linux 2.6.35+ we use the straightforward method because it allows us
5158 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5159 * in such a case we get no response at all from the kernel (!) if a
5160 * builtin qdisc is in use (which is later caught by "!error &&
5161 * !qdisc->size"). */
5162 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5166 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5167 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5169 /* Figure out what tc class to instantiate. */
5170 error = tc_transact(&request, &qdisc);
5171 if (!error && qdisc->size) {
5174 error = tc_parse_qdisc(qdisc, &kind, NULL);
5176 ops = &tc_ops_other;
5178 ops = tc_lookup_linux_name(kind);
5180 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5181 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5183 ops = &tc_ops_other;
5186 } else if ((!error && !qdisc->size) || error == ENOENT) {
5187 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5188 * set up by some other entity that doesn't have a handle 1:0. We will
5189 * assume that it's the system default qdisc. */
5190 ops = &tc_ops_default;
5193 /* Who knows? Maybe the device got deleted. */
5194 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5195 netdev_get_name(netdev_), ovs_strerror(error));
5196 ops = &tc_ops_other;
5199 /* Instantiate it. */
5200 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5201 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5202 ofpbuf_delete(qdisc);
5204 return error ? error : load_error;
5207 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5208 approximate the time to transmit packets of various lengths. For an MTU of
5209 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5210 represents two possible packet lengths; for a MTU of 513 through 1024, four
5211 possible lengths; and so on.
5213 Returns, for the specified 'mtu', the number of bits that packet lengths
5214 need to be shifted right to fit within such a 256-entry table. */
5216 tc_calc_cell_log(unsigned int mtu)
5221 mtu = ETH_PAYLOAD_MAX;
5223 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5225 for (cell_log = 0; mtu >= 256; cell_log++) {
5232 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5235 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5237 memset(rate, 0, sizeof *rate);
5238 rate->cell_log = tc_calc_cell_log(mtu);
5239 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5240 /* rate->cell_align = 0; */ /* distro headers. */
5241 rate->mpu = ETH_TOTAL_MIN;
5245 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5246 * attribute of the specified "type".
5248 * See tc_calc_cell_log() above for a description of "rtab"s. */
5250 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5255 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5256 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5257 unsigned packet_size = (i + 1) << rate->cell_log;
5258 if (packet_size < rate->mpu) {
5259 packet_size = rate->mpu;
5261 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5265 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5266 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5267 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5270 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5272 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5273 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5276 /* Linux-only functions declared in netdev-linux.h */
5278 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5279 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5281 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5282 const char *flag_name, bool enable)
5284 const char *netdev_name = netdev_get_name(netdev);
5285 struct ethtool_value evalue;
5289 COVERAGE_INC(netdev_get_ethtool);
5290 memset(&evalue, 0, sizeof evalue);
5291 error = netdev_linux_do_ethtool(netdev_name,
5292 (struct ethtool_cmd *)&evalue,
5293 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5298 COVERAGE_INC(netdev_set_ethtool);
5299 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5300 if (new_flags == evalue.data) {
5303 evalue.data = new_flags;
5304 error = netdev_linux_do_ethtool(netdev_name,
5305 (struct ethtool_cmd *)&evalue,
5306 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5311 COVERAGE_INC(netdev_get_ethtool);
5312 memset(&evalue, 0, sizeof evalue);
5313 error = netdev_linux_do_ethtool(netdev_name,
5314 (struct ethtool_cmd *)&evalue,
5315 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5320 if (new_flags != evalue.data) {
5321 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5322 "device %s failed", enable ? "enable" : "disable",
5323 flag_name, netdev_name);
5330 /* Utility functions. */
5332 /* Copies 'src' into 'dst', performing format conversion in the process. */
5334 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5335 const struct rtnl_link_stats *src)
5337 dst->rx_packets = src->rx_packets;
5338 dst->tx_packets = src->tx_packets;
5339 dst->rx_bytes = src->rx_bytes;
5340 dst->tx_bytes = src->tx_bytes;
5341 dst->rx_errors = src->rx_errors;
5342 dst->tx_errors = src->tx_errors;
5343 dst->rx_dropped = src->rx_dropped;
5344 dst->tx_dropped = src->tx_dropped;
5345 dst->multicast = src->multicast;
5346 dst->collisions = src->collisions;
5347 dst->rx_length_errors = src->rx_length_errors;
5348 dst->rx_over_errors = src->rx_over_errors;
5349 dst->rx_crc_errors = src->rx_crc_errors;
5350 dst->rx_frame_errors = src->rx_frame_errors;
5351 dst->rx_fifo_errors = src->rx_fifo_errors;
5352 dst->rx_missed_errors = src->rx_missed_errors;
5353 dst->tx_aborted_errors = src->tx_aborted_errors;
5354 dst->tx_carrier_errors = src->tx_carrier_errors;
5355 dst->tx_fifo_errors = src->tx_fifo_errors;
5356 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5357 dst->tx_window_errors = src->tx_window_errors;
5360 /* Copies 'src' into 'dst', performing format conversion in the process. */
5362 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5363 const struct rtnl_link_stats64 *src)
5365 dst->rx_packets = src->rx_packets;
5366 dst->tx_packets = src->tx_packets;
5367 dst->rx_bytes = src->rx_bytes;
5368 dst->tx_bytes = src->tx_bytes;
5369 dst->rx_errors = src->rx_errors;
5370 dst->tx_errors = src->tx_errors;
5371 dst->rx_dropped = src->rx_dropped;
5372 dst->tx_dropped = src->tx_dropped;
5373 dst->multicast = src->multicast;
5374 dst->collisions = src->collisions;
5375 dst->rx_length_errors = src->rx_length_errors;
5376 dst->rx_over_errors = src->rx_over_errors;
5377 dst->rx_crc_errors = src->rx_crc_errors;
5378 dst->rx_frame_errors = src->rx_frame_errors;
5379 dst->rx_fifo_errors = src->rx_fifo_errors;
5380 dst->rx_missed_errors = src->rx_missed_errors;
5381 dst->tx_aborted_errors = src->tx_aborted_errors;
5382 dst->tx_carrier_errors = src->tx_carrier_errors;
5383 dst->tx_fifo_errors = src->tx_fifo_errors;
5384 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5385 dst->tx_window_errors = src->tx_window_errors;
5389 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5391 struct ofpbuf request;
5392 struct ofpbuf *reply;
5395 ofpbuf_init(&request, 0);
5396 nl_msg_put_nlmsghdr(&request,
5397 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5398 RTM_GETLINK, NLM_F_REQUEST);
5399 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5400 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5401 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5402 ofpbuf_uninit(&request);
5407 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5408 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5409 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5410 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5413 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5414 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5415 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5418 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5423 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5428 ofpbuf_delete(reply);
5433 get_flags(const struct netdev *dev, unsigned int *flags)
5439 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5441 *flags = ifr.ifr_flags;
5447 set_flags(const char *name, unsigned int flags)
5451 ifr.ifr_flags = flags;
5452 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5456 do_get_ifindex(const char *netdev_name)
5461 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5462 COVERAGE_INC(netdev_get_ifindex);
5464 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5466 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5467 netdev_name, ovs_strerror(error));
5470 return ifr.ifr_ifindex;
5474 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5476 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5478 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5479 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5482 netdev->get_ifindex_error = -ifindex;
5483 netdev->ifindex = 0;
5485 netdev->get_ifindex_error = 0;
5486 netdev->ifindex = ifindex;
5488 netdev->cache_valid |= VALID_IFINDEX;
5491 *ifindexp = netdev->ifindex;
5492 return netdev->get_ifindex_error;
5496 get_etheraddr(const char *netdev_name, struct eth_addr *ea)
5502 memset(&ifr, 0, sizeof ifr);
5503 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5504 COVERAGE_INC(netdev_get_hwaddr);
5505 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5507 /* ENODEV probably means that a vif disappeared asynchronously and
5508 * hasn't been removed from the database yet, so reduce the log level
5509 * to INFO for that case. */
5510 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5511 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5512 netdev_name, ovs_strerror(error));
5515 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5516 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5517 VLOG_INFO("%s device has unknown hardware address family %d",
5518 netdev_name, hwaddr_family);
5521 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5526 set_etheraddr(const char *netdev_name, const struct eth_addr mac)
5531 memset(&ifr, 0, sizeof ifr);
5532 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5533 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5534 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
5535 COVERAGE_INC(netdev_set_hwaddr);
5536 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5538 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5539 netdev_name, ovs_strerror(error));
5545 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5546 int cmd, const char *cmd_name)
5551 memset(&ifr, 0, sizeof ifr);
5552 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5553 ifr.ifr_data = (caddr_t) ecmd;
5556 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5558 if (error != EOPNOTSUPP) {
5559 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5560 "failed: %s", cmd_name, name, ovs_strerror(error));
5562 /* The device doesn't support this operation. That's pretty
5563 * common, so there's no point in logging anything. */
5570 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5571 int cmd, const char *cmd_name)
5576 ifr.ifr_addr.sa_family = AF_INET;
5577 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5579 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5581 *ip = sin->sin_addr;
5586 /* Returns an AF_PACKET raw socket or a negative errno value. */
5588 af_packet_sock(void)
5590 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5593 if (ovsthread_once_start(&once)) {
5594 sock = socket(AF_PACKET, SOCK_RAW, 0);
5596 int error = set_nonblocking(sock);
5603 VLOG_ERR("failed to create packet socket: %s",
5604 ovs_strerror(errno));
5606 ovsthread_once_done(&once);