2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux);
79 COVERAGE_DEFINE(netdev_set_policing);
80 COVERAGE_DEFINE(netdev_arp_lookup);
81 COVERAGE_DEFINE(netdev_get_ifindex);
82 COVERAGE_DEFINE(netdev_get_hwaddr);
83 COVERAGE_DEFINE(netdev_set_hwaddr);
84 COVERAGE_DEFINE(netdev_get_ethtool);
85 COVERAGE_DEFINE(netdev_set_ethtool);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata {
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
142 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
150 #define IFLA_STATS64 23
152 #define rtnl_link_stats64 rpl_rtnl_link_stats64
153 struct rtnl_link_stats64 {
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
188 VALID_POLICING = 1 << 5,
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
191 VALID_FEATURES = 1 << 8,
194 /* Traffic control. */
196 /* An instance of a traffic control class. Always associated with a particular
199 * Each TC implementation subclasses this with whatever additional data it
202 const struct tc_ops *ops;
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
208 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
210 /* One traffic control queue.
212 * Each TC implementation subclasses this with whatever additional data it
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
217 long long int created; /* Time queue was created, in msecs. */
220 /* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
292 * This function may be null if 'tc' is not configurable.
294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function may be null if 'tc' is not configurable.
305 int (*qdisc_set)(struct netdev *, const struct smap *details);
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
318 * This function may be null if 'tc' does not have queues ('n_queues' is
320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
321 struct smap *details);
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
335 const struct smap *details);
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
347 * On success, initializes '*stats'.
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
353 struct netdev_queue_stats *stats);
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
366 tc_init(struct tc *tc, const struct tc_ops *ops)
369 hmap_init(&tc->queues);
373 tc_destroy(struct tc *tc)
375 hmap_destroy(&tc->queues);
378 static const struct tc_ops tc_ops_htb;
379 static const struct tc_ops tc_ops_hfsc;
380 static const struct tc_ops tc_ops_codel;
381 static const struct tc_ops tc_ops_fqcodel;
382 static const struct tc_ops tc_ops_sfq;
383 static const struct tc_ops tc_ops_default;
384 static const struct tc_ops tc_ops_other;
386 static const struct tc_ops *const tcs[] = {
387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
397 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398 static unsigned int tc_get_major(unsigned int handle);
399 static unsigned int tc_get_minor(unsigned int handle);
401 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
405 static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
408 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
409 static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
412 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417 static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420 static int tc_delete_class(const struct netdev *, unsigned int handle);
422 static int tc_del_qdisc(struct netdev *netdev);
423 static int tc_query_qdisc(const struct netdev *netdev);
425 static int tc_calc_cell_log(unsigned int mtu);
426 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
431 struct netdev_linux {
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
437 unsigned int cache_valid;
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
446 uint8_t etheraddr[ETH_ADDR_LEN];
447 struct in_addr address, netmask;
450 unsigned int ifi_flags;
451 long long int carrier_resets;
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
458 int netdev_policing_error; /* Cached error code from set policing. */
459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
461 int in6_error; /* Cached error code from reading in6 addr. */
463 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
465 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
467 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
470 /* For devices of class netdev_tap_class only. */
474 struct netdev_rxq_linux {
475 struct netdev_rxq up;
480 /* This is set pretty low because we probably won't learn anything from the
481 * additional log messages. */
482 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
484 /* Polling miimon status for all ports causes performance degradation when
485 * handling a large number of ports. If there are no devices using miimon, then
486 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
488 * Readers do not depend on this variable synchronizing with the related
489 * changes in the device miimon status, so we can use atomic_count. */
490 static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
492 static void netdev_linux_run(void);
494 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
495 int cmd, const char *cmd_name);
496 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
497 int cmd, const char *cmd_name);
498 static int get_flags(const struct netdev *, unsigned int *flags);
499 static int set_flags(const char *, unsigned int flags);
500 static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
501 enum netdev_flags on, enum netdev_flags *old_flagsp)
502 OVS_REQUIRES(netdev->mutex);
503 static int do_get_ifindex(const char *netdev_name);
504 static int get_ifindex(const struct netdev *, int *ifindexp);
505 static int do_set_addr(struct netdev *netdev,
506 int ioctl_nr, const char *ioctl_name,
507 struct in_addr addr);
508 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
509 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
510 static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
511 static int af_packet_sock(void);
512 static bool netdev_linux_miimon_enabled(void);
513 static void netdev_linux_miimon_run(void);
514 static void netdev_linux_miimon_wait(void);
515 static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
518 is_netdev_linux_class(const struct netdev_class *netdev_class)
520 return netdev_class->run == netdev_linux_run;
524 is_tap_netdev(const struct netdev *netdev)
526 return netdev_get_class(netdev) == &netdev_tap_class;
529 static struct netdev_linux *
530 netdev_linux_cast(const struct netdev *netdev)
532 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
534 return CONTAINER_OF(netdev, struct netdev_linux, up);
537 static struct netdev_rxq_linux *
538 netdev_rxq_linux_cast(const struct netdev_rxq *rx)
540 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
541 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
544 static void netdev_linux_update(struct netdev_linux *netdev,
545 const struct rtnetlink_change *)
546 OVS_REQUIRES(netdev->mutex);
547 static void netdev_linux_changed(struct netdev_linux *netdev,
548 unsigned int ifi_flags, unsigned int mask)
549 OVS_REQUIRES(netdev->mutex);
551 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
552 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
553 * if no such socket could be created. */
554 static struct nl_sock *
555 netdev_linux_notify_sock(void)
557 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
558 static struct nl_sock *sock;
559 unsigned int mcgroups[3] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
560 RTNLGRP_IPV6_IFADDR};
562 if (ovsthread_once_start(&once)) {
565 error = nl_sock_create(NETLINK_ROUTE, &sock);
569 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
570 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
572 nl_sock_destroy(sock);
578 ovsthread_once_done(&once);
585 netdev_linux_miimon_enabled(void)
587 return atomic_count_get(&miimon_cnt) > 0;
591 netdev_linux_run(void)
593 struct nl_sock *sock;
596 if (netdev_linux_miimon_enabled()) {
597 netdev_linux_miimon_run();
600 sock = netdev_linux_notify_sock();
606 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
607 uint64_t buf_stub[4096 / 8];
610 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
611 error = nl_sock_recv(sock, &buf, false);
613 struct rtnetlink_change change;
615 if (rtnetlink_parse(&buf, &change)) {
616 struct netdev *netdev_ = netdev_from_name(change.ifname);
617 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
618 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
620 ovs_mutex_lock(&netdev->mutex);
621 netdev_linux_update(netdev, &change);
622 ovs_mutex_unlock(&netdev->mutex);
624 netdev_close(netdev_);
626 } else if (error == ENOBUFS) {
627 struct shash device_shash;
628 struct shash_node *node;
632 shash_init(&device_shash);
633 netdev_get_devices(&netdev_linux_class, &device_shash);
634 SHASH_FOR_EACH (node, &device_shash) {
635 struct netdev *netdev_ = node->data;
636 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
639 ovs_mutex_lock(&netdev->mutex);
640 get_flags(netdev_, &flags);
641 netdev_linux_changed(netdev, flags, 0);
642 ovs_mutex_unlock(&netdev->mutex);
644 netdev_close(netdev_);
646 shash_destroy(&device_shash);
647 } else if (error != EAGAIN) {
648 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
649 ovs_strerror(error));
656 netdev_linux_wait(void)
658 struct nl_sock *sock;
660 if (netdev_linux_miimon_enabled()) {
661 netdev_linux_miimon_wait();
663 sock = netdev_linux_notify_sock();
665 nl_sock_wait(sock, POLLIN);
670 netdev_linux_changed(struct netdev_linux *dev,
671 unsigned int ifi_flags, unsigned int mask)
672 OVS_REQUIRES(dev->mutex)
674 netdev_change_seq_changed(&dev->up);
676 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
677 dev->carrier_resets++;
679 dev->ifi_flags = ifi_flags;
681 dev->cache_valid &= mask;
685 netdev_linux_update(struct netdev_linux *dev,
686 const struct rtnetlink_change *change)
687 OVS_REQUIRES(dev->mutex)
689 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
690 if (change->nlmsg_type == RTM_NEWLINK) {
691 /* Keep drv-info, in4, in6. */
692 netdev_linux_changed(dev, change->ifi_flags,
693 VALID_DRVINFO | VALID_IN4 | VALID_IN6);
695 /* Update netdev from rtnl-change msg. */
697 dev->mtu = change->mtu;
698 dev->cache_valid |= VALID_MTU;
699 dev->netdev_mtu_error = 0;
702 if (!eth_addr_is_zero(change->addr)) {
703 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
704 dev->cache_valid |= VALID_ETHERADDR;
705 dev->ether_addr_error = 0;
708 dev->ifindex = change->if_index;
709 dev->cache_valid |= VALID_IFINDEX;
710 dev->get_ifindex_error = 0;
712 netdev_linux_changed(dev, change->ifi_flags, 0);
714 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
715 /* Invalidates in4, in6. */
716 netdev_linux_changed(dev, dev->ifi_flags,
717 ~(VALID_IN4 | VALID_IN6));
723 static struct netdev *
724 netdev_linux_alloc(void)
726 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
731 netdev_linux_common_construct(struct netdev_linux *netdev)
733 ovs_mutex_init(&netdev->mutex);
736 /* Creates system and internal devices. */
738 netdev_linux_construct(struct netdev *netdev_)
740 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
743 netdev_linux_common_construct(netdev);
745 error = get_flags(&netdev->up, &netdev->ifi_flags);
746 if (error == ENODEV) {
747 if (netdev->up.netdev_class != &netdev_internal_class) {
748 /* The device does not exist, so don't allow it to be opened. */
751 /* "Internal" netdevs have to be created as netdev objects before
752 * they exist in the kernel, because creating them in the kernel
753 * happens by passing a netdev object to dpif_port_add().
754 * Therefore, ignore the error. */
761 /* For most types of netdevs we open the device for each call of
762 * netdev_open(). However, this is not the case with tap devices,
763 * since it is only possible to open the device once. In this
764 * situation we share a single file descriptor, and consequently
765 * buffers, across all readers. Therefore once data is read it will
766 * be unavailable to other reads for tap devices. */
768 netdev_linux_construct_tap(struct netdev *netdev_)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771 static const char tap_dev[] = "/dev/net/tun";
772 const char *name = netdev_->name;
776 netdev_linux_common_construct(netdev);
778 /* Open tap device. */
779 netdev->tap_fd = open(tap_dev, O_RDWR);
780 if (netdev->tap_fd < 0) {
782 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
786 /* Create tap device. */
787 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
788 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
789 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
790 VLOG_WARN("%s: creating tap device failed: %s", name,
791 ovs_strerror(errno));
796 /* Make non-blocking. */
797 error = set_nonblocking(netdev->tap_fd);
805 close(netdev->tap_fd);
810 netdev_linux_destruct(struct netdev *netdev_)
812 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 if (netdev->tc && netdev->tc->ops->tc_destroy) {
815 netdev->tc->ops->tc_destroy(netdev->tc);
818 if (netdev_get_class(netdev_) == &netdev_tap_class
819 && netdev->tap_fd >= 0)
821 close(netdev->tap_fd);
824 if (netdev->miimon_interval > 0) {
825 atomic_count_dec(&miimon_cnt);
828 ovs_mutex_destroy(&netdev->mutex);
832 netdev_linux_dealloc(struct netdev *netdev_)
834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838 static struct netdev_rxq *
839 netdev_linux_rxq_alloc(void)
841 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
846 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
848 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
849 struct netdev *netdev_ = rx->up.netdev;
850 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
853 ovs_mutex_lock(&netdev->mutex);
854 rx->is_tap = is_tap_netdev(netdev_);
856 rx->fd = netdev->tap_fd;
858 struct sockaddr_ll sll;
860 /* Result of tcpdump -dd inbound */
861 static const struct sock_filter filt[] = {
862 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
863 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
864 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
865 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
867 static const struct sock_fprog fprog = {
868 ARRAY_SIZE(filt), (struct sock_filter *) filt
871 /* Create file descriptor. */
872 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
875 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
880 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
882 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
883 netdev_get_name(netdev_), ovs_strerror(error));
887 /* Set non-blocking mode. */
888 error = set_nonblocking(rx->fd);
893 /* Get ethernet device index. */
894 error = get_ifindex(&netdev->up, &ifindex);
899 /* Bind to specific ethernet device. */
900 memset(&sll, 0, sizeof sll);
901 sll.sll_family = AF_PACKET;
902 sll.sll_ifindex = ifindex;
903 sll.sll_protocol = htons(ETH_P_ALL);
904 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
906 VLOG_ERR("%s: failed to bind raw socket (%s)",
907 netdev_get_name(netdev_), ovs_strerror(error));
911 /* Filter for only inbound packets. */
912 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
916 VLOG_ERR("%s: failed to attach filter (%s)",
917 netdev_get_name(netdev_), ovs_strerror(error));
921 ovs_mutex_unlock(&netdev->mutex);
929 ovs_mutex_unlock(&netdev->mutex);
934 netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
936 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
944 netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
946 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
952 auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
954 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
955 return htons(aux->tp_vlan_tpid);
957 return htons(ETH_TYPE_VLAN);
962 auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
964 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
968 netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
973 struct cmsghdr *cmsg;
976 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
980 /* Reserve headroom for a single VLAN tag */
981 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
982 size = dp_packet_tailroom(buffer);
984 iov.iov_base = dp_packet_data(buffer);
986 msgh.msg_name = NULL;
987 msgh.msg_namelen = 0;
990 msgh.msg_control = &cmsg_buffer;
991 msgh.msg_controllen = sizeof cmsg_buffer;
995 retval = recvmsg(fd, &msgh, MSG_TRUNC);
996 } while (retval < 0 && errno == EINTR);
1000 } else if (retval > size) {
1004 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1006 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1007 const struct tpacket_auxdata *aux;
1009 if (cmsg->cmsg_level != SOL_PACKET
1010 || cmsg->cmsg_type != PACKET_AUXDATA
1011 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1015 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1016 if (auxdata_has_vlan_tci(aux)) {
1017 if (retval < ETH_HEADER_LEN) {
1021 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1022 htons(aux->tp_vlan_tci));
1031 netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
1034 size_t size = dp_packet_tailroom(buffer);
1037 retval = read(fd, dp_packet_data(buffer), size);
1038 } while (retval < 0 && errno == EINTR);
1042 } else if (retval > size) {
1046 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1051 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
1054 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1055 struct netdev *netdev = rx->up.netdev;
1056 struct dp_packet *buffer;
1060 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1061 mtu = ETH_PAYLOAD_MAX;
1064 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
1065 DP_NETDEV_HEADROOM);
1066 retval = (rx->is_tap
1067 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1068 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
1071 if (retval != EAGAIN && retval != EMSGSIZE) {
1072 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1073 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
1075 dp_packet_delete(buffer);
1077 dp_packet_pad(buffer);
1078 dp_packet_set_rss_hash(buffer, 0);
1079 packets[0] = buffer;
1087 netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1089 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1090 poll_fd_wait(rx->fd, POLLIN);
1094 netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1096 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1099 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1100 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1104 drain_fd(rx->fd, ifr.ifr_qlen);
1107 return drain_rcvbuf(rx->fd);
1111 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1112 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1113 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1114 * the packet is too big or too small to transmit on the device.
1116 * The caller retains ownership of 'buffer' in all cases.
1118 * The kernel maintains a packet transmission queue, so the caller is not
1119 * expected to do additional queuing of packets. */
1121 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1122 struct dp_packet **pkts, int cnt, bool may_steal)
1127 /* 'i' is incremented only if there's no error */
1128 for (i = 0; i < cnt;) {
1129 const void *data = dp_packet_data(pkts[i]);
1130 size_t size = dp_packet_size(pkts[i]);
1133 if (!is_tap_netdev(netdev_)) {
1134 /* Use our AF_PACKET socket to send to this device. */
1135 struct sockaddr_ll sll;
1141 sock = af_packet_sock();
1146 ifindex = netdev_get_ifindex(netdev_);
1151 /* We don't bother setting most fields in sockaddr_ll because the
1152 * kernel ignores them for SOCK_RAW. */
1153 memset(&sll, 0, sizeof sll);
1154 sll.sll_family = AF_PACKET;
1155 sll.sll_ifindex = ifindex;
1157 iov.iov_base = CONST_CAST(void *, data);
1160 msg.msg_name = &sll;
1161 msg.msg_namelen = sizeof sll;
1164 msg.msg_control = NULL;
1165 msg.msg_controllen = 0;
1168 retval = sendmsg(sock, &msg, 0);
1170 /* Use the tap fd to send to this device. This is essential for
1171 * tap devices, because packets sent to a tap device with an
1172 * AF_PACKET socket will loop back to be *received* again on the
1173 * tap device. This doesn't occur on other interface types
1174 * because we attach a socket filter to the rx socket. */
1175 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1177 retval = write(netdev->tap_fd, data, size);
1181 /* The Linux AF_PACKET implementation never blocks waiting for room
1182 * for packets, instead returning ENOBUFS. Translate this into
1183 * EAGAIN for the caller. */
1184 error = errno == ENOBUFS ? EAGAIN : errno;
1185 if (error == EINTR) {
1186 /* continue without incrementing 'i', i.e. retry this packet */
1190 } else if (retval != size) {
1191 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1192 " of %"PRIuSIZE") on %s", retval, size,
1193 netdev_get_name(netdev_));
1198 /* Process the next packet in the batch */
1203 for (i = 0; i < cnt; i++) {
1204 dp_packet_delete(pkts[i]);
1208 if (error && error != EAGAIN) {
1209 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1210 netdev_get_name(netdev_), ovs_strerror(error));
1217 /* Registers with the poll loop to wake up from the next call to poll_block()
1218 * when the packet transmission queue has sufficient room to transmit a packet
1219 * with netdev_send().
1221 * The kernel maintains a packet transmission queue, so the client is not
1222 * expected to do additional queuing of packets. Thus, this function is
1223 * unlikely to ever be used. It is included for completeness. */
1225 netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1227 if (is_tap_netdev(netdev)) {
1228 /* TAP device always accepts packets.*/
1229 poll_immediate_wake();
1233 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1234 * otherwise a positive errno value. */
1236 netdev_linux_set_etheraddr(struct netdev *netdev_,
1237 const uint8_t mac[ETH_ADDR_LEN])
1239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1240 enum netdev_flags old_flags = 0;
1243 ovs_mutex_lock(&netdev->mutex);
1245 if (netdev->cache_valid & VALID_ETHERADDR) {
1246 error = netdev->ether_addr_error;
1247 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1250 netdev->cache_valid &= ~VALID_ETHERADDR;
1253 /* Tap devices must be brought down before setting the address. */
1254 if (is_tap_netdev(netdev_)) {
1255 update_flags(netdev, NETDEV_UP, 0, &old_flags);
1257 error = set_etheraddr(netdev_get_name(netdev_), mac);
1258 if (!error || error == ENODEV) {
1259 netdev->ether_addr_error = error;
1260 netdev->cache_valid |= VALID_ETHERADDR;
1262 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
1266 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1267 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1271 ovs_mutex_unlock(&netdev->mutex);
1275 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1277 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1278 uint8_t mac[ETH_ADDR_LEN])
1280 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1283 ovs_mutex_lock(&netdev->mutex);
1284 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1285 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1287 netdev->cache_valid |= VALID_ETHERADDR;
1290 error = netdev->ether_addr_error;
1292 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
1294 ovs_mutex_unlock(&netdev->mutex);
1300 netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1304 if (!(netdev->cache_valid & VALID_MTU)) {
1307 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1308 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1309 netdev->mtu = ifr.ifr_mtu;
1310 netdev->cache_valid |= VALID_MTU;
1313 error = netdev->netdev_mtu_error;
1315 *mtup = netdev->mtu;
1321 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1322 * in bytes, not including the hardware header; thus, this is typically 1500
1323 * bytes for Ethernet devices. */
1325 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1327 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1330 ovs_mutex_lock(&netdev->mutex);
1331 error = netdev_linux_get_mtu__(netdev, mtup);
1332 ovs_mutex_unlock(&netdev->mutex);
1337 /* Sets the maximum size of transmitted (MTU) for given device using linux
1338 * networking ioctl interface.
1341 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1343 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1347 ovs_mutex_lock(&netdev->mutex);
1348 if (netdev->cache_valid & VALID_MTU) {
1349 error = netdev->netdev_mtu_error;
1350 if (error || netdev->mtu == mtu) {
1353 netdev->cache_valid &= ~VALID_MTU;
1356 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1357 SIOCSIFMTU, "SIOCSIFMTU");
1358 if (!error || error == ENODEV) {
1359 netdev->netdev_mtu_error = error;
1360 netdev->mtu = ifr.ifr_mtu;
1361 netdev->cache_valid |= VALID_MTU;
1364 ovs_mutex_unlock(&netdev->mutex);
1368 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1369 * On failure, returns a negative errno value. */
1371 netdev_linux_get_ifindex(const struct netdev *netdev_)
1373 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1376 ovs_mutex_lock(&netdev->mutex);
1377 error = get_ifindex(netdev_, &ifindex);
1378 ovs_mutex_unlock(&netdev->mutex);
1380 return error ? -error : ifindex;
1384 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1386 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1388 ovs_mutex_lock(&netdev->mutex);
1389 if (netdev->miimon_interval > 0) {
1390 *carrier = netdev->miimon;
1392 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
1394 ovs_mutex_unlock(&netdev->mutex);
1399 static long long int
1400 netdev_linux_get_carrier_resets(const struct netdev *netdev_)
1402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1403 long long int carrier_resets;
1405 ovs_mutex_lock(&netdev->mutex);
1406 carrier_resets = netdev->carrier_resets;
1407 ovs_mutex_unlock(&netdev->mutex);
1409 return carrier_resets;
1413 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1414 struct mii_ioctl_data *data)
1419 memset(&ifr, 0, sizeof ifr);
1420 memcpy(&ifr.ifr_data, data, sizeof *data);
1421 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
1422 memcpy(data, &ifr.ifr_data, sizeof *data);
1428 netdev_linux_get_miimon(const char *name, bool *miimon)
1430 struct mii_ioctl_data data;
1435 memset(&data, 0, sizeof data);
1436 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1438 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1439 data.reg_num = MII_BMSR;
1440 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1444 *miimon = !!(data.val_out & BMSR_LSTATUS);
1446 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1449 struct ethtool_cmd ecmd;
1451 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1454 COVERAGE_INC(netdev_get_ethtool);
1455 memset(&ecmd, 0, sizeof ecmd);
1456 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1459 struct ethtool_value eval;
1461 memcpy(&eval, &ecmd, sizeof eval);
1462 *miimon = !!eval.data;
1464 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1472 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1473 long long int interval)
1475 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1477 ovs_mutex_lock(&netdev->mutex);
1478 interval = interval > 0 ? MAX(interval, 100) : 0;
1479 if (netdev->miimon_interval != interval) {
1480 if (interval && !netdev->miimon_interval) {
1481 atomic_count_inc(&miimon_cnt);
1482 } else if (!interval && netdev->miimon_interval) {
1483 atomic_count_dec(&miimon_cnt);
1486 netdev->miimon_interval = interval;
1487 timer_set_expired(&netdev->miimon_timer);
1489 ovs_mutex_unlock(&netdev->mutex);
1495 netdev_linux_miimon_run(void)
1497 struct shash device_shash;
1498 struct shash_node *node;
1500 shash_init(&device_shash);
1501 netdev_get_devices(&netdev_linux_class, &device_shash);
1502 SHASH_FOR_EACH (node, &device_shash) {
1503 struct netdev *netdev = node->data;
1504 struct netdev_linux *dev = netdev_linux_cast(netdev);
1507 ovs_mutex_lock(&dev->mutex);
1508 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1509 netdev_linux_get_miimon(dev->up.name, &miimon);
1510 if (miimon != dev->miimon) {
1511 dev->miimon = miimon;
1512 netdev_linux_changed(dev, dev->ifi_flags, 0);
1515 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1517 ovs_mutex_unlock(&dev->mutex);
1518 netdev_close(netdev);
1521 shash_destroy(&device_shash);
1525 netdev_linux_miimon_wait(void)
1527 struct shash device_shash;
1528 struct shash_node *node;
1530 shash_init(&device_shash);
1531 netdev_get_devices(&netdev_linux_class, &device_shash);
1532 SHASH_FOR_EACH (node, &device_shash) {
1533 struct netdev *netdev = node->data;
1534 struct netdev_linux *dev = netdev_linux_cast(netdev);
1536 ovs_mutex_lock(&dev->mutex);
1537 if (dev->miimon_interval > 0) {
1538 timer_wait(&dev->miimon_timer);
1540 ovs_mutex_unlock(&dev->mutex);
1541 netdev_close(netdev);
1543 shash_destroy(&device_shash);
1547 swap_uint64(uint64_t *a, uint64_t *b)
1554 /* Copies 'src' into 'dst', performing format conversion in the process.
1556 * 'src' is allowed to be misaligned. */
1558 netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1559 const struct ovs_vport_stats *src)
1561 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1562 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1563 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1564 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1565 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1566 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1567 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1568 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
1570 dst->collisions = 0;
1571 dst->rx_length_errors = 0;
1572 dst->rx_over_errors = 0;
1573 dst->rx_crc_errors = 0;
1574 dst->rx_frame_errors = 0;
1575 dst->rx_fifo_errors = 0;
1576 dst->rx_missed_errors = 0;
1577 dst->tx_aborted_errors = 0;
1578 dst->tx_carrier_errors = 0;
1579 dst->tx_fifo_errors = 0;
1580 dst->tx_heartbeat_errors = 0;
1581 dst->tx_window_errors = 0;
1585 get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1587 struct dpif_netlink_vport reply;
1591 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
1594 } else if (!reply.stats) {
1599 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1607 get_stats_via_vport(const struct netdev *netdev_,
1608 struct netdev_stats *stats)
1610 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1612 if (!netdev->vport_stats_error ||
1613 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1616 error = get_stats_via_vport__(netdev_, stats);
1617 if (error && error != ENOENT && error != ENODEV) {
1618 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1620 netdev_get_name(netdev_), ovs_strerror(error));
1622 netdev->vport_stats_error = error;
1623 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
1627 /* Retrieves current device stats for 'netdev-linux'. */
1629 netdev_linux_get_stats(const struct netdev *netdev_,
1630 struct netdev_stats *stats)
1632 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1633 struct netdev_stats dev_stats;
1636 ovs_mutex_lock(&netdev->mutex);
1637 get_stats_via_vport(netdev_, stats);
1638 error = get_stats_via_netlink(netdev_, &dev_stats);
1640 if (!netdev->vport_stats_error) {
1643 } else if (netdev->vport_stats_error) {
1644 /* stats not available from OVS then use netdev stats. */
1647 /* Use kernel netdev's packet and byte counts since vport's counters
1648 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1650 stats->rx_packets = dev_stats.rx_packets;
1651 stats->rx_bytes = dev_stats.rx_bytes;
1652 stats->tx_packets = dev_stats.tx_packets;
1653 stats->tx_bytes = dev_stats.tx_bytes;
1655 stats->rx_errors += dev_stats.rx_errors;
1656 stats->tx_errors += dev_stats.tx_errors;
1657 stats->rx_dropped += dev_stats.rx_dropped;
1658 stats->tx_dropped += dev_stats.tx_dropped;
1659 stats->multicast += dev_stats.multicast;
1660 stats->collisions += dev_stats.collisions;
1661 stats->rx_length_errors += dev_stats.rx_length_errors;
1662 stats->rx_over_errors += dev_stats.rx_over_errors;
1663 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1664 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1665 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1666 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1667 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1668 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1669 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1670 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1671 stats->tx_window_errors += dev_stats.tx_window_errors;
1673 ovs_mutex_unlock(&netdev->mutex);
1678 /* Retrieves current device stats for 'netdev-tap' netdev or
1679 * netdev-internal. */
1681 netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
1683 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1684 struct netdev_stats dev_stats;
1687 ovs_mutex_lock(&netdev->mutex);
1688 get_stats_via_vport(netdev_, stats);
1689 error = get_stats_via_netlink(netdev_, &dev_stats);
1691 if (!netdev->vport_stats_error) {
1694 } else if (netdev->vport_stats_error) {
1695 /* Transmit and receive stats will appear to be swapped relative to the
1696 * other ports since we are the one sending the data, not a remote
1697 * computer. For consistency, we swap them back here. This does not
1698 * apply if we are getting stats from the vport layer because it always
1699 * tracks stats from the perspective of the switch. */
1702 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1703 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1704 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1705 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1706 stats->rx_length_errors = 0;
1707 stats->rx_over_errors = 0;
1708 stats->rx_crc_errors = 0;
1709 stats->rx_frame_errors = 0;
1710 stats->rx_fifo_errors = 0;
1711 stats->rx_missed_errors = 0;
1712 stats->tx_aborted_errors = 0;
1713 stats->tx_carrier_errors = 0;
1714 stats->tx_fifo_errors = 0;
1715 stats->tx_heartbeat_errors = 0;
1716 stats->tx_window_errors = 0;
1718 /* Use kernel netdev's packet and byte counts since vport counters
1719 * do not reflect packet counts on the wire when GSO, TSO or GRO
1721 stats->rx_packets = dev_stats.tx_packets;
1722 stats->rx_bytes = dev_stats.tx_bytes;
1723 stats->tx_packets = dev_stats.rx_packets;
1724 stats->tx_bytes = dev_stats.rx_bytes;
1726 stats->rx_dropped += dev_stats.tx_dropped;
1727 stats->tx_dropped += dev_stats.rx_dropped;
1729 stats->rx_errors += dev_stats.tx_errors;
1730 stats->tx_errors += dev_stats.rx_errors;
1732 stats->multicast += dev_stats.multicast;
1733 stats->collisions += dev_stats.collisions;
1735 ovs_mutex_unlock(&netdev->mutex);
1741 netdev_internal_get_stats(const struct netdev *netdev_,
1742 struct netdev_stats *stats)
1744 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1747 ovs_mutex_lock(&netdev->mutex);
1748 get_stats_via_vport(netdev_, stats);
1749 error = netdev->vport_stats_error;
1750 ovs_mutex_unlock(&netdev->mutex);
1756 netdev_linux_read_features(struct netdev_linux *netdev)
1758 struct ethtool_cmd ecmd;
1762 if (netdev->cache_valid & VALID_FEATURES) {
1766 COVERAGE_INC(netdev_get_ethtool);
1767 memset(&ecmd, 0, sizeof ecmd);
1768 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
1769 ETHTOOL_GSET, "ETHTOOL_GSET");
1774 /* Supported features. */
1775 netdev->supported = 0;
1776 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1777 netdev->supported |= NETDEV_F_10MB_HD;
1779 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1780 netdev->supported |= NETDEV_F_10MB_FD;
1782 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1783 netdev->supported |= NETDEV_F_100MB_HD;
1785 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1786 netdev->supported |= NETDEV_F_100MB_FD;
1788 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1789 netdev->supported |= NETDEV_F_1GB_HD;
1791 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1792 netdev->supported |= NETDEV_F_1GB_FD;
1794 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1795 netdev->supported |= NETDEV_F_10GB_FD;
1797 if (ecmd.supported & SUPPORTED_TP) {
1798 netdev->supported |= NETDEV_F_COPPER;
1800 if (ecmd.supported & SUPPORTED_FIBRE) {
1801 netdev->supported |= NETDEV_F_FIBER;
1803 if (ecmd.supported & SUPPORTED_Autoneg) {
1804 netdev->supported |= NETDEV_F_AUTONEG;
1806 if (ecmd.supported & SUPPORTED_Pause) {
1807 netdev->supported |= NETDEV_F_PAUSE;
1809 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1810 netdev->supported |= NETDEV_F_PAUSE_ASYM;
1813 /* Advertised features. */
1814 netdev->advertised = 0;
1815 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1816 netdev->advertised |= NETDEV_F_10MB_HD;
1818 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1819 netdev->advertised |= NETDEV_F_10MB_FD;
1821 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1822 netdev->advertised |= NETDEV_F_100MB_HD;
1824 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1825 netdev->advertised |= NETDEV_F_100MB_FD;
1827 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1828 netdev->advertised |= NETDEV_F_1GB_HD;
1830 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1831 netdev->advertised |= NETDEV_F_1GB_FD;
1833 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1834 netdev->advertised |= NETDEV_F_10GB_FD;
1836 if (ecmd.advertising & ADVERTISED_TP) {
1837 netdev->advertised |= NETDEV_F_COPPER;
1839 if (ecmd.advertising & ADVERTISED_FIBRE) {
1840 netdev->advertised |= NETDEV_F_FIBER;
1842 if (ecmd.advertising & ADVERTISED_Autoneg) {
1843 netdev->advertised |= NETDEV_F_AUTONEG;
1845 if (ecmd.advertising & ADVERTISED_Pause) {
1846 netdev->advertised |= NETDEV_F_PAUSE;
1848 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1849 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
1852 /* Current settings. */
1854 if (speed == SPEED_10) {
1855 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1856 } else if (speed == SPEED_100) {
1857 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1858 } else if (speed == SPEED_1000) {
1859 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1860 } else if (speed == SPEED_10000) {
1861 netdev->current = NETDEV_F_10GB_FD;
1862 } else if (speed == 40000) {
1863 netdev->current = NETDEV_F_40GB_FD;
1864 } else if (speed == 100000) {
1865 netdev->current = NETDEV_F_100GB_FD;
1866 } else if (speed == 1000000) {
1867 netdev->current = NETDEV_F_1TB_FD;
1869 netdev->current = 0;
1872 if (ecmd.port == PORT_TP) {
1873 netdev->current |= NETDEV_F_COPPER;
1874 } else if (ecmd.port == PORT_FIBRE) {
1875 netdev->current |= NETDEV_F_FIBER;
1879 netdev->current |= NETDEV_F_AUTONEG;
1883 netdev->cache_valid |= VALID_FEATURES;
1884 netdev->get_features_error = error;
1887 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1888 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1889 * Returns 0 if successful, otherwise a positive errno value. */
1891 netdev_linux_get_features(const struct netdev *netdev_,
1892 enum netdev_features *current,
1893 enum netdev_features *advertised,
1894 enum netdev_features *supported,
1895 enum netdev_features *peer)
1897 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1900 ovs_mutex_lock(&netdev->mutex);
1901 netdev_linux_read_features(netdev);
1902 if (!netdev->get_features_error) {
1903 *current = netdev->current;
1904 *advertised = netdev->advertised;
1905 *supported = netdev->supported;
1906 *peer = 0; /* XXX */
1908 error = netdev->get_features_error;
1909 ovs_mutex_unlock(&netdev->mutex);
1914 /* Set the features advertised by 'netdev' to 'advertise'. */
1916 netdev_linux_set_advertisements(struct netdev *netdev_,
1917 enum netdev_features advertise)
1919 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1920 struct ethtool_cmd ecmd;
1923 ovs_mutex_lock(&netdev->mutex);
1925 COVERAGE_INC(netdev_get_ethtool);
1926 memset(&ecmd, 0, sizeof ecmd);
1927 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1928 ETHTOOL_GSET, "ETHTOOL_GSET");
1933 ecmd.advertising = 0;
1934 if (advertise & NETDEV_F_10MB_HD) {
1935 ecmd.advertising |= ADVERTISED_10baseT_Half;
1937 if (advertise & NETDEV_F_10MB_FD) {
1938 ecmd.advertising |= ADVERTISED_10baseT_Full;
1940 if (advertise & NETDEV_F_100MB_HD) {
1941 ecmd.advertising |= ADVERTISED_100baseT_Half;
1943 if (advertise & NETDEV_F_100MB_FD) {
1944 ecmd.advertising |= ADVERTISED_100baseT_Full;
1946 if (advertise & NETDEV_F_1GB_HD) {
1947 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1949 if (advertise & NETDEV_F_1GB_FD) {
1950 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1952 if (advertise & NETDEV_F_10GB_FD) {
1953 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1955 if (advertise & NETDEV_F_COPPER) {
1956 ecmd.advertising |= ADVERTISED_TP;
1958 if (advertise & NETDEV_F_FIBER) {
1959 ecmd.advertising |= ADVERTISED_FIBRE;
1961 if (advertise & NETDEV_F_AUTONEG) {
1962 ecmd.advertising |= ADVERTISED_Autoneg;
1964 if (advertise & NETDEV_F_PAUSE) {
1965 ecmd.advertising |= ADVERTISED_Pause;
1967 if (advertise & NETDEV_F_PAUSE_ASYM) {
1968 ecmd.advertising |= ADVERTISED_Asym_Pause;
1970 COVERAGE_INC(netdev_set_ethtool);
1971 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1972 ETHTOOL_SSET, "ETHTOOL_SSET");
1975 ovs_mutex_unlock(&netdev->mutex);
1979 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1980 * successful, otherwise a positive errno value. */
1982 netdev_linux_set_policing(struct netdev *netdev_,
1983 uint32_t kbits_rate, uint32_t kbits_burst)
1985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1986 const char *netdev_name = netdev_get_name(netdev_);
1989 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1990 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1991 : kbits_burst); /* Stick with user-specified value. */
1993 ovs_mutex_lock(&netdev->mutex);
1994 if (netdev->cache_valid & VALID_POLICING) {
1995 error = netdev->netdev_policing_error;
1996 if (error || (netdev->kbits_rate == kbits_rate &&
1997 netdev->kbits_burst == kbits_burst)) {
1998 /* Assume that settings haven't changed since we last set them. */
2001 netdev->cache_valid &= ~VALID_POLICING;
2004 COVERAGE_INC(netdev_set_policing);
2005 /* Remove any existing ingress qdisc. */
2006 error = tc_add_del_ingress_qdisc(netdev_, false);
2008 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
2009 netdev_name, ovs_strerror(error));
2014 error = tc_add_del_ingress_qdisc(netdev_, true);
2016 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
2017 netdev_name, ovs_strerror(error));
2021 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
2023 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
2024 netdev_name, ovs_strerror(error));
2029 netdev->kbits_rate = kbits_rate;
2030 netdev->kbits_burst = kbits_burst;
2033 if (!error || error == ENODEV) {
2034 netdev->netdev_policing_error = error;
2035 netdev->cache_valid |= VALID_POLICING;
2037 ovs_mutex_unlock(&netdev->mutex);
2042 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2045 const struct tc_ops *const *opsp;
2047 for (opsp = tcs; *opsp != NULL; opsp++) {
2048 const struct tc_ops *ops = *opsp;
2049 if (ops->tc_install && ops->ovs_name[0] != '\0') {
2050 sset_add(types, ops->ovs_name);
2056 static const struct tc_ops *
2057 tc_lookup_ovs_name(const char *name)
2059 const struct tc_ops *const *opsp;
2061 for (opsp = tcs; *opsp != NULL; opsp++) {
2062 const struct tc_ops *ops = *opsp;
2063 if (!strcmp(name, ops->ovs_name)) {
2070 static const struct tc_ops *
2071 tc_lookup_linux_name(const char *name)
2073 const struct tc_ops *const *opsp;
2075 for (opsp = tcs; *opsp != NULL; opsp++) {
2076 const struct tc_ops *ops = *opsp;
2077 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2084 static struct tc_queue *
2085 tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
2088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2089 struct tc_queue *queue;
2091 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
2092 if (queue->queue_id == queue_id) {
2099 static struct tc_queue *
2100 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2102 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2106 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2108 struct netdev_qos_capabilities *caps)
2110 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2114 caps->n_queues = ops->n_queues;
2119 netdev_linux_get_qos(const struct netdev *netdev_,
2120 const char **typep, struct smap *details)
2122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2125 ovs_mutex_lock(&netdev->mutex);
2126 error = tc_query_qdisc(netdev_);
2128 *typep = netdev->tc->ops->ovs_name;
2129 error = (netdev->tc->ops->qdisc_get
2130 ? netdev->tc->ops->qdisc_get(netdev_, details)
2133 ovs_mutex_unlock(&netdev->mutex);
2139 netdev_linux_set_qos(struct netdev *netdev_,
2140 const char *type, const struct smap *details)
2142 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2143 const struct tc_ops *new_ops;
2146 new_ops = tc_lookup_ovs_name(type);
2147 if (!new_ops || !new_ops->tc_install) {
2151 ovs_mutex_lock(&netdev->mutex);
2152 error = tc_query_qdisc(netdev_);
2157 if (new_ops == netdev->tc->ops) {
2158 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
2160 /* Delete existing qdisc. */
2161 error = tc_del_qdisc(netdev_);
2165 ovs_assert(netdev->tc == NULL);
2167 /* Install new qdisc. */
2168 error = new_ops->tc_install(netdev_, details);
2169 ovs_assert((error == 0) == (netdev->tc != NULL));
2173 ovs_mutex_unlock(&netdev->mutex);
2178 netdev_linux_get_queue(const struct netdev *netdev_,
2179 unsigned int queue_id, struct smap *details)
2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2184 ovs_mutex_lock(&netdev->mutex);
2185 error = tc_query_qdisc(netdev_);
2187 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2189 ? netdev->tc->ops->class_get(netdev_, queue, details)
2192 ovs_mutex_unlock(&netdev->mutex);
2198 netdev_linux_set_queue(struct netdev *netdev_,
2199 unsigned int queue_id, const struct smap *details)
2201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2204 ovs_mutex_lock(&netdev->mutex);
2205 error = tc_query_qdisc(netdev_);
2207 error = (queue_id < netdev->tc->ops->n_queues
2208 && netdev->tc->ops->class_set
2209 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2212 ovs_mutex_unlock(&netdev->mutex);
2218 netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
2220 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2223 ovs_mutex_lock(&netdev->mutex);
2224 error = tc_query_qdisc(netdev_);
2226 if (netdev->tc->ops->class_delete) {
2227 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2229 ? netdev->tc->ops->class_delete(netdev_, queue)
2235 ovs_mutex_unlock(&netdev->mutex);
2241 netdev_linux_get_queue_stats(const struct netdev *netdev_,
2242 unsigned int queue_id,
2243 struct netdev_queue_stats *stats)
2245 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2248 ovs_mutex_lock(&netdev->mutex);
2249 error = tc_query_qdisc(netdev_);
2251 if (netdev->tc->ops->class_get_stats) {
2252 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2254 stats->created = queue->created;
2255 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2264 ovs_mutex_unlock(&netdev->mutex);
2269 struct queue_dump_state {
2270 struct nl_dump dump;
2275 start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
2277 struct ofpbuf request;
2278 struct tcmsg *tcmsg;
2280 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2284 tcmsg->tcm_parent = 0;
2285 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
2286 ofpbuf_uninit(&request);
2288 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
2293 finish_queue_dump(struct queue_dump_state *state)
2295 ofpbuf_uninit(&state->buf);
2296 return nl_dump_done(&state->dump);
2299 struct netdev_linux_queue_state {
2300 unsigned int *queues;
2306 netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
2308 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2311 ovs_mutex_lock(&netdev->mutex);
2312 error = tc_query_qdisc(netdev_);
2314 if (netdev->tc->ops->class_get) {
2315 struct netdev_linux_queue_state *state;
2316 struct tc_queue *queue;
2319 *statep = state = xmalloc(sizeof *state);
2320 state->n_queues = hmap_count(&netdev->tc->queues);
2321 state->cur_queue = 0;
2322 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2325 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2326 state->queues[i++] = queue->queue_id;
2332 ovs_mutex_unlock(&netdev->mutex);
2338 netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2339 unsigned int *queue_idp, struct smap *details)
2341 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2342 struct netdev_linux_queue_state *state = state_;
2345 ovs_mutex_lock(&netdev->mutex);
2346 while (state->cur_queue < state->n_queues) {
2347 unsigned int queue_id = state->queues[state->cur_queue++];
2348 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2351 *queue_idp = queue_id;
2352 error = netdev->tc->ops->class_get(netdev_, queue, details);
2356 ovs_mutex_unlock(&netdev->mutex);
2362 netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2365 struct netdev_linux_queue_state *state = state_;
2367 free(state->queues);
2373 netdev_linux_dump_queue_stats(const struct netdev *netdev_,
2374 netdev_dump_queue_stats_cb *cb, void *aux)
2376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2379 ovs_mutex_lock(&netdev->mutex);
2380 error = tc_query_qdisc(netdev_);
2382 struct queue_dump_state state;
2384 if (!netdev->tc->ops->class_dump_stats) {
2386 } else if (!start_queue_dump(netdev_, &state)) {
2392 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
2393 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2400 retval = finish_queue_dump(&state);
2406 ovs_mutex_unlock(&netdev->mutex);
2412 netdev_linux_get_in4(const struct netdev *netdev_,
2413 struct in_addr *address, struct in_addr *netmask)
2415 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2418 ovs_mutex_lock(&netdev->mutex);
2419 if (!(netdev->cache_valid & VALID_IN4)) {
2420 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
2421 SIOCGIFADDR, "SIOCGIFADDR");
2423 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2424 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2426 netdev->cache_valid |= VALID_IN4;
2434 if (netdev->address.s_addr != INADDR_ANY) {
2435 *address = netdev->address;
2436 *netmask = netdev->netmask;
2438 error = EADDRNOTAVAIL;
2441 ovs_mutex_unlock(&netdev->mutex);
2447 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2448 struct in_addr netmask)
2450 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2453 ovs_mutex_lock(&netdev->mutex);
2454 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2456 netdev->cache_valid |= VALID_IN4;
2457 netdev->address = address;
2458 netdev->netmask = netmask;
2459 if (address.s_addr != INADDR_ANY) {
2460 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2461 "SIOCSIFNETMASK", netmask);
2464 ovs_mutex_unlock(&netdev->mutex);
2470 parse_if_inet6_line(const char *line,
2471 struct in6_addr *in6, char ifname[16 + 1])
2473 uint8_t *s6 = in6->s6_addr;
2474 #define X8 "%2"SCNx8
2475 return ovs_scan(line,
2476 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2477 "%*x %*x %*x %*x %16s\n",
2478 &s6[0], &s6[1], &s6[2], &s6[3],
2479 &s6[4], &s6[5], &s6[6], &s6[7],
2480 &s6[8], &s6[9], &s6[10], &s6[11],
2481 &s6[12], &s6[13], &s6[14], &s6[15],
2485 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2486 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2489 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2491 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2494 ovs_mutex_lock(&netdev->mutex);
2495 if (!(netdev->cache_valid & VALID_IN6)) {
2499 netdev->in6 = in6addr_any;
2500 netdev->in6_error = EADDRNOTAVAIL;
2502 file = fopen("/proc/net/if_inet6", "r");
2504 const char *name = netdev_get_name(netdev_);
2505 while (fgets(line, sizeof line, file)) {
2506 struct in6_addr in6_tmp;
2507 char ifname[16 + 1];
2508 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2509 && !strcmp(name, ifname))
2511 netdev->in6 = in6_tmp;
2512 netdev->in6_error = 0;
2518 netdev->in6_error = EOPNOTSUPP;
2520 netdev->cache_valid |= VALID_IN6;
2523 error = netdev->in6_error;
2524 ovs_mutex_unlock(&netdev->mutex);
2530 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2532 struct sockaddr_in sin;
2533 memset(&sin, 0, sizeof sin);
2534 sin.sin_family = AF_INET;
2535 sin.sin_addr = addr;
2538 memset(sa, 0, sizeof *sa);
2539 memcpy(sa, &sin, sizeof sin);
2543 do_set_addr(struct netdev *netdev,
2544 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2548 make_in4_sockaddr(&ifr.ifr_addr, addr);
2549 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2553 /* Adds 'router' as a default IP gateway. */
2555 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2557 struct in_addr any = { INADDR_ANY };
2561 memset(&rt, 0, sizeof rt);
2562 make_in4_sockaddr(&rt.rt_dst, any);
2563 make_in4_sockaddr(&rt.rt_gateway, router);
2564 make_in4_sockaddr(&rt.rt_genmask, any);
2565 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2566 error = af_inet_ioctl(SIOCADDRT, &rt);
2568 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
2574 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2577 static const char fn[] = "/proc/net/route";
2582 *netdev_name = NULL;
2583 stream = fopen(fn, "r");
2584 if (stream == NULL) {
2585 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
2590 while (fgets(line, sizeof line, stream)) {
2593 ovs_be32 dest, gateway, mask;
2594 int refcnt, metric, mtu;
2595 unsigned int flags, use, window, irtt;
2598 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2600 iface, &dest, &gateway, &flags, &refcnt,
2601 &use, &metric, &mask, &mtu, &window, &irtt)) {
2602 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2606 if (!(flags & RTF_UP)) {
2607 /* Skip routes that aren't up. */
2611 /* The output of 'dest', 'mask', and 'gateway' were given in
2612 * network byte order, so we don't need need any endian
2613 * conversions here. */
2614 if ((dest & mask) == (host->s_addr & mask)) {
2616 /* The host is directly reachable. */
2617 next_hop->s_addr = 0;
2619 /* To reach the host, we must go through a gateway. */
2620 next_hop->s_addr = gateway;
2622 *netdev_name = xstrdup(iface);
2634 netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
2636 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2639 ovs_mutex_lock(&netdev->mutex);
2640 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2641 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
2643 COVERAGE_INC(netdev_get_ethtool);
2644 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2645 error = netdev_linux_do_ethtool(netdev->up.name,
2648 "ETHTOOL_GDRVINFO");
2650 netdev->cache_valid |= VALID_DRVINFO;
2655 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2656 smap_add(smap, "driver_version", netdev->drvinfo.version);
2657 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
2659 ovs_mutex_unlock(&netdev->mutex);
2665 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2668 smap_add(smap, "driver_name", "openvswitch");
2672 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2673 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2674 * returns 0. Otherwise, it returns a positive errno value; in particular,
2675 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2677 netdev_linux_arp_lookup(const struct netdev *netdev,
2678 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2681 struct sockaddr_in sin;
2684 memset(&r, 0, sizeof r);
2685 memset(&sin, 0, sizeof sin);
2686 sin.sin_family = AF_INET;
2687 sin.sin_addr.s_addr = ip;
2689 memcpy(&r.arp_pa, &sin, sizeof sin);
2690 r.arp_ha.sa_family = ARPHRD_ETHER;
2692 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2693 COVERAGE_INC(netdev_arp_lookup);
2694 retval = af_inet_ioctl(SIOCGARP, &r);
2696 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2697 } else if (retval != ENXIO) {
2698 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2699 netdev_get_name(netdev), IP_ARGS(ip),
2700 ovs_strerror(retval));
2706 nd_to_iff_flags(enum netdev_flags nd)
2709 if (nd & NETDEV_UP) {
2712 if (nd & NETDEV_PROMISC) {
2715 if (nd & NETDEV_LOOPBACK) {
2716 iff |= IFF_LOOPBACK;
2722 iff_to_nd_flags(int iff)
2724 enum netdev_flags nd = 0;
2728 if (iff & IFF_PROMISC) {
2729 nd |= NETDEV_PROMISC;
2731 if (iff & IFF_LOOPBACK) {
2732 nd |= NETDEV_LOOPBACK;
2738 update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2739 enum netdev_flags on, enum netdev_flags *old_flagsp)
2740 OVS_REQUIRES(netdev->mutex)
2742 int old_flags, new_flags;
2745 old_flags = netdev->ifi_flags;
2746 *old_flagsp = iff_to_nd_flags(old_flags);
2747 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2748 if (new_flags != old_flags) {
2749 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2750 get_flags(&netdev->up, &netdev->ifi_flags);
2757 netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2758 enum netdev_flags on, enum netdev_flags *old_flagsp)
2760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2763 ovs_mutex_lock(&netdev->mutex);
2764 error = update_flags(netdev, off, on, old_flagsp);
2765 ovs_mutex_unlock(&netdev->mutex);
2770 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2771 GET_FEATURES, GET_STATUS) \
2777 netdev_linux_wait, \
2779 netdev_linux_alloc, \
2781 netdev_linux_destruct, \
2782 netdev_linux_dealloc, \
2783 NULL, /* get_config */ \
2784 NULL, /* set_config */ \
2785 NULL, /* get_tunnel_config */ \
2786 NULL, /* build header */ \
2787 NULL, /* push header */ \
2788 NULL, /* pop header */ \
2789 NULL, /* get_numa_id */ \
2790 NULL, /* set_multiq */ \
2792 netdev_linux_send, \
2793 netdev_linux_send_wait, \
2795 netdev_linux_set_etheraddr, \
2796 netdev_linux_get_etheraddr, \
2797 netdev_linux_get_mtu, \
2798 netdev_linux_set_mtu, \
2799 netdev_linux_get_ifindex, \
2800 netdev_linux_get_carrier, \
2801 netdev_linux_get_carrier_resets, \
2802 netdev_linux_set_miimon_interval, \
2806 netdev_linux_set_advertisements, \
2808 netdev_linux_set_policing, \
2809 netdev_linux_get_qos_types, \
2810 netdev_linux_get_qos_capabilities, \
2811 netdev_linux_get_qos, \
2812 netdev_linux_set_qos, \
2813 netdev_linux_get_queue, \
2814 netdev_linux_set_queue, \
2815 netdev_linux_delete_queue, \
2816 netdev_linux_get_queue_stats, \
2817 netdev_linux_queue_dump_start, \
2818 netdev_linux_queue_dump_next, \
2819 netdev_linux_queue_dump_done, \
2820 netdev_linux_dump_queue_stats, \
2822 netdev_linux_get_in4, \
2823 netdev_linux_set_in4, \
2824 netdev_linux_get_in6, \
2825 netdev_linux_add_router, \
2826 netdev_linux_get_next_hop, \
2828 netdev_linux_arp_lookup, \
2830 netdev_linux_update_flags, \
2832 netdev_linux_rxq_alloc, \
2833 netdev_linux_rxq_construct, \
2834 netdev_linux_rxq_destruct, \
2835 netdev_linux_rxq_dealloc, \
2836 netdev_linux_rxq_recv, \
2837 netdev_linux_rxq_wait, \
2838 netdev_linux_rxq_drain, \
2841 const struct netdev_class netdev_linux_class =
2844 netdev_linux_construct,
2845 netdev_linux_get_stats,
2846 netdev_linux_get_features,
2847 netdev_linux_get_status);
2849 const struct netdev_class netdev_tap_class =
2852 netdev_linux_construct_tap,
2853 netdev_tap_get_stats,
2854 netdev_linux_get_features,
2855 netdev_linux_get_status);
2857 const struct netdev_class netdev_internal_class =
2860 netdev_linux_construct,
2861 netdev_internal_get_stats,
2862 NULL, /* get_features */
2863 netdev_internal_get_status);
2866 #define CODEL_N_QUEUES 0x0000
2868 /* In sufficiently new kernel headers these are defined as enums in
2869 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2870 * kernels. (This overrides any enum definition in the header file but that's
2872 #define TCA_CODEL_TARGET 1
2873 #define TCA_CODEL_LIMIT 2
2874 #define TCA_CODEL_INTERVAL 3
2883 static struct codel *
2884 codel_get__(const struct netdev *netdev_)
2886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2887 return CONTAINER_OF(netdev->tc, struct codel, tc);
2891 codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2895 struct codel *codel;
2897 codel = xmalloc(sizeof *codel);
2898 tc_init(&codel->tc, &tc_ops_codel);
2899 codel->target = target;
2900 codel->limit = limit;
2901 codel->interval = interval;
2903 netdev->tc = &codel->tc;
2907 codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2911 struct ofpbuf request;
2912 struct tcmsg *tcmsg;
2913 uint32_t otarget, olimit, ointerval;
2916 tc_del_qdisc(netdev);
2918 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2919 NLM_F_EXCL | NLM_F_CREATE, &request);
2923 tcmsg->tcm_handle = tc_make_handle(1, 0);
2924 tcmsg->tcm_parent = TC_H_ROOT;
2926 otarget = target ? target : 5000;
2927 olimit = limit ? limit : 10240;
2928 ointerval = interval ? interval : 100000;
2930 nl_msg_put_string(&request, TCA_KIND, "codel");
2931 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2932 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2933 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2934 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2935 nl_msg_end_nested(&request, opt_offset);
2937 error = tc_transact(&request, NULL);
2939 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2940 "target %u, limit %u, interval %u error %d(%s)",
2941 netdev_get_name(netdev),
2942 otarget, olimit, ointerval,
2943 error, ovs_strerror(error));
2949 codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2950 const struct smap *details, struct codel *codel)
2952 const char *target_s;
2953 const char *limit_s;
2954 const char *interval_s;
2956 target_s = smap_get(details, "target");
2957 limit_s = smap_get(details, "limit");
2958 interval_s = smap_get(details, "interval");
2960 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2961 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2962 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2964 if (!codel->target) {
2965 codel->target = 5000;
2967 if (!codel->limit) {
2968 codel->limit = 10240;
2970 if (!codel->interval) {
2971 codel->interval = 100000;
2976 codel_tc_install(struct netdev *netdev, const struct smap *details)
2981 codel_parse_qdisc_details__(netdev, details, &codel);
2982 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2985 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2991 codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2993 static const struct nl_policy tca_codel_policy[] = {
2994 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2995 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2996 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2999 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3001 if (!nl_parse_nested(nl_options, tca_codel_policy,
3002 attrs, ARRAY_SIZE(tca_codel_policy))) {
3003 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3007 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3008 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3009 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3014 codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3016 struct nlattr *nlattr;
3021 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3026 error = codel_parse_tca_options__(nlattr, &codel);
3031 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3037 codel_tc_destroy(struct tc *tc)
3039 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3045 codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3047 const struct codel *codel = codel_get__(netdev);
3048 smap_add_format(details, "target", "%u", codel->target);
3049 smap_add_format(details, "limit", "%u", codel->limit);
3050 smap_add_format(details, "interval", "%u", codel->interval);
3055 codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3059 codel_parse_qdisc_details__(netdev, details, &codel);
3060 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3061 codel_get__(netdev)->target = codel.target;
3062 codel_get__(netdev)->limit = codel.limit;
3063 codel_get__(netdev)->interval = codel.interval;
3067 static const struct tc_ops tc_ops_codel = {
3068 "codel", /* linux_name */
3069 "linux-codel", /* ovs_name */
3070 CODEL_N_QUEUES, /* n_queues */
3083 /* FQ-CoDel traffic control class. */
3085 #define FQCODEL_N_QUEUES 0x0000
3087 /* In sufficiently new kernel headers these are defined as enums in
3088 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3089 * kernels. (This overrides any enum definition in the header file but that's
3091 #define TCA_FQ_CODEL_TARGET 1
3092 #define TCA_FQ_CODEL_LIMIT 2
3093 #define TCA_FQ_CODEL_INTERVAL 3
3094 #define TCA_FQ_CODEL_ECN 4
3095 #define TCA_FQ_CODEL_FLOWS 5
3096 #define TCA_FQ_CODEL_QUANTUM 6
3107 static struct fqcodel *
3108 fqcodel_get__(const struct netdev *netdev_)
3110 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3111 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3115 fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3116 uint32_t interval, uint32_t flows, uint32_t quantum)
3118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3119 struct fqcodel *fqcodel;
3121 fqcodel = xmalloc(sizeof *fqcodel);
3122 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3123 fqcodel->target = target;
3124 fqcodel->limit = limit;
3125 fqcodel->interval = interval;
3126 fqcodel->flows = flows;
3127 fqcodel->quantum = quantum;
3129 netdev->tc = &fqcodel->tc;
3133 fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3134 uint32_t interval, uint32_t flows, uint32_t quantum)
3137 struct ofpbuf request;
3138 struct tcmsg *tcmsg;
3139 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3142 tc_del_qdisc(netdev);
3144 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3145 NLM_F_EXCL | NLM_F_CREATE, &request);
3149 tcmsg->tcm_handle = tc_make_handle(1, 0);
3150 tcmsg->tcm_parent = TC_H_ROOT;
3152 otarget = target ? target : 5000;
3153 olimit = limit ? limit : 10240;
3154 ointerval = interval ? interval : 100000;
3155 oflows = flows ? flows : 1024;
3156 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3159 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3160 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3161 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3162 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3163 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3164 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3165 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3166 nl_msg_end_nested(&request, opt_offset);
3168 error = tc_transact(&request, NULL);
3170 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3171 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3172 netdev_get_name(netdev),
3173 otarget, olimit, ointerval, oflows, oquantum,
3174 error, ovs_strerror(error));
3180 fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3181 const struct smap *details, struct fqcodel *fqcodel)
3183 const char *target_s;
3184 const char *limit_s;
3185 const char *interval_s;
3186 const char *flows_s;
3187 const char *quantum_s;
3189 target_s = smap_get(details, "target");
3190 limit_s = smap_get(details, "limit");
3191 interval_s = smap_get(details, "interval");
3192 flows_s = smap_get(details, "flows");
3193 quantum_s = smap_get(details, "quantum");
3194 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3195 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3196 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3197 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3198 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3199 if (!fqcodel->target) {
3200 fqcodel->target = 5000;
3202 if (!fqcodel->limit) {
3203 fqcodel->limit = 10240;
3205 if (!fqcodel->interval) {
3206 fqcodel->interval = 1000000;
3208 if (!fqcodel->flows) {
3209 fqcodel->flows = 1024;
3211 if (!fqcodel->quantum) {
3212 fqcodel->quantum = 1514;
3217 fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3220 struct fqcodel fqcodel;
3222 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3223 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3224 fqcodel.interval, fqcodel.flows,
3227 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3228 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3234 fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3236 static const struct nl_policy tca_fqcodel_policy[] = {
3237 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3238 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3239 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3240 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3241 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3244 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3246 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3247 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3248 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3252 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3253 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3254 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3255 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3256 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3261 fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3263 struct nlattr *nlattr;
3266 struct fqcodel fqcodel;
3268 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3273 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3278 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3279 fqcodel.flows, fqcodel.quantum);
3284 fqcodel_tc_destroy(struct tc *tc)
3286 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3292 fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3294 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3295 smap_add_format(details, "target", "%u", fqcodel->target);
3296 smap_add_format(details, "limit", "%u", fqcodel->limit);
3297 smap_add_format(details, "interval", "%u", fqcodel->interval);
3298 smap_add_format(details, "flows", "%u", fqcodel->flows);
3299 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3304 fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3306 struct fqcodel fqcodel;
3308 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3309 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3310 fqcodel.flows, fqcodel.quantum);
3311 fqcodel_get__(netdev)->target = fqcodel.target;
3312 fqcodel_get__(netdev)->limit = fqcodel.limit;
3313 fqcodel_get__(netdev)->interval = fqcodel.interval;
3314 fqcodel_get__(netdev)->flows = fqcodel.flows;
3315 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3319 static const struct tc_ops tc_ops_fqcodel = {
3320 "fq_codel", /* linux_name */
3321 "linux-fq_codel", /* ovs_name */
3322 FQCODEL_N_QUEUES, /* n_queues */
3335 /* SFQ traffic control class. */
3337 #define SFQ_N_QUEUES 0x0000
3346 sfq_get__(const struct netdev *netdev_)
3348 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3349 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3353 sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3355 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3358 sfq = xmalloc(sizeof *sfq);
3359 tc_init(&sfq->tc, &tc_ops_sfq);
3360 sfq->perturb = perturb;
3361 sfq->quantum = quantum;
3363 netdev->tc = &sfq->tc;
3367 sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3369 struct tc_sfq_qopt opt;
3370 struct ofpbuf request;
3371 struct tcmsg *tcmsg;
3373 int mtu_error, error;
3374 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3376 tc_del_qdisc(netdev);
3378 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3379 NLM_F_EXCL | NLM_F_CREATE, &request);
3383 tcmsg->tcm_handle = tc_make_handle(1, 0);
3384 tcmsg->tcm_parent = TC_H_ROOT;
3386 memset(&opt, 0, sizeof opt);
3389 opt.quantum = mtu; /* if we cannot find mtu, use default */
3392 opt.quantum = quantum;
3396 opt.perturb_period = 10;
3398 opt.perturb_period = perturb;
3401 nl_msg_put_string(&request, TCA_KIND, "sfq");
3402 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3404 error = tc_transact(&request, NULL);
3406 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3407 "quantum %u, perturb %u error %d(%s)",
3408 netdev_get_name(netdev),
3409 opt.quantum, opt.perturb_period,
3410 error, ovs_strerror(error));
3416 sfq_parse_qdisc_details__(struct netdev *netdev,
3417 const struct smap *details, struct sfq *sfq)
3419 const char *perturb_s;
3420 const char *quantum_s;
3424 perturb_s = smap_get(details, "perturb");
3425 quantum_s = smap_get(details, "quantum");
3426 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3427 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3428 if (!sfq->perturb) {
3432 if (!sfq->quantum) {
3433 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3437 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3438 "device without mtu");
3445 sfq_tc_install(struct netdev *netdev, const struct smap *details)
3450 sfq_parse_qdisc_details__(netdev, details, &sfq);
3451 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3453 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3459 sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3461 const struct tc_sfq_qopt *sfq;
3462 struct nlattr *nlattr;
3466 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3468 sfq = nl_attr_get(nlattr);
3469 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3477 sfq_tc_destroy(struct tc *tc)
3479 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3485 sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3487 const struct sfq *sfq = sfq_get__(netdev);
3488 smap_add_format(details, "quantum", "%u", sfq->quantum);
3489 smap_add_format(details, "perturb", "%u", sfq->perturb);
3494 sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3498 sfq_parse_qdisc_details__(netdev, details, &sfq);
3499 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3500 sfq_get__(netdev)->quantum = sfq.quantum;
3501 sfq_get__(netdev)->perturb = sfq.perturb;
3505 static const struct tc_ops tc_ops_sfq = {
3506 "sfq", /* linux_name */
3507 "linux-sfq", /* ovs_name */
3508 SFQ_N_QUEUES, /* n_queues */
3521 /* HTB traffic control class. */
3523 #define HTB_N_QUEUES 0xf000
3524 #define HTB_RATE2QUANTUM 10
3528 unsigned int max_rate; /* In bytes/s. */
3532 struct tc_queue tc_queue;
3533 unsigned int min_rate; /* In bytes/s. */
3534 unsigned int max_rate; /* In bytes/s. */
3535 unsigned int burst; /* In bytes. */
3536 unsigned int priority; /* Lower values are higher priorities. */
3540 htb_get__(const struct netdev *netdev_)
3542 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3543 return CONTAINER_OF(netdev->tc, struct htb, tc);
3547 htb_install__(struct netdev *netdev_, uint64_t max_rate)
3549 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3552 htb = xmalloc(sizeof *htb);
3553 tc_init(&htb->tc, &tc_ops_htb);
3554 htb->max_rate = max_rate;
3556 netdev->tc = &htb->tc;
3559 /* Create an HTB qdisc.
3561 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3563 htb_setup_qdisc__(struct netdev *netdev)
3566 struct tc_htb_glob opt;
3567 struct ofpbuf request;
3568 struct tcmsg *tcmsg;
3570 tc_del_qdisc(netdev);
3572 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3573 NLM_F_EXCL | NLM_F_CREATE, &request);
3577 tcmsg->tcm_handle = tc_make_handle(1, 0);
3578 tcmsg->tcm_parent = TC_H_ROOT;
3580 nl_msg_put_string(&request, TCA_KIND, "htb");
3582 memset(&opt, 0, sizeof opt);
3583 opt.rate2quantum = HTB_RATE2QUANTUM;
3587 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3588 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3589 nl_msg_end_nested(&request, opt_offset);
3591 return tc_transact(&request, NULL);
3594 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3595 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3597 htb_setup_class__(struct netdev *netdev, unsigned int handle,
3598 unsigned int parent, struct htb_class *class)
3601 struct tc_htb_opt opt;
3602 struct ofpbuf request;
3603 struct tcmsg *tcmsg;
3607 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3609 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3610 netdev_get_name(netdev));
3614 memset(&opt, 0, sizeof opt);
3615 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3616 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
3617 /* Makes sure the quantum is at least MTU. Setting quantum will
3618 * make htb ignore the r2q for this class. */
3619 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3622 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3623 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3624 opt.prio = class->priority;
3626 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3630 tcmsg->tcm_handle = handle;
3631 tcmsg->tcm_parent = parent;
3633 nl_msg_put_string(&request, TCA_KIND, "htb");
3634 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3635 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3636 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3637 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3638 nl_msg_end_nested(&request, opt_offset);
3640 error = tc_transact(&request, NULL);
3642 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3643 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3644 netdev_get_name(netdev),
3645 tc_get_major(handle), tc_get_minor(handle),
3646 tc_get_major(parent), tc_get_minor(parent),
3647 class->min_rate, class->max_rate,
3648 class->burst, class->priority, ovs_strerror(error));
3653 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3654 * description of them into 'details'. The description complies with the
3655 * specification given in the vswitch database documentation for linux-htb
3658 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3660 static const struct nl_policy tca_htb_policy[] = {
3661 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3662 .min_len = sizeof(struct tc_htb_opt) },
3665 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3666 const struct tc_htb_opt *htb;
3668 if (!nl_parse_nested(nl_options, tca_htb_policy,
3669 attrs, ARRAY_SIZE(tca_htb_policy))) {
3670 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3674 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3675 class->min_rate = htb->rate.rate;
3676 class->max_rate = htb->ceil.rate;
3677 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3678 class->priority = htb->prio;
3683 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3684 struct htb_class *options,
3685 struct netdev_queue_stats *stats)
3687 struct nlattr *nl_options;
3688 unsigned int handle;
3691 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3692 if (!error && queue_id) {
3693 unsigned int major = tc_get_major(handle);
3694 unsigned int minor = tc_get_minor(handle);
3695 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3696 *queue_id = minor - 1;
3701 if (!error && options) {
3702 error = htb_parse_tca_options__(nl_options, options);
3708 htb_parse_qdisc_details__(struct netdev *netdev_,
3709 const struct smap *details, struct htb_class *hc)
3711 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3712 const char *max_rate_s;
3714 max_rate_s = smap_get(details, "max-rate");
3715 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3716 if (!hc->max_rate) {
3717 enum netdev_features current;
3719 netdev_linux_read_features(netdev);
3720 current = !netdev->get_features_error ? netdev->current : 0;
3721 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3723 hc->min_rate = hc->max_rate;
3729 htb_parse_class_details__(struct netdev *netdev,
3730 const struct smap *details, struct htb_class *hc)
3732 const struct htb *htb = htb_get__(netdev);
3733 const char *min_rate_s = smap_get(details, "min-rate");
3734 const char *max_rate_s = smap_get(details, "max-rate");
3735 const char *burst_s = smap_get(details, "burst");
3736 const char *priority_s = smap_get(details, "priority");
3739 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3741 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3742 netdev_get_name(netdev));
3746 /* HTB requires at least an mtu sized min-rate to send any traffic even
3747 * on uncongested links. */
3748 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3749 hc->min_rate = MAX(hc->min_rate, mtu);
3750 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3753 hc->max_rate = (max_rate_s
3754 ? strtoull(max_rate_s, NULL, 10) / 8
3756 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3757 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3761 * According to hints in the documentation that I've read, it is important
3762 * that 'burst' be at least as big as the largest frame that might be
3763 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3764 * but having it a bit too small is a problem. Since netdev_get_mtu()
3765 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3766 * the MTU. We actually add 64, instead of 14, as a guard against
3767 * additional headers get tacked on somewhere that we're not aware of. */
3768 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3769 hc->burst = MAX(hc->burst, mtu + 64);
3772 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3778 htb_query_class__(const struct netdev *netdev, unsigned int handle,
3779 unsigned int parent, struct htb_class *options,
3780 struct netdev_queue_stats *stats)
3782 struct ofpbuf *reply;
3785 error = tc_query_class(netdev, handle, parent, &reply);
3787 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3788 ofpbuf_delete(reply);
3794 htb_tc_install(struct netdev *netdev, const struct smap *details)
3798 error = htb_setup_qdisc__(netdev);
3800 struct htb_class hc;
3802 htb_parse_qdisc_details__(netdev, details, &hc);
3803 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3804 tc_make_handle(1, 0), &hc);
3806 htb_install__(netdev, hc.max_rate);
3812 static struct htb_class *
3813 htb_class_cast__(const struct tc_queue *queue)
3815 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3819 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3820 const struct htb_class *hc)
3822 struct htb *htb = htb_get__(netdev);
3823 size_t hash = hash_int(queue_id, 0);
3824 struct tc_queue *queue;
3825 struct htb_class *hcp;
3827 queue = tc_find_queue__(netdev, queue_id, hash);
3829 hcp = htb_class_cast__(queue);
3831 hcp = xmalloc(sizeof *hcp);
3832 queue = &hcp->tc_queue;
3833 queue->queue_id = queue_id;
3834 queue->created = time_msec();
3835 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
3838 hcp->min_rate = hc->min_rate;
3839 hcp->max_rate = hc->max_rate;
3840 hcp->burst = hc->burst;
3841 hcp->priority = hc->priority;
3845 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3848 struct queue_dump_state state;
3849 struct htb_class hc;
3851 /* Get qdisc options. */
3853 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3854 htb_install__(netdev, hc.max_rate);
3857 if (!start_queue_dump(netdev, &state)) {
3860 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3861 unsigned int queue_id;
3863 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3864 htb_update_queue__(netdev, queue_id, &hc);
3867 finish_queue_dump(&state);
3873 htb_tc_destroy(struct tc *tc)
3875 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
3876 struct htb_class *hc, *next;
3878 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
3879 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3887 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
3889 const struct htb *htb = htb_get__(netdev);
3890 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
3895 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
3897 struct htb_class hc;
3900 htb_parse_qdisc_details__(netdev, details, &hc);
3901 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3902 tc_make_handle(1, 0), &hc);
3904 htb_get__(netdev)->max_rate = hc.max_rate;
3910 htb_class_get(const struct netdev *netdev OVS_UNUSED,
3911 const struct tc_queue *queue, struct smap *details)
3913 const struct htb_class *hc = htb_class_cast__(queue);
3915 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3916 if (hc->min_rate != hc->max_rate) {
3917 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3919 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
3921 smap_add_format(details, "priority", "%u", hc->priority);
3927 htb_class_set(struct netdev *netdev, unsigned int queue_id,
3928 const struct smap *details)
3930 struct htb_class hc;
3933 error = htb_parse_class_details__(netdev, details, &hc);
3938 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3939 tc_make_handle(1, 0xfffe), &hc);
3944 htb_update_queue__(netdev, queue_id, &hc);
3949 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
3951 struct htb_class *hc = htb_class_cast__(queue);
3952 struct htb *htb = htb_get__(netdev);
3955 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3957 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
3964 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3965 struct netdev_queue_stats *stats)
3967 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3968 tc_make_handle(1, 0xfffe), NULL, stats);
3972 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3973 const struct ofpbuf *nlmsg,
3974 netdev_dump_queue_stats_cb *cb, void *aux)
3976 struct netdev_queue_stats stats;
3977 unsigned int handle, major, minor;
3980 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3985 major = tc_get_major(handle);
3986 minor = tc_get_minor(handle);
3987 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3988 (*cb)(minor - 1, &stats, aux);
3993 static const struct tc_ops tc_ops_htb = {
3994 "htb", /* linux_name */
3995 "linux-htb", /* ovs_name */
3996 HTB_N_QUEUES, /* n_queues */
4005 htb_class_get_stats,
4006 htb_class_dump_stats
4009 /* "linux-hfsc" traffic control class. */
4011 #define HFSC_N_QUEUES 0xf000
4019 struct tc_queue tc_queue;
4024 static struct hfsc *
4025 hfsc_get__(const struct netdev *netdev_)
4027 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4028 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
4031 static struct hfsc_class *
4032 hfsc_class_cast__(const struct tc_queue *queue)
4034 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4038 hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
4040 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4043 hfsc = xmalloc(sizeof *hfsc);
4044 tc_init(&hfsc->tc, &tc_ops_hfsc);
4045 hfsc->max_rate = max_rate;
4046 netdev->tc = &hfsc->tc;
4050 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4051 const struct hfsc_class *hc)
4055 struct hfsc_class *hcp;
4056 struct tc_queue *queue;
4058 hfsc = hfsc_get__(netdev);
4059 hash = hash_int(queue_id, 0);
4061 queue = tc_find_queue__(netdev, queue_id, hash);
4063 hcp = hfsc_class_cast__(queue);
4065 hcp = xmalloc(sizeof *hcp);
4066 queue = &hcp->tc_queue;
4067 queue->queue_id = queue_id;
4068 queue->created = time_msec();
4069 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4072 hcp->min_rate = hc->min_rate;
4073 hcp->max_rate = hc->max_rate;
4077 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4079 const struct tc_service_curve *rsc, *fsc, *usc;
4080 static const struct nl_policy tca_hfsc_policy[] = {
4082 .type = NL_A_UNSPEC,
4084 .min_len = sizeof(struct tc_service_curve),
4087 .type = NL_A_UNSPEC,
4089 .min_len = sizeof(struct tc_service_curve),
4092 .type = NL_A_UNSPEC,
4094 .min_len = sizeof(struct tc_service_curve),
4097 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4099 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4100 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4101 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4105 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4106 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4107 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4109 if (rsc->m1 != 0 || rsc->d != 0 ||
4110 fsc->m1 != 0 || fsc->d != 0 ||
4111 usc->m1 != 0 || usc->d != 0) {
4112 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4113 "Non-linear service curves are not supported.");
4117 if (rsc->m2 != fsc->m2) {
4118 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4119 "Real-time service curves are not supported ");
4123 if (rsc->m2 > usc->m2) {
4124 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4125 "Min-rate service curve is greater than "
4126 "the max-rate service curve.");
4130 class->min_rate = fsc->m2;
4131 class->max_rate = usc->m2;
4136 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4137 struct hfsc_class *options,
4138 struct netdev_queue_stats *stats)
4141 unsigned int handle;
4142 struct nlattr *nl_options;
4144 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4150 unsigned int major, minor;
4152 major = tc_get_major(handle);
4153 minor = tc_get_minor(handle);
4154 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4155 *queue_id = minor - 1;
4162 error = hfsc_parse_tca_options__(nl_options, options);
4169 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4170 unsigned int parent, struct hfsc_class *options,
4171 struct netdev_queue_stats *stats)
4174 struct ofpbuf *reply;
4176 error = tc_query_class(netdev, handle, parent, &reply);
4181 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4182 ofpbuf_delete(reply);
4187 hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
4188 struct hfsc_class *class)
4190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4192 const char *max_rate_s;
4194 max_rate_s = smap_get(details, "max-rate");
4195 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4198 enum netdev_features current;
4200 netdev_linux_read_features(netdev);
4201 current = !netdev->get_features_error ? netdev->current : 0;
4202 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
4205 class->min_rate = max_rate;
4206 class->max_rate = max_rate;
4210 hfsc_parse_class_details__(struct netdev *netdev,
4211 const struct smap *details,
4212 struct hfsc_class * class)
4214 const struct hfsc *hfsc;
4215 uint32_t min_rate, max_rate;
4216 const char *min_rate_s, *max_rate_s;
4218 hfsc = hfsc_get__(netdev);
4219 min_rate_s = smap_get(details, "min-rate");
4220 max_rate_s = smap_get(details, "max-rate");
4222 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4223 min_rate = MAX(min_rate, 1);
4224 min_rate = MIN(min_rate, hfsc->max_rate);
4226 max_rate = (max_rate_s
4227 ? strtoull(max_rate_s, NULL, 10) / 8
4229 max_rate = MAX(max_rate, min_rate);
4230 max_rate = MIN(max_rate, hfsc->max_rate);
4232 class->min_rate = min_rate;
4233 class->max_rate = max_rate;
4238 /* Create an HFSC qdisc.
4240 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4242 hfsc_setup_qdisc__(struct netdev * netdev)
4244 struct tcmsg *tcmsg;
4245 struct ofpbuf request;
4246 struct tc_hfsc_qopt opt;
4248 tc_del_qdisc(netdev);
4250 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4251 NLM_F_EXCL | NLM_F_CREATE, &request);
4257 tcmsg->tcm_handle = tc_make_handle(1, 0);
4258 tcmsg->tcm_parent = TC_H_ROOT;
4260 memset(&opt, 0, sizeof opt);
4263 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4264 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4266 return tc_transact(&request, NULL);
4269 /* Create an HFSC class.
4271 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4272 * sc rate <min_rate> ul rate <max_rate>" */
4274 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4275 unsigned int parent, struct hfsc_class *class)
4279 struct tcmsg *tcmsg;
4280 struct ofpbuf request;
4281 struct tc_service_curve min, max;
4283 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4289 tcmsg->tcm_handle = handle;
4290 tcmsg->tcm_parent = parent;
4294 min.m2 = class->min_rate;
4298 max.m2 = class->max_rate;
4300 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4301 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4302 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4303 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4304 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4305 nl_msg_end_nested(&request, opt_offset);
4307 error = tc_transact(&request, NULL);
4309 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4310 "min-rate %ubps, max-rate %ubps (%s)",
4311 netdev_get_name(netdev),
4312 tc_get_major(handle), tc_get_minor(handle),
4313 tc_get_major(parent), tc_get_minor(parent),
4314 class->min_rate, class->max_rate, ovs_strerror(error));
4321 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
4324 struct hfsc_class class;
4326 error = hfsc_setup_qdisc__(netdev);
4332 hfsc_parse_qdisc_details__(netdev, details, &class);
4333 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4334 tc_make_handle(1, 0), &class);
4340 hfsc_install__(netdev, class.max_rate);
4345 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4348 struct queue_dump_state state;
4349 struct hfsc_class hc;
4352 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
4353 hfsc_install__(netdev, hc.max_rate);
4355 if (!start_queue_dump(netdev, &state)) {
4359 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
4360 unsigned int queue_id;
4362 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4363 hfsc_update_queue__(netdev, queue_id, &hc);
4367 finish_queue_dump(&state);
4372 hfsc_tc_destroy(struct tc *tc)
4375 struct hfsc_class *hc, *next;
4377 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4379 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4380 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4389 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
4391 const struct hfsc *hfsc;
4392 hfsc = hfsc_get__(netdev);
4393 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
4398 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
4401 struct hfsc_class class;
4403 hfsc_parse_qdisc_details__(netdev, details, &class);
4404 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4405 tc_make_handle(1, 0), &class);
4408 hfsc_get__(netdev)->max_rate = class.max_rate;
4415 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
4416 const struct tc_queue *queue, struct smap *details)
4418 const struct hfsc_class *hc;
4420 hc = hfsc_class_cast__(queue);
4421 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
4422 if (hc->min_rate != hc->max_rate) {
4423 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
4429 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
4430 const struct smap *details)
4433 struct hfsc_class class;
4435 error = hfsc_parse_class_details__(netdev, details, &class);
4440 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4441 tc_make_handle(1, 0xfffe), &class);
4446 hfsc_update_queue__(netdev, queue_id, &class);
4451 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4455 struct hfsc_class *hc;
4457 hc = hfsc_class_cast__(queue);
4458 hfsc = hfsc_get__(netdev);
4460 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4462 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4469 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4470 struct netdev_queue_stats *stats)
4472 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4473 tc_make_handle(1, 0xfffe), NULL, stats);
4477 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4478 const struct ofpbuf *nlmsg,
4479 netdev_dump_queue_stats_cb *cb, void *aux)
4481 struct netdev_queue_stats stats;
4482 unsigned int handle, major, minor;
4485 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4490 major = tc_get_major(handle);
4491 minor = tc_get_minor(handle);
4492 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4493 (*cb)(minor - 1, &stats, aux);
4498 static const struct tc_ops tc_ops_hfsc = {
4499 "hfsc", /* linux_name */
4500 "linux-hfsc", /* ovs_name */
4501 HFSC_N_QUEUES, /* n_queues */
4502 hfsc_tc_install, /* tc_install */
4503 hfsc_tc_load, /* tc_load */
4504 hfsc_tc_destroy, /* tc_destroy */
4505 hfsc_qdisc_get, /* qdisc_get */
4506 hfsc_qdisc_set, /* qdisc_set */
4507 hfsc_class_get, /* class_get */
4508 hfsc_class_set, /* class_set */
4509 hfsc_class_delete, /* class_delete */
4510 hfsc_class_get_stats, /* class_get_stats */
4511 hfsc_class_dump_stats /* class_dump_stats */
4514 /* "linux-default" traffic control class.
4516 * This class represents the default, unnamed Linux qdisc. It corresponds to
4517 * the "" (empty string) QoS type in the OVS database. */
4520 default_install__(struct netdev *netdev_)
4522 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4523 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4525 /* Nothing but a tc class implementation is allowed to write to a tc. This
4526 * class never does that, so we can legitimately use a const tc object. */
4527 netdev->tc = CONST_CAST(struct tc *, &tc);
4531 default_tc_install(struct netdev *netdev,
4532 const struct smap *details OVS_UNUSED)
4534 default_install__(netdev);
4539 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4541 default_install__(netdev);
4545 static const struct tc_ops tc_ops_default = {
4546 NULL, /* linux_name */
4551 NULL, /* tc_destroy */
4552 NULL, /* qdisc_get */
4553 NULL, /* qdisc_set */
4554 NULL, /* class_get */
4555 NULL, /* class_set */
4556 NULL, /* class_delete */
4557 NULL, /* class_get_stats */
4558 NULL /* class_dump_stats */
4561 /* "linux-other" traffic control class.
4566 other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
4568 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4569 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
4571 /* Nothing but a tc class implementation is allowed to write to a tc. This
4572 * class never does that, so we can legitimately use a const tc object. */
4573 netdev->tc = CONST_CAST(struct tc *, &tc);
4577 static const struct tc_ops tc_ops_other = {
4578 NULL, /* linux_name */
4579 "linux-other", /* ovs_name */
4581 NULL, /* tc_install */
4583 NULL, /* tc_destroy */
4584 NULL, /* qdisc_get */
4585 NULL, /* qdisc_set */
4586 NULL, /* class_get */
4587 NULL, /* class_set */
4588 NULL, /* class_delete */
4589 NULL, /* class_get_stats */
4590 NULL /* class_dump_stats */
4593 /* Traffic control. */
4595 /* Number of kernel "tc" ticks per second. */
4596 static double ticks_per_s;
4598 /* Number of kernel "jiffies" per second. This is used for the purpose of
4599 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4600 * one jiffy's worth of data.
4602 * There are two possibilities here:
4604 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4605 * approximate range of 100 to 1024. That means that we really need to
4606 * make sure that the qdisc can buffer that much data.
4608 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4609 * has finely granular timers and there's no need to fudge additional room
4610 * for buffers. (There's no extra effort needed to implement that: the
4611 * large 'buffer_hz' is used as a divisor, so practically any number will
4612 * come out as 0 in the division. Small integer results in the case of
4613 * really high dividends won't have any real effect anyhow.)
4615 static unsigned int buffer_hz;
4617 /* Returns tc handle 'major':'minor'. */
4619 tc_make_handle(unsigned int major, unsigned int minor)
4621 return TC_H_MAKE(major << 16, minor);
4624 /* Returns the major number from 'handle'. */
4626 tc_get_major(unsigned int handle)
4628 return TC_H_MAJ(handle) >> 16;
4631 /* Returns the minor number from 'handle'. */
4633 tc_get_minor(unsigned int handle)
4635 return TC_H_MIN(handle);
4638 static struct tcmsg *
4639 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4640 struct ofpbuf *request)
4642 struct tcmsg *tcmsg;
4646 error = get_ifindex(netdev, &ifindex);
4651 ofpbuf_init(request, 512);
4652 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4653 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4654 tcmsg->tcm_family = AF_UNSPEC;
4655 tcmsg->tcm_ifindex = ifindex;
4656 /* Caller should fill in tcmsg->tcm_handle. */
4657 /* Caller should fill in tcmsg->tcm_parent. */
4663 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4665 int error = nl_transact(NETLINK_ROUTE, request, replyp);
4666 ofpbuf_uninit(request);
4670 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4671 * policing configuration.
4673 * This function is equivalent to running the following when 'add' is true:
4674 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4676 * This function is equivalent to running the following when 'add' is false:
4677 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4679 * The configuration and stats may be seen with the following command:
4680 * /sbin/tc -s qdisc show dev <devname>
4682 * Returns 0 if successful, otherwise a positive errno value.
4685 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4687 struct ofpbuf request;
4688 struct tcmsg *tcmsg;
4690 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4691 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4693 tcmsg = tc_make_request(netdev, type, flags, &request);
4697 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4698 tcmsg->tcm_parent = TC_H_INGRESS;
4699 nl_msg_put_string(&request, TCA_KIND, "ingress");
4700 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4702 error = tc_transact(&request, NULL);
4704 /* If we're deleting the qdisc, don't worry about some of the
4705 * error conditions. */
4706 if (!add && (error == ENOENT || error == EINVAL)) {
4715 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4718 * This function is equivalent to running:
4719 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4720 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4723 * The configuration and stats may be seen with the following command:
4724 * /sbin/tc -s filter show dev <devname> parent ffff:
4726 * Returns 0 if successful, otherwise a positive errno value.
4729 tc_add_policer(struct netdev *netdev,
4730 uint32_t kbits_rate, uint32_t kbits_burst)
4732 struct tc_police tc_police;
4733 struct ofpbuf request;
4734 struct tcmsg *tcmsg;
4735 size_t basic_offset;
4736 size_t police_offset;
4740 memset(&tc_police, 0, sizeof tc_police);
4741 tc_police.action = TC_POLICE_SHOT;
4742 tc_police.mtu = mtu;
4743 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
4745 /* The following appears wrong in two ways:
4747 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4748 * arguments (or at least consistently "bytes" as both or "bits" as
4749 * both), but this supplies bytes for the first argument and bits for the
4752 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4754 * However if you "fix" those problems then "tc filter show ..." shows
4755 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4756 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4757 * tc's point of view. Whatever. */
4758 tc_police.burst = tc_bytes_to_ticks(
4759 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
4761 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4762 NLM_F_EXCL | NLM_F_CREATE, &request);
4766 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4767 tcmsg->tcm_info = tc_make_handle(49,
4768 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4770 nl_msg_put_string(&request, TCA_KIND, "basic");
4771 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4772 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4773 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4774 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4775 nl_msg_end_nested(&request, police_offset);
4776 nl_msg_end_nested(&request, basic_offset);
4778 error = tc_transact(&request, NULL);
4789 /* The values in psched are not individually very meaningful, but they are
4790 * important. The tables below show some values seen in the wild.
4794 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4795 * (Before that, there are hints that it was 1000000000.)
4797 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4801 * -----------------------------------
4802 * [1] 000c8000 000f4240 000f4240 00000064
4803 * [2] 000003e8 00000400 000f4240 3b9aca00
4804 * [3] 000003e8 00000400 000f4240 3b9aca00
4805 * [4] 000003e8 00000400 000f4240 00000064
4806 * [5] 000003e8 00000040 000f4240 3b9aca00
4807 * [6] 000003e8 00000040 000f4240 000000f9
4809 * a b c d ticks_per_s buffer_hz
4810 * ------- --------- ---------- ------------- ----------- -------------
4811 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4812 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4813 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4814 * [4] 1,000 1,024 1,000,000 100 976,562 100
4815 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4816 * [6] 1,000 64 1,000,000 249 15,625,000 249
4818 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4819 * [2] 2.6.26-1-686-bigmem from Debian lenny
4820 * [3] 2.6.26-2-sparc64 from Debian lenny
4821 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4822 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4823 * [6] 2.6.34 from kernel.org on KVM
4825 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4826 static const char fn[] = "/proc/net/psched";
4827 unsigned int a, b, c, d;
4830 if (!ovsthread_once_start(&once)) {
4837 stream = fopen(fn, "r");
4839 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
4843 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4844 VLOG_WARN("%s: read failed", fn);
4848 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4852 VLOG_WARN("%s: invalid scheduler parameters", fn);
4856 ticks_per_s = (double) a * c / b;
4860 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4863 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
4866 ovsthread_once_done(&once);
4869 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4870 * rate of 'rate' bytes per second. */
4872 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4875 return (rate * ticks) / ticks_per_s;
4878 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4879 * rate of 'rate' bytes per second. */
4881 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4884 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
4887 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4888 * a transmission rate of 'rate' bytes per second. */
4890 tc_buffer_per_jiffy(unsigned int rate)
4893 return rate / buffer_hz;
4896 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4897 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4898 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4899 * stores NULL into it if it is absent.
4901 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4904 * Returns 0 if successful, otherwise a positive errno value. */
4906 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4907 struct nlattr **options)
4909 static const struct nl_policy tca_policy[] = {
4910 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4911 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4913 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4915 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4916 tca_policy, ta, ARRAY_SIZE(ta))) {
4917 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4922 *kind = nl_attr_get_string(ta[TCA_KIND]);
4926 *options = ta[TCA_OPTIONS];
4941 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4942 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4943 * into '*options', and its queue statistics into '*stats'. Any of the output
4944 * arguments may be null.
4946 * Returns 0 if successful, otherwise a positive errno value. */
4948 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4949 struct nlattr **options, struct netdev_queue_stats *stats)
4951 static const struct nl_policy tca_policy[] = {
4952 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4953 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4955 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4957 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4958 tca_policy, ta, ARRAY_SIZE(ta))) {
4959 VLOG_WARN_RL(&rl, "failed to parse class message");
4964 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4965 *handlep = tc->tcm_handle;
4969 *options = ta[TCA_OPTIONS];
4973 const struct gnet_stats_queue *gsq;
4974 struct gnet_stats_basic gsb;
4976 static const struct nl_policy stats_policy[] = {
4977 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4978 .min_len = sizeof gsb },
4979 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4980 .min_len = sizeof *gsq },
4982 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4984 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4985 sa, ARRAY_SIZE(sa))) {
4986 VLOG_WARN_RL(&rl, "failed to parse class stats");
4990 /* Alignment issues screw up the length of struct gnet_stats_basic on
4991 * some arch/bitsize combinations. Newer versions of Linux have a
4992 * struct gnet_stats_basic_packed, but we can't depend on that. The
4993 * easiest thing to do is just to make a copy. */
4994 memset(&gsb, 0, sizeof gsb);
4995 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4996 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4997 stats->tx_bytes = gsb.bytes;
4998 stats->tx_packets = gsb.packets;
5000 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5001 stats->tx_errors = gsq->drops;
5011 memset(stats, 0, sizeof *stats);
5016 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5019 tc_query_class(const struct netdev *netdev,
5020 unsigned int handle, unsigned int parent,
5021 struct ofpbuf **replyp)
5023 struct ofpbuf request;
5024 struct tcmsg *tcmsg;
5027 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
5031 tcmsg->tcm_handle = handle;
5032 tcmsg->tcm_parent = parent;
5034 error = tc_transact(&request, replyp);
5036 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5037 netdev_get_name(netdev),
5038 tc_get_major(handle), tc_get_minor(handle),
5039 tc_get_major(parent), tc_get_minor(parent),
5040 ovs_strerror(error));
5045 /* Equivalent to "tc class del dev <name> handle <handle>". */
5047 tc_delete_class(const struct netdev *netdev, unsigned int handle)
5049 struct ofpbuf request;
5050 struct tcmsg *tcmsg;
5053 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
5057 tcmsg->tcm_handle = handle;
5058 tcmsg->tcm_parent = 0;
5060 error = tc_transact(&request, NULL);
5062 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5063 netdev_get_name(netdev),
5064 tc_get_major(handle), tc_get_minor(handle),
5065 ovs_strerror(error));
5070 /* Equivalent to "tc qdisc del dev <name> root". */
5072 tc_del_qdisc(struct netdev *netdev_)
5074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5075 struct ofpbuf request;
5076 struct tcmsg *tcmsg;
5079 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
5083 tcmsg->tcm_handle = tc_make_handle(1, 0);
5084 tcmsg->tcm_parent = TC_H_ROOT;
5086 error = tc_transact(&request, NULL);
5087 if (error == EINVAL) {
5088 /* EINVAL probably means that the default qdisc was in use, in which
5089 * case we've accomplished our purpose. */
5092 if (!error && netdev->tc) {
5093 if (netdev->tc->ops->tc_destroy) {
5094 netdev->tc->ops->tc_destroy(netdev->tc);
5102 getqdisc_is_safe(void)
5104 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5105 static bool safe = false;
5107 if (ovsthread_once_start(&once)) {
5108 struct utsname utsname;
5111 if (uname(&utsname) == -1) {
5112 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5113 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5114 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5115 } else if (major < 2 || (major == 2 && minor < 35)) {
5116 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5121 ovsthread_once_done(&once);
5126 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5127 * kernel to determine what they are. Returns 0 if successful, otherwise a
5128 * positive errno value. */
5130 tc_query_qdisc(const struct netdev *netdev_)
5132 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5133 struct ofpbuf request, *qdisc;
5134 const struct tc_ops *ops;
5135 struct tcmsg *tcmsg;
5143 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5144 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5145 * 2.6.35 without that fix backported to it.
5147 * To avoid the OOPS, we must not make a request that would attempt to dump
5148 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5149 * few others. There are a few ways that I can see to do this, but most of
5150 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5151 * technique chosen here is to assume that any non-default qdisc that we
5152 * create will have a class with handle 1:0. The built-in qdiscs only have
5153 * a class with handle 0:0.
5155 * On Linux 2.6.35+ we use the straightforward method because it allows us
5156 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5157 * in such a case we get no response at all from the kernel (!) if a
5158 * builtin qdisc is in use (which is later caught by "!error &&
5159 * !qdisc->size"). */
5160 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
5164 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5165 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
5167 /* Figure out what tc class to instantiate. */
5168 error = tc_transact(&request, &qdisc);
5169 if (!error && qdisc->size) {
5172 error = tc_parse_qdisc(qdisc, &kind, NULL);
5174 ops = &tc_ops_other;
5176 ops = tc_lookup_linux_name(kind);
5178 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
5179 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
5181 ops = &tc_ops_other;
5184 } else if ((!error && !qdisc->size) || error == ENOENT) {
5185 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5186 * set up by some other entity that doesn't have a handle 1:0. We will
5187 * assume that it's the system default qdisc. */
5188 ops = &tc_ops_default;
5191 /* Who knows? Maybe the device got deleted. */
5192 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
5193 netdev_get_name(netdev_), ovs_strerror(error));
5194 ops = &tc_ops_other;
5197 /* Instantiate it. */
5198 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5199 ovs_assert((load_error == 0) == (netdev->tc != NULL));
5200 ofpbuf_delete(qdisc);
5202 return error ? error : load_error;
5205 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5206 approximate the time to transmit packets of various lengths. For an MTU of
5207 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5208 represents two possible packet lengths; for a MTU of 513 through 1024, four
5209 possible lengths; and so on.
5211 Returns, for the specified 'mtu', the number of bits that packet lengths
5212 need to be shifted right to fit within such a 256-entry table. */
5214 tc_calc_cell_log(unsigned int mtu)
5219 mtu = ETH_PAYLOAD_MAX;
5221 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5223 for (cell_log = 0; mtu >= 256; cell_log++) {
5230 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5233 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5235 memset(rate, 0, sizeof *rate);
5236 rate->cell_log = tc_calc_cell_log(mtu);
5237 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5238 /* rate->cell_align = 0; */ /* distro headers. */
5239 rate->mpu = ETH_TOTAL_MIN;
5243 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5244 * attribute of the specified "type".
5246 * See tc_calc_cell_log() above for a description of "rtab"s. */
5248 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5253 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5254 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5255 unsigned packet_size = (i + 1) << rate->cell_log;
5256 if (packet_size < rate->mpu) {
5257 packet_size = rate->mpu;
5259 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5263 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5264 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5265 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5268 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5270 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5271 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5274 /* Linux-only functions declared in netdev-linux.h */
5276 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5277 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5279 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5280 const char *flag_name, bool enable)
5282 const char *netdev_name = netdev_get_name(netdev);
5283 struct ethtool_value evalue;
5287 COVERAGE_INC(netdev_get_ethtool);
5288 memset(&evalue, 0, sizeof evalue);
5289 error = netdev_linux_do_ethtool(netdev_name,
5290 (struct ethtool_cmd *)&evalue,
5291 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5296 COVERAGE_INC(netdev_set_ethtool);
5297 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5298 error = netdev_linux_do_ethtool(netdev_name,
5299 (struct ethtool_cmd *)&evalue,
5300 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5305 COVERAGE_INC(netdev_get_ethtool);
5306 memset(&evalue, 0, sizeof evalue);
5307 error = netdev_linux_do_ethtool(netdev_name,
5308 (struct ethtool_cmd *)&evalue,
5309 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5314 if (new_flags != evalue.data) {
5315 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5316 "device %s failed", enable ? "enable" : "disable",
5317 flag_name, netdev_name);
5324 /* Utility functions. */
5326 /* Copies 'src' into 'dst', performing format conversion in the process. */
5328 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5329 const struct rtnl_link_stats *src)
5331 dst->rx_packets = src->rx_packets;
5332 dst->tx_packets = src->tx_packets;
5333 dst->rx_bytes = src->rx_bytes;
5334 dst->tx_bytes = src->tx_bytes;
5335 dst->rx_errors = src->rx_errors;
5336 dst->tx_errors = src->tx_errors;
5337 dst->rx_dropped = src->rx_dropped;
5338 dst->tx_dropped = src->tx_dropped;
5339 dst->multicast = src->multicast;
5340 dst->collisions = src->collisions;
5341 dst->rx_length_errors = src->rx_length_errors;
5342 dst->rx_over_errors = src->rx_over_errors;
5343 dst->rx_crc_errors = src->rx_crc_errors;
5344 dst->rx_frame_errors = src->rx_frame_errors;
5345 dst->rx_fifo_errors = src->rx_fifo_errors;
5346 dst->rx_missed_errors = src->rx_missed_errors;
5347 dst->tx_aborted_errors = src->tx_aborted_errors;
5348 dst->tx_carrier_errors = src->tx_carrier_errors;
5349 dst->tx_fifo_errors = src->tx_fifo_errors;
5350 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5351 dst->tx_window_errors = src->tx_window_errors;
5354 /* Copies 'src' into 'dst', performing format conversion in the process. */
5356 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5357 const struct rtnl_link_stats64 *src)
5359 dst->rx_packets = src->rx_packets;
5360 dst->tx_packets = src->tx_packets;
5361 dst->rx_bytes = src->rx_bytes;
5362 dst->tx_bytes = src->tx_bytes;
5363 dst->rx_errors = src->rx_errors;
5364 dst->tx_errors = src->tx_errors;
5365 dst->rx_dropped = src->rx_dropped;
5366 dst->tx_dropped = src->tx_dropped;
5367 dst->multicast = src->multicast;
5368 dst->collisions = src->collisions;
5369 dst->rx_length_errors = src->rx_length_errors;
5370 dst->rx_over_errors = src->rx_over_errors;
5371 dst->rx_crc_errors = src->rx_crc_errors;
5372 dst->rx_frame_errors = src->rx_frame_errors;
5373 dst->rx_fifo_errors = src->rx_fifo_errors;
5374 dst->rx_missed_errors = src->rx_missed_errors;
5375 dst->tx_aborted_errors = src->tx_aborted_errors;
5376 dst->tx_carrier_errors = src->tx_carrier_errors;
5377 dst->tx_fifo_errors = src->tx_fifo_errors;
5378 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5379 dst->tx_window_errors = src->tx_window_errors;
5383 get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
5385 struct ofpbuf request;
5386 struct ofpbuf *reply;
5389 ofpbuf_init(&request, 0);
5390 nl_msg_put_nlmsghdr(&request,
5391 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5392 RTM_GETLINK, NLM_F_REQUEST);
5393 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5394 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
5395 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5396 ofpbuf_uninit(&request);
5401 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
5402 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5403 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5404 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
5407 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5408 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5409 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5412 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5417 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5422 ofpbuf_delete(reply);
5427 get_flags(const struct netdev *dev, unsigned int *flags)
5433 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
5435 *flags = ifr.ifr_flags;
5441 set_flags(const char *name, unsigned int flags)
5445 ifr.ifr_flags = flags;
5446 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
5450 do_get_ifindex(const char *netdev_name)
5455 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5456 COVERAGE_INC(netdev_get_ifindex);
5458 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5460 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5461 netdev_name, ovs_strerror(error));
5464 return ifr.ifr_ifindex;
5468 get_ifindex(const struct netdev *netdev_, int *ifindexp)
5470 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5472 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5473 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
5476 netdev->get_ifindex_error = -ifindex;
5477 netdev->ifindex = 0;
5479 netdev->get_ifindex_error = 0;
5480 netdev->ifindex = ifindex;
5482 netdev->cache_valid |= VALID_IFINDEX;
5485 *ifindexp = netdev->ifindex;
5486 return netdev->get_ifindex_error;
5490 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5496 memset(&ifr, 0, sizeof ifr);
5497 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5498 COVERAGE_INC(netdev_get_hwaddr);
5499 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5501 /* ENODEV probably means that a vif disappeared asynchronously and
5502 * hasn't been removed from the database yet, so reduce the log level
5503 * to INFO for that case. */
5504 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
5505 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5506 netdev_name, ovs_strerror(error));
5509 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5510 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5511 VLOG_WARN("%s device has unknown hardware address family %d",
5512 netdev_name, hwaddr_family);
5514 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5519 set_etheraddr(const char *netdev_name,
5520 const uint8_t mac[ETH_ADDR_LEN])
5525 memset(&ifr, 0, sizeof ifr);
5526 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
5527 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
5528 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5529 COVERAGE_INC(netdev_set_hwaddr);
5530 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5532 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5533 netdev_name, ovs_strerror(error));
5539 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
5540 int cmd, const char *cmd_name)
5545 memset(&ifr, 0, sizeof ifr);
5546 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
5547 ifr.ifr_data = (caddr_t) ecmd;
5550 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5552 if (error != EOPNOTSUPP) {
5553 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
5554 "failed: %s", cmd_name, name, ovs_strerror(error));
5556 /* The device doesn't support this operation. That's pretty
5557 * common, so there's no point in logging anything. */
5564 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5565 int cmd, const char *cmd_name)
5570 ifr.ifr_addr.sa_family = AF_INET;
5571 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
5573 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5575 *ip = sin->sin_addr;
5580 /* Returns an AF_PACKET raw socket or a negative errno value. */
5582 af_packet_sock(void)
5584 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5587 if (ovsthread_once_start(&once)) {
5588 sock = socket(AF_PACKET, SOCK_RAW, 0);
5590 int error = set_nonblocking(sock);
5597 VLOG_ERR("failed to create packet socket: %s",
5598 ovs_strerror(errno));
5600 ovsthread_once_done(&once);