#include <string.h>
#include <unistd.h>
-#include "connectivity.h"
#include "coverage.h"
-#include "dpif-linux.h"
+#include "dpif-netlink.h"
+#include "dpif-netdev.h"
#include "dynamic-string.h"
#include "fatal-signal.h"
#include "hash.h"
#include "ofpbuf.h"
#include "openflow/openflow.h"
#include "ovs-atomic.h"
+#include "packet-dpif.h"
#include "packets.h"
#include "poll-loop.h"
#include "rtnetlink-link.h"
-#include "seq.h"
#include "shash.h"
#include "socket-util.h"
#include "sset.h"
uint16_t tp_vlan_tpid;
};
+/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
+ *
+ * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
+ * 2.6.32-431.29.2.el6.x86_64 (see report at
+ * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
+ * if_link.h is not self-contained on those kernels. It is easiest to
+ * unconditionally define a replacement. */
+#ifndef IFLA_STATS64
+#define IFLA_STATS64 23
+#endif
+#define rtnl_link_stats64 rpl_rtnl_link_stats64
+struct rtnl_link_stats64 {
+ uint64_t rx_packets;
+ uint64_t tx_packets;
+ uint64_t rx_bytes;
+ uint64_t tx_bytes;
+ uint64_t rx_errors;
+ uint64_t tx_errors;
+ uint64_t rx_dropped;
+ uint64_t tx_dropped;
+ uint64_t multicast;
+ uint64_t collisions;
+
+ uint64_t rx_length_errors;
+ uint64_t rx_over_errors;
+ uint64_t rx_crc_errors;
+ uint64_t rx_frame_errors;
+ uint64_t rx_fifo_errors;
+ uint64_t rx_missed_errors;
+
+ uint64_t tx_aborted_errors;
+ uint64_t tx_carrier_errors;
+ uint64_t tx_fifo_errors;
+ uint64_t tx_heartbeat_errors;
+ uint64_t tx_window_errors;
+
+ uint64_t rx_compressed;
+ uint64_t tx_compressed;
+};
+
enum {
VALID_IFINDEX = 1 << 0,
VALID_ETHERADDR = 1 << 1,
int tap_fd;
};
-struct netdev_rx_linux {
- struct netdev_rx up;
+struct netdev_rxq_linux {
+ struct netdev_rxq up;
bool is_tap;
int fd;
};
/* Polling miimon status for all ports causes performance degradation when
* handling a large number of ports. If there are no devices using miimon, then
- * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait(). */
-static atomic_int miimon_cnt = ATOMIC_VAR_INIT(0);
+ * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
+ *
+ * Readers do not depend on this variable synchronizing with the related
+ * changes in the device miimon status, so we can use atomic_count. */
+static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
static void netdev_linux_run(void);
static bool netdev_linux_miimon_enabled(void);
static void netdev_linux_miimon_run(void);
static void netdev_linux_miimon_wait(void);
+static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
static bool
is_netdev_linux_class(const struct netdev_class *netdev_class)
return CONTAINER_OF(netdev, struct netdev_linux, up);
}
-static struct netdev_rx_linux *
-netdev_rx_linux_cast(const struct netdev_rx *rx)
+static struct netdev_rxq_linux *
+netdev_rxq_linux_cast(const struct netdev_rxq *rx)
{
ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
- return CONTAINER_OF(rx, struct netdev_rx_linux, up);
+ return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
}
\f
static void netdev_linux_update(struct netdev_linux *netdev,
static bool
netdev_linux_miimon_enabled(void)
{
- int miimon;
-
- atomic_read(&miimon_cnt, &miimon);
- return miimon > 0;
+ return atomic_count_get(&miimon_cnt) > 0;
}
static void
unsigned int ifi_flags, unsigned int mask)
OVS_REQUIRES(dev->mutex)
{
- seq_change(connectivity_seq_get());
+ netdev_change_seq_changed(&dev->up);
if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
dev->carrier_resets++;
}
if (netdev->miimon_interval > 0) {
- int junk;
- atomic_sub(&miimon_cnt, 1, &junk);
+ atomic_count_dec(&miimon_cnt);
}
ovs_mutex_destroy(&netdev->mutex);
free(netdev);
}
-static struct netdev_rx *
-netdev_linux_rx_alloc(void)
+static struct netdev_rxq *
+netdev_linux_rxq_alloc(void)
{
- struct netdev_rx_linux *rx = xzalloc(sizeof *rx);
+ struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
return &rx->up;
}
static int
-netdev_linux_rx_construct(struct netdev_rx *rx_)
+netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
struct netdev *netdev_ = rx->up.netdev;
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
}
static void
-netdev_linux_rx_destruct(struct netdev_rx *rx_)
+netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
if (!rx->is_tap) {
close(rx->fd);
}
static void
-netdev_linux_rx_dealloc(struct netdev_rx *rx_)
+netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
free(rx);
}
}
static int
-netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer)
+netdev_linux_rxq_recv_sock(int fd, struct ofpbuf *buffer)
{
size_t size;
ssize_t retval;
ofpbuf_reserve(buffer, VLAN_HEADER_LEN);
size = ofpbuf_tailroom(buffer);
- iov.iov_base = buffer->data;
+ iov.iov_base = ofpbuf_data(buffer);
iov.iov_len = size;
msgh.msg_name = NULL;
msgh.msg_namelen = 0;
return EMSGSIZE;
}
- buffer->size += retval;
+ ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
const struct tpacket_auxdata *aux;
}
static int
-netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
+netdev_linux_rxq_recv_tap(int fd, struct ofpbuf *buffer)
{
ssize_t retval;
size_t size = ofpbuf_tailroom(buffer);
do {
- retval = read(fd, buffer->data, size);
+ retval = read(fd, ofpbuf_data(buffer), size);
} while (retval < 0 && errno == EINTR);
if (retval < 0) {
return EMSGSIZE;
}
- buffer->size += retval;
+ ofpbuf_set_size(buffer, ofpbuf_size(buffer) + retval);
return 0;
}
static int
-netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
+netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
+ int *c)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
- int retval;
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
+ struct netdev *netdev = rx->up.netdev;
+ struct dpif_packet *packet;
+ struct ofpbuf *buffer;
+ ssize_t retval;
+ int mtu;
+
+ if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
+ mtu = ETH_PAYLOAD_MAX;
+ }
+
+ packet = dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
+ DP_NETDEV_HEADROOM);
+ buffer = &packet->ofpbuf;
retval = (rx->is_tap
- ? netdev_linux_rx_recv_tap(rx->fd, buffer)
- : netdev_linux_rx_recv_sock(rx->fd, buffer));
- if (retval && retval != EAGAIN && retval != EMSGSIZE) {
- VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
- ovs_strerror(errno), netdev_rx_get_name(rx_));
+ ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
+ : netdev_linux_rxq_recv_sock(rx->fd, buffer));
+
+ if (retval) {
+ if (retval != EAGAIN && retval != EMSGSIZE) {
+ VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
+ ovs_strerror(errno), netdev_rxq_get_name(rxq_));
+ }
+ dpif_packet_delete(packet);
+ } else {
+ dp_packet_pad(buffer);
+ dpif_packet_set_dp_hash(packet, 0);
+ packets[0] = packet;
+ *c = 1;
}
return retval;
}
static void
-netdev_linux_rx_wait(struct netdev_rx *rx_)
+netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
poll_fd_wait(rx->fd, POLLIN);
}
static int
-netdev_linux_rx_drain(struct netdev_rx *rx_)
+netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
{
- struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
if (rx->is_tap) {
struct ifreq ifr;
- int error = af_inet_ifreq_ioctl(netdev_rx_get_name(rx_), &ifr,
+ int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
if (error) {
return error;
* The kernel maintains a packet transmission queue, so the caller is not
* expected to do additional queuing of packets. */
static int
-netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
+netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
+ struct dpif_packet **pkts, int cnt, bool may_steal)
{
- for (;;) {
+ int i;
+ int error = 0;
+
+ /* 'i' is incremented only if there's no error */
+ for (i = 0; i < cnt;) {
+ const void *data = ofpbuf_data(&pkts[i]->ofpbuf);
+ size_t size = ofpbuf_size(&pkts[i]->ofpbuf);
ssize_t retval;
if (!is_tap_netdev(netdev_)) {
/* The Linux AF_PACKET implementation never blocks waiting for room
* for packets, instead returning ENOBUFS. Translate this into
* EAGAIN for the caller. */
- if (errno == ENOBUFS) {
- return EAGAIN;
- } else if (errno == EINTR) {
+ error = errno == ENOBUFS ? EAGAIN : errno;
+ if (error == EINTR) {
+ /* continue without incrementing 'i', i.e. retry this packet */
continue;
- } else if (errno != EAGAIN) {
- VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
- netdev_get_name(netdev_), ovs_strerror(errno));
}
- return errno;
+ break;
} else if (retval != size) {
- VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"d bytes of "
- "%"PRIuSIZE") on %s", retval, size, netdev_get_name(netdev_));
- return EMSGSIZE;
- } else {
- return 0;
+ VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
+ " of %"PRIuSIZE") on %s", retval, size,
+ netdev_get_name(netdev_));
+ error = EMSGSIZE;
+ break;
}
+
+ /* Process the next packet in the batch */
+ i++;
+ }
+
+ if (may_steal) {
+ for (i = 0; i < cnt; i++) {
+ dpif_packet_delete(pkts[i]);
+ }
+ }
+
+ if (error && error != EAGAIN) {
+ VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
+ netdev_get_name(netdev_), ovs_strerror(error));
}
+
+ return error;
+
}
/* Registers with the poll loop to wake up from the next call to poll_block()
* expected to do additional queuing of packets. Thus, this function is
* unlikely to ever be used. It is included for completeness. */
static void
-netdev_linux_send_wait(struct netdev *netdev)
+netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
{
if (is_tap_netdev(netdev)) {
/* TAP device always accepts packets.*/
ovs_mutex_lock(&netdev->mutex);
interval = interval > 0 ? MAX(interval, 100) : 0;
if (netdev->miimon_interval != interval) {
- int junk;
-
if (interval && !netdev->miimon_interval) {
- atomic_add(&miimon_cnt, 1, &junk);
+ atomic_count_inc(&miimon_cnt);
} else if (!interval && netdev->miimon_interval) {
- atomic_sub(&miimon_cnt, 1, &junk);
+ atomic_count_dec(&miimon_cnt);
}
netdev->miimon_interval = interval;
netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
const struct ovs_vport_stats *src)
{
- dst->rx_packets = get_unaligned_u64(&src->rx_packets);
- dst->tx_packets = get_unaligned_u64(&src->tx_packets);
- dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
- dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
- dst->rx_errors = get_unaligned_u64(&src->rx_errors);
- dst->tx_errors = get_unaligned_u64(&src->tx_errors);
- dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
- dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
+ dst->rx_packets = get_32aligned_u64(&src->rx_packets);
+ dst->tx_packets = get_32aligned_u64(&src->tx_packets);
+ dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
+ dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
+ dst->rx_errors = get_32aligned_u64(&src->rx_errors);
+ dst->tx_errors = get_32aligned_u64(&src->tx_errors);
+ dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
+ dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
dst->multicast = 0;
dst->collisions = 0;
dst->rx_length_errors = 0;
static int
get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
{
- struct dpif_linux_vport reply;
+ struct dpif_netlink_vport reply;
struct ofpbuf *buf;
int error;
- error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
+ error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
if (error) {
return error;
} else if (!reply.stats) {
error = 0;
}
} else if (netdev->vport_stats_error) {
- /* stats not available from OVS then use ioctl stats. */
+ /* stats not available from OVS then use netdev stats. */
*stats = dev_stats;
} else {
+ /* Use kernel netdev's packet and byte counts since vport's counters
+ * do not reflect packet counts on the wire when GSO, TSO or GRO are
+ * enabled. */
+ stats->rx_packets = dev_stats.rx_packets;
+ stats->rx_bytes = dev_stats.rx_bytes;
+ stats->tx_packets = dev_stats.tx_packets;
+ stats->tx_bytes = dev_stats.tx_bytes;
+
stats->rx_errors += dev_stats.rx_errors;
stats->tx_errors += dev_stats.tx_errors;
stats->rx_dropped += dev_stats.rx_dropped;
stats->tx_heartbeat_errors = 0;
stats->tx_window_errors = 0;
} else {
+ /* Use kernel netdev's packet and byte counts since vport counters
+ * do not reflect packet counts on the wire when GSO, TSO or GRO
+ * are enabled. */
+ stats->rx_packets = dev_stats.tx_packets;
+ stats->rx_bytes = dev_stats.tx_bytes;
+ stats->tx_packets = dev_stats.rx_packets;
+ stats->tx_bytes = dev_stats.rx_bytes;
+
stats->rx_dropped += dev_stats.tx_dropped;
stats->tx_dropped += dev_stats.rx_dropped;
return error;
}
-static int
-netdev_internal_set_stats(struct netdev *netdev,
- const struct netdev_stats *stats)
-{
- struct ovs_vport_stats vport_stats;
- struct dpif_linux_vport vport;
- int err;
-
- vport_stats.rx_packets = stats->rx_packets;
- vport_stats.tx_packets = stats->tx_packets;
- vport_stats.rx_bytes = stats->rx_bytes;
- vport_stats.tx_bytes = stats->tx_bytes;
- vport_stats.rx_errors = stats->rx_errors;
- vport_stats.tx_errors = stats->tx_errors;
- vport_stats.rx_dropped = stats->rx_dropped;
- vport_stats.tx_dropped = stats->tx_dropped;
-
- dpif_linux_vport_init(&vport);
- vport.cmd = OVS_VPORT_CMD_SET;
- vport.name = netdev_get_name(netdev);
- vport.stats = &vport_stats;
-
- err = dpif_linux_vport_transact(&vport, NULL, NULL);
-
- /* If the vport layer doesn't know about the device, that doesn't mean it
- * doesn't exist (after all were able to open it when netdev_open() was
- * called), it just means that it isn't attached and we'll be getting
- * stats a different way. */
- if (err == ENODEV) {
- err = EOPNOTSUPP;
- }
-
- return err;
-}
-
static void
netdev_linux_read_features(struct netdev_linux *netdev)
{
return error;
}
-#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, SET_STATS, \
+#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
GET_FEATURES, GET_STATUS) \
{ \
NAME, \
NULL, /* get_config */ \
NULL, /* set_config */ \
NULL, /* get_tunnel_config */ \
+ NULL, /* build header */ \
+ NULL, /* push header */ \
+ NULL, /* pop header */ \
+ NULL, /* get_numa_id */ \
+ NULL, /* set_multiq */ \
\
netdev_linux_send, \
netdev_linux_send_wait, \
netdev_linux_get_carrier_resets, \
netdev_linux_set_miimon_interval, \
GET_STATS, \
- SET_STATS, \
\
GET_FEATURES, \
netdev_linux_set_advertisements, \
\
netdev_linux_update_flags, \
\
- netdev_linux_rx_alloc, \
- netdev_linux_rx_construct, \
- netdev_linux_rx_destruct, \
- netdev_linux_rx_dealloc, \
- netdev_linux_rx_recv, \
- netdev_linux_rx_wait, \
- netdev_linux_rx_drain, \
+ netdev_linux_rxq_alloc, \
+ netdev_linux_rxq_construct, \
+ netdev_linux_rxq_destruct, \
+ netdev_linux_rxq_dealloc, \
+ netdev_linux_rxq_recv, \
+ netdev_linux_rxq_wait, \
+ netdev_linux_rxq_drain, \
}
const struct netdev_class netdev_linux_class =
"system",
netdev_linux_construct,
netdev_linux_get_stats,
- NULL, /* set_stats */
netdev_linux_get_features,
netdev_linux_get_status);
"tap",
netdev_linux_construct_tap,
netdev_tap_get_stats,
- NULL, /* set_stats */
netdev_linux_get_features,
netdev_linux_get_status);
"internal",
netdev_linux_construct,
netdev_internal_get_stats,
- netdev_internal_set_stats,
NULL, /* get_features */
netdev_internal_get_status);
\f
memset(&tc_police, 0, sizeof tc_police);
tc_police.action = TC_POLICE_SHOT;
tc_police.mtu = mtu;
- tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
+ tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
kbits_burst * 1024);
dst->tx_window_errors = src->tx_window_errors;
}
+/* Copies 'src' into 'dst', performing format conversion in the process. */
+static void
+netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
+ const struct rtnl_link_stats64 *src)
+{
+ dst->rx_packets = src->rx_packets;
+ dst->tx_packets = src->tx_packets;
+ dst->rx_bytes = src->rx_bytes;
+ dst->tx_bytes = src->tx_bytes;
+ dst->rx_errors = src->rx_errors;
+ dst->tx_errors = src->tx_errors;
+ dst->rx_dropped = src->rx_dropped;
+ dst->tx_dropped = src->tx_dropped;
+ dst->multicast = src->multicast;
+ dst->collisions = src->collisions;
+ dst->rx_length_errors = src->rx_length_errors;
+ dst->rx_over_errors = src->rx_over_errors;
+ dst->rx_crc_errors = src->rx_crc_errors;
+ dst->rx_frame_errors = src->rx_frame_errors;
+ dst->rx_fifo_errors = src->rx_fifo_errors;
+ dst->rx_missed_errors = src->rx_missed_errors;
+ dst->tx_aborted_errors = src->tx_aborted_errors;
+ dst->tx_carrier_errors = src->tx_carrier_errors;
+ dst->tx_fifo_errors = src->tx_fifo_errors;
+ dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
+ dst->tx_window_errors = src->tx_window_errors;
+}
+
static int
get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
{
}
if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
- const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
- if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
- netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
+ const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
+ if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
+ netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
error = 0;
} else {
- VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
- error = EPROTO;
+ const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
+ if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
+ netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
+ error = 0;
+ } else {
+ VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
+ error = EPROTO;
+ }
}
} else {
VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");