#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
+#include <linux/skb_array.h>
#include <asm/uaccess.h>
#define TUN_FLOW_EXPIRE (3 * HZ)
+struct tun_pcpu_stats {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+ u32 rx_dropped;
+ u32 tx_dropped;
+ u32 rx_frame_errors;
+};
+
/* A tun_file connects an open character device to a tuntap netdevice. It
* also contains all socket related structures (except sock_fprog and tap_filter)
* to serve as one transmit queue for tuntap device. The sock_fprog and
};
struct list_head next;
struct tun_struct *detached;
+ struct skb_array tx_array;
};
struct tun_flow_entry {
struct list_head disabled;
void *security;
u32 flow_count;
+ struct tun_pcpu_stats __percpu *pcpu_stats;
};
#ifdef CONFIG_TUN_VNET_CROSS_LE
static void tun_queue_purge(struct tun_file *tfile)
{
- skb_queue_purge(&tfile->sk.sk_receive_queue);
+ struct sk_buff *skb;
+
+ while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+ kfree_skb(skb);
+
skb_queue_purge(&tfile->sk.sk_error_queue);
}
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
+ if (tun)
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
BUG_ON(!tfile);
+ tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
RCU_INIT_POINTER(tfile->tun, NULL);
--tun->numqueues;
}
list_for_each_entry(tfile, &tun->disabled, next) {
+ tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
RCU_INIT_POINTER(tfile->tun, NULL);
}
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
+ struct net_device *dev = tun->dev;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
/* Re-attach the filter to persist device */
if (!skip_filter && (tun->filter_attached == true)) {
- err = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
- lockdep_rtnl_is_held());
+ lock_sock(tfile->socket.sk);
+ err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+ release_sock(tfile->socket.sk);
if (!err)
goto out;
}
+
+ if (!tfile->detached &&
+ skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
tfile->queue_index = tun->numqueues;
+ tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
if (txq >= numqueues)
goto drop;
- if (numqueues == 1) {
+#ifdef CONFIG_RPS
+ if (numqueues == 1 && static_key_false(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
tun_flow_save_rps_rxhash(e, rxhash);
}
}
+#endif
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
goto drop;
if (skb->sk && sk_fullsock(skb->sk)) {
- sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(skb->sk, skb->sk->sk_tsflags,
+ &skb_shinfo(skb)->tx_flags);
sw_tx_timestamp(skb);
}
nf_reset(skb);
- /* Enqueue packet */
- skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ if (skb_array_produce(&tfile->tx_array, skb))
+ goto drop;
/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
return NETDEV_TX_OK;
drop:
- dev->stats.tx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->tx_dropped);
skb_tx_error(skb);
kfree_skb(skb);
rcu_read_unlock();
tun->align = new_hr;
}
+static struct rtnl_link_stats64 *
+tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
+ struct tun_struct *tun = netdev_priv(dev);
+ struct tun_pcpu_stats *p;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, txpackets, txbytes;
+ unsigned int start;
+
+ p = per_cpu_ptr(tun->pcpu_stats, i);
+ do {
+ start = u64_stats_fetch_begin(&p->syncp);
+ rxpackets = p->rx_packets;
+ rxbytes = p->rx_bytes;
+ txpackets = p->tx_packets;
+ txbytes = p->tx_bytes;
+ } while (u64_stats_fetch_retry(&p->syncp, start));
+
+ stats->rx_packets += rxpackets;
+ stats->rx_bytes += rxbytes;
+ stats->tx_packets += txpackets;
+ stats->tx_bytes += txbytes;
+
+ /* u32 counters */
+ rx_dropped += p->rx_dropped;
+ rx_frame_errors += p->rx_frame_errors;
+ tx_dropped += p->tx_dropped;
+ }
+ stats->rx_dropped = rx_dropped;
+ stats->rx_frame_errors = rx_frame_errors;
+ stats->tx_dropped = tx_dropped;
+ return stats;
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_set_rx_headroom = tun_set_headroom,
+ .ndo_get_stats64 = tun_net_get_stats64,
};
static const struct net_device_ops tap_netdev_ops = {
#endif
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
+ .ndo_get_stats64 = tun_net_get_stats64,
};
static void tun_flow_init(struct tun_struct *tun)
poll_wait(file, sk_sleep(sk), wait);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) ||
size_t total_len = iov_iter_count(from);
size_t len = total_len, align = tun->align, linear;
struct virtio_net_hdr gso = { 0 };
+ struct tun_pcpu_stats *stats;
int good_linear;
int copylen;
bool zerocopy = false;
skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
}
if (err) {
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EFAULT;
}
- if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- if (!skb_partial_csum_set(skb, tun16_to_cpu(tun, gso.csum_start),
- tun16_to_cpu(tun, gso.csum_offset))) {
- tun->dev->stats.rx_frame_errors++;
- kfree_skb(skb);
- return -EINVAL;
- }
+ err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
+ if (err) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ return -EINVAL;
}
switch (tun->flags & TUN_TYPE_MASK) {
pi.proto = htons(ETH_P_IPV6);
break;
default:
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EINVAL;
}
break;
}
- if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
- pr_debug("GSO!\n");
- switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
- case VIRTIO_NET_HDR_GSO_TCPV4:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
- break;
- case VIRTIO_NET_HDR_GSO_TCPV6:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
- break;
- case VIRTIO_NET_HDR_GSO_UDP:
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- break;
- default:
- tun->dev->stats.rx_frame_errors++;
- kfree_skb(skb);
- return -EINVAL;
- }
-
- if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
-
- skb_shinfo(skb)->gso_size = tun16_to_cpu(tun, gso.gso_size);
- if (skb_shinfo(skb)->gso_size == 0) {
- tun->dev->stats.rx_frame_errors++;
- kfree_skb(skb);
- return -EINVAL;
- }
-
- /* Header must be checked, and gso_segs computed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
- }
-
/* copy skb_ubuf_info for callback when skb has no error */
if (zerocopy) {
skb_shinfo(skb)->destructor_arg = msg_control;
rxhash = skb_get_hash(skb);
netif_rx_ni(skb);
- tun->dev->stats.rx_packets++;
- tun->dev->stats.rx_bytes += len;
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
tun_flow_update(tun, rxhash, tfile);
return total_len;
struct iov_iter *iter)
{
struct tun_pi pi = { 0, skb->protocol };
+ struct tun_pcpu_stats *stats;
ssize_t total;
int vlan_offset = 0;
int vlan_hlen = 0;
if (vnet_hdr_sz) {
struct virtio_net_hdr gso = { 0 }; /* no info leak */
+ int ret;
+
if (iov_iter_count(iter) < vnet_hdr_sz)
return -EINVAL;
- if (skb_is_gso(skb)) {
+ ret = virtio_net_hdr_from_skb(skb, &gso,
+ tun_is_little_endian(tun));
+ if (ret) {
struct skb_shared_info *sinfo = skb_shinfo(skb);
-
- /* This is a hint as to how much should be linear. */
- gso.hdr_len = cpu_to_tun16(tun, skb_headlen(skb));
- gso.gso_size = cpu_to_tun16(tun, sinfo->gso_size);
- if (sinfo->gso_type & SKB_GSO_TCPV4)
- gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
- else if (sinfo->gso_type & SKB_GSO_TCPV6)
- gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
- else if (sinfo->gso_type & SKB_GSO_UDP)
- gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
- else {
- pr_err("unexpected GSO type: "
- "0x%x, gso_size %d, hdr_len %d\n",
- sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
- tun16_to_cpu(tun, gso.hdr_len));
- print_hex_dump(KERN_ERR, "tun: ",
- DUMP_PREFIX_NONE,
- 16, 1, skb->head,
- min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
- WARN_ON_ONCE(1);
- return -EINVAL;
- }
- if (sinfo->gso_type & SKB_GSO_TCP_ECN)
- gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
- } else
- gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- gso.csum_start = cpu_to_tun16(tun, skb_checksum_start_offset(skb) +
- vlan_hlen);
- gso.csum_offset = cpu_to_tun16(tun, skb->csum_offset);
- } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
- gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
- } /* else everything is zero */
+ pr_err("unexpected GSO type: "
+ "0x%x, gso_size %d, hdr_len %d\n",
+ sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
+ tun16_to_cpu(tun, gso.hdr_len));
+ print_hex_dump(KERN_ERR, "tun: ",
+ DUMP_PREFIX_NONE,
+ 16, 1, skb->head,
+ min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
return -EFAULT;
skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
done:
- tun->dev->stats.tx_packets++;
- tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
+ /* caller is in process context, */
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_packets++;
+ stats->tx_bytes += skb->len + vlan_hlen;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(tun->pcpu_stats);
return total;
}
+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+ int *err)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb = NULL;
+ int error = 0;
+
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ goto out;
+ if (noblock) {
+ error = -EAGAIN;
+ goto out;
+ }
+
+ add_wait_queue(&tfile->wq.wait, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ while (1) {
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ break;
+ if (signal_pending(current)) {
+ error = -ERESTARTSYS;
+ break;
+ }
+ if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+ error = -EFAULT;
+ break;
+ }
+
+ schedule();
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+ *err = error;
+ return skb;
+}
+
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
- int peeked, err, off = 0;
+ int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to))
return 0;
- if (tun->dev->reg_state != NETREG_REGISTERED)
- return -EIO;
-
- /* Read frames from queue */
- skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
- &peeked, &off, &err);
+ /* Read frames from ring */
+ skb = tun_ring_recv(tfile, noblock, &err);
if (!skb)
return err;
struct tun_struct *tun = netdev_priv(dev);
BUG_ON(!(list_empty(&tun->disabled)));
+ free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
free_netdev(dev);
return ret;
}
+static int tun_peek_len(struct socket *sock)
+{
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun;
+ int ret = 0;
+
+ tun = __tun_get(tfile);
+ if (!tun)
+ return 0;
+
+ ret = skb_array_peek_len(&tfile->tx_array);
+ tun_put(tun);
+
+ return ret;
+}
+
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
+ .peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
+ tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
+ if (!tun->pcpu_stats) {
+ err = -ENOMEM;
+ goto err_free_dev;
+ }
+
spin_lock_init(&tun->lock);
err = security_tun_dev_alloc_security(&tun->security);
if (err < 0)
- goto err_free_dev;
+ goto err_free_stat;
tun_net_init(dev);
tun_flow_init(tun);
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
- dev->features = dev->hw_features;
+ dev->features = dev->hw_features | NETIF_F_LLTX;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
err_free_flow:
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
+err_free_stat:
+ free_percpu(tun->pcpu_stats);
err_free_dev:
free_netdev(dev);
return err;
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
- __sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held());
+ lock_sock(tfile->socket.sk);
+ sk_detach_filter(tfile->socket.sk);
+ release_sock(tfile->socket.sk);
}
tun->filter_attached = false;
for (i = 0; i < tun->numqueues; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
- ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
- lockdep_rtnl_is_held());
+ lock_sock(tfile->socket.sk);
+ ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+ release_sock(tfile->socket.sk);
if (ret) {
tun_detach_filter(tun, i);
return ret;
.get_ts_info = ethtool_op_get_ts_info,
};
+static int tun_queue_resize(struct tun_struct *tun)
+{
+ struct net_device *dev = tun->dev;
+ struct tun_file *tfile;
+ struct skb_array **arrays;
+ int n = tun->numqueues + tun->numdisabled;
+ int ret, i;
+
+ arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+ if (!arrays)
+ return -ENOMEM;
+
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ arrays[i] = &tfile->tx_array;
+ }
+ list_for_each_entry(tfile, &tun->disabled, next)
+ arrays[i++] = &tfile->tx_array;
+
+ ret = skb_array_resize_multiple(arrays, n,
+ dev->tx_queue_len, GFP_KERNEL);
+
+ kfree(arrays);
+ return ret;
+}
+
+static int tun_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tun_struct *tun = netdev_priv(dev);
+
+ if (dev->rtnl_link_ops != &tun_link_ops)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ if (tun_queue_resize(tun))
+ return NOTIFY_BAD;
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tun_notifier_block __read_mostly = {
+ .notifier_call = tun_device_event,
+};
static int __init tun_init(void)
{
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
+
+ register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
{
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
+ unregister_netdevice_notifier(&tun_notifier_block);
}
/* Get an underlying socket object from tun file. Returns error unless file is