packet: introduce PACKET_QDISC_BYPASS socket option
[cascardo/linux.git] / net / packet / af_packet.c
index e4171dd..9d70f13 100644 (file)
@@ -237,6 +237,48 @@ struct packet_skb_cb {
 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
 static void __fanout_link(struct sock *sk, struct packet_sock *po);
 
+static int packet_direct_xmit(struct sk_buff *skb)
+{
+       struct net_device *dev = skb->dev;
+       const struct net_device_ops *ops = dev->netdev_ops;
+       netdev_features_t features;
+       struct netdev_queue *txq;
+       u16 queue_map;
+       int ret;
+
+       if (unlikely(!netif_running(dev) ||
+                    !netif_carrier_ok(dev))) {
+               kfree_skb(skb);
+               return NET_XMIT_DROP;
+       }
+
+       features = netif_skb_features(skb);
+       if (skb_needs_linearize(skb, features) &&
+           __skb_linearize(skb)) {
+               kfree_skb(skb);
+               return NET_XMIT_DROP;
+       }
+
+       queue_map = skb_get_queue_mapping(skb);
+       txq = netdev_get_tx_queue(dev, queue_map);
+
+       __netif_tx_lock_bh(txq);
+       if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
+               ret = NETDEV_TX_BUSY;
+               kfree_skb(skb);
+               goto out;
+       }
+
+       ret = ops->ndo_start_xmit(skb, dev);
+       if (likely(dev_xmit_complete(ret)))
+               txq_trans_update(txq);
+       else
+               kfree_skb(skb);
+out:
+       __netif_tx_unlock_bh(txq);
+       return ret;
+}
+
 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
 {
        struct net_device *dev;
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po)
        RCU_INIT_POINTER(po->cached_dev, NULL);
 }
 
+static bool packet_use_direct_xmit(const struct packet_sock *po)
+{
+       return po->xmit == packet_direct_xmit;
+}
+
+static u16 packet_pick_tx_queue(struct net_device *dev)
+{
+       return (u16) smp_processor_id() % dev->real_num_tx_queues;
+}
+
 /* register_prot_hook must be invoked with the po->bind_lock held,
  * or from a context in which asynchronous accesses to the packet
  * socket is not possible (packet_create()).
@@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 
        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);
-       skb_probe_transport_header(skb, 0);
 
-       if (po->tp_tx_has_off) {
+       if (!packet_use_direct_xmit(po))
+               skb_probe_transport_header(skb, 0);
+       if (unlikely(po->tp_tx_has_off)) {
                int off_min, off_max, off;
                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
                off_max = po->tx_ring.frame_size - tp_len;
@@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
                        }
                }
 
+               skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
                skb->destructor = tpacket_destruct_skb;
                __packet_set_status(po, ph, TP_STATUS_SENDING);
                atomic_inc(&po->tx_ring.pending);
 
                status = TP_STATUS_SEND_REQUEST;
-               err = dev_queue_xmit(skb);
+               err = po->xmit(skb);
                if (unlikely(err > 0)) {
                        err = net_xmit_errno(err);
                        if (err && __packet_get_status(po, ph) ==
@@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
        return skb;
 }
 
-static int packet_snd(struct socket *sock,
-                         struct msghdr *msg, size_t len)
+static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 {
        struct sock *sk = sock->sk;
        struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
@@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock,
        skb->dev = dev;
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
+       skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
 
        if (po->has_vnet_hdr) {
                if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
@@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock,
                len += vnet_hdr_len;
        }
 
-       skb_probe_transport_header(skb, reserve);
-
+       if (!packet_use_direct_xmit(po))
+               skb_probe_transport_header(skb, reserve);
        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;
 
-       /*
-        *      Now send it
-        */
-
-       err = dev_queue_xmit(skb);
+       err = po->xmit(skb);
        if (err > 0 && (err = net_xmit_errno(err)) != 0)
                goto out_unlock;
 
@@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 {
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
+
        if (po->tx_ring.pg_vec)
                return tpacket_snd(po, msg);
        else
@@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
        po = pkt_sk(sk);
        sk->sk_family = PF_PACKET;
        po->num = proto;
+       po->xmit = dev_queue_xmit;
 
        packet_cached_dev_reset(po);
 
@@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                po->tp_tx_has_off = !!val;
                return 0;
        }
+       case PACKET_QDISC_BYPASS:
+       {
+               int val;
+
+               if (optlen != sizeof(val))
+                       return -EINVAL;
+               if (copy_from_user(&val, optval, sizeof(val)))
+                       return -EFAULT;
+
+               po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
+               return 0;
+       }
        default:
                return -ENOPROTOOPT;
        }
@@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
        case PACKET_TX_HAS_OFF:
                val = po->tp_tx_has_off;
                break;
+       case PACKET_QDISC_BYPASS:
+               val = packet_use_direct_xmit(po);
+               break;
        default:
                return -ENOPROTOOPT;
        }