packet: introduce PACKET_QDISC_BYPASS socket option
[cascardo/linux.git] / net / packet / af_packet.c
index 2e8286b..9d70f13 100644 (file)
@@ -237,6 +237,82 @@ struct packet_skb_cb {
 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
 static void __fanout_link(struct sock *sk, struct packet_sock *po);
 
+static int packet_direct_xmit(struct sk_buff *skb)
+{
+       struct net_device *dev = skb->dev;
+       const struct net_device_ops *ops = dev->netdev_ops;
+       netdev_features_t features;
+       struct netdev_queue *txq;
+       u16 queue_map;
+       int ret;
+
+       if (unlikely(!netif_running(dev) ||
+                    !netif_carrier_ok(dev))) {
+               kfree_skb(skb);
+               return NET_XMIT_DROP;
+       }
+
+       features = netif_skb_features(skb);
+       if (skb_needs_linearize(skb, features) &&
+           __skb_linearize(skb)) {
+               kfree_skb(skb);
+               return NET_XMIT_DROP;
+       }
+
+       queue_map = skb_get_queue_mapping(skb);
+       txq = netdev_get_tx_queue(dev, queue_map);
+
+       __netif_tx_lock_bh(txq);
+       if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
+               ret = NETDEV_TX_BUSY;
+               kfree_skb(skb);
+               goto out;
+       }
+
+       ret = ops->ndo_start_xmit(skb, dev);
+       if (likely(dev_xmit_complete(ret)))
+               txq_trans_update(txq);
+       else
+               kfree_skb(skb);
+out:
+       __netif_tx_unlock_bh(txq);
+       return ret;
+}
+
+static struct net_device *packet_cached_dev_get(struct packet_sock *po)
+{
+       struct net_device *dev;
+
+       rcu_read_lock();
+       dev = rcu_dereference(po->cached_dev);
+       if (likely(dev))
+               dev_hold(dev);
+       rcu_read_unlock();
+
+       return dev;
+}
+
+static void packet_cached_dev_assign(struct packet_sock *po,
+                                    struct net_device *dev)
+{
+       rcu_assign_pointer(po->cached_dev, dev);
+}
+
+static void packet_cached_dev_reset(struct packet_sock *po)
+{
+       RCU_INIT_POINTER(po->cached_dev, NULL);
+}
+
+static bool packet_use_direct_xmit(const struct packet_sock *po)
+{
+       return po->xmit == packet_direct_xmit;
+}
+
+static u16 packet_pick_tx_queue(struct net_device *dev)
+{
+       return (u16) smp_processor_id() % dev->real_num_tx_queues;
+}
+
 /* register_prot_hook must be invoked with the po->bind_lock held,
  * or from a context in which asynchronous accesses to the packet
  * socket is not possible (packet_create()).
@@ -244,11 +320,13 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
 static void register_prot_hook(struct sock *sk)
 {
        struct packet_sock *po = pkt_sk(sk);
+
        if (!po->running) {
                if (po->fanout)
                        __fanout_link(sk, po);
                else
                        dev_add_pack(&po->prot_hook);
+
                sock_hold(sk);
                po->running = 1;
        }
@@ -266,10 +344,12 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
        struct packet_sock *po = pkt_sk(sk);
 
        po->running = 0;
+
        if (po->fanout)
                __fanout_unlink(sk, po);
        else
                __dev_remove_pack(&po->prot_hook);
+
        __sock_put(sk);
 
        if (sync) {
@@ -430,11 +510,12 @@ static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
 {
        struct tpacket_kbdq_core *pkc;
 
-       pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+       pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+                       GET_PBDQC_FROM_RB(&po->rx_ring);
 
-       spin_lock(&rb_queue->lock);
+       spin_lock_bh(&rb_queue->lock);
        pkc->delete_blk_timer = 1;
-       spin_unlock(&rb_queue->lock);
+       spin_unlock_bh(&rb_queue->lock);
 
        prb_del_retire_blk_timer(pkc);
 }
@@ -456,7 +537,8 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
        if (tx_ring)
                BUG();
 
-       pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+       pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+                       GET_PBDQC_FROM_RB(&po->rx_ring);
        prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
 }
 
@@ -514,7 +596,7 @@ static void init_prb_bdqc(struct packet_sock *po,
                        struct pgv *pg_vec,
                        union tpacket_req_u *req_u, int tx_ring)
 {
-       struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
+       struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd;
 
        memset(p1, 0x0, sizeof(*p1));
@@ -578,7 +660,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 static void prb_retire_rx_blk_timer_expired(unsigned long data)
 {
        struct packet_sock *po = (struct packet_sock *)data;
-       struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
+       struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        unsigned int frozen;
        struct tpacket_block_desc *pbd;
 
@@ -1964,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 
        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);
-       skb_probe_transport_header(skb, 0);
 
-       if (po->tp_tx_has_off) {
+       if (!packet_use_direct_xmit(po))
+               skb_probe_transport_header(skb, 0);
+       if (unlikely(po->tp_tx_has_off)) {
                int off_min, off_max, off;
                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
                off_max = po->tx_ring.frame_size - tp_len;
@@ -2057,7 +2140,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
        struct sk_buff *skb;
        struct net_device *dev;
        __be16 proto;
-       bool need_rls_dev = false;
        int err, reserve = 0;
        void *ph;
        struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
@@ -2069,8 +2151,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
        mutex_lock(&po->pg_vec_lock);
 
-       if (saddr == NULL) {
-               dev = po->prot_hook.dev;
+       if (likely(saddr == NULL)) {
+               dev     = packet_cached_dev_get(po);
                proto   = po->num;
                addr    = NULL;
        } else {
@@ -2084,19 +2166,17 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
                proto   = saddr->sll_protocol;
                addr    = saddr->sll_addr;
                dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
-               need_rls_dev = true;
        }
 
        err = -ENXIO;
        if (unlikely(dev == NULL))
                goto out;
-
-       reserve = dev->hard_header_len;
-
        err = -ENETDOWN;
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_put;
 
+       reserve = dev->hard_header_len;
+
        size_max = po->tx_ring.frame_size
                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
@@ -2139,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
                        }
                }
 
+               skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
                skb->destructor = tpacket_destruct_skb;
                __packet_set_status(po, ph, TP_STATUS_SENDING);
                atomic_inc(&po->tx_ring.pending);
 
                status = TP_STATUS_SEND_REQUEST;
-               err = dev_queue_xmit(skb);
+               err = po->xmit(skb);
                if (unlikely(err > 0)) {
                        err = net_xmit_errno(err);
                        if (err && __packet_get_status(po, ph) ==
@@ -2173,8 +2254,7 @@ out_status:
        __packet_set_status(po, ph, status);
        kfree_skb(skb);
 out_put:
-       if (need_rls_dev)
-               dev_put(dev);
+       dev_put(dev);
 out:
        mutex_unlock(&po->pg_vec_lock);
        return err;
@@ -2204,15 +2284,13 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
        return skb;
 }
 
-static int packet_snd(struct socket *sock,
-                         struct msghdr *msg, size_t len)
+static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 {
        struct sock *sk = sock->sk;
        struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
        struct sk_buff *skb;
        struct net_device *dev;
        __be16 proto;
-       bool need_rls_dev = false;
        unsigned char *addr;
        int err, reserve = 0;
        struct virtio_net_hdr vnet_hdr = { 0 };
@@ -2227,8 +2305,8 @@ static int packet_snd(struct socket *sock,
         *      Get and verify the address.
         */
 
-       if (saddr == NULL) {
-               dev = po->prot_hook.dev;
+       if (likely(saddr == NULL)) {
+               dev     = packet_cached_dev_get(po);
                proto   = po->num;
                addr    = NULL;
        } else {
@@ -2240,19 +2318,17 @@ static int packet_snd(struct socket *sock,
                proto   = saddr->sll_protocol;
                addr    = saddr->sll_addr;
                dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
-               need_rls_dev = true;
        }
 
        err = -ENXIO;
-       if (dev == NULL)
+       if (unlikely(dev == NULL))
                goto out_unlock;
-       if (sock->type == SOCK_RAW)
-               reserve = dev->hard_header_len;
-
        err = -ENETDOWN;
-       if (!(dev->flags & IFF_UP))
+       if (unlikely(!(dev->flags & IFF_UP)))
                goto out_unlock;
 
+       if (sock->type == SOCK_RAW)
+               reserve = dev->hard_header_len;
        if (po->has_vnet_hdr) {
                vnet_hdr_len = sizeof(vnet_hdr);
 
@@ -2353,6 +2429,7 @@ static int packet_snd(struct socket *sock,
        skb->dev = dev;
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
+       skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
 
        if (po->has_vnet_hdr) {
                if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
@@ -2373,28 +2450,23 @@ static int packet_snd(struct socket *sock,
                len += vnet_hdr_len;
        }
 
-       skb_probe_transport_header(skb, reserve);
-
+       if (!packet_use_direct_xmit(po))
+               skb_probe_transport_header(skb, reserve);
        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;
 
-       /*
-        *      Now send it
-        */
-
-       err = dev_queue_xmit(skb);
+       err = po->xmit(skb);
        if (err > 0 && (err = net_xmit_errno(err)) != 0)
                goto out_unlock;
 
-       if (need_rls_dev)
-               dev_put(dev);
+       dev_put(dev);
 
        return len;
 
 out_free:
        kfree_skb(skb);
 out_unlock:
-       if (dev && need_rls_dev)
+       if (dev)
                dev_put(dev);
 out:
        return err;
@@ -2405,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 {
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
+
        if (po->tx_ring.pg_vec)
                return tpacket_snd(po, msg);
        else
@@ -2439,6 +2512,8 @@ static int packet_release(struct socket *sock)
 
        spin_lock(&po->bind_lock);
        unregister_prot_hook(sk, false);
+       packet_cached_dev_reset(po);
+
        if (po->prot_hook.dev) {
                dev_put(po->prot_hook.dev);
                po->prot_hook.dev = NULL;
@@ -2494,14 +2569,17 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
 
        spin_lock(&po->bind_lock);
        unregister_prot_hook(sk, true);
+
        po->num = protocol;
        po->prot_hook.type = protocol;
        if (po->prot_hook.dev)
                dev_put(po->prot_hook.dev);
-       po->prot_hook.dev = dev;
 
+       po->prot_hook.dev = dev;
        po->ifindex = dev ? dev->ifindex : 0;
 
+       packet_cached_dev_assign(po, dev);
+
        if (protocol == 0)
                goto out_unlock;
 
@@ -2614,6 +2692,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
        po = pkt_sk(sk);
        sk->sk_family = PF_PACKET;
        po->num = proto;
+       po->xmit = dev_queue_xmit;
+
+       packet_cached_dev_reset(po);
 
        sk->sk_destruct = packet_sock_destruct;
        sk_refcnt_debug_inc(sk);
@@ -2660,7 +2741,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int copied, err;
-       struct sockaddr_ll *sll;
        int vnet_hdr_len = 0;
 
        err = -EINVAL;
@@ -2744,22 +2824,10 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
                        goto out_free;
        }
 
-       /*
-        *      If the address length field is there to be filled in, we fill
-        *      it in now.
-        */
-
-       sll = &PACKET_SKB_CB(skb)->sa.ll;
-       if (sock->type == SOCK_PACKET)
-               msg->msg_namelen = sizeof(struct sockaddr_pkt);
-       else
-               msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
-
-       /*
-        *      You lose any data beyond the buffer you gave. If it worries a
-        *      user program they can ask the device for its MTU anyway.
+       /* You lose any data beyond the buffer you gave. If it worries
+        * a user program they can ask the device for its MTU
+        * anyway.
         */
-
        copied = skb->len;
        if (copied > len) {
                copied = len;
@@ -2772,9 +2840,20 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 
        sock_recv_ts_and_drops(msg, sk, skb);
 
-       if (msg->msg_name)
+       if (msg->msg_name) {
+               /* If the address length field is there to be filled
+                * in, we fill it in now.
+                */
+               if (sock->type == SOCK_PACKET) {
+                       msg->msg_namelen = sizeof(struct sockaddr_pkt);
+               } else {
+                       struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+                       msg->msg_namelen = sll->sll_halen +
+                               offsetof(struct sockaddr_ll, sll_addr);
+               }
                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
                       msg->msg_namelen);
+       }
 
        if (pkt_sk(sk)->auxdata) {
                struct tpacket_auxdata aux;
@@ -3193,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                po->tp_tx_has_off = !!val;
                return 0;
        }
+       case PACKET_QDISC_BYPASS:
+       {
+               int val;
+
+               if (optlen != sizeof(val))
+                       return -EINVAL;
+               if (copy_from_user(&val, optval, sizeof(val)))
+                       return -EFAULT;
+
+               po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
+               return 0;
+       }
        default:
                return -ENOPROTOOPT;
        }
@@ -3285,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
        case PACKET_TX_HAS_OFF:
                val = po->tp_tx_has_off;
                break;
+       case PACKET_QDISC_BYPASS:
+               val = packet_use_direct_xmit(po);
+               break;
        default:
                return -ENOPROTOOPT;
        }
@@ -3326,6 +3420,7 @@ static int packet_notifier(struct notifier_block *this,
                                                sk->sk_error_report(sk);
                                }
                                if (msg == NETDEV_UNREGISTER) {
+                                       packet_cached_dev_reset(po);
                                        po->ifindex = -1;
                                        if (po->prot_hook.dev)
                                                dev_put(po->prot_hook.dev);