Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net...
authorPablo Neira Ayuso <pablo@netfilter.org>
Sun, 25 Sep 2016 21:23:57 +0000 (23:23 +0200)
committerPablo Neira Ayuso <pablo@netfilter.org>
Sun, 25 Sep 2016 21:34:19 +0000 (23:34 +0200)
Conflicts:
net/netfilter/core.c
net/netfilter/nf_tables_netdev.c

Resolve two conflicts before pull request for David's net-next tree:

1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant
   ip_hdr assignment") from the net tree and commit ddc8b6027ad0
   ("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()").

2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and
   Aaron Conole's patches to replace list_head with single linked list.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
12 files changed:
1  2 
include/linux/netdevice.h
include/linux/netfilter.h
include/uapi/linux/if_tunnel.h
net/bridge/netfilter/ebtables.c
net/core/dev.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv6/netfilter/nft_chain_route_ipv6.c
net/netfilter/core.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_tables_trace.c
net/netfilter/nft_meta.c

@@@ -789,6 -789,7 +789,7 @@@ enum 
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_MATCHALL,
+       TC_SETUP_CLSBPF,
  };
  
  struct tc_cls_u32_offload;
@@@ -800,6 -801,7 +801,7 @@@ struct tc_to_netdev 
                struct tc_cls_u32_offload *cls_u32;
                struct tc_cls_flower_offload *cls_flower;
                struct tc_cls_matchall_offload *cls_mall;
+               struct tc_cls_bpf_offload *cls_bpf;
        };
  };
  
@@@ -924,6 -926,14 +926,14 @@@ struct netdev_xdp 
   *    3. Update dev->stats asynchronously and atomically, and define
   *       neither operation.
   *
+  * bool (*ndo_has_offload_stats)(int attr_id)
+  *    Return true if this device supports offload stats of this attr_id.
+  *
+  * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
+  *    void *attr_data)
+  *    Get statistics for offload operations by attr_id. Write it into the
+  *    attr_data pointer.
+  *
   * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
   *    If device supports VLAN filtering this function is called when a
   *    VLAN id is registered.
   *
   *    SR-IOV management functions.
   * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
-  * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
+  * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
+  *                      u8 qos, __be16 proto);
   * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
   *                      int max_tx_rate);
   * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
@@@ -1155,6 -1166,10 +1166,10 @@@ struct net_device_ops 
  
        struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
                                                     struct rtnl_link_stats64 *storage);
+       bool                    (*ndo_has_offload_stats)(int attr_id);
+       int                     (*ndo_get_offload_stats)(int attr_id,
+                                                        const struct net_device *dev,
+                                                        void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  
        int                     (*ndo_vlan_rx_add_vid)(struct net_device *dev,
        int                     (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                     (*ndo_set_vf_vlan)(struct net_device *dev,
-                                                  int queue, u16 vlan, u8 qos);
+                                                  int queue, u16 vlan,
+                                                  u8 qos, __be16 proto);
        int                     (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
@@@ -1783,7 -1799,7 +1799,7 @@@ struct net_device 
  #endif
        struct netdev_queue __rcu *ingress_queue;
  #ifdef CONFIG_NETFILTER_INGRESS
 -      struct list_head        nf_hooks_ingress;
 +      struct nf_hook_entry __rcu *nf_hooks_ingress;
  #endif
  
        unsigned char           broadcast[MAX_ADDR_LEN];
@@@ -3266,6 -3282,7 +3282,7 @@@ static inline void napi_free_frags(stru
        napi->skb = NULL;
  }
  
+ bool netdev_is_rx_handler_busy(struct net_device *dev);
  int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
@@@ -55,34 -55,12 +55,34 @@@ struct nf_hook_state 
        struct net_device *out;
        struct sock *sk;
        struct net *net;
 -      struct list_head *hook_list;
 +      struct nf_hook_entry __rcu *hook_entries;
        int (*okfn)(struct net *, struct sock *, struct sk_buff *);
  };
  
 +typedef unsigned int nf_hookfn(void *priv,
 +                             struct sk_buff *skb,
 +                             const struct nf_hook_state *state);
 +struct nf_hook_ops {
 +      struct list_head        list;
 +
 +      /* User fills in from here down. */
 +      nf_hookfn               *hook;
 +      struct net_device       *dev;
 +      void                    *priv;
 +      u_int8_t                pf;
 +      unsigned int            hooknum;
 +      /* Hooks are ordered in ascending priority. */
 +      int                     priority;
 +};
 +
 +struct nf_hook_entry {
 +      struct nf_hook_entry __rcu      *next;
 +      struct nf_hook_ops              ops;
 +      const struct nf_hook_ops        *orig_ops;
 +};
 +
  static inline void nf_hook_state_init(struct nf_hook_state *p,
 -                                    struct list_head *hook_list,
 +                                    struct nf_hook_entry *hook_entry,
                                      unsigned int hook,
                                      int thresh, u_int8_t pf,
                                      struct net_device *indev,
        p->out = outdev;
        p->sk = sk;
        p->net = net;
 -      p->hook_list = hook_list;
 +      RCU_INIT_POINTER(p->hook_entries, hook_entry);
        p->okfn = okfn;
  }
  
 -typedef unsigned int nf_hookfn(void *priv,
 -                             struct sk_buff *skb,
 -                             const struct nf_hook_state *state);
  
 -struct nf_hook_ops {
 -      struct list_head        list;
 -
 -      /* User fills in from here down. */
 -      nf_hookfn               *hook;
 -      struct net_device       *dev;
 -      void                    *priv;
 -      u_int8_t                pf;
 -      unsigned int            hooknum;
 -      /* Hooks are ordered in ascending priority. */
 -      int                     priority;
 -};
  
  struct nf_sockopt_ops {
        struct list_head list;
@@@ -140,6 -133,8 +140,8 @@@ int nf_register_hook(struct nf_hook_op
  void nf_unregister_hook(struct nf_hook_ops *reg);
  int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
  void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+ int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+ void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
  
  /* Functions to register get/setsockopt ranges (non-inclusive).  You
     need to check permissions yourself! */
@@@ -168,8 -163,7 +170,8 @@@ static inline int nf_hook_thresh(u_int8
                                 int (*okfn)(struct net *, struct sock *, struct sk_buff *),
                                 int thresh)
  {
 -      struct list_head *hook_list;
 +      struct nf_hook_entry *hook_head;
 +      int ret = 1;
  
  #ifdef HAVE_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
                return 1;
  #endif
  
 -      hook_list = &net->nf.hooks[pf][hook];
 -
 -      if (!list_empty(hook_list)) {
 +      rcu_read_lock();
 +      hook_head = rcu_dereference(net->nf.hooks[pf][hook]);
 +      if (hook_head) {
                struct nf_hook_state state;
  
 -              nf_hook_state_init(&state, hook_list, hook, thresh,
 +              nf_hook_state_init(&state, hook_head, hook, thresh,
                                   pf, indev, outdev, sk, net, okfn);
 -              return nf_hook_slow(skb, &state);
 +
 +              ret = nf_hook_slow(skb, &state);
        }
 -      return 1;
 +      rcu_read_unlock();
 +
 +      return ret;
  }
  
  static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
@@@ -39,7 -39,6 +39,7 @@@
  #define GRE_IS_REC(f)         ((f) & GRE_REC)
  #define GRE_IS_ACK(f)         ((f) & GRE_ACK)
  
 +#define GRE_VERSION_0         __cpu_to_be16(0x0000)
  #define GRE_VERSION_1         __cpu_to_be16(0x0001)
  #define GRE_PROTO_PPP         __cpu_to_be16(0x880b)
  #define GRE_PPTP_KEY_MASK     __cpu_to_be32(0xffff)
@@@ -74,6 -73,7 +74,7 @@@ enum 
        IFLA_IPTUN_ENCAP_FLAGS,
        IFLA_IPTUN_ENCAP_SPORT,
        IFLA_IPTUN_ENCAP_DPORT,
+       IFLA_IPTUN_COLLECT_METADATA,
        __IFLA_IPTUN_MAX,
  };
  #define IFLA_IPTUN_MAX        (__IFLA_IPTUN_MAX - 1)
@@@ -146,7 -146,7 +146,7 @@@ ebt_basic_match(const struct ebt_entry 
                return 1;
        if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
                return 1;
 -      /* rcu_read_lock()ed by nf_hook_slow */
 +      /* rcu_read_lock()ed by nf_hook_thresh */
        if (in && (p = br_port_get_rcu(in)) != NULL &&
            NF_INVF(e, EBT_ILOGICALIN,
                    ebt_dev_check(e->logical_in, p->br->dev)))
@@@ -368,6 -368,8 +368,8 @@@ ebt_check_match(struct ebt_entry_match 
  
        match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
        if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+               if (!IS_ERR(match))
+                       module_put(match->me);
                request_module("ebt_%s", m->u.name);
                match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
        }
diff --combined net/core/dev.c
@@@ -3904,8 -3904,7 +3904,7 @@@ static void net_tx_action(struct softir
        }
  }
  
- #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
-     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+ #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
  /* This hook is defined here for ATM LANE */
  int (*br_fdb_test_addr_hook)(struct net_device *dev,
                             unsigned char *addr) __read_mostly;
@@@ -3964,6 -3963,22 +3963,22 @@@ sch_handle_ingress(struct sk_buff *skb
        return skb;
  }
  
+ /**
+  *    netdev_is_rx_handler_busy - check if receive handler is registered
+  *    @dev: device to check
+  *
+  *    Check if a receive handler is already registered for a given device.
+  *    Return true if there one.
+  *
+  *    The caller must hold the rtnl_mutex.
+  */
+ bool netdev_is_rx_handler_busy(struct net_device *dev)
+ {
+       ASSERT_RTNL();
+       return dev && rtnl_dereference(dev->rx_handler);
+ }
+ EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
  /**
   *    netdev_rx_handler_register - register receive handler
   *    @dev: device to register a handler for
@@@ -4040,17 -4055,12 +4055,17 @@@ static inline int nf_ingress(struct sk_
  {
  #ifdef CONFIG_NETFILTER_INGRESS
        if (nf_hook_ingress_active(skb)) {
 +              int ingress_retval;
 +
                if (*pt_prev) {
                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
                        *pt_prev = NULL;
                }
  
 -              return nf_hook_ingress(skb);
 +              rcu_read_lock();
 +              ingress_retval = nf_hook_ingress(skb);
 +              rcu_read_unlock();
 +              return ingress_retval;
        }
  #endif /* CONFIG_NETFILTER_INGRESS */
        return 0;
diff --combined net/ipv4/tcp_input.c
@@@ -289,6 -289,7 +289,7 @@@ static bool tcp_ecn_rcv_ecn_echo(const 
  static void tcp_sndbuf_expand(struct sock *sk)
  {
        const struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        int sndmem, per_mss;
        u32 nr_segs;
  
         * Cubic needs 1.7 factor, rounded to 2 to include
         * extra cushion (application might react slowly to POLLOUT)
         */
-       sndmem = 2 * nr_segs * per_mss;
+       sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+       sndmem *= nr_segs * per_mss;
  
        if (sk->sk_sndbuf < sndmem)
                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@@ -899,12 -901,29 +901,29 @@@ static void tcp_verify_retransmit_hint(
                tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
  }
  
+ /* Sum the number of packets on the wire we have marked as lost.
+  * There are two cases we care about here:
+  * a) Packet hasn't been marked lost (nor retransmitted),
+  *    and this is the first loss.
+  * b) Packet has been marked both lost and retransmitted,
+  *    and this means we think it was lost again.
+  */
+ static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+ {
+       __u8 sacked = TCP_SKB_CB(skb)->sacked;
+       if (!(sacked & TCPCB_LOST) ||
+           ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+               tp->lost += tcp_skb_pcount(skb);
+ }
  static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
  {
        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                tcp_verify_retransmit_hint(tp, skb);
  
                tp->lost_out += tcp_skb_pcount(skb);
+               tcp_sum_lost(tp, skb);
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
        }
  }
@@@ -913,6 -932,7 +932,7 @@@ void tcp_skb_mark_lost_uncond_verify(st
  {
        tcp_verify_retransmit_hint(tp, skb);
  
+       tcp_sum_lost(tp, skb);
        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                tp->lost_out += tcp_skb_pcount(skb);
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@@ -1094,6 -1114,7 +1114,7 @@@ struct tcp_sacktag_state 
         */
        struct skb_mstamp first_sackt;
        struct skb_mstamp last_sackt;
+       struct rate_sample *rate;
        int     flag;
  };
  
@@@ -1261,6 -1282,7 +1282,7 @@@ static bool tcp_shifted_skb(struct soc
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                        start_seq, end_seq, dup_sack, pcount,
                        &skb->skb_mstamp);
+       tcp_rate_skb_delivered(sk, skb, state->rate);
  
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
                tcp_advance_highest_sack(sk, skb);
  
        tcp_skb_collapse_tstamp(prev, skb);
+       if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+               TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
        tcp_unlink_write_queue(skb, sk);
        sk_wmem_free_skb(sk, skb);
  
@@@ -1540,6 -1565,7 +1565,7 @@@ static struct sk_buff *tcp_sacktag_walk
                                                dup_sack,
                                                tcp_skb_pcount(skb),
                                                &skb->skb_mstamp);
+                       tcp_rate_skb_delivered(sk, skb, state->rate);
  
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@@ -1622,8 -1648,10 +1648,10 @@@ tcp_sacktag_write_queue(struct sock *sk
  
        found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
                                         num_sacks, prior_snd_una);
-       if (found_dup_sack)
+       if (found_dup_sack) {
                state->flag |= FLAG_DSACKING_ACK;
+               tp->delivered++; /* A spurious retransmission is delivered */
+       }
  
        /* Eliminate too old ACKs, but take into
         * account more or less fresh ones, they can
@@@ -1890,6 -1918,7 +1918,7 @@@ void tcp_enter_loss(struct sock *sk
        struct sk_buff *skb;
        bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
        bool is_reneg;                  /* is receiver reneging on SACKs? */
+       bool mark_lost;
  
        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
                if (skb == tcp_send_head(sk))
                        break;
  
+               mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+                            is_reneg);
+               if (mark_lost)
+                       tcp_sum_lost(tp, skb);
                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+               if (mark_lost) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@@ -2503,6 -2536,9 +2536,9 @@@ static inline void tcp_end_cwnd_reducti
  {
        struct tcp_sock *tp = tcp_sk(sk);
  
+       if (inet_csk(sk)->icsk_ca_ops->cong_control)
+               return;
        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@@ -2879,67 -2915,13 +2915,13 @@@ static void tcp_fastretrans_alert(struc
        *rexmit = REXMIT_LOST;
  }
  
- /* Kathleen Nichols' algorithm for tracking the minimum value of
-  * a data stream over some fixed time interval. (E.g., the minimum
-  * RTT over the past five minutes.) It uses constant space and constant
-  * time per update yet almost always delivers the same minimum as an
-  * implementation that has to keep all the data in the window.
-  *
-  * The algorithm keeps track of the best, 2nd best & 3rd best min
-  * values, maintaining an invariant that the measurement time of the
-  * n'th best >= n-1'th best. It also makes sure that the three values
-  * are widely separated in the time window since that bounds the worse
-  * case error when that data is monotonically increasing over the window.
-  *
-  * Upon getting a new min, we can forget everything earlier because it
-  * has no value - the new min is <= everything else in the window by
-  * definition and it's the most recent. So we restart fresh on every new min
-  * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
-  * best.
-  */
  static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
  {
-       const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
-       struct rtt_meas *m = tcp_sk(sk)->rtt_min;
-       struct rtt_meas rttm = {
-               .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
-               .ts = now,
-       };
-       u32 elapsed;
-       /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
-       if (unlikely(rttm.rtt <= m[0].rtt))
-               m[0] = m[1] = m[2] = rttm;
-       else if (rttm.rtt <= m[1].rtt)
-               m[1] = m[2] = rttm;
-       else if (rttm.rtt <= m[2].rtt)
-               m[2] = rttm;
-       elapsed = now - m[0].ts;
-       if (unlikely(elapsed > wlen)) {
-               /* Passed entire window without a new min so make 2nd choice
-                * the new min & 3rd choice the new 2nd. So forth and so on.
-                */
-               m[0] = m[1];
-               m[1] = m[2];
-               m[2] = rttm;
-               if (now - m[0].ts > wlen) {
-                       m[0] = m[1];
-                       m[1] = rttm;
-                       if (now - m[0].ts > wlen)
-                               m[0] = rttm;
-               }
-       } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
-               /* Passed a quarter of the window without a new min so
-                * take 2nd choice from the 2nd quarter of the window.
-                */
-               m[2] = m[1] = rttm;
-       } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
-               /* Passed half the window without a new min so take the 3rd
-                * choice from the last half of the window.
-                */
-               m[2] = rttm;
-       }
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+       minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+                          rtt_us ? : jiffies_to_usecs(1));
  }
  
  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@@ -3102,10 -3084,11 +3084,11 @@@ static void tcp_ack_tstamp(struct sock 
   */
  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                               u32 prior_snd_una, int *acked,
-                              struct tcp_sacktag_state *sack)
+                              struct tcp_sacktag_state *sack,
+                              struct skb_mstamp *now)
  {
        const struct inet_connection_sock *icsk = inet_csk(sk);
-       struct skb_mstamp first_ackt, last_ackt, now;
+       struct skb_mstamp first_ackt, last_ackt;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_sacked = tp->sacked_out;
        u32 reord = tp->packets_out;
                        acked_pcount = tcp_tso_acked(sk, skb);
                        if (!acked_pcount)
                                break;
                        fully_acked = false;
                } else {
                        /* Speedup tcp_unlink_write_queue() and next loop */
  
                tp->packets_out -= acked_pcount;
                pkts_acked += acked_pcount;
+               tcp_rate_skb_delivered(sk, skb, sack->rate);
  
                /* Initial outgoing SYN's get put onto the write_queue
                 * just like anything else we transmit.  It is not
        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                flag |= FLAG_SACK_RENEGING;
  
-       skb_mstamp_get(&now);
        if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-               seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+               seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
        }
        if (sack->first_sackt.v64) {
-               sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+               sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
        }
+       sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
                                        ca_rtt_us);
  
                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
  
        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
-                  sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+                  sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
                /* Do not re-arm RTO if the sack RTT is measured from data sent
                 * after when the head was last (re)transmitted. Otherwise the
                 * timeout may continue to extend in loss recovery.
@@@ -3333,8 -3315,15 +3315,15 @@@ static inline bool tcp_may_raise_cwnd(c
   * information. All transmission or retransmission are delayed afterwards.
   */
  static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
-                            int flag)
+                            int flag, const struct rate_sample *rs)
  {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       if (icsk->icsk_ca_ops->cong_control) {
+               icsk->icsk_ca_ops->cong_control(sk, rs);
+               return;
+       }
        if (tcp_in_cwnd_reduction(sk)) {
                /* Reduce cwnd if state mandates */
                tcp_cwnd_reduction(sk, acked_sacked, flag);
@@@ -3579,17 -3568,21 +3568,21 @@@ static int tcp_ack(struct sock *sk, con
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_sacktag_state sack_state;
+       struct rate_sample rs = { .prior_delivered = 0 };
        u32 prior_snd_una = tp->snd_una;
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
-       u32 prior_delivered = tp->delivered;
+       u32 delivered = tp->delivered;
+       u32 lost = tp->lost;
        int acked = 0; /* Number of packets newly acked */
        int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+       struct skb_mstamp now;
  
        sack_state.first_sackt.v64 = 0;
+       sack_state.rate = &rs;
  
        /* We very likely will need to access write queue head. */
        prefetchw(sk->sk_write_queue.next);
        if (after(ack, tp->snd_nxt))
                goto invalid_ack;
  
+       skb_mstamp_get(&now);
        if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
        }
  
        prior_fackets = tp->fackets_out;
+       rs.prior_in_flight = tcp_packets_in_flight(tp);
  
        /* ts_recent update must be made after we are sure that the packet
         * is in window.
  
        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-                                   &sack_state);
+                                   &sack_state, &now);
  
        if (tcp_ack_is_dubious(sk, flag)) {
                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
  
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
-       tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+       delivered = tp->delivered - delivered;  /* freshly ACKed or SACKed */
+       lost = tp->lost - lost;                 /* freshly marked lost */
+       tcp_rate_gen(sk, delivered, lost, &now, &rs);
+       tcp_cong_control(sk, ack, delivered, flag, &rs);
        tcp_xmit_recovery(sk, rexmit);
        return 1;
  
@@@ -4108,7 -4107,7 +4107,7 @@@ void tcp_fin(struct sock *sk
        /* It _is_ possible, that we have something out-of-order _after_ FIN.
         * Probably, we should reset in this case. For now drop them.
         */
-       __skb_queue_purge(&tp->out_of_order_queue);
+       skb_rbtree_purge(&tp->out_of_order_queue);
        if (tcp_is_sack(tp))
                tcp_sack_reset(&tp->rx_opt);
        sk_mem_reclaim(sk);
@@@ -4268,7 -4267,7 +4267,7 @@@ static void tcp_sack_remove(struct tcp_
        int this_sack;
  
        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-       if (skb_queue_empty(&tp->out_of_order_queue)) {
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                tp->rx_opt.num_sacks = 0;
                return;
        }
@@@ -4344,10 -4343,13 +4343,13 @@@ static void tcp_ofo_queue(struct sock *
  {
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 dsack_high = tp->rcv_nxt;
+       bool fin, fragstolen, eaten;
        struct sk_buff *skb, *tail;
-       bool fragstolen, eaten;
+       struct rb_node *p;
  
-       while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+       p = rb_first(&tp->out_of_order_queue);
+       while (p) {
+               skb = rb_entry(p, struct sk_buff, rbnode);
                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
                        break;
  
                                dsack_high = TCP_SKB_CB(skb)->end_seq;
                        tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
                }
+               p = rb_next(p);
+               rb_erase(&skb->rbnode, &tp->out_of_order_queue);
  
-               __skb_unlink(skb, &tp->out_of_order_queue);
-               if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+               if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
                        SOCK_DEBUG(sk, "ofo packet was already received\n");
                        tcp_drop(sk, skb);
                        continue;
                tail = skb_peek_tail(&sk->sk_receive_queue);
                eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+               fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
                if (!eaten)
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
-               if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-                       tcp_fin(sk);
-               if (eaten)
+               else
                        kfree_skb_partial(skb, fragstolen);
+               if (unlikely(fin)) {
+                       tcp_fin(sk);
+                       /* tcp_fin() purges tp->out_of_order_queue,
+                        * so we must end this loop right now.
+                        */
+                       break;
+               }
        }
  }
  
@@@ -4403,8 -4413,10 +4413,10 @@@ static int tcp_try_rmem_schedule(struc
  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  {
        struct tcp_sock *tp = tcp_sk(sk);
+       struct rb_node **p, *q, *parent;
        struct sk_buff *skb1;
        u32 seq, end_seq;
+       bool fragstolen;
  
        tcp_ecn_check_ce(tp, skb);
  
        inet_csk_schedule_ack(sk);
  
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+       seq = TCP_SKB_CB(skb)->seq;
+       end_seq = TCP_SKB_CB(skb)->end_seq;
        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-                  tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+                  tp->rcv_nxt, seq, end_seq);
  
-       skb1 = skb_peek_tail(&tp->out_of_order_queue);
-       if (!skb1) {
+       p = &tp->out_of_order_queue.rb_node;
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                /* Initial out of order segment, build 1 SACK. */
                if (tcp_is_sack(tp)) {
                        tp->rx_opt.num_sacks = 1;
-                       tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-                       tp->selective_acks[0].end_seq =
-                                               TCP_SKB_CB(skb)->end_seq;
+                       tp->selective_acks[0].start_seq = seq;
+                       tp->selective_acks[0].end_seq = end_seq;
                }
-               __skb_queue_head(&tp->out_of_order_queue, skb);
+               rb_link_node(&skb->rbnode, NULL, p);
+               rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+               tp->ooo_last_skb = skb;
                goto end;
        }
  
-       seq = TCP_SKB_CB(skb)->seq;
-       end_seq = TCP_SKB_CB(skb)->end_seq;
-       if (seq == TCP_SKB_CB(skb1)->end_seq) {
-               bool fragstolen;
-               if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-                       __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-               } else {
-                       tcp_grow_window(sk, skb);
-                       kfree_skb_partial(skb, fragstolen);
-                       skb = NULL;
-               }
-               if (!tp->rx_opt.num_sacks ||
-                   tp->selective_acks[0].end_seq != seq)
-                       goto add_sack;
-               /* Common case: data arrive in order after hole. */
-               tp->selective_acks[0].end_seq = end_seq;
-               goto end;
-       }
-       /* Find place to insert this segment. */
-       while (1) {
-               if (!after(TCP_SKB_CB(skb1)->seq, seq))
-                       break;
-               if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-                       skb1 = NULL;
-                       break;
-               }
-               skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
-       }
-       /* Do skb overlap to previous one? */
-       if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-               if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-                       /* All the bits are present. Drop. */
-                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-                       tcp_drop(sk, skb);
-                       skb = NULL;
-                       tcp_dsack_set(sk, seq, end_seq);
-                       goto add_sack;
+       /* In the typical case, we are adding an skb to the end of the list.
+        * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+        */
+       if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+ coalesce_done:
+               tcp_grow_window(sk, skb);
+               kfree_skb_partial(skb, fragstolen);
+               skb = NULL;
+               goto add_sack;
+       }
+       /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+       if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+               parent = &tp->ooo_last_skb->rbnode;
+               p = &parent->rb_right;
+               goto insert;
+       }
+       /* Find place to insert this segment. Handle overlaps on the way. */
+       parent = NULL;
+       while (*p) {
+               parent = *p;
+               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+                       p = &parent->rb_left;
+                       continue;
                }
-               if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-                       /* Partial overlap. */
-                       tcp_dsack_set(sk, seq,
-                                     TCP_SKB_CB(skb1)->end_seq);
-               } else {
-                       if (skb_queue_is_first(&tp->out_of_order_queue,
-                                              skb1))
-                               skb1 = NULL;
-                       else
-                               skb1 = skb_queue_prev(
-                                       &tp->out_of_order_queue,
-                                       skb1);
+               if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+                       if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                               /* All the bits are present. Drop. */
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+                               __kfree_skb(skb);
+                               skb = NULL;
+                               tcp_dsack_set(sk, seq, end_seq);
+                               goto add_sack;
+                       }
+                       if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+                               /* Partial overlap. */
+                               tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+                       } else {
+                               /* skb's seq == skb1's seq and skb covers skb1.
+                                * Replace skb1 with skb.
+                                */
+                               rb_replace_node(&skb1->rbnode, &skb->rbnode,
+                                               &tp->out_of_order_queue);
+                               tcp_dsack_extend(sk,
+                                                TCP_SKB_CB(skb1)->seq,
+                                                TCP_SKB_CB(skb1)->end_seq);
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+                               __kfree_skb(skb1);
+                               goto merge_right;
+                       }
+               } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+                       goto coalesce_done;
                }
+               p = &parent->rb_right;
        }
-       if (!skb1)
-               __skb_queue_head(&tp->out_of_order_queue, skb);
-       else
-               __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ insert:
+       /* Insert segment into RB tree. */
+       rb_link_node(&skb->rbnode, parent, p);
+       rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
  
-       /* And clean segments covered by new one as whole. */
-       while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-               skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+ merge_right:
+       /* Remove other segments covered by skb. */
+       while ((q = rb_next(&skb->rbnode)) != NULL) {
+               skb1 = rb_entry(q, struct sk_buff, rbnode);
  
                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
                        break;
                                         end_seq);
                        break;
                }
-               __skb_unlink(skb1, &tp->out_of_order_queue);
+               rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                 TCP_SKB_CB(skb1)->end_seq);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
                tcp_drop(sk, skb1);
        }
+       /* If there is no skb after us, we are the last_skb ! */
+       if (!q)
+               tp->ooo_last_skb = skb;
  
  add_sack:
        if (tcp_is_sack(tp))
@@@ -4651,13 -4670,13 +4670,13 @@@ queue_and_out
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        tcp_fin(sk);
  
-               if (!skb_queue_empty(&tp->out_of_order_queue)) {
+               if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                        tcp_ofo_queue(sk);
  
                        /* RFC2581. 4.2. SHOULD send immediate ACK, when
                         * gap in queue is filled.
                         */
-                       if (skb_queue_empty(&tp->out_of_order_queue))
+                       if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                                inet_csk(sk)->icsk_ack.pingpong = 0;
                }
  
@@@ -4711,48 -4730,76 +4730,76 @@@ drop
        tcp_data_queue_ofo(sk, skb);
  }
  
+ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+ {
+       if (list)
+               return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+       return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+ }
  static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
-                                       struct sk_buff_head *list)
+                                       struct sk_buff_head *list,
+                                       struct rb_root *root)
  {
-       struct sk_buff *next = NULL;
+       struct sk_buff *next = tcp_skb_next(skb, list);
  
-       if (!skb_queue_is_last(list, skb))
-               next = skb_queue_next(list, skb);
+       if (list)
+               __skb_unlink(skb, list);
+       else
+               rb_erase(&skb->rbnode, root);
  
-       __skb_unlink(skb, list);
        __kfree_skb(skb);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
  
        return next;
  }
  
+ /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+ {
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct sk_buff *skb1;
+       while (*p) {
+               parent = *p;
+               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+       rb_link_node(&skb->rbnode, parent, p);
+       rb_insert_color(&skb->rbnode, root);
+ }
  /* Collapse contiguous sequence of skbs head..tail with
   * sequence numbers start..end.
   *
-  * If tail is NULL, this means until the end of the list.
+  * If tail is NULL, this means until the end of the queue.
   *
   * Segments with FIN/SYN are not collapsed (only because this
   * simplifies code)
   */
  static void
- tcp_collapse(struct sock *sk, struct sk_buff_head *list,
-            struct sk_buff *head, struct sk_buff *tail,
-            u32 start, u32 end)
+ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+            struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
  {
-       struct sk_buff *skb, *n;
+       struct sk_buff *skb = head, *n;
+       struct sk_buff_head tmp;
        bool end_of_skbs;
  
        /* First, check that queue is collapsible and find
-        * the point where collapsing can be useful. */
-       skb = head;
+        * the point where collapsing can be useful.
+        */
  restart:
-       end_of_skbs = true;
-       skb_queue_walk_from_safe(list, skb, n) {
-               if (skb == tail)
-                       break;
+       for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+               n = tcp_skb_next(skb, list);
                /* No new bits? It is possible on ofo queue. */
                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-                       skb = tcp_collapse_one(sk, skb, list);
+                       skb = tcp_collapse_one(sk, skb, list, root);
                        if (!skb)
                                break;
                        goto restart;
                        break;
                }
  
-               if (!skb_queue_is_last(list, skb)) {
-                       struct sk_buff *next = skb_queue_next(list, skb);
-                       if (next != tail &&
-                           TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
-                               end_of_skbs = false;
-                               break;
-                       }
+               if (n && n != tail &&
+                   TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+                       end_of_skbs = false;
+                       break;
                }
  
                /* Decided to skip this, advance start seq. */
            (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
                return;
  
+       __skb_queue_head_init(&tmp);
        while (before(start, end)) {
                int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
                struct sk_buff *nskb;
  
                nskb = alloc_skb(copy, GFP_ATOMIC);
                if (!nskb)
-                       return;
+                       break;
  
                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-               __skb_queue_before(list, skb, nskb);
+               if (list)
+                       __skb_queue_before(list, skb, nskb);
+               else
+                       __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
                skb_set_owner_r(nskb, sk);
  
                /* Copy data, releasing collapsed skbs. */
                                start += size;
                        }
                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
-                               skb = tcp_collapse_one(sk, skb, list);
+                               skb = tcp_collapse_one(sk, skb, list, root);
                                if (!skb ||
                                    skb == tail ||
                                    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
-                                       return;
+                                       goto end;
                        }
                }
        }
+ end:
+       skb_queue_walk_safe(&tmp, skb, n)
+               tcp_rbtree_insert(root, skb);
  }
  
  /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
  static void tcp_collapse_ofo_queue(struct sock *sk)
  {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
-       struct sk_buff *head;
+       struct sk_buff *skb, *head;
+       struct rb_node *p;
        u32 start, end;
  
-       if (!skb)
+       p = rb_first(&tp->out_of_order_queue);
+       skb = rb_entry_safe(p, struct sk_buff, rbnode);
+ new_range:
+       if (!skb) {
+               p = rb_last(&tp->out_of_order_queue);
+               /* Note: This is possible p is NULL here. We do not
+                * use rb_entry_safe(), as ooo_last_skb is valid only
+                * if rbtree is not empty.
+                */
+               tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
                return;
+       }
        start = TCP_SKB_CB(skb)->seq;
        end = TCP_SKB_CB(skb)->end_seq;
-       head = skb;
-       for (;;) {
-               struct sk_buff *next = NULL;
  
-               if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
-                       next = skb_queue_next(&tp->out_of_order_queue, skb);
-               skb = next;
+       for (head = skb;;) {
+               skb = tcp_skb_next(skb, NULL);
  
-               /* Segment is terminated when we see gap or when
-                * we are at the end of all the queue. */
+               /* Range is terminated when we see a gap or when
+                * we are at the queue end.
+                */
                if (!skb ||
                    after(TCP_SKB_CB(skb)->seq, end) ||
                    before(TCP_SKB_CB(skb)->end_seq, start)) {
-                       tcp_collapse(sk, &tp->out_of_order_queue,
+                       tcp_collapse(sk, NULL, &tp->out_of_order_queue,
                                     head, skb, start, end);
-                       head = skb;
-                       if (!skb)
-                               break;
-                       /* Start new segment */
+                       goto new_range;
+               }
+               if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
                        start = TCP_SKB_CB(skb)->seq;
+               if (after(TCP_SKB_CB(skb)->end_seq, end))
                        end = TCP_SKB_CB(skb)->end_seq;
-               } else {
-                       if (before(TCP_SKB_CB(skb)->seq, start))
-                               start = TCP_SKB_CB(skb)->seq;
-                       if (after(TCP_SKB_CB(skb)->end_seq, end))
-                               end = TCP_SKB_CB(skb)->end_seq;
-               }
        }
  }
  
  static bool tcp_prune_ofo_queue(struct sock *sk)
  {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
+       struct rb_node *node, *prev;
  
-       if (skb_queue_empty(&tp->out_of_order_queue))
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                return false;
  
        NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-       while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
-               tcp_drop(sk, skb);
+       node = &tp->ooo_last_skb->rbnode;
+       do {
+               prev = rb_prev(node);
+               rb_erase(node, &tp->out_of_order_queue);
+               tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
                sk_mem_reclaim(sk);
                if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
                    !tcp_under_memory_pressure(sk))
                        break;
-       }
+               node = prev;
+       } while (node);
+       tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
  
        /* Reset SACK state.  A conforming SACK implementation will
         * do the same at a timeout based retransmit.  When a connection
@@@ -4930,7 -4986,7 +4986,7 @@@ static int tcp_prune_queue(struct sock 
  
        tcp_collapse_ofo_queue(sk);
        if (!skb_queue_empty(&sk->sk_receive_queue))
-               tcp_collapse(sk, &sk->sk_receive_queue,
+               tcp_collapse(sk, &sk->sk_receive_queue, NULL,
                             skb_peek(&sk->sk_receive_queue),
                             NULL,
                             tp->copied_seq, tp->rcv_nxt);
@@@ -5035,7 -5091,7 +5091,7 @@@ static void __tcp_ack_snd_check(struct 
            /* We ACK each frame or... */
            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
-           (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+           (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
                /* Then ack it now */
                tcp_send_ack(sk);
        } else {
@@@ -5894,7 -5950,7 +5950,7 @@@ int tcp_rcv_state_process(struct sock *
                 * so release it.
                 */
                if (req) {
-                       tp->total_retrans = req->num_retrans;
+                       inet_csk(sk)->icsk_retransmits = 0;
                        reqsk_fastopen_remove(sk, req, false);
                } else {
                        /* Make sure socket is routed, for correct metrics. */
                } else
                        tcp_init_metrics(sk);
  
-               tcp_update_pacing_rate(sk);
+               if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+                       tcp_update_pacing_rate(sk);
  
                /* Prevent spurious tcp_cwnd_restart() on first data packet */
                tp->lsndtime = tcp_time_stamp;
@@@ -6269,7 -6326,6 +6326,7 @@@ int tcp_conn_request(struct request_soc
  
        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
        tcp_openreq_init(req, &tmp_opt, skb, sk);
 +      inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
  
        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
        inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --combined net/ipv4/tcp_ipv4.c
@@@ -1196,6 -1196,7 +1196,6 @@@ static void tcp_v4_init_req(struct requ
  
        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
 -      ireq->no_srccheck = inet_sk(sk_listener)->transparent;
        ireq->opt = tcp_v4_save_options(skb);
  }
  
@@@ -1844,7 -1845,7 +1844,7 @@@ void tcp_v4_destroy_sock(struct sock *s
        tcp_write_queue_purge(sk);
  
        /* Cleans up our, hopefully empty, out_of_order_queue. */
-       __skb_queue_purge(&tp->out_of_order_queue);
+       skb_rbtree_purge(&tp->out_of_order_queue);
  
  #ifdef CONFIG_TCP_MD5SIG
        /* Clean up the MD5 key list, if any */
@@@ -31,8 -31,11 +31,9 @@@ static unsigned int nf_route_table_hook
        struct in6_addr saddr, daddr;
        u_int8_t hop_limit;
        u32 mark, flowlabel;
+       int err;
  
 -      /* malformed packet, drop it */
 -      if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
 -              return NF_DROP;
 +      nft_set_pktinfo_ipv6(&pkt, skb, state);
  
        /* save source/dest address, mark, hoplimit, flowlabel, priority */
        memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
        flowlabel = *((u32 *)ipv6_hdr(skb));
  
        ret = nft_do_chain(&pkt, priv);
-       if (ret != NF_DROP && ret != NF_QUEUE &&
+       if (ret != NF_DROP && ret != NF_STOLEN &&
            (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
             memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
             skb->mark != mark ||
             ipv6_hdr(skb)->hop_limit != hop_limit ||
-            flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
-               return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
+            flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+               err = ip6_route_me_harder(state->net, skb);
+               if (err < 0)
+                       ret = NF_DROP_ERR(err);
+       }
  
        return ret;
  }
diff --combined net/netfilter/core.c
@@@ -22,7 -22,6 +22,7 @@@
  #include <linux/proc_fs.h>
  #include <linux/mutex.h>
  #include <linux/slab.h>
 +#include <linux/rcupdate.h>
  #include <net/net_namespace.h>
  #include <net/sock.h>
  
@@@ -62,55 -61,33 +62,55 @@@ EXPORT_SYMBOL(nf_hooks_needed)
  #endif
  
  static DEFINE_MUTEX(nf_hook_mutex);
 +#define nf_entry_dereference(e) \
 +      rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
  
 -static struct list_head *nf_find_hook_list(struct net *net,
 -                                         const struct nf_hook_ops *reg)
 +static struct nf_hook_entry *nf_hook_entry_head(struct net *net,
 +                                              const struct nf_hook_ops *reg)
  {
 -      struct list_head *hook_list = NULL;
 +      struct nf_hook_entry *hook_head = NULL;
  
        if (reg->pf != NFPROTO_NETDEV)
 -              hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 +              hook_head = nf_entry_dereference(net->nf.hooks[reg->pf]
 +                                               [reg->hooknum]);
        else if (reg->hooknum == NF_NETDEV_INGRESS) {
  #ifdef CONFIG_NETFILTER_INGRESS
                if (reg->dev && dev_net(reg->dev) == net)
 -                      hook_list = &reg->dev->nf_hooks_ingress;
 +                      hook_head =
 +                              nf_entry_dereference(
 +                                      reg->dev->nf_hooks_ingress);
  #endif
        }
 -      return hook_list;
 +      return hook_head;
  }
  
 -struct nf_hook_entry {
 -      const struct nf_hook_ops        *orig_ops;
 -      struct nf_hook_ops              ops;
 -};
 +/* must hold nf_hook_mutex */
 +static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg,
 +                            struct nf_hook_entry *entry)
 +{
 +      switch (reg->pf) {
 +      case NFPROTO_NETDEV:
 +              /* We already checked in nf_register_net_hook() that this is
 +               * used from ingress.
 +               */
 +              rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry);
 +              break;
 +      default:
 +              rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum],
 +                                 entry);
 +              break;
 +      }
 +}
  
  int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
  {
 -      struct list_head *hook_list;
 +      struct nf_hook_entry *hooks_entry;
        struct nf_hook_entry *entry;
 -      struct nf_hook_ops *elem;
 +
 +      if (reg->pf == NFPROTO_NETDEV &&
 +          (reg->hooknum != NF_NETDEV_INGRESS ||
 +           !reg->dev || dev_net(reg->dev) != net))
 +              return -EINVAL;
  
        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
  
        entry->orig_ops = reg;
        entry->ops      = *reg;
 +      entry->next     = NULL;
 +
 +      mutex_lock(&nf_hook_mutex);
 +      hooks_entry = nf_hook_entry_head(net, reg);
  
 -      hook_list = nf_find_hook_list(net, reg);
 -      if (!hook_list) {
 -              kfree(entry);
 -              return -ENOENT;
 +      if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) {
 +              /* This is the case where we need to insert at the head */
 +              entry->next = hooks_entry;
 +              hooks_entry = NULL;
        }
  
 -      mutex_lock(&nf_hook_mutex);
 -      list_for_each_entry(elem, hook_list, list) {
 -              if (reg->priority < elem->priority)
 -                      break;
 +      while (hooks_entry &&
 +              reg->priority >= hooks_entry->orig_ops->priority &&
 +              nf_entry_dereference(hooks_entry->next)) {
 +              hooks_entry = nf_entry_dereference(hooks_entry->next);
 +      }
 +
 +      if (hooks_entry) {
 +              entry->next = nf_entry_dereference(hooks_entry->next);
 +              rcu_assign_pointer(hooks_entry->next, entry);
 +      } else {
 +              nf_set_hooks_head(net, reg, entry);
        }
 -      list_add_rcu(&entry->ops.list, elem->list.prev);
 +
        mutex_unlock(&nf_hook_mutex);
  #ifdef CONFIG_NETFILTER_INGRESS
        if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@@ -156,33 -122,24 +156,33 @@@ EXPORT_SYMBOL(nf_register_net_hook)
  
  void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
  {
 -      struct list_head *hook_list;
 -      struct nf_hook_entry *entry;
 -      struct nf_hook_ops *elem;
 -
 -      hook_list = nf_find_hook_list(net, reg);
 -      if (!hook_list)
 -              return;
 +      struct nf_hook_entry *hooks_entry;
  
        mutex_lock(&nf_hook_mutex);
 -      list_for_each_entry(elem, hook_list, list) {
 -              entry = container_of(elem, struct nf_hook_entry, ops);
 -              if (entry->orig_ops == reg) {
 -                      list_del_rcu(&entry->ops.list);
 -                      break;
 +      hooks_entry = nf_hook_entry_head(net, reg);
 +      if (hooks_entry->orig_ops == reg) {
 +              nf_set_hooks_head(net, reg,
 +                                nf_entry_dereference(hooks_entry->next));
 +              goto unlock;
 +      }
 +      while (hooks_entry && nf_entry_dereference(hooks_entry->next)) {
 +              struct nf_hook_entry *next =
 +                      nf_entry_dereference(hooks_entry->next);
 +              struct nf_hook_entry *nnext;
 +
 +              if (next->orig_ops != reg) {
 +                      hooks_entry = next;
 +                      continue;
                }
 +              nnext = nf_entry_dereference(next->next);
 +              rcu_assign_pointer(hooks_entry->next, nnext);
 +              hooks_entry = next;
 +              break;
        }
 +
 +unlock:
        mutex_unlock(&nf_hook_mutex);
 -      if (&elem->list == hook_list) {
 +      if (!hooks_entry) {
                WARN(1, "nf_unregister_net_hook: hook not found!\n");
                return;
        }
        static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
  #endif
        synchronize_net();
 -      nf_queue_nf_hook_drop(net, &entry->ops);
 +      nf_queue_nf_hook_drop(net, hooks_entry);
        /* other cpu might still process nfqueue verdict that used reg */
        synchronize_net();
 -      kfree(entry);
 +      kfree(hooks_entry);
  }
  EXPORT_SYMBOL(nf_unregister_net_hook);
  
@@@ -231,19 -188,17 +231,17 @@@ EXPORT_SYMBOL(nf_unregister_net_hooks)
  
  static LIST_HEAD(nf_hook_list);
  
int nf_register_hook(struct nf_hook_ops *reg)
static int _nf_register_hook(struct nf_hook_ops *reg)
  {
        struct net *net, *last;
        int ret;
  
-       rtnl_lock();
        for_each_net(net) {
                ret = nf_register_net_hook(net, reg);
                if (ret && ret != -ENOENT)
                        goto rollback;
        }
        list_add_tail(&reg->list, &nf_hook_list);
-       rtnl_unlock();
  
        return 0;
  rollback:
                        break;
                nf_unregister_net_hook(net, reg);
        }
+       return ret;
+ }
+ int nf_register_hook(struct nf_hook_ops *reg)
+ {
+       int ret;
+       rtnl_lock();
+       ret = _nf_register_hook(reg);
        rtnl_unlock();
        return ret;
  }
  EXPORT_SYMBOL(nf_register_hook);
  
void nf_unregister_hook(struct nf_hook_ops *reg)
static void _nf_unregister_hook(struct nf_hook_ops *reg)
  {
        struct net *net;
  
-       rtnl_lock();
        list_del(&reg->list);
        for_each_net(net)
                nf_unregister_net_hook(net, reg);
+ }
+ void nf_unregister_hook(struct nf_hook_ops *reg)
+ {
+       rtnl_lock();
+       _nf_unregister_hook(reg);
        rtnl_unlock();
  }
  EXPORT_SYMBOL(nf_unregister_hook);
@@@ -289,6 -259,26 +302,26 @@@ err
  }
  EXPORT_SYMBOL(nf_register_hooks);
  
+ /* Caller MUST take rtnl_lock() */
+ int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+ {
+       unsigned int i;
+       int err = 0;
+       for (i = 0; i < n; i++) {
+               err = _nf_register_hook(&reg[i]);
+               if (err)
+                       goto err;
+       }
+       return err;
+ err:
+       if (i > 0)
+               _nf_unregister_hooks(reg, i);
+       return err;
+ }
+ EXPORT_SYMBOL(_nf_register_hooks);
  void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
  {
        while (n-- > 0)
  }
  EXPORT_SYMBOL(nf_unregister_hooks);
  
 -unsigned int nf_iterate(struct list_head *head,
 -                      struct sk_buff *skb,
+ /* Caller MUST take rtnl_lock */
+ void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+ {
+       while (n-- > 0)
+               _nf_unregister_hook(&reg[n]);
+ }
+ EXPORT_SYMBOL(_nf_unregister_hooks);
 +unsigned int nf_iterate(struct sk_buff *skb,
                        struct nf_hook_state *state,
 -                      struct nf_hook_ops **elemp)
 +                      struct nf_hook_entry **entryp)
  {
        unsigned int verdict;
  
         * The caller must not block between calls to this
         * function because of risk of continuing from deleted element.
         */
 -      list_for_each_entry_continue_rcu((*elemp), head, list) {
 -              if (state->thresh > (*elemp)->priority)
 +      while (*entryp) {
 +              if (state->thresh > (*entryp)->ops.priority) {
 +                      *entryp = rcu_dereference((*entryp)->next);
                        continue;
 +              }
  
                /* Optimization: we don't need to hold module
                   reference here, since function can't sleep. --RR */
  repeat:
 -              verdict = (*elemp)->hook((*elemp)->priv, skb, state);
 +              verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
                if (verdict != NF_ACCEPT) {
  #ifdef CONFIG_NETFILTER_DEBUG
                        if (unlikely((verdict & NF_VERDICT_MASK)
                                                        > NF_MAX_VERDICT)) {
                                NFDEBUG("Evil return from %p(%u).\n",
 -                                      (*elemp)->hook, state->hook);
 +                                      (*entryp)->ops.hook, state->hook);
 +                              *entryp = rcu_dereference((*entryp)->next);
                                continue;
                        }
  #endif
                                return verdict;
                        goto repeat;
                }
 +              *entryp = rcu_dereference((*entryp)->next);
        }
        return NF_ACCEPT;
  }
  
  
  /* Returns 1 if okfn() needs to be executed by the caller,
 - * -EPERM for NF_DROP, 0 otherwise. */
 + * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
  int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
  {
 -      struct nf_hook_ops *elem;
 +      struct nf_hook_entry *entry;
        unsigned int verdict;
        int ret = 0;
  
 -      /* We may already have this, but read-locks nest anyway */
 -      rcu_read_lock();
 -
 -      elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
 +      entry = rcu_dereference(state->hook_entries);
  next_hook:
 -      verdict = nf_iterate(state->hook_list, skb, state, &elem);
 +      verdict = nf_iterate(skb, state, &entry);
        if (verdict == NF_ACCEPT || verdict == NF_STOP) {
                ret = 1;
        } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
                if (ret == 0)
                        ret = -EPERM;
        } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
 -              int err = nf_queue(skb, elem, state,
 -                                 verdict >> NF_VERDICT_QBITS);
 +              int err;
 +
 +              RCU_INIT_POINTER(state->hook_entries, entry);
 +              err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
                if (err < 0) {
                        if (err == -ESRCH &&
                           (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
                        kfree_skb(skb);
                }
        }
 -      rcu_read_unlock();
        return ret;
  }
  EXPORT_SYMBOL(nf_hook_slow);
@@@ -485,7 -482,7 +526,7 @@@ static int __net_init netfilter_net_ini
  
        for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
                for (h = 0; h < NF_MAX_HOOKS; h++)
 -                      INIT_LIST_HEAD(&net->nf.hooks[i][h]);
 +                      RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
        }
  
  #ifdef CONFIG_PROC_FS
@@@ -379,6 -379,7 +379,6 @@@ static voi
  destroy_conntrack(struct nf_conntrack *nfct)
  {
        struct nf_conn *ct = (struct nf_conn *)nfct;
 -      struct net *net = nf_ct_net(ct);
        struct nf_conntrack_l4proto *l4proto;
  
        pr_debug("destroy_conntrack(%p)\n", ct);
  
        nf_ct_del_from_dying_or_unconfirmed_list(ct);
  
 -      NF_CT_STAT_INC(net, delete);
        local_bh_enable();
  
        if (ct->master)
@@@ -436,6 -438,7 +436,6 @@@ static void nf_ct_delete_from_lists(str
  
        nf_ct_add_to_dying_list(ct);
  
 -      NF_CT_STAT_INC(net, delete_list);
        local_bh_enable();
  }
  
@@@ -526,8 -529,11 +526,8 @@@ begin
                if (nf_ct_is_dying(ct))
                        continue;
  
 -              if (nf_ct_key_equal(h, tuple, zone, net)) {
 -                      NF_CT_STAT_INC_ATOMIC(net, found);
 +              if (nf_ct_key_equal(h, tuple, zone, net))
                        return h;
 -              }
 -              NF_CT_STAT_INC_ATOMIC(net, searched);
        }
        /*
         * if the nulls value we got at the end of this lookup is
@@@ -792,6 -798,7 +792,6 @@@ __nf_conntrack_confirm(struct sk_buff *
         */
        __nf_conntrack_hash_insert(ct, hash, reply_hash);
        nf_conntrack_double_unlock(hash, reply_hash);
 -      NF_CT_STAT_INC(net, insert);
        local_bh_enable();
  
        help = nfct_help(ct);
@@@ -850,6 -857,7 +850,6 @@@ nf_conntrack_tuple_taken(const struct n
                        rcu_read_unlock();
                        return 1;
                }
 -              NF_CT_STAT_INC_ATOMIC(net, searched);
        }
  
        if (get_nulls_value(n) != hash) {
@@@ -1108,9 -1116,9 +1108,9 @@@ init_conntrack(struct net *net, struct 
        if (IS_ERR(ct))
                return (struct nf_conntrack_tuple_hash *)ct;
  
-       if (tmpl && nfct_synproxy(tmpl)) {
-               nfct_seqadj_ext_add(ct);
-               nfct_synproxy_ext_add(ct);
+       if (!nf_ct_add_synproxy(ct, tmpl)) {
+               nf_conntrack_free(ct);
+               return ERR_PTR(-ENOMEM);
        }
  
        timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
                }
                spin_unlock(&nf_conntrack_expect_lock);
        }
 -      if (!exp) {
 +      if (!exp)
                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
 -              NF_CT_STAT_INC(net, new);
 -      }
  
        /* Now it is inserted into the unconfirmed list, bump refcount */
        nf_conntrack_get(&ct->ct_general);
@@@ -1275,7 -1285,7 +1275,7 @@@ nf_conntrack_in(struct net *net, u_int8
                skb->nfct = NULL;
        }
  
 -      /* rcu_read_lock()ed by nf_hook_slow */
 +      /* rcu_read_lock()ed by nf_hook_thresh */
        l3proto = __nf_ct_l3proto_find(pf);
        ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
                                   &dataoff, &protonum);
@@@ -113,22 -113,20 +113,22 @@@ static int nf_trace_fill_pkt_info(struc
                                  const struct nft_pktinfo *pkt)
  {
        const struct sk_buff *skb = pkt->skb;
 -      unsigned int len = min_t(unsigned int,
 -                               pkt->xt.thoff - skb_network_offset(skb),
 -                               NFT_TRACETYPE_NETWORK_HSIZE);
        int off = skb_network_offset(skb);
 +      unsigned int len, nh_end;
  
 +      nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
 +      len = min_t(unsigned int, nh_end - skb_network_offset(skb),
 +                  NFT_TRACETYPE_NETWORK_HSIZE);
        if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
                return -1;
  
 -      len = min_t(unsigned int, skb->len - pkt->xt.thoff,
 -                  NFT_TRACETYPE_TRANSPORT_HSIZE);
 -
 -      if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
 -                            pkt->xt.thoff, len))
 -              return -1;
 +      if (pkt->tprot_set) {
 +              len = min_t(unsigned int, skb->len - pkt->xt.thoff,
 +                          NFT_TRACETYPE_TRANSPORT_HSIZE);
 +              if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
 +                                    pkt->xt.thoff, len))
 +                      return -1;
 +      }
  
        if (!skb_mac_header_was_set(skb))
                return 0;
@@@ -239,7 -237,7 +239,7 @@@ void nft_trace_notify(struct nft_tracei
                break;
        case NFT_TRACETYPE_POLICY:
                if (nla_put_be32(skb, NFTA_TRACE_POLICY,
-                                info->basechain->policy))
+                                htonl(info->basechain->policy)))
                        goto nla_put_failure;
                break;
        }
diff --combined net/netfilter/nft_meta.c
@@@ -52,8 -52,6 +52,8 @@@ void nft_meta_get_eval(const struct nft
                *dest = pkt->pf;
                break;
        case NFT_META_L4PROTO:
 +              if (!pkt->tprot_set)
 +                      goto err;
                *dest = pkt->tprot;
                break;
        case NFT_META_PRIORITY:
@@@ -293,10 -291,16 +293,16 @@@ int nft_meta_get_init(const struct nft_
  }
  EXPORT_SYMBOL_GPL(nft_meta_get_init);
  
- static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+ int nft_meta_set_validate(const struct nft_ctx *ctx,
+                         const struct nft_expr *expr,
+                         const struct nft_data **data)
  {
+       struct nft_meta *priv = nft_expr_priv(expr);
        unsigned int hooks;
  
+       if (priv->key != NFT_META_PKTTYPE)
+               return 0;
        switch (ctx->afi->family) {
        case NFPROTO_BRIDGE:
                hooks = 1 << NF_BR_PRE_ROUTING;
  
        return nft_chain_validate_hooks(ctx->chain, hooks);
  }
+ EXPORT_SYMBOL_GPL(nft_meta_set_validate);
  
  int nft_meta_set_init(const struct nft_ctx *ctx,
                      const struct nft_expr *expr,
                len = sizeof(u8);
                break;
        case NFT_META_PKTTYPE:
-               err = nft_meta_set_init_pkttype(ctx);
-               if (err)
-                       return err;
                len = sizeof(u8);
                break;
        default:
                return -EOPNOTSUPP;
        }
  
+       err = nft_meta_set_validate(ctx, expr, NULL);
+       if (err < 0)
+               return err;
        priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
        err = nft_validate_register_load(priv->sreg, len);
        if (err < 0)
@@@ -409,6 -415,7 +417,7 @@@ static const struct nft_expr_ops nft_me
        .init           = nft_meta_set_init,
        .destroy        = nft_meta_set_destroy,
        .dump           = nft_meta_set_dump,
+       .validate       = nft_meta_set_validate,
  };
  
  static const struct nft_expr_ops *