Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

[cascardo/linux.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index dad3e7e..a27b9c0 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
  static void tcp_sndbuf_expand(struct sock *sk)
  {
         const struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
         int sndmem, per_mss;
         u32 nr_segs;
  
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
          * Cubic needs 1.7 factor, rounded to 2 to include
          * extra cushion (application might react slowly to POLLOUT)
          */
-       sndmem = 2 * nr_segs * per_mss;
+       sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+       sndmem *= nr_segs * per_mss;
  
         if (sk->sk_sndbuf < sndmem)
                 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
                 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
  }
  
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ *    and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ *    and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+       __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+       if (!(sacked & TCPCB_LOST) ||
+           ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+               tp->lost += tcp_skb_pcount(skb);
+}
+
  static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
  {
         if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                 tcp_verify_retransmit_hint(tp, skb);
  
                 tp->lost_out += tcp_skb_pcount(skb);
+               tcp_sum_lost(tp, skb);
                 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
         }
  }
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
  {
         tcp_verify_retransmit_hint(tp, skb);
  
+       tcp_sum_lost(tp, skb);
         if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                 tp->lost_out += tcp_skb_pcount(skb);
                 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
          */
         struct skb_mstamp first_sackt;
         struct skb_mstamp last_sackt;
+       struct rate_sample *rate;
         int     flag;
  };
  
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                         start_seq, end_seq, dup_sack, pcount,
                         &skb->skb_mstamp);
+       tcp_rate_skb_delivered(sk, skb, state->rate);
  
         if (skb == tp->lost_skb_hint)
                 tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
                 tcp_advance_highest_sack(sk, skb);
  
         tcp_skb_collapse_tstamp(prev, skb);
+       if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+               TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
         tcp_unlink_write_queue(skb, sk);
         sk_wmem_free_skb(sk, skb);
  
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                 dup_sack,
                                                 tcp_skb_pcount(skb),
                                                 &skb->skb_mstamp);
+                       tcp_rate_skb_delivered(sk, skb, state->rate);
  
                         if (!before(TCP_SKB_CB(skb)->seq,
                                     tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
  
         found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
                                          num_sacks, prior_snd_una);
-       if (found_dup_sack)
+       if (found_dup_sack) {
                 state->flag |= FLAG_DSACKING_ACK;
+               tp->delivered++; /* A spurious retransmission is delivered */
+       }
  
         /* Eliminate too old ACKs, but take into
          * account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
         struct sk_buff *skb;
         bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
         bool is_reneg;                  /* is receiver reneging on SACKs? */
+       bool mark_lost;
  
         /* Reduce ssthresh if it has not yet been made inside this window. */
         if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
                 if (skb == tcp_send_head(sk))
                         break;
  
+               mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+                            is_reneg);
+               if (mark_lost)
+                       tcp_sum_lost(tp, skb);
                 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+               if (mark_lost) {
                         TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                         TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                         tp->lost_out += tcp_skb_pcount(skb);
@@ -2329,10 +2362,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
         }
  #if IS_ENABLED(CONFIG_IPV6)
         else if (sk->sk_family == AF_INET6) {
-               struct ipv6_pinfo *np = inet6_sk(sk);
                 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
                          msg,
-                        &np->daddr, ntohs(inet->inet_dport),
+                        &sk->sk_v6_daddr, ntohs(inet->inet_dport),
                          tp->snd_cwnd, tcp_left_out(tp),
                          tp->snd_ssthresh, tp->prior_ssthresh,
                          tp->packets_out);
@@ -2503,6 +2535,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
+       if (inet_csk(sk)->icsk_ca_ops->cong_control)
+               return;
+
         /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
         if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
             (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2914,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
         *rexmit = REXMIT_LOST;
  }
  
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
  static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
  {
-       const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
-       struct rtt_meas *m = tcp_sk(sk)->rtt_min;
-       struct rtt_meas rttm = {
-               .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
-               .ts = now,
-       };
-       u32 elapsed;
-
-       /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
-       if (unlikely(rttm.rtt <= m[0].rtt))
-               m[0] = m[1] = m[2] = rttm;
-       else if (rttm.rtt <= m[1].rtt)
-               m[1] = m[2] = rttm;
-       else if (rttm.rtt <= m[2].rtt)
-               m[2] = rttm;
-
-       elapsed = now - m[0].ts;
-       if (unlikely(elapsed > wlen)) {
-               /* Passed entire window without a new min so make 2nd choice
-                * the new min & 3rd choice the new 2nd. So forth and so on.
-                */
-               m[0] = m[1];
-               m[1] = m[2];
-               m[2] = rttm;
-               if (now - m[0].ts > wlen) {
-                       m[0] = m[1];
-                       m[1] = rttm;
-                       if (now - m[0].ts > wlen)
-                               m[0] = rttm;
-               }
-       } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
-               /* Passed a quarter of the window without a new min so
-                * take 2nd choice from the 2nd quarter of the window.
-                */
-               m[2] = m[1] = rttm;
-       } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
-               /* Passed half the window without a new min so take the 3rd
-                * choice from the last half of the window.
-                */
-               m[2] = rttm;
-       }
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+       minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+                          rtt_us ? : jiffies_to_usecs(1));
  }
  
  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3083,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
   */
  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                u32 prior_snd_una, int *acked,
-                              struct tcp_sacktag_state *sack)
+                              struct tcp_sacktag_state *sack,
+                              struct skb_mstamp *now)
  {
         const struct inet_connection_sock *icsk = inet_csk(sk);
-       struct skb_mstamp first_ackt, last_ackt, now;
+       struct skb_mstamp first_ackt, last_ackt;
         struct tcp_sock *tp = tcp_sk(sk);
         u32 prior_sacked = tp->sacked_out;
         u32 reord = tp->packets_out;
@@ -3137,7 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                         acked_pcount = tcp_tso_acked(sk, skb);
                         if (!acked_pcount)
                                 break;
-
                         fully_acked = false;
                 } else {
                         /* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3154,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
                 tp->packets_out -= acked_pcount;
                 pkts_acked += acked_pcount;
+               tcp_rate_skb_delivered(sk, skb, sack->rate);
  
                 /* Initial outgoing SYN's get put onto the write_queue
                  * just like anything else we transmit.  It is not
@@ -3205,16 +3187,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
         if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                 flag |= FLAG_SACK_RENEGING;
  
-       skb_mstamp_get(&now);
         if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-               seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+               seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
         }
         if (sack->first_sackt.v64) {
-               sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+               sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
         }
-
+       sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
         rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
                                         ca_rtt_us);
  
@@ -3242,7 +3223,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
  
         } else if (skb && rtt_update && sack_rtt_us >= 0 &&
-                  sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+                  sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
                 /* Do not re-arm RTO if the sack RTT is measured from data sent
                  * after when the head was last (re)transmitted. Otherwise the
                  * timeout may continue to extend in loss recovery.
@@ -3333,8 +3314,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
   * information. All transmission or retransmission are delayed afterwards.
   */
  static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
-                            int flag)
+                            int flag, const struct rate_sample *rs)
  {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+
+       if (icsk->icsk_ca_ops->cong_control) {
+               icsk->icsk_ca_ops->cong_control(sk, rs);
+               return;
+       }
+
         if (tcp_in_cwnd_reduction(sk)) {
                 /* Reduce cwnd if state mandates */
                 tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3567,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         struct inet_connection_sock *icsk = inet_csk(sk);
         struct tcp_sock *tp = tcp_sk(sk);
         struct tcp_sacktag_state sack_state;
+       struct rate_sample rs = { .prior_delivered = 0 };
         u32 prior_snd_una = tp->snd_una;
         u32 ack_seq = TCP_SKB_CB(skb)->seq;
         u32 ack = TCP_SKB_CB(skb)->ack_seq;
         bool is_dupack = false;
         u32 prior_fackets;
         int prior_packets = tp->packets_out;
-       u32 prior_delivered = tp->delivered;
+       u32 delivered = tp->delivered;
+       u32 lost = tp->lost;
         int acked = 0; /* Number of packets newly acked */
         int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+       struct skb_mstamp now;
  
         sack_state.first_sackt.v64 = 0;
+       sack_state.rate = &rs;
  
         /* We very likely will need to access write queue head. */
         prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3604,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         if (after(ack, tp->snd_nxt))
                 goto invalid_ack;
  
+       skb_mstamp_get(&now);
+
         if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                 tcp_rearm_rto(sk);
@@ -3622,6 +3616,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         }
  
         prior_fackets = tp->fackets_out;
+       rs.prior_in_flight = tcp_packets_in_flight(tp);
  
         /* ts_recent update must be made after we are sure that the packet
          * is in window.
@@ -3677,7 +3672,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         /* See if we can take anything off of the retransmit queue. */
         flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-                                   &sack_state);
+                                   &sack_state, &now);
  
         if (tcp_ack_is_dubious(sk, flag)) {
                 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                 tcp_schedule_loss_probe(sk);
-       tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+       delivered = tp->delivered - delivered;  /* freshly ACKed or SACKed */
+       lost = tp->lost - lost;                 /* freshly marked lost */
+       tcp_rate_gen(sk, delivered, lost, &now, &rs);
+       tcp_cong_control(sk, ack, delivered, flag, &rs);
         tcp_xmit_recovery(sk, rexmit);
         return 1;
  
@@ -5951,7 +5949,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                  * so release it.
                  */
                 if (req) {
-                       tp->total_retrans = req->num_retrans;
+                       inet_csk(sk)->icsk_retransmits = 0;
                         reqsk_fastopen_remove(sk, req, false);
                 } else {
                         /* Make sure socket is routed, for correct metrics. */
@@ -5993,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                 } else
                         tcp_init_metrics(sk);
  
-               tcp_update_pacing_rate(sk);
+               if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+                       tcp_update_pacing_rate(sk);
  
                 /* Prevent spurious tcp_cwnd_restart() on first data packet */
                 tp->lsndtime = tcp_time_stamp;
@@ -6326,6 +6325,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
  
         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
         tcp_openreq_init(req, &tmp_opt, skb, sk);
+       inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
  
         /* Note: tcp_v6_init_req() might override ir_iif for link locals */
         inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);