tcp: mitigate ACK loops for connections as tcp_sock
[cascardo/linux.git] / net / ipv4 / tcp_input.c
index 075ab4d..8fdd27b 100644 (file)
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
 #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
@@ -3183,8 +3184,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
-               if (ca_ops->pkts_acked)
-                       ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
+               if (ca_ops->pkts_acked) {
+                       long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
+                       ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+               }
 
        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
                   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
@@ -3319,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 }
 
 /* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 {
        /* unprotected vars, we dont care of overwrites */
        static u32 challenge_timestamp;
        static unsigned int challenge_count;
-       u32 now = jiffies / HZ;
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 now;
+
+       /* First check our per-socket dupack rate limit. */
+       if (tcp_oow_rate_limited(sock_net(sk), skb,
+                                LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+                                &tp->last_oow_ack_time))
+               return;
 
+       /* Then check the check host-wide RFC 5961 rate limit. */
+       now = jiffies / HZ;
        if (now != challenge_timestamp) {
                challenge_timestamp = now;
                challenge_count = 0;
@@ -3358,34 +3370,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 }
 
 /* This routine deals with acks during a TLP episode.
+ * We mark the end of a TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
  * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
  */
 static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
-                            !(flag & (FLAG_SND_UNA_ADVANCED |
-                                      FLAG_NOT_DUP | FLAG_DATA_SACKED));
 
-       /* Mark the end of TLP episode on receiving TLP dupack or when
-        * ack is after tlp_high_seq.
-        */
-       if (is_tlp_dupack) {
-               tp->tlp_high_seq = 0;
+       if (before(ack, tp->tlp_high_seq))
                return;
-       }
 
-       if (after(ack, tp->tlp_high_seq)) {
+       if (flag & FLAG_DSACKING_ACK) {
+               /* This DSACK means original and TLP probe arrived; no loss */
+               tp->tlp_high_seq = 0;
+       } else if (after(ack, tp->tlp_high_seq)) {
+               /* ACK advances: there was a loss, so reduce cwnd. Reset
+                * tlp_high_seq in tcp_init_cwnd_reduction()
+                */
+               tcp_init_cwnd_reduction(sk);
+               tcp_set_ca_state(sk, TCP_CA_CWR);
+               tcp_end_cwnd_reduction(sk);
+               tcp_try_keep_open(sk);
+               NET_INC_STATS_BH(sock_net(sk),
+                                LINUX_MIB_TCPLOSSPROBERECOVERY);
+       } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+                            FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+               /* Pure dupack: original and TLP probe arrived; no loss */
                tp->tlp_high_seq = 0;
-               /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
-               if (!(flag & FLAG_DSACKING_ACK)) {
-                       tcp_init_cwnd_reduction(sk);
-                       tcp_set_ca_state(sk, TCP_CA_CWR);
-                       tcp_end_cwnd_reduction(sk);
-                       tcp_try_keep_open(sk);
-                       NET_INC_STATS_BH(sock_net(sk),
-                                        LINUX_MIB_TCPLOSSPROBERECOVERY);
-               }
        }
 }
 
@@ -3421,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (before(ack, prior_snd_una)) {
                /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
                if (before(ack, prior_snd_una - tp->max_window)) {
-                       tcp_send_challenge_ack(sk);
+                       tcp_send_challenge_ack(sk, skb);
                        return -1;
                }
                goto old_ack;
@@ -4990,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
            tcp_paws_discard(sk, skb)) {
                if (!th->rst) {
                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-                       tcp_send_dupack(sk, skb);
+                       if (!tcp_oow_rate_limited(sock_net(sk), skb,
+                                                 LINUX_MIB_TCPACKSKIPPEDPAWS,
+                                                 &tp->last_oow_ack_time))
+                               tcp_send_dupack(sk, skb);
                        goto discard;
                }
                /* Reset is accepted even if it did not pass PAWS. */
@@ -5007,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
                if (!th->rst) {
                        if (th->syn)
                                goto syn_challenge;
-                       tcp_send_dupack(sk, skb);
+                       if (!tcp_oow_rate_limited(sock_net(sk), skb,
+                                                 LINUX_MIB_TCPACKSKIPPEDSEQ,
+                                                 &tp->last_oow_ack_time))
+                               tcp_send_dupack(sk, skb);
                }
                goto discard;
        }
@@ -5023,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
                if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
                        tcp_reset(sk);
                else
-                       tcp_send_challenge_ack(sk);
+                       tcp_send_challenge_ack(sk, skb);
                goto discard;
        }
 
@@ -5037,7 +5055,7 @@ syn_challenge:
                if (syn_inerr)
                        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-               tcp_send_challenge_ack(sk);
+               tcp_send_challenge_ack(sk, skb);
                goto discard;
        }
 
@@ -5870,10 +5888,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
  * TCP ECN negotiation.
  *
  * Exception: tcp_ca wants ECN. This is required for DCTCP
- * congestion control; it requires setting ECT on all packets,
- * including SYN. We inverse the test in this case: If our
- * local socket wants ECN, but peer only set ece/cwr (but not
- * ECT in IP header) its probably a non-DCTCP aware sender.
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
  */
 static void tcp_ecn_create_request(struct request_sock *req,
                                   const struct sk_buff *skb,
@@ -5883,18 +5900,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
        const struct tcphdr *th = tcp_hdr(skb);
        const struct net *net = sock_net(listen_sk);
        bool th_ecn = th->ece && th->cwr;
-       bool ect, need_ecn, ecn_ok;
+       bool ect, ecn_ok;
 
        if (!th_ecn)
                return;
 
        ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
-       need_ecn = tcp_ca_needs_ecn(listen_sk);
        ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
 
-       if (!ect && !need_ecn && ecn_ok)
-               inet_rsk(req)->ecn_ok = 1;
-       else if (ect && need_ecn)
+       if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
                inet_rsk(req)->ecn_ok = 1;
 }