2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 #include <net/busy_poll.h>
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
86 #include <linux/crypto.h>
87 #include <linux/scatterlist.h>
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
94 #ifdef CONFIG_TCP_MD5SIG
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
99 struct inet_hashinfo tcp_hashinfo;
100 EXPORT_SYMBOL(tcp_hashinfo);
102 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107 tcp_hdr(skb)->source);
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
115 /* With PAWS, it is safe from the viewpoint
116 of data integrity. Even without PAWS it is safe provided sequence
117 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 Actually, the idea is close to VJ's one, only timestamp cache is
120 held not per host, but per port pair and TW bucket is used as state
123 If TW bucket has been already destroyed we fall back to VJ's scheme
124 and use initial timestamp retrieved from peer table.
126 if (tcptw->tw_ts_recent_stamp &&
127 (twp == NULL || (sysctl_tcp_tw_reuse &&
128 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 if (tp->write_seq == 0)
132 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
133 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 /* This will initiate an outgoing connection. */
143 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
146 struct inet_sock *inet = inet_sk(sk);
147 struct tcp_sock *tp = tcp_sk(sk);
148 __be16 orig_sport, orig_dport;
149 __be32 daddr, nexthop;
153 struct ip_options_rcu *inet_opt;
155 if (addr_len < sizeof(struct sockaddr_in))
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
161 nexthop = daddr = usin->sin_addr.s_addr;
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 sock_owned_by_user(sk));
164 if (inet_opt && inet_opt->opt.srr) {
167 nexthop = inet_opt->opt.faddr;
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
176 orig_sport, orig_dport, sk);
179 if (err == -ENETUNREACH)
180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 if (!inet_opt || !inet_opt->opt.srr)
192 if (!inet->inet_saddr)
193 inet->inet_saddr = fl4->saddr;
194 inet->inet_rcv_saddr = inet->inet_saddr;
196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
200 if (likely(!tp->repair))
204 if (tcp_death_row.sysctl_tw_recycle &&
205 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 tcp_fetch_timewait_stamp(sk, &rt->dst);
208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr;
211 inet_csk(sk)->icsk_ext_hdr_len = 0;
213 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
215 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
217 /* Socket identity is still unknown (sport may be zero).
218 * However we set state to SYN-SENT and not releasing socket
219 * lock select source port, enter ourselves into the hash tables and
220 * complete initialization after this.
222 tcp_set_state(sk, TCP_SYN_SENT);
223 err = inet_hash_connect(&tcp_death_row, sk);
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
238 if (!tp->write_seq && likely(!tp->repair))
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244 inet->inet_id = tp->write_seq ^ jiffies;
246 err = tcp_connect(sk);
256 * This unhashes the socket and releases the local port,
259 tcp_set_state(sk, TCP_CLOSE);
261 sk->sk_route_caps = 0;
262 inet->inet_dport = 0;
265 EXPORT_SYMBOL(tcp_v4_connect);
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
272 static void tcp_v4_mtu_reduced(struct sock *sk)
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
276 u32 mtu = tcp_sk(sk)->mtu_info;
278 dst = inet_csk_update_pmtu(sk, mtu);
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 ip_sk_accept_pmtu(sk) &&
292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 tcp_sync_mss(sk, mtu);
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 struct dst_entry *dst = __sk_dst_check(sk, 0);
309 dst->ops->redirect(dst, sk, skb);
313 * This routine is called by the ICMP module when it gets some
314 * sort of error condition. If err < 0 then the socket should
315 * be closed and the error returned to the user. If err > 0
316 * it's just the icmp type << 8 | icmp code. After adjustment
317 * header points to the first 8 bytes of the tcp header. We need
318 * to find the appropriate port.
320 * The locking strategy used here is very "optimistic". When
321 * someone else accesses the socket the ICMP is just dropped
322 * and for some paths there is no check at all.
323 * A more general error queue to queue errors for later handling
324 * is probably better.
328 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
331 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
332 struct inet_connection_sock *icsk;
334 struct inet_sock *inet;
335 const int type = icmp_hdr(icmp_skb)->type;
336 const int code = icmp_hdr(icmp_skb)->code;
339 struct request_sock *req;
343 struct net *net = dev_net(icmp_skb->dev);
345 if (icmp_skb->len < (iph->ihl << 2) + 8) {
346 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
351 iph->saddr, th->source, inet_iif(icmp_skb));
353 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356 if (sk->sk_state == TCP_TIME_WAIT) {
357 inet_twsk_put(inet_twsk(sk));
362 /* If too many ICMPs get dropped on busy
363 * servers this needs to be solved differently.
364 * We do take care of PMTU discovery (RFC1191) special case :
365 * we can receive locally generated ICMP messages while socket is held.
367 if (sock_owned_by_user(sk)) {
368 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
369 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
371 if (sk->sk_state == TCP_CLOSE)
374 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
375 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
381 req = tp->fastopen_rsk;
382 seq = ntohl(th->seq);
383 if (sk->sk_state != TCP_LISTEN &&
384 !between(seq, tp->snd_una, tp->snd_nxt) &&
385 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
386 /* For a Fast Open socket, allow seq to be snt_isn. */
387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 do_redirect(icmp_skb, sk);
395 case ICMP_SOURCE_QUENCH:
396 /* Just silently ignore these. */
398 case ICMP_PARAMETERPROB:
401 case ICMP_DEST_UNREACH:
402 if (code > NR_ICMP_UNREACH)
405 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
406 /* We are not interested in TCP_LISTEN and open_requests
407 * (SYN-ACKs send out by Linux are always <576bytes so
408 * they should go through unfragmented).
410 if (sk->sk_state == TCP_LISTEN)
414 if (!sock_owned_by_user(sk)) {
415 tcp_v4_mtu_reduced(sk);
417 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
423 err = icmp_err_convert[code].errno;
424 /* check if icmp_skb allows revert of backoff
425 * (see draft-zimmermann-tcp-lcd) */
426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
428 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
432 /* XXX (TFO) - revisit the following logic for TFO */
434 if (sock_owned_by_user(sk))
437 icsk->icsk_backoff--;
438 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
439 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
442 skb = tcp_write_queue_head(sk);
445 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
446 tcp_time_stamp - TCP_SKB_CB(skb)->when);
449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450 remaining, TCP_RTO_MAX);
452 /* RTO revert clocked out retransmission.
453 * Will retransmit now */
454 tcp_retransmit_timer(sk);
458 case ICMP_TIME_EXCEEDED:
465 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
466 * than following the TCP_SYN_RECV case and closing the socket,
467 * we ignore the ICMP error and keep trying like a fully established
468 * socket. Is this the right thing to do?
470 if (req && req->sk == NULL)
473 switch (sk->sk_state) {
474 struct request_sock *req, **prev;
476 if (sock_owned_by_user(sk))
479 req = inet_csk_search_req(sk, &prev, th->dest,
480 iph->daddr, iph->saddr);
484 /* ICMPs are not backlogged, hence we cannot get
485 an established socket here.
489 if (seq != tcp_rsk(req)->snt_isn) {
490 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
495 * Still in SYN_RECV, just remove it silently.
496 * There is no good way to pass the error to the newly
497 * created socket, and POSIX does not want network
498 * errors returned from accept().
500 inet_csk_reqsk_queue_drop(sk, req, prev);
501 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
505 case TCP_SYN_RECV: /* Cannot happen.
506 It can f.e. if SYNs crossed,
509 if (!sock_owned_by_user(sk)) {
512 sk->sk_error_report(sk);
516 sk->sk_err_soft = err;
521 /* If we've already connected we will keep trying
522 * until we time out, or the user gives up.
524 * rfc1122 4.2.3.9 allows to consider as hard errors
525 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
526 * but it is obsoleted by pmtu discovery).
528 * Note, that in modern internet, where routing is unreliable
529 * and in each dark corner broken firewalls sit, sending random
530 * errors ordered by their masters even this two messages finally lose
531 * their original sense (even Linux sends invalid PORT_UNREACHs)
533 * Now we are in compliance with RFCs.
538 if (!sock_owned_by_user(sk) && inet->recverr) {
540 sk->sk_error_report(sk);
541 } else { /* Only an error on timeout */
542 sk->sk_err_soft = err;
550 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
552 struct tcphdr *th = tcp_hdr(skb);
554 if (skb->ip_summed == CHECKSUM_PARTIAL) {
555 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
556 skb->csum_start = skb_transport_header(skb) - skb->head;
557 skb->csum_offset = offsetof(struct tcphdr, check);
559 th->check = tcp_v4_check(skb->len, saddr, daddr,
566 /* This routine computes an IPv4 TCP checksum. */
567 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
569 const struct inet_sock *inet = inet_sk(sk);
571 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
573 EXPORT_SYMBOL(tcp_v4_send_check);
576 * This routine will send an RST to the other tcp.
578 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
580 * Answer: if a packet caused RST, it is not for a socket
581 * existing in our system, if it is matched to a socket,
582 * it is just duplicate segment or bug in other side's TCP.
583 * So that we build reply only basing on parameters
584 * arrived with segment.
585 * Exception: precedence violation. We do not implement it in any case.
588 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
590 const struct tcphdr *th = tcp_hdr(skb);
593 #ifdef CONFIG_TCP_MD5SIG
594 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597 struct ip_reply_arg arg;
598 #ifdef CONFIG_TCP_MD5SIG
599 struct tcp_md5sig_key *key;
600 const __u8 *hash_location = NULL;
601 unsigned char newhash[16];
603 struct sock *sk1 = NULL;
607 /* Never send a reset in response to a reset. */
611 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
614 /* Swap the send and the receive. */
615 memset(&rep, 0, sizeof(rep));
616 rep.th.dest = th->source;
617 rep.th.source = th->dest;
618 rep.th.doff = sizeof(struct tcphdr) / 4;
622 rep.th.seq = th->ack_seq;
625 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
626 skb->len - (th->doff << 2));
629 memset(&arg, 0, sizeof(arg));
630 arg.iov[0].iov_base = (unsigned char *)&rep;
631 arg.iov[0].iov_len = sizeof(rep.th);
633 #ifdef CONFIG_TCP_MD5SIG
634 hash_location = tcp_parse_md5sig_option(th);
635 if (!sk && hash_location) {
637 * active side is lost. Try to find listening socket through
638 * source port, and then find md5 key through listening socket.
639 * we are not loose security here:
640 * Incoming packet is checked with md5 hash with finding key,
641 * no RST generated if md5 hash doesn't match.
643 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
644 &tcp_hashinfo, ip_hdr(skb)->saddr,
645 th->source, ip_hdr(skb)->daddr,
646 ntohs(th->source), inet_iif(skb));
647 /* don't send rst if it can't find key */
651 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652 &ip_hdr(skb)->saddr, AF_INET);
656 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
657 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
666 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
668 (TCPOPT_MD5SIG << 8) |
670 /* Update length and the length the header thinks exists */
671 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
672 rep.th.doff = arg.iov[0].iov_len / 4;
674 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
675 key, ip_hdr(skb)->saddr,
676 ip_hdr(skb)->daddr, &rep.th);
679 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
680 ip_hdr(skb)->saddr, /* XXX */
681 arg.iov[0].iov_len, IPPROTO_TCP, 0);
682 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
683 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
684 /* When socket is gone, all binding information is lost.
685 * routing might fail in this case. No choice here, if we choose to force
686 * input interface, we will misroute in case of asymmetric route.
689 arg.bound_dev_if = sk->sk_bound_dev_if;
691 net = dev_net(skb_dst(skb)->dev);
692 arg.tos = ip_hdr(skb)->tos;
693 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
694 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
696 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
697 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
699 #ifdef CONFIG_TCP_MD5SIG
708 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
709 outside socket context is ugly, certainly. What can I do?
712 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
713 u32 win, u32 tsval, u32 tsecr, int oif,
714 struct tcp_md5sig_key *key,
715 int reply_flags, u8 tos)
717 const struct tcphdr *th = tcp_hdr(skb);
720 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
721 #ifdef CONFIG_TCP_MD5SIG
722 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
726 struct ip_reply_arg arg;
727 struct net *net = dev_net(skb_dst(skb)->dev);
729 memset(&rep.th, 0, sizeof(struct tcphdr));
730 memset(&arg, 0, sizeof(arg));
732 arg.iov[0].iov_base = (unsigned char *)&rep;
733 arg.iov[0].iov_len = sizeof(rep.th);
735 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
736 (TCPOPT_TIMESTAMP << 8) |
738 rep.opt[1] = htonl(tsval);
739 rep.opt[2] = htonl(tsecr);
740 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
743 /* Swap the send and the receive. */
744 rep.th.dest = th->source;
745 rep.th.source = th->dest;
746 rep.th.doff = arg.iov[0].iov_len / 4;
747 rep.th.seq = htonl(seq);
748 rep.th.ack_seq = htonl(ack);
750 rep.th.window = htons(win);
752 #ifdef CONFIG_TCP_MD5SIG
754 int offset = (tsecr) ? 3 : 0;
756 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
758 (TCPOPT_MD5SIG << 8) |
760 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
761 rep.th.doff = arg.iov[0].iov_len/4;
763 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
764 key, ip_hdr(skb)->saddr,
765 ip_hdr(skb)->daddr, &rep.th);
768 arg.flags = reply_flags;
769 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
770 ip_hdr(skb)->saddr, /* XXX */
771 arg.iov[0].iov_len, IPPROTO_TCP, 0);
772 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
774 arg.bound_dev_if = oif;
776 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
777 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
779 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 struct inet_timewait_sock *tw = inet_twsk(sk);
785 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
788 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
789 tcp_time_stamp + tcptw->tw_ts_offset,
792 tcp_twsk_md5_key(tcptw),
793 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
800 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
801 struct request_sock *req)
803 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
804 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
807 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
808 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
812 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
814 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
819 * Send a SYN-ACK after having received a SYN.
820 * This still operates on a request_sock only, not on a big
823 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
824 struct request_sock *req,
827 const struct inet_request_sock *ireq = inet_rsk(req);
832 /* First, grab a route. */
833 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
836 skb = tcp_make_synack(sk, dst, req, NULL);
839 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
841 skb_set_queue_mapping(skb, queue_mapping);
842 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
845 err = net_xmit_eval(err);
846 if (!tcp_rsk(req)->snt_synack && !err)
847 tcp_rsk(req)->snt_synack = tcp_time_stamp;
853 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
855 int res = tcp_v4_send_synack(sk, NULL, req, 0);
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
865 * IPv4 request_sock destructor.
867 static void tcp_v4_reqsk_destructor(struct request_sock *req)
869 kfree(inet_rsk(req)->opt);
873 * Return true if a syncookie should be sent
875 bool tcp_syn_flood_action(struct sock *sk,
876 const struct sk_buff *skb,
879 const char *msg = "Dropping request";
880 bool want_cookie = false;
881 struct listen_sock *lopt;
885 #ifdef CONFIG_SYN_COOKIES
886 if (sysctl_tcp_syncookies) {
887 msg = "Sending cookies";
889 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
892 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
894 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
895 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
896 lopt->synflood_warned = 1;
897 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
898 proto, ntohs(tcp_hdr(skb)->dest), msg);
902 EXPORT_SYMBOL(tcp_syn_flood_action);
905 * Save and compile IPv4 options into the request_sock if needed.
907 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
909 const struct ip_options *opt = &(IPCB(skb)->opt);
910 struct ip_options_rcu *dopt = NULL;
912 if (opt && opt->optlen) {
913 int opt_size = sizeof(*dopt) + opt->optlen;
915 dopt = kmalloc(opt_size, GFP_ATOMIC);
917 if (ip_options_echo(&dopt->opt, skb)) {
926 #ifdef CONFIG_TCP_MD5SIG
928 * RFC2385 MD5 checksumming requires a mapping of
929 * IP address->MD5 Key.
930 * We need to maintain these in the sk structure.
933 /* Find the Key structure for an address. */
934 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
935 const union tcp_md5_addr *addr,
938 struct tcp_sock *tp = tcp_sk(sk);
939 struct tcp_md5sig_key *key;
940 unsigned int size = sizeof(struct in_addr);
941 struct tcp_md5sig_info *md5sig;
943 /* caller either holds rcu_read_lock() or socket lock */
944 md5sig = rcu_dereference_check(tp->md5sig_info,
945 sock_owned_by_user(sk) ||
946 lockdep_is_held(&sk->sk_lock.slock));
949 #if IS_ENABLED(CONFIG_IPV6)
950 if (family == AF_INET6)
951 size = sizeof(struct in6_addr);
953 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
954 if (key->family != family)
956 if (!memcmp(&key->addr, addr, size))
961 EXPORT_SYMBOL(tcp_md5_do_lookup);
963 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
964 struct sock *addr_sk)
966 union tcp_md5_addr *addr;
968 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
969 return tcp_md5_do_lookup(sk, addr, AF_INET);
971 EXPORT_SYMBOL(tcp_v4_md5_lookup);
973 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
974 struct request_sock *req)
976 union tcp_md5_addr *addr;
978 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
979 return tcp_md5_do_lookup(sk, addr, AF_INET);
982 /* This can be called on a newly created socket, from other files */
983 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
984 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
986 /* Add Key to the list */
987 struct tcp_md5sig_key *key;
988 struct tcp_sock *tp = tcp_sk(sk);
989 struct tcp_md5sig_info *md5sig;
991 key = tcp_md5_do_lookup(sk, addr, family);
993 /* Pre-existing entry - just update that one. */
994 memcpy(key->key, newkey, newkeylen);
995 key->keylen = newkeylen;
999 md5sig = rcu_dereference_protected(tp->md5sig_info,
1000 sock_owned_by_user(sk));
1002 md5sig = kmalloc(sizeof(*md5sig), gfp);
1006 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1007 INIT_HLIST_HEAD(&md5sig->head);
1008 rcu_assign_pointer(tp->md5sig_info, md5sig);
1011 key = sock_kmalloc(sk, sizeof(*key), gfp);
1014 if (!tcp_alloc_md5sig_pool()) {
1015 sock_kfree_s(sk, key, sizeof(*key));
1019 memcpy(key->key, newkey, newkeylen);
1020 key->keylen = newkeylen;
1021 key->family = family;
1022 memcpy(&key->addr, addr,
1023 (family == AF_INET6) ? sizeof(struct in6_addr) :
1024 sizeof(struct in_addr));
1025 hlist_add_head_rcu(&key->node, &md5sig->head);
1028 EXPORT_SYMBOL(tcp_md5_do_add);
1030 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1032 struct tcp_md5sig_key *key;
1034 key = tcp_md5_do_lookup(sk, addr, family);
1037 hlist_del_rcu(&key->node);
1038 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1039 kfree_rcu(key, rcu);
1042 EXPORT_SYMBOL(tcp_md5_do_del);
1044 static void tcp_clear_md5_list(struct sock *sk)
1046 struct tcp_sock *tp = tcp_sk(sk);
1047 struct tcp_md5sig_key *key;
1048 struct hlist_node *n;
1049 struct tcp_md5sig_info *md5sig;
1051 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1053 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1054 hlist_del_rcu(&key->node);
1055 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056 kfree_rcu(key, rcu);
1060 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1063 struct tcp_md5sig cmd;
1064 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1066 if (optlen < sizeof(cmd))
1069 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1072 if (sin->sin_family != AF_INET)
1075 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1076 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1079 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1082 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1083 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1087 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1088 __be32 daddr, __be32 saddr, int nbytes)
1090 struct tcp4_pseudohdr *bp;
1091 struct scatterlist sg;
1093 bp = &hp->md5_blk.ip4;
1096 * 1. the TCP pseudo-header (in the order: source IP address,
1097 * destination IP address, zero-padded protocol number, and
1103 bp->protocol = IPPROTO_TCP;
1104 bp->len = cpu_to_be16(nbytes);
1106 sg_init_one(&sg, bp, sizeof(*bp));
1107 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1110 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1111 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1113 struct tcp_md5sig_pool *hp;
1114 struct hash_desc *desc;
1116 hp = tcp_get_md5sig_pool();
1118 goto clear_hash_noput;
1119 desc = &hp->md5_desc;
1121 if (crypto_hash_init(desc))
1123 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1125 if (tcp_md5_hash_header(hp, th))
1127 if (tcp_md5_hash_key(hp, key))
1129 if (crypto_hash_final(desc, md5_hash))
1132 tcp_put_md5sig_pool();
1136 tcp_put_md5sig_pool();
1138 memset(md5_hash, 0, 16);
1142 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1143 const struct sock *sk, const struct request_sock *req,
1144 const struct sk_buff *skb)
1146 struct tcp_md5sig_pool *hp;
1147 struct hash_desc *desc;
1148 const struct tcphdr *th = tcp_hdr(skb);
1149 __be32 saddr, daddr;
1152 saddr = inet_sk(sk)->inet_saddr;
1153 daddr = inet_sk(sk)->inet_daddr;
1155 saddr = inet_rsk(req)->ir_loc_addr;
1156 daddr = inet_rsk(req)->ir_rmt_addr;
1158 const struct iphdr *iph = ip_hdr(skb);
1163 hp = tcp_get_md5sig_pool();
1165 goto clear_hash_noput;
1166 desc = &hp->md5_desc;
1168 if (crypto_hash_init(desc))
1171 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1173 if (tcp_md5_hash_header(hp, th))
1175 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1177 if (tcp_md5_hash_key(hp, key))
1179 if (crypto_hash_final(desc, md5_hash))
1182 tcp_put_md5sig_pool();
1186 tcp_put_md5sig_pool();
1188 memset(md5_hash, 0, 16);
1191 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1193 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1196 * This gets called for each TCP segment that arrives
1197 * so we want to be efficient.
1198 * We have 3 drop cases:
1199 * o No MD5 hash and one expected.
1200 * o MD5 hash and we're not expecting one.
1201 * o MD5 hash and its wrong.
1203 const __u8 *hash_location = NULL;
1204 struct tcp_md5sig_key *hash_expected;
1205 const struct iphdr *iph = ip_hdr(skb);
1206 const struct tcphdr *th = tcp_hdr(skb);
1208 unsigned char newhash[16];
1210 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1212 hash_location = tcp_parse_md5sig_option(th);
1214 /* We've parsed the options - do we have a hash? */
1215 if (!hash_expected && !hash_location)
1218 if (hash_expected && !hash_location) {
1219 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1223 if (!hash_expected && hash_location) {
1224 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1228 /* Okay, so this is hash_expected and hash_location -
1229 * so we need to calculate the checksum.
1231 genhash = tcp_v4_md5_hash_skb(newhash,
1235 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1236 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1237 &iph->saddr, ntohs(th->source),
1238 &iph->daddr, ntohs(th->dest),
1239 genhash ? " tcp_v4_calc_md5_hash failed"
1248 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1250 .obj_size = sizeof(struct tcp_request_sock),
1251 .rtx_syn_ack = tcp_v4_rtx_synack,
1252 .send_ack = tcp_v4_reqsk_send_ack,
1253 .destructor = tcp_v4_reqsk_destructor,
1254 .send_reset = tcp_v4_send_reset,
1255 .syn_ack_timeout = tcp_syn_ack_timeout,
1258 #ifdef CONFIG_TCP_MD5SIG
1259 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1261 .calc_md5_hash = tcp_v4_md5_hash_skb,
1265 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1266 struct request_sock *req,
1267 struct tcp_fastopen_cookie *foc,
1268 struct tcp_fastopen_cookie *valid_foc)
1270 bool skip_cookie = false;
1271 struct fastopen_queue *fastopenq;
1273 if (likely(!fastopen_cookie_present(foc))) {
1274 /* See include/net/tcp.h for the meaning of these knobs */
1275 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1276 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1277 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1278 skip_cookie = true; /* no cookie to validate */
1282 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1283 /* A FO option is present; bump the counter. */
1284 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1286 /* Make sure the listener has enabled fastopen, and we don't
1287 * exceed the max # of pending TFO requests allowed before trying
1288 * to validating the cookie in order to avoid burning CPU cycles
1291 * XXX (TFO) - The implication of checking the max_qlen before
1292 * processing a cookie request is that clients can't differentiate
1293 * between qlen overflow causing Fast Open to be disabled
1294 * temporarily vs a server not supporting Fast Open at all.
1296 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1297 fastopenq == NULL || fastopenq->max_qlen == 0)
1300 if (fastopenq->qlen >= fastopenq->max_qlen) {
1301 struct request_sock *req1;
1302 spin_lock(&fastopenq->lock);
1303 req1 = fastopenq->rskq_rst_head;
1304 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1305 spin_unlock(&fastopenq->lock);
1306 NET_INC_STATS_BH(sock_net(sk),
1307 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1308 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1312 fastopenq->rskq_rst_head = req1->dl_next;
1314 spin_unlock(&fastopenq->lock);
1318 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1322 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1323 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1324 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1325 ip_hdr(skb)->daddr, valid_foc);
1326 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1327 memcmp(&foc->val[0], &valid_foc->val[0],
1328 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1330 valid_foc->len = -1;
1332 /* Acknowledge the data received from the peer. */
1333 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1335 } else if (foc->len == 0) { /* Client requesting a cookie */
1336 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1337 ip_hdr(skb)->daddr, valid_foc);
1338 NET_INC_STATS_BH(sock_net(sk),
1339 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1341 /* Client sent a cookie with wrong size. Treat it
1342 * the same as invalid and return a valid one.
1344 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1345 ip_hdr(skb)->daddr, valid_foc);
1350 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1351 struct sk_buff *skb,
1352 struct sk_buff *skb_synack,
1353 struct request_sock *req)
1355 struct tcp_sock *tp = tcp_sk(sk);
1356 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1357 const struct inet_request_sock *ireq = inet_rsk(req);
1361 req->num_retrans = 0;
1362 req->num_timeout = 0;
1365 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1366 if (child == NULL) {
1367 NET_INC_STATS_BH(sock_net(sk),
1368 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1369 kfree_skb(skb_synack);
1372 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
1373 ireq->ir_rmt_addr, ireq->opt);
1374 err = net_xmit_eval(err);
1376 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1377 /* XXX (TFO) - is it ok to ignore error and continue? */
1379 spin_lock(&queue->fastopenq->lock);
1380 queue->fastopenq->qlen++;
1381 spin_unlock(&queue->fastopenq->lock);
1383 /* Initialize the child socket. Have to fix some values to take
1384 * into account the child is a Fast Open socket and is created
1385 * only out of the bits carried in the SYN packet.
1389 tp->fastopen_rsk = req;
1390 /* Do a hold on the listner sk so that if the listener is being
1391 * closed, the child that has been accepted can live on and still
1392 * access listen_lock.
1395 tcp_rsk(req)->listener = sk;
1397 /* RFC1323: The window in SYN & SYN/ACK segments is never
1398 * scaled. So correct it appropriately.
1400 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1402 /* Activate the retrans timer so that SYNACK can be retransmitted.
1403 * The request socket is not added to the SYN table of the parent
1404 * because it's been added to the accept queue directly.
1406 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1407 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1409 /* Add the child socket directly into the accept queue */
1410 inet_csk_reqsk_queue_add(sk, req, child);
1412 /* Now finish processing the fastopen child socket. */
1413 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1414 tcp_init_congestion_control(child);
1415 tcp_mtup_init(child);
1416 tcp_init_metrics(child);
1417 tcp_init_buffer_space(child);
1419 /* Queue the data carried in the SYN packet. We need to first
1420 * bump skb's refcnt because the caller will attempt to free it.
1422 * XXX (TFO) - we honor a zero-payload TFO request for now.
1423 * (Any reason not to?)
1425 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1426 /* Don't queue the skb if there is no payload in SYN.
1427 * XXX (TFO) - How about SYN+FIN?
1429 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1433 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1434 skb_set_owner_r(skb, child);
1435 __skb_queue_tail(&child->sk_receive_queue, skb);
1436 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1437 tp->syn_data_acked = 1;
1439 sk->sk_data_ready(sk, 0);
1440 bh_unlock_sock(child);
1442 WARN_ON(req->sk == NULL);
1446 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1448 struct tcp_options_received tmp_opt;
1449 struct request_sock *req;
1450 struct inet_request_sock *ireq;
1451 struct tcp_sock *tp = tcp_sk(sk);
1452 struct dst_entry *dst = NULL;
1453 __be32 saddr = ip_hdr(skb)->saddr;
1454 __be32 daddr = ip_hdr(skb)->daddr;
1455 __u32 isn = TCP_SKB_CB(skb)->when;
1456 bool want_cookie = false;
1458 struct tcp_fastopen_cookie foc = { .len = -1 };
1459 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1460 struct sk_buff *skb_synack;
1463 /* Never answer to SYNs send to broadcast or multicast */
1464 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 /* TW buckets are converted to open requests without
1468 * limitations, they conserve resources and peer is
1469 * evidently real one.
1471 if ((sysctl_tcp_syncookies == 2 ||
1472 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1473 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1478 /* Accept backlog is full. If we have already queued enough
1479 * of warm entries in syn queue, drop request. It is better than
1480 * clogging syn queue with openreqs with exponentially increasing
1483 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1484 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1492 #ifdef CONFIG_TCP_MD5SIG
1493 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1496 tcp_clear_options(&tmp_opt);
1497 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1498 tmp_opt.user_mss = tp->rx_opt.user_mss;
1499 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1501 if (want_cookie && !tmp_opt.saw_tstamp)
1502 tcp_clear_options(&tmp_opt);
1504 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1505 tcp_openreq_init(req, &tmp_opt, skb);
1507 ireq = inet_rsk(req);
1508 ireq->ir_loc_addr = daddr;
1509 ireq->ir_rmt_addr = saddr;
1510 ireq->no_srccheck = inet_sk(sk)->transparent;
1511 ireq->opt = tcp_v4_save_options(skb);
1513 if (security_inet_conn_request(sk, skb, req))
1516 if (!want_cookie || tmp_opt.tstamp_ok)
1517 TCP_ECN_create_request(req, skb, sock_net(sk));
1520 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1521 req->cookie_ts = tmp_opt.tstamp_ok;
1523 /* VJ's idea. We save last timestamp seen
1524 * from the destination in peer table, when entering
1525 * state TIME-WAIT, and check against it before
1526 * accepting new connection request.
1528 * If "isn" is not zero, this request hit alive
1529 * timewait bucket, so that all the necessary checks
1530 * are made in the function processing timewait state.
1532 if (tmp_opt.saw_tstamp &&
1533 tcp_death_row.sysctl_tw_recycle &&
1534 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1535 fl4.daddr == saddr) {
1536 if (!tcp_peer_is_proven(req, dst, true)) {
1537 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1538 goto drop_and_release;
1541 /* Kill the following clause, if you dislike this way. */
1542 else if (!sysctl_tcp_syncookies &&
1543 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1544 (sysctl_max_syn_backlog >> 2)) &&
1545 !tcp_peer_is_proven(req, dst, false)) {
1546 /* Without syncookies last quarter of
1547 * backlog is filled with destinations,
1548 * proven to be alive.
1549 * It means that we continue to communicate
1550 * to destinations, already remembered
1551 * to the moment of synflood.
1553 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1554 &saddr, ntohs(tcp_hdr(skb)->source));
1555 goto drop_and_release;
1558 isn = tcp_v4_init_sequence(skb);
1560 tcp_rsk(req)->snt_isn = isn;
1563 dst = inet_csk_route_req(sk, &fl4, req);
1567 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1569 /* We don't call tcp_v4_send_synack() directly because we need
1570 * to make sure a child socket can be created successfully before
1571 * sending back synack!
1573 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1574 * (or better yet, call tcp_send_synack() in the child context
1575 * directly, but will have to fix bunch of other code first)
1576 * after syn_recv_sock() except one will need to first fix the
1577 * latter to remove its dependency on the current implementation
1578 * of tcp_v4_send_synack()->tcp_select_initial_window().
1580 skb_synack = tcp_make_synack(sk, dst, req,
1581 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1584 __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1585 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1589 if (likely(!do_fastopen)) {
1591 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
1592 ireq->ir_rmt_addr, ireq->opt);
1593 err = net_xmit_eval(err);
1594 if (err || want_cookie)
1597 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1598 tcp_rsk(req)->listener = NULL;
1599 /* Add the request_sock to the SYN table */
1600 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1601 if (fastopen_cookie_present(&foc) && foc.len != 0)
1602 NET_INC_STATS_BH(sock_net(sk),
1603 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1604 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1614 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1617 EXPORT_SYMBOL(tcp_v4_conn_request);
1621 * The three way handshake has completed - we got a valid synack -
1622 * now create the new socket.
1624 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1625 struct request_sock *req,
1626 struct dst_entry *dst)
1628 struct inet_request_sock *ireq;
1629 struct inet_sock *newinet;
1630 struct tcp_sock *newtp;
1632 #ifdef CONFIG_TCP_MD5SIG
1633 struct tcp_md5sig_key *key;
1635 struct ip_options_rcu *inet_opt;
1637 if (sk_acceptq_is_full(sk))
1640 newsk = tcp_create_openreq_child(sk, req, skb);
1644 newsk->sk_gso_type = SKB_GSO_TCPV4;
1645 inet_sk_rx_dst_set(newsk, skb);
1647 newtp = tcp_sk(newsk);
1648 newinet = inet_sk(newsk);
1649 ireq = inet_rsk(req);
1650 newinet->inet_daddr = ireq->ir_rmt_addr;
1651 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1652 newinet->inet_saddr = ireq->ir_loc_addr;
1653 inet_opt = ireq->opt;
1654 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1656 newinet->mc_index = inet_iif(skb);
1657 newinet->mc_ttl = ip_hdr(skb)->ttl;
1658 newinet->rcv_tos = ip_hdr(skb)->tos;
1659 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1661 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1662 newinet->inet_id = newtp->write_seq ^ jiffies;
1665 dst = inet_csk_route_child_sock(sk, newsk, req);
1669 /* syncookie case : see end of cookie_v4_check() */
1671 sk_setup_caps(newsk, dst);
1673 tcp_sync_mss(newsk, dst_mtu(dst));
1674 newtp->advmss = dst_metric_advmss(dst);
1675 if (tcp_sk(sk)->rx_opt.user_mss &&
1676 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1677 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1679 tcp_initialize_rcv_mss(newsk);
1681 #ifdef CONFIG_TCP_MD5SIG
1682 /* Copy over the MD5 key from the original socket */
1683 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1687 * We're using one, so create a matching key
1688 * on the newsk structure. If we fail to get
1689 * memory, then we end up not copying the key
1692 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1693 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1694 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1698 if (__inet_inherit_port(sk, newsk) < 0)
1700 __inet_hash_nolisten(newsk, NULL);
1705 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1712 inet_csk_prepare_forced_close(newsk);
1716 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1718 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1720 struct tcphdr *th = tcp_hdr(skb);
1721 const struct iphdr *iph = ip_hdr(skb);
1723 struct request_sock **prev;
1724 /* Find possible connection requests. */
1725 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1726 iph->saddr, iph->daddr);
1728 return tcp_check_req(sk, skb, req, prev, false);
1730 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1731 th->source, iph->daddr, th->dest, inet_iif(skb));
1734 if (nsk->sk_state != TCP_TIME_WAIT) {
1738 inet_twsk_put(inet_twsk(nsk));
1742 #ifdef CONFIG_SYN_COOKIES
1744 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1749 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1751 const struct iphdr *iph = ip_hdr(skb);
1753 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1754 if (!tcp_v4_check(skb->len, iph->saddr,
1755 iph->daddr, skb->csum)) {
1756 skb->ip_summed = CHECKSUM_UNNECESSARY;
1761 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1762 skb->len, IPPROTO_TCP, 0);
1764 if (skb->len <= 76) {
1765 return __skb_checksum_complete(skb);
1771 /* The socket must have it's spinlock held when we get
1774 * We have a potential double-lock case here, so even when
1775 * doing backlog processing we use the BH locking scheme.
1776 * This is because we cannot sleep with the original spinlock
1779 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1782 #ifdef CONFIG_TCP_MD5SIG
1784 * We really want to reject the packet as early as possible
1786 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1787 * o There is an MD5 option and we're not expecting one
1789 if (tcp_v4_inbound_md5_hash(sk, skb))
1793 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1794 struct dst_entry *dst = sk->sk_rx_dst;
1796 sock_rps_save_rxhash(sk, skb);
1798 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1799 dst->ops->check(dst, 0) == NULL) {
1801 sk->sk_rx_dst = NULL;
1804 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1808 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1811 if (sk->sk_state == TCP_LISTEN) {
1812 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1817 sock_rps_save_rxhash(nsk, skb);
1818 if (tcp_child_process(sk, nsk, skb)) {
1825 sock_rps_save_rxhash(sk, skb);
1827 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1834 tcp_v4_send_reset(rsk, skb);
1837 /* Be careful here. If this function gets more complicated and
1838 * gcc suffers from register pressure on the x86, sk (in %ebx)
1839 * might be destroyed here. This current version compiles correctly,
1840 * but you have been warned.
1845 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1846 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1849 EXPORT_SYMBOL(tcp_v4_do_rcv);
1851 void tcp_v4_early_demux(struct sk_buff *skb)
1853 const struct iphdr *iph;
1854 const struct tcphdr *th;
1857 if (skb->pkt_type != PACKET_HOST)
1860 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1866 if (th->doff < sizeof(struct tcphdr) / 4)
1869 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1870 iph->saddr, th->source,
1871 iph->daddr, ntohs(th->dest),
1875 skb->destructor = sock_edemux;
1876 if (sk->sk_state != TCP_TIME_WAIT) {
1877 struct dst_entry *dst = sk->sk_rx_dst;
1880 dst = dst_check(dst, 0);
1882 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1883 skb_dst_set_noref(skb, dst);
1888 /* Packet is added to VJ-style prequeue for processing in process
1889 * context, if a reader task is waiting. Apparently, this exciting
1890 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1891 * failed somewhere. Latency? Burstiness? Well, at least now we will
1892 * see, why it failed. 8)8) --ANK
1895 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1897 struct tcp_sock *tp = tcp_sk(sk);
1899 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1902 if (skb->len <= tcp_hdrlen(skb) &&
1903 skb_queue_len(&tp->ucopy.prequeue) == 0)
1907 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1908 tp->ucopy.memory += skb->truesize;
1909 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1910 struct sk_buff *skb1;
1912 BUG_ON(sock_owned_by_user(sk));
1914 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1915 sk_backlog_rcv(sk, skb1);
1916 NET_INC_STATS_BH(sock_net(sk),
1917 LINUX_MIB_TCPPREQUEUEDROPPED);
1920 tp->ucopy.memory = 0;
1921 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1922 wake_up_interruptible_sync_poll(sk_sleep(sk),
1923 POLLIN | POLLRDNORM | POLLRDBAND);
1924 if (!inet_csk_ack_scheduled(sk))
1925 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1926 (3 * tcp_rto_min(sk)) / 4,
1931 EXPORT_SYMBOL(tcp_prequeue);
1937 int tcp_v4_rcv(struct sk_buff *skb)
1939 const struct iphdr *iph;
1940 const struct tcphdr *th;
1943 struct net *net = dev_net(skb->dev);
1945 if (skb->pkt_type != PACKET_HOST)
1948 /* Count it even if it's bad */
1949 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1951 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1956 if (th->doff < sizeof(struct tcphdr) / 4)
1958 if (!pskb_may_pull(skb, th->doff * 4))
1961 /* An explanation is required here, I think.
1962 * Packet length and doff are validated by header prediction,
1963 * provided case of th->doff==0 is eliminated.
1964 * So, we defer the checks. */
1965 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1970 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1971 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1972 skb->len - th->doff * 4);
1973 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1974 TCP_SKB_CB(skb)->when = 0;
1975 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1976 TCP_SKB_CB(skb)->sacked = 0;
1978 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1983 if (sk->sk_state == TCP_TIME_WAIT)
1986 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1987 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1988 goto discard_and_relse;
1991 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1992 goto discard_and_relse;
1995 if (sk_filter(sk, skb))
1996 goto discard_and_relse;
1998 sk_mark_napi_id(sk, skb);
2001 bh_lock_sock_nested(sk);
2003 if (!sock_owned_by_user(sk)) {
2004 #ifdef CONFIG_NET_DMA
2005 struct tcp_sock *tp = tcp_sk(sk);
2006 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2007 tp->ucopy.dma_chan = net_dma_find_channel();
2008 if (tp->ucopy.dma_chan)
2009 ret = tcp_v4_do_rcv(sk, skb);
2013 if (!tcp_prequeue(sk, skb))
2014 ret = tcp_v4_do_rcv(sk, skb);
2016 } else if (unlikely(sk_add_backlog(sk, skb,
2017 sk->sk_rcvbuf + sk->sk_sndbuf))) {
2019 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2020 goto discard_and_relse;
2029 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2032 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2034 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2036 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2038 tcp_v4_send_reset(NULL, skb);
2042 /* Discard frame. */
2051 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2052 inet_twsk_put(inet_twsk(sk));
2056 if (skb->len < (th->doff << 2)) {
2057 inet_twsk_put(inet_twsk(sk));
2060 if (tcp_checksum_complete(skb)) {
2061 inet_twsk_put(inet_twsk(sk));
2064 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2066 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2068 iph->saddr, th->source,
2069 iph->daddr, th->dest,
2072 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2073 inet_twsk_put(inet_twsk(sk));
2077 /* Fall through to ACK */
2080 tcp_v4_timewait_ack(sk, skb);
2084 case TCP_TW_SUCCESS:;
2089 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2090 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2091 .twsk_unique = tcp_twsk_unique,
2092 .twsk_destructor= tcp_twsk_destructor,
2095 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2097 struct dst_entry *dst = skb_dst(skb);
2100 sk->sk_rx_dst = dst;
2101 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2103 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2105 const struct inet_connection_sock_af_ops ipv4_specific = {
2106 .queue_xmit = ip_queue_xmit,
2107 .send_check = tcp_v4_send_check,
2108 .rebuild_header = inet_sk_rebuild_header,
2109 .sk_rx_dst_set = inet_sk_rx_dst_set,
2110 .conn_request = tcp_v4_conn_request,
2111 .syn_recv_sock = tcp_v4_syn_recv_sock,
2112 .net_header_len = sizeof(struct iphdr),
2113 .setsockopt = ip_setsockopt,
2114 .getsockopt = ip_getsockopt,
2115 .addr2sockaddr = inet_csk_addr2sockaddr,
2116 .sockaddr_len = sizeof(struct sockaddr_in),
2117 .bind_conflict = inet_csk_bind_conflict,
2118 #ifdef CONFIG_COMPAT
2119 .compat_setsockopt = compat_ip_setsockopt,
2120 .compat_getsockopt = compat_ip_getsockopt,
2123 EXPORT_SYMBOL(ipv4_specific);
2125 #ifdef CONFIG_TCP_MD5SIG
2126 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2127 .md5_lookup = tcp_v4_md5_lookup,
2128 .calc_md5_hash = tcp_v4_md5_hash_skb,
2129 .md5_parse = tcp_v4_parse_md5_keys,
2133 /* NOTE: A lot of things set to zero explicitly by call to
2134 * sk_alloc() so need not be done here.
2136 static int tcp_v4_init_sock(struct sock *sk)
2138 struct inet_connection_sock *icsk = inet_csk(sk);
2142 icsk->icsk_af_ops = &ipv4_specific;
2144 #ifdef CONFIG_TCP_MD5SIG
2145 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2151 void tcp_v4_destroy_sock(struct sock *sk)
2153 struct tcp_sock *tp = tcp_sk(sk);
2155 tcp_clear_xmit_timers(sk);
2157 tcp_cleanup_congestion_control(sk);
2159 /* Cleanup up the write buffer. */
2160 tcp_write_queue_purge(sk);
2162 /* Cleans up our, hopefully empty, out_of_order_queue. */
2163 __skb_queue_purge(&tp->out_of_order_queue);
2165 #ifdef CONFIG_TCP_MD5SIG
2166 /* Clean up the MD5 key list, if any */
2167 if (tp->md5sig_info) {
2168 tcp_clear_md5_list(sk);
2169 kfree_rcu(tp->md5sig_info, rcu);
2170 tp->md5sig_info = NULL;
2174 #ifdef CONFIG_NET_DMA
2175 /* Cleans up our sk_async_wait_queue */
2176 __skb_queue_purge(&sk->sk_async_wait_queue);
2179 /* Clean prequeue, it must be empty really */
2180 __skb_queue_purge(&tp->ucopy.prequeue);
2182 /* Clean up a referenced TCP bind bucket. */
2183 if (inet_csk(sk)->icsk_bind_hash)
2186 BUG_ON(tp->fastopen_rsk != NULL);
2188 /* If socket is aborted during connect operation */
2189 tcp_free_fastopen_req(tp);
2191 sk_sockets_allocated_dec(sk);
2192 sock_release_memcg(sk);
2194 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2196 #ifdef CONFIG_PROC_FS
2197 /* Proc filesystem TCP sock list dumping. */
2200 * Get next listener socket follow cur. If cur is NULL, get first socket
2201 * starting from bucket given in st->bucket; when st->bucket is zero the
2202 * very first socket in the hash table is returned.
2204 static void *listening_get_next(struct seq_file *seq, void *cur)
2206 struct inet_connection_sock *icsk;
2207 struct hlist_nulls_node *node;
2208 struct sock *sk = cur;
2209 struct inet_listen_hashbucket *ilb;
2210 struct tcp_iter_state *st = seq->private;
2211 struct net *net = seq_file_net(seq);
2214 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2215 spin_lock_bh(&ilb->lock);
2216 sk = sk_nulls_head(&ilb->head);
2220 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2224 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2225 struct request_sock *req = cur;
2227 icsk = inet_csk(st->syn_wait_sk);
2231 if (req->rsk_ops->family == st->family) {
2237 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2240 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2242 sk = sk_nulls_next(st->syn_wait_sk);
2243 st->state = TCP_SEQ_STATE_LISTENING;
2244 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2246 icsk = inet_csk(sk);
2247 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2248 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2250 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2251 sk = sk_nulls_next(sk);
2254 sk_nulls_for_each_from(sk, node) {
2255 if (!net_eq(sock_net(sk), net))
2257 if (sk->sk_family == st->family) {
2261 icsk = inet_csk(sk);
2262 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2263 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2265 st->uid = sock_i_uid(sk);
2266 st->syn_wait_sk = sk;
2267 st->state = TCP_SEQ_STATE_OPENREQ;
2271 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2273 spin_unlock_bh(&ilb->lock);
2275 if (++st->bucket < INET_LHTABLE_SIZE) {
2276 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2277 spin_lock_bh(&ilb->lock);
2278 sk = sk_nulls_head(&ilb->head);
2286 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2288 struct tcp_iter_state *st = seq->private;
2293 rc = listening_get_next(seq, NULL);
2295 while (rc && *pos) {
2296 rc = listening_get_next(seq, rc);
2302 static inline bool empty_bucket(const struct tcp_iter_state *st)
2304 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2308 * Get first established socket starting from bucket given in st->bucket.
2309 * If st->bucket is zero, the very first socket in the hash is returned.
2311 static void *established_get_first(struct seq_file *seq)
2313 struct tcp_iter_state *st = seq->private;
2314 struct net *net = seq_file_net(seq);
2318 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2320 struct hlist_nulls_node *node;
2321 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2323 /* Lockless fast path for the common case of empty buckets */
2324 if (empty_bucket(st))
2328 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2329 if (sk->sk_family != st->family ||
2330 !net_eq(sock_net(sk), net)) {
2336 spin_unlock_bh(lock);
2342 static void *established_get_next(struct seq_file *seq, void *cur)
2344 struct sock *sk = cur;
2345 struct hlist_nulls_node *node;
2346 struct tcp_iter_state *st = seq->private;
2347 struct net *net = seq_file_net(seq);
2352 sk = sk_nulls_next(sk);
2354 sk_nulls_for_each_from(sk, node) {
2355 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2359 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2361 return established_get_first(seq);
2364 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2366 struct tcp_iter_state *st = seq->private;
2370 rc = established_get_first(seq);
2373 rc = established_get_next(seq, rc);
2379 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2382 struct tcp_iter_state *st = seq->private;
2384 st->state = TCP_SEQ_STATE_LISTENING;
2385 rc = listening_get_idx(seq, &pos);
2388 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389 rc = established_get_idx(seq, pos);
2395 static void *tcp_seek_last_pos(struct seq_file *seq)
2397 struct tcp_iter_state *st = seq->private;
2398 int offset = st->offset;
2399 int orig_num = st->num;
2402 switch (st->state) {
2403 case TCP_SEQ_STATE_OPENREQ:
2404 case TCP_SEQ_STATE_LISTENING:
2405 if (st->bucket >= INET_LHTABLE_SIZE)
2407 st->state = TCP_SEQ_STATE_LISTENING;
2408 rc = listening_get_next(seq, NULL);
2409 while (offset-- && rc)
2410 rc = listening_get_next(seq, rc);
2414 st->state = TCP_SEQ_STATE_ESTABLISHED;
2416 case TCP_SEQ_STATE_ESTABLISHED:
2417 if (st->bucket > tcp_hashinfo.ehash_mask)
2419 rc = established_get_first(seq);
2420 while (offset-- && rc)
2421 rc = established_get_next(seq, rc);
2429 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2431 struct tcp_iter_state *st = seq->private;
2434 if (*pos && *pos == st->last_pos) {
2435 rc = tcp_seek_last_pos(seq);
2440 st->state = TCP_SEQ_STATE_LISTENING;
2444 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2447 st->last_pos = *pos;
2451 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2453 struct tcp_iter_state *st = seq->private;
2456 if (v == SEQ_START_TOKEN) {
2457 rc = tcp_get_idx(seq, 0);
2461 switch (st->state) {
2462 case TCP_SEQ_STATE_OPENREQ:
2463 case TCP_SEQ_STATE_LISTENING:
2464 rc = listening_get_next(seq, v);
2466 st->state = TCP_SEQ_STATE_ESTABLISHED;
2469 rc = established_get_first(seq);
2472 case TCP_SEQ_STATE_ESTABLISHED:
2473 rc = established_get_next(seq, v);
2478 st->last_pos = *pos;
2482 static void tcp_seq_stop(struct seq_file *seq, void *v)
2484 struct tcp_iter_state *st = seq->private;
2486 switch (st->state) {
2487 case TCP_SEQ_STATE_OPENREQ:
2489 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2490 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2492 case TCP_SEQ_STATE_LISTENING:
2493 if (v != SEQ_START_TOKEN)
2494 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2496 case TCP_SEQ_STATE_ESTABLISHED:
2498 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2503 int tcp_seq_open(struct inode *inode, struct file *file)
2505 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2506 struct tcp_iter_state *s;
2509 err = seq_open_net(inode, file, &afinfo->seq_ops,
2510 sizeof(struct tcp_iter_state));
2514 s = ((struct seq_file *)file->private_data)->private;
2515 s->family = afinfo->family;
2519 EXPORT_SYMBOL(tcp_seq_open);
2521 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2524 struct proc_dir_entry *p;
2526 afinfo->seq_ops.start = tcp_seq_start;
2527 afinfo->seq_ops.next = tcp_seq_next;
2528 afinfo->seq_ops.stop = tcp_seq_stop;
2530 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2531 afinfo->seq_fops, afinfo);
2536 EXPORT_SYMBOL(tcp_proc_register);
2538 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2540 remove_proc_entry(afinfo->name, net->proc_net);
2542 EXPORT_SYMBOL(tcp_proc_unregister);
2544 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2545 struct seq_file *f, int i, kuid_t uid)
2547 const struct inet_request_sock *ireq = inet_rsk(req);
2548 long delta = req->expires - jiffies;
2550 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2551 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2554 ntohs(inet_sk(sk)->inet_sport),
2556 ntohs(ireq->ir_rmt_port),
2558 0, 0, /* could print option size, but that is af dependent. */
2559 1, /* timers active (only the expire timer) */
2560 jiffies_delta_to_clock_t(delta),
2562 from_kuid_munged(seq_user_ns(f), uid),
2563 0, /* non standard timer */
2564 0, /* open_requests have no inode */
2565 atomic_read(&sk->sk_refcnt),
2569 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572 unsigned long timer_expires;
2573 const struct tcp_sock *tp = tcp_sk(sk);
2574 const struct inet_connection_sock *icsk = inet_csk(sk);
2575 const struct inet_sock *inet = inet_sk(sk);
2576 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2577 __be32 dest = inet->inet_daddr;
2578 __be32 src = inet->inet_rcv_saddr;
2579 __u16 destp = ntohs(inet->inet_dport);
2580 __u16 srcp = ntohs(inet->inet_sport);
2583 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2584 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2585 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587 timer_expires = icsk->icsk_timeout;
2588 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590 timer_expires = icsk->icsk_timeout;
2591 } else if (timer_pending(&sk->sk_timer)) {
2593 timer_expires = sk->sk_timer.expires;
2596 timer_expires = jiffies;
2599 if (sk->sk_state == TCP_LISTEN)
2600 rx_queue = sk->sk_ack_backlog;
2603 * because we dont lock socket, we might find a transient negative value
2605 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2607 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2608 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2609 i, src, srcp, dest, destp, sk->sk_state,
2610 tp->write_seq - tp->snd_una,
2613 jiffies_delta_to_clock_t(timer_expires - jiffies),
2614 icsk->icsk_retransmits,
2615 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2616 icsk->icsk_probes_out,
2618 atomic_read(&sk->sk_refcnt), sk,
2619 jiffies_to_clock_t(icsk->icsk_rto),
2620 jiffies_to_clock_t(icsk->icsk_ack.ato),
2621 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2623 sk->sk_state == TCP_LISTEN ?
2624 (fastopenq ? fastopenq->max_qlen : 0) :
2625 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2628 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2629 struct seq_file *f, int i)
2633 long delta = tw->tw_ttd - jiffies;
2635 dest = tw->tw_daddr;
2636 src = tw->tw_rcv_saddr;
2637 destp = ntohs(tw->tw_dport);
2638 srcp = ntohs(tw->tw_sport);
2640 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2641 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2642 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2643 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2644 atomic_read(&tw->tw_refcnt), tw);
2649 static int tcp4_seq_show(struct seq_file *seq, void *v)
2651 struct tcp_iter_state *st;
2652 struct sock *sk = v;
2654 seq_setwidth(seq, TMPSZ - 1);
2655 if (v == SEQ_START_TOKEN) {
2656 seq_puts(seq, " sl local_address rem_address st tx_queue "
2657 "rx_queue tr tm->when retrnsmt uid timeout "
2663 switch (st->state) {
2664 case TCP_SEQ_STATE_LISTENING:
2665 case TCP_SEQ_STATE_ESTABLISHED:
2666 if (sk->sk_state == TCP_TIME_WAIT)
2667 get_timewait4_sock(v, seq, st->num);
2669 get_tcp4_sock(v, seq, st->num);
2671 case TCP_SEQ_STATE_OPENREQ:
2672 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2680 static const struct file_operations tcp_afinfo_seq_fops = {
2681 .owner = THIS_MODULE,
2682 .open = tcp_seq_open,
2684 .llseek = seq_lseek,
2685 .release = seq_release_net
2688 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2691 .seq_fops = &tcp_afinfo_seq_fops,
2693 .show = tcp4_seq_show,
2697 static int __net_init tcp4_proc_init_net(struct net *net)
2699 return tcp_proc_register(net, &tcp4_seq_afinfo);
2702 static void __net_exit tcp4_proc_exit_net(struct net *net)
2704 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2707 static struct pernet_operations tcp4_net_ops = {
2708 .init = tcp4_proc_init_net,
2709 .exit = tcp4_proc_exit_net,
2712 int __init tcp4_proc_init(void)
2714 return register_pernet_subsys(&tcp4_net_ops);
2717 void tcp4_proc_exit(void)
2719 unregister_pernet_subsys(&tcp4_net_ops);
2721 #endif /* CONFIG_PROC_FS */
2723 struct proto tcp_prot = {
2725 .owner = THIS_MODULE,
2727 .connect = tcp_v4_connect,
2728 .disconnect = tcp_disconnect,
2729 .accept = inet_csk_accept,
2731 .init = tcp_v4_init_sock,
2732 .destroy = tcp_v4_destroy_sock,
2733 .shutdown = tcp_shutdown,
2734 .setsockopt = tcp_setsockopt,
2735 .getsockopt = tcp_getsockopt,
2736 .recvmsg = tcp_recvmsg,
2737 .sendmsg = tcp_sendmsg,
2738 .sendpage = tcp_sendpage,
2739 .backlog_rcv = tcp_v4_do_rcv,
2740 .release_cb = tcp_release_cb,
2741 .mtu_reduced = tcp_v4_mtu_reduced,
2743 .unhash = inet_unhash,
2744 .get_port = inet_csk_get_port,
2745 .enter_memory_pressure = tcp_enter_memory_pressure,
2746 .stream_memory_free = tcp_stream_memory_free,
2747 .sockets_allocated = &tcp_sockets_allocated,
2748 .orphan_count = &tcp_orphan_count,
2749 .memory_allocated = &tcp_memory_allocated,
2750 .memory_pressure = &tcp_memory_pressure,
2751 .sysctl_mem = sysctl_tcp_mem,
2752 .sysctl_wmem = sysctl_tcp_wmem,
2753 .sysctl_rmem = sysctl_tcp_rmem,
2754 .max_header = MAX_TCP_HEADER,
2755 .obj_size = sizeof(struct tcp_sock),
2756 .slab_flags = SLAB_DESTROY_BY_RCU,
2757 .twsk_prot = &tcp_timewait_sock_ops,
2758 .rsk_prot = &tcp_request_sock_ops,
2759 .h.hashinfo = &tcp_hashinfo,
2760 .no_autobind = true,
2761 #ifdef CONFIG_COMPAT
2762 .compat_setsockopt = compat_tcp_setsockopt,
2763 .compat_getsockopt = compat_tcp_getsockopt,
2765 #ifdef CONFIG_MEMCG_KMEM
2766 .init_cgroup = tcp_init_cgroup,
2767 .destroy_cgroup = tcp_destroy_cgroup,
2768 .proto_cgroup = tcp_proto_cgroup,
2771 EXPORT_SYMBOL(tcp_prot);
2773 static int __net_init tcp_sk_init(struct net *net)
2775 net->ipv4.sysctl_tcp_ecn = 2;
2779 static void __net_exit tcp_sk_exit(struct net *net)
2783 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2785 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2788 static struct pernet_operations __net_initdata tcp_sk_ops = {
2789 .init = tcp_sk_init,
2790 .exit = tcp_sk_exit,
2791 .exit_batch = tcp_sk_exit_batch,
2794 void __init tcp_v4_init(void)
2796 inet_hashinfo_init(&tcp_hashinfo);
2797 if (register_pernet_subsys(&tcp_sk_ops))
2798 panic("Failed to create the TCP control socket.\n");