Merge branch 'for-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetoot...
[cascardo/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (!twp || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 if (likely(!tp->repair))
199                         tp->write_seq      = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206         inet->inet_dport = usin->sin_port;
207         sk_daddr_set(sk, daddr);
208
209         inet_csk(sk)->icsk_ext_hdr_len = 0;
210         if (inet_opt)
211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215         /* Socket identity is still unknown (sport may be zero).
216          * However we set state to SYN-SENT and not releasing socket
217          * lock select source port, enter ourselves into the hash tables and
218          * complete initialization after this.
219          */
220         tcp_set_state(sk, TCP_SYN_SENT);
221         err = inet_hash_connect(&tcp_death_row, sk);
222         if (err)
223                 goto failure;
224
225         sk_set_txhash(sk);
226
227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228                                inet->inet_sport, inet->inet_dport, sk);
229         if (IS_ERR(rt)) {
230                 err = PTR_ERR(rt);
231                 rt = NULL;
232                 goto failure;
233         }
234         /* OK, now commit destination to socket.  */
235         sk->sk_gso_type = SKB_GSO_TCPV4;
236         sk_setup_caps(sk, &rt->dst);
237
238         if (!tp->write_seq && likely(!tp->repair))
239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240                                                            inet->inet_daddr,
241                                                            inet->inet_sport,
242                                                            usin->sin_port);
243
244         inet->inet_id = tp->write_seq ^ jiffies;
245
246         err = tcp_connect(sk);
247
248         rt = NULL;
249         if (err)
250                 goto failure;
251
252         return 0;
253
254 failure:
255         /*
256          * This unhashes the socket and releases the local port,
257          * if necessary.
258          */
259         tcp_set_state(sk, TCP_CLOSE);
260         ip_rt_put(rt);
261         sk->sk_route_caps = 0;
262         inet->inet_dport = 0;
263         return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276         u32 mtu = tcp_sk(sk)->mtu_info;
277
278         dst = inet_csk_update_pmtu(sk, mtu);
279         if (!dst)
280                 return;
281
282         /* Something is about to be wrong... Remember soft error
283          * for the case, if this connection will not able to recover.
284          */
285         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286                 sk->sk_err_soft = EMSGSIZE;
287
288         mtu = dst_mtu(dst);
289
290         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291             ip_sk_accept_pmtu(sk) &&
292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293                 tcp_sync_mss(sk, mtu);
294
295                 /* Resend the TCP packet because it's
296                  * clear that the old packet has been
297                  * dropped. This is the new "fast" path mtu
298                  * discovery.
299                  */
300                 tcp_simple_retransmit(sk);
301         } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307         struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309         if (dst)
310                 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq)
316 {
317         struct request_sock *req = inet_reqsk(sk);
318         struct net *net = sock_net(sk);
319
320         /* ICMPs are not backlogged, hence we cannot get
321          * an established socket here.
322          */
323         WARN_ON(req->sk);
324
325         if (seq != tcp_rsk(req)->snt_isn) {
326                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327         } else {
328                 /*
329                  * Still in SYN_RECV, just remove it silently.
330                  * There is no good way to pass the error to the newly
331                  * created socket, and POSIX does not want network
332                  * errors returned from accept().
333                  */
334                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
335                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
336         }
337         reqsk_put(req);
338 }
339 EXPORT_SYMBOL(tcp_req_err);
340
341 /*
342  * This routine is called by the ICMP module when it gets some
343  * sort of error condition.  If err < 0 then the socket should
344  * be closed and the error returned to the user.  If err > 0
345  * it's just the icmp type << 8 | icmp code.  After adjustment
346  * header points to the first 8 bytes of the tcp header.  We need
347  * to find the appropriate port.
348  *
349  * The locking strategy used here is very "optimistic". When
350  * someone else accesses the socket the ICMP is just dropped
351  * and for some paths there is no check at all.
352  * A more general error queue to queue errors for later handling
353  * is probably better.
354  *
355  */
356
357 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
358 {
359         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
360         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
361         struct inet_connection_sock *icsk;
362         struct tcp_sock *tp;
363         struct inet_sock *inet;
364         const int type = icmp_hdr(icmp_skb)->type;
365         const int code = icmp_hdr(icmp_skb)->code;
366         struct sock *sk;
367         struct sk_buff *skb;
368         struct request_sock *fastopen;
369         __u32 seq, snd_una;
370         __u32 remaining;
371         int err;
372         struct net *net = dev_net(icmp_skb->dev);
373
374         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
375                                        th->dest, iph->saddr, ntohs(th->source),
376                                        inet_iif(icmp_skb));
377         if (!sk) {
378                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379                 return;
380         }
381         if (sk->sk_state == TCP_TIME_WAIT) {
382                 inet_twsk_put(inet_twsk(sk));
383                 return;
384         }
385         seq = ntohl(th->seq);
386         if (sk->sk_state == TCP_NEW_SYN_RECV)
387                 return tcp_req_err(sk, seq);
388
389         bh_lock_sock(sk);
390         /* If too many ICMPs get dropped on busy
391          * servers this needs to be solved differently.
392          * We do take care of PMTU discovery (RFC1191) special case :
393          * we can receive locally generated ICMP messages while socket is held.
394          */
395         if (sock_owned_by_user(sk)) {
396                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
397                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
398         }
399         if (sk->sk_state == TCP_CLOSE)
400                 goto out;
401
402         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
403                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
404                 goto out;
405         }
406
407         icsk = inet_csk(sk);
408         tp = tcp_sk(sk);
409         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
410         fastopen = tp->fastopen_rsk;
411         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
412         if (sk->sk_state != TCP_LISTEN &&
413             !between(seq, snd_una, tp->snd_nxt)) {
414                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
415                 goto out;
416         }
417
418         switch (type) {
419         case ICMP_REDIRECT:
420                 do_redirect(icmp_skb, sk);
421                 goto out;
422         case ICMP_SOURCE_QUENCH:
423                 /* Just silently ignore these. */
424                 goto out;
425         case ICMP_PARAMETERPROB:
426                 err = EPROTO;
427                 break;
428         case ICMP_DEST_UNREACH:
429                 if (code > NR_ICMP_UNREACH)
430                         goto out;
431
432                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
433                         /* We are not interested in TCP_LISTEN and open_requests
434                          * (SYN-ACKs send out by Linux are always <576bytes so
435                          * they should go through unfragmented).
436                          */
437                         if (sk->sk_state == TCP_LISTEN)
438                                 goto out;
439
440                         tp->mtu_info = info;
441                         if (!sock_owned_by_user(sk)) {
442                                 tcp_v4_mtu_reduced(sk);
443                         } else {
444                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
445                                         sock_hold(sk);
446                         }
447                         goto out;
448                 }
449
450                 err = icmp_err_convert[code].errno;
451                 /* check if icmp_skb allows revert of backoff
452                  * (see draft-zimmermann-tcp-lcd) */
453                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
454                         break;
455                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
456                     !icsk->icsk_backoff || fastopen)
457                         break;
458
459                 if (sock_owned_by_user(sk))
460                         break;
461
462                 icsk->icsk_backoff--;
463                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
464                                                TCP_TIMEOUT_INIT;
465                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
466
467                 skb = tcp_write_queue_head(sk);
468                 BUG_ON(!skb);
469
470                 remaining = icsk->icsk_rto -
471                             min(icsk->icsk_rto,
472                                 tcp_time_stamp - tcp_skb_timestamp(skb));
473
474                 if (remaining) {
475                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
476                                                   remaining, TCP_RTO_MAX);
477                 } else {
478                         /* RTO revert clocked out retransmission.
479                          * Will retransmit now */
480                         tcp_retransmit_timer(sk);
481                 }
482
483                 break;
484         case ICMP_TIME_EXCEEDED:
485                 err = EHOSTUNREACH;
486                 break;
487         default:
488                 goto out;
489         }
490
491         switch (sk->sk_state) {
492         case TCP_SYN_SENT:
493         case TCP_SYN_RECV:
494                 /* Only in fast or simultaneous open. If a fast open socket is
495                  * is already accepted it is treated as a connected one below.
496                  */
497                 if (fastopen && !fastopen->sk)
498                         break;
499
500                 if (!sock_owned_by_user(sk)) {
501                         sk->sk_err = err;
502
503                         sk->sk_error_report(sk);
504
505                         tcp_done(sk);
506                 } else {
507                         sk->sk_err_soft = err;
508                 }
509                 goto out;
510         }
511
512         /* If we've already connected we will keep trying
513          * until we time out, or the user gives up.
514          *
515          * rfc1122 4.2.3.9 allows to consider as hard errors
516          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
517          * but it is obsoleted by pmtu discovery).
518          *
519          * Note, that in modern internet, where routing is unreliable
520          * and in each dark corner broken firewalls sit, sending random
521          * errors ordered by their masters even this two messages finally lose
522          * their original sense (even Linux sends invalid PORT_UNREACHs)
523          *
524          * Now we are in compliance with RFCs.
525          *                                                      --ANK (980905)
526          */
527
528         inet = inet_sk(sk);
529         if (!sock_owned_by_user(sk) && inet->recverr) {
530                 sk->sk_err = err;
531                 sk->sk_error_report(sk);
532         } else  { /* Only an error on timeout */
533                 sk->sk_err_soft = err;
534         }
535
536 out:
537         bh_unlock_sock(sk);
538         sock_put(sk);
539 }
540
541 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
542 {
543         struct tcphdr *th = tcp_hdr(skb);
544
545         if (skb->ip_summed == CHECKSUM_PARTIAL) {
546                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
547                 skb->csum_start = skb_transport_header(skb) - skb->head;
548                 skb->csum_offset = offsetof(struct tcphdr, check);
549         } else {
550                 th->check = tcp_v4_check(skb->len, saddr, daddr,
551                                          csum_partial(th,
552                                                       th->doff << 2,
553                                                       skb->csum));
554         }
555 }
556
557 /* This routine computes an IPv4 TCP checksum. */
558 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
559 {
560         const struct inet_sock *inet = inet_sk(sk);
561
562         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
563 }
564 EXPORT_SYMBOL(tcp_v4_send_check);
565
566 /*
567  *      This routine will send an RST to the other tcp.
568  *
569  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570  *                    for reset.
571  *      Answer: if a packet caused RST, it is not for a socket
572  *              existing in our system, if it is matched to a socket,
573  *              it is just duplicate segment or bug in other side's TCP.
574  *              So that we build reply only basing on parameters
575  *              arrived with segment.
576  *      Exception: precedence violation. We do not implement it in any case.
577  */
578
579 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
580 {
581         const struct tcphdr *th = tcp_hdr(skb);
582         struct {
583                 struct tcphdr th;
584 #ifdef CONFIG_TCP_MD5SIG
585                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
586 #endif
587         } rep;
588         struct ip_reply_arg arg;
589 #ifdef CONFIG_TCP_MD5SIG
590         struct tcp_md5sig_key *key = NULL;
591         const __u8 *hash_location = NULL;
592         unsigned char newhash[16];
593         int genhash;
594         struct sock *sk1 = NULL;
595 #endif
596         struct net *net;
597
598         /* Never send a reset in response to a reset. */
599         if (th->rst)
600                 return;
601
602         /* If sk not NULL, it means we did a successful lookup and incoming
603          * route had to be correct. prequeue might have dropped our dst.
604          */
605         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
606                 return;
607
608         /* Swap the send and the receive. */
609         memset(&rep, 0, sizeof(rep));
610         rep.th.dest   = th->source;
611         rep.th.source = th->dest;
612         rep.th.doff   = sizeof(struct tcphdr) / 4;
613         rep.th.rst    = 1;
614
615         if (th->ack) {
616                 rep.th.seq = th->ack_seq;
617         } else {
618                 rep.th.ack = 1;
619                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620                                        skb->len - (th->doff << 2));
621         }
622
623         memset(&arg, 0, sizeof(arg));
624         arg.iov[0].iov_base = (unsigned char *)&rep;
625         arg.iov[0].iov_len  = sizeof(rep.th);
626
627         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
628 #ifdef CONFIG_TCP_MD5SIG
629         hash_location = tcp_parse_md5sig_option(th);
630         if (sk && sk_fullsock(sk)) {
631                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
632                                         &ip_hdr(skb)->saddr, AF_INET);
633         } else if (hash_location) {
634                 /*
635                  * active side is lost. Try to find listening socket through
636                  * source port, and then find md5 key through listening socket.
637                  * we are not loose security here:
638                  * Incoming packet is checked with md5 hash with finding key,
639                  * no RST generated if md5 hash doesn't match.
640                  */
641                 sk1 = __inet_lookup_listener(net,
642                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
643                                              th->source, ip_hdr(skb)->daddr,
644                                              ntohs(th->source), inet_iif(skb));
645                 /* don't send rst if it can't find key */
646                 if (!sk1)
647                         return;
648                 rcu_read_lock();
649                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650                                         &ip_hdr(skb)->saddr, AF_INET);
651                 if (!key)
652                         goto release_sk1;
653
654                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656                         goto release_sk1;
657         }
658
659         if (key) {
660                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661                                    (TCPOPT_NOP << 16) |
662                                    (TCPOPT_MD5SIG << 8) |
663                                    TCPOLEN_MD5SIG);
664                 /* Update length and the length the header thinks exists */
665                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666                 rep.th.doff = arg.iov[0].iov_len / 4;
667
668                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669                                      key, ip_hdr(skb)->saddr,
670                                      ip_hdr(skb)->daddr, &rep.th);
671         }
672 #endif
673         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674                                       ip_hdr(skb)->saddr, /* XXX */
675                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
676         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678
679         /* When socket is gone, all binding information is lost.
680          * routing might fail in this case. No choice here, if we choose to force
681          * input interface, we will misroute in case of asymmetric route.
682          */
683         if (sk)
684                 arg.bound_dev_if = sk->sk_bound_dev_if;
685
686         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
687                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
688
689         arg.tos = ip_hdr(skb)->tos;
690         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
692                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693                               &arg, arg.iov[0].iov_len);
694
695         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700         if (sk1) {
701                 rcu_read_unlock();
702                 sock_put(sk1);
703         }
704 #endif
705 }
706
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708    outside socket context is ugly, certainly. What can I do?
709  */
710
711 static void tcp_v4_send_ack(struct net *net,
712                             struct sk_buff *skb, u32 seq, u32 ack,
713                             u32 win, u32 tsval, u32 tsecr, int oif,
714                             struct tcp_md5sig_key *key,
715                             int reply_flags, u8 tos)
716 {
717         const struct tcphdr *th = tcp_hdr(skb);
718         struct {
719                 struct tcphdr th;
720                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
721 #ifdef CONFIG_TCP_MD5SIG
722                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
723 #endif
724                         ];
725         } rep;
726         struct ip_reply_arg arg;
727
728         memset(&rep.th, 0, sizeof(struct tcphdr));
729         memset(&arg, 0, sizeof(arg));
730
731         arg.iov[0].iov_base = (unsigned char *)&rep;
732         arg.iov[0].iov_len  = sizeof(rep.th);
733         if (tsecr) {
734                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735                                    (TCPOPT_TIMESTAMP << 8) |
736                                    TCPOLEN_TIMESTAMP);
737                 rep.opt[1] = htonl(tsval);
738                 rep.opt[2] = htonl(tsecr);
739                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740         }
741
742         /* Swap the send and the receive. */
743         rep.th.dest    = th->source;
744         rep.th.source  = th->dest;
745         rep.th.doff    = arg.iov[0].iov_len / 4;
746         rep.th.seq     = htonl(seq);
747         rep.th.ack_seq = htonl(ack);
748         rep.th.ack     = 1;
749         rep.th.window  = htons(win);
750
751 #ifdef CONFIG_TCP_MD5SIG
752         if (key) {
753                 int offset = (tsecr) ? 3 : 0;
754
755                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756                                           (TCPOPT_NOP << 16) |
757                                           (TCPOPT_MD5SIG << 8) |
758                                           TCPOLEN_MD5SIG);
759                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760                 rep.th.doff = arg.iov[0].iov_len/4;
761
762                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763                                     key, ip_hdr(skb)->saddr,
764                                     ip_hdr(skb)->daddr, &rep.th);
765         }
766 #endif
767         arg.flags = reply_flags;
768         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769                                       ip_hdr(skb)->saddr, /* XXX */
770                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
771         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772         if (oif)
773                 arg.bound_dev_if = oif;
774         arg.tos = tos;
775         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
777                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778                               &arg, arg.iov[0].iov_len);
779
780         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782
783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785         struct inet_timewait_sock *tw = inet_twsk(sk);
786         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787
788         tcp_v4_send_ack(sock_net(sk), skb,
789                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
790                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
791                         tcp_time_stamp + tcptw->tw_ts_offset,
792                         tcptw->tw_ts_recent,
793                         tw->tw_bound_dev_if,
794                         tcp_twsk_md5_key(tcptw),
795                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
796                         tw->tw_tos
797                         );
798
799         inet_twsk_put(tw);
800 }
801
802 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
803                                   struct request_sock *req)
804 {
805         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
806          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
807          */
808         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
809                                              tcp_sk(sk)->snd_nxt;
810
811         tcp_v4_send_ack(sock_net(sk), skb, seq,
812                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
813                         tcp_time_stamp,
814                         req->ts_recent,
815                         0,
816                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
817                                           AF_INET),
818                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
819                         ip_hdr(skb)->tos);
820 }
821
822 /*
823  *      Send a SYN-ACK after having received a SYN.
824  *      This still operates on a request_sock only, not on a big
825  *      socket.
826  */
827 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
828                               struct flowi *fl,
829                               struct request_sock *req,
830                               struct tcp_fastopen_cookie *foc,
831                                   bool attach_req)
832 {
833         const struct inet_request_sock *ireq = inet_rsk(req);
834         struct flowi4 fl4;
835         int err = -1;
836         struct sk_buff *skb;
837
838         /* First, grab a route. */
839         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840                 return -1;
841
842         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
843
844         if (skb) {
845                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
846
847                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
848                                             ireq->ir_rmt_addr,
849                                             ireq->opt);
850                 err = net_xmit_eval(err);
851         }
852
853         return err;
854 }
855
856 /*
857  *      IPv4 request_sock destructor.
858  */
859 static void tcp_v4_reqsk_destructor(struct request_sock *req)
860 {
861         kfree(inet_rsk(req)->opt);
862 }
863
864
865 #ifdef CONFIG_TCP_MD5SIG
866 /*
867  * RFC2385 MD5 checksumming requires a mapping of
868  * IP address->MD5 Key.
869  * We need to maintain these in the sk structure.
870  */
871
872 /* Find the Key structure for an address.  */
873 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
874                                          const union tcp_md5_addr *addr,
875                                          int family)
876 {
877         const struct tcp_sock *tp = tcp_sk(sk);
878         struct tcp_md5sig_key *key;
879         unsigned int size = sizeof(struct in_addr);
880         const struct tcp_md5sig_info *md5sig;
881
882         /* caller either holds rcu_read_lock() or socket lock */
883         md5sig = rcu_dereference_check(tp->md5sig_info,
884                                        sock_owned_by_user(sk) ||
885                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
886         if (!md5sig)
887                 return NULL;
888 #if IS_ENABLED(CONFIG_IPV6)
889         if (family == AF_INET6)
890                 size = sizeof(struct in6_addr);
891 #endif
892         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
893                 if (key->family != family)
894                         continue;
895                 if (!memcmp(&key->addr, addr, size))
896                         return key;
897         }
898         return NULL;
899 }
900 EXPORT_SYMBOL(tcp_md5_do_lookup);
901
902 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
903                                          const struct sock *addr_sk)
904 {
905         const union tcp_md5_addr *addr;
906
907         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
908         return tcp_md5_do_lookup(sk, addr, AF_INET);
909 }
910 EXPORT_SYMBOL(tcp_v4_md5_lookup);
911
912 /* This can be called on a newly created socket, from other files */
913 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
914                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
915 {
916         /* Add Key to the list */
917         struct tcp_md5sig_key *key;
918         struct tcp_sock *tp = tcp_sk(sk);
919         struct tcp_md5sig_info *md5sig;
920
921         key = tcp_md5_do_lookup(sk, addr, family);
922         if (key) {
923                 /* Pre-existing entry - just update that one. */
924                 memcpy(key->key, newkey, newkeylen);
925                 key->keylen = newkeylen;
926                 return 0;
927         }
928
929         md5sig = rcu_dereference_protected(tp->md5sig_info,
930                                            sock_owned_by_user(sk) ||
931                                            lockdep_is_held(&sk->sk_lock.slock));
932         if (!md5sig) {
933                 md5sig = kmalloc(sizeof(*md5sig), gfp);
934                 if (!md5sig)
935                         return -ENOMEM;
936
937                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
938                 INIT_HLIST_HEAD(&md5sig->head);
939                 rcu_assign_pointer(tp->md5sig_info, md5sig);
940         }
941
942         key = sock_kmalloc(sk, sizeof(*key), gfp);
943         if (!key)
944                 return -ENOMEM;
945         if (!tcp_alloc_md5sig_pool()) {
946                 sock_kfree_s(sk, key, sizeof(*key));
947                 return -ENOMEM;
948         }
949
950         memcpy(key->key, newkey, newkeylen);
951         key->keylen = newkeylen;
952         key->family = family;
953         memcpy(&key->addr, addr,
954                (family == AF_INET6) ? sizeof(struct in6_addr) :
955                                       sizeof(struct in_addr));
956         hlist_add_head_rcu(&key->node, &md5sig->head);
957         return 0;
958 }
959 EXPORT_SYMBOL(tcp_md5_do_add);
960
961 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
962 {
963         struct tcp_md5sig_key *key;
964
965         key = tcp_md5_do_lookup(sk, addr, family);
966         if (!key)
967                 return -ENOENT;
968         hlist_del_rcu(&key->node);
969         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
970         kfree_rcu(key, rcu);
971         return 0;
972 }
973 EXPORT_SYMBOL(tcp_md5_do_del);
974
975 static void tcp_clear_md5_list(struct sock *sk)
976 {
977         struct tcp_sock *tp = tcp_sk(sk);
978         struct tcp_md5sig_key *key;
979         struct hlist_node *n;
980         struct tcp_md5sig_info *md5sig;
981
982         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
983
984         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
985                 hlist_del_rcu(&key->node);
986                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
987                 kfree_rcu(key, rcu);
988         }
989 }
990
991 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
992                                  int optlen)
993 {
994         struct tcp_md5sig cmd;
995         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
996
997         if (optlen < sizeof(cmd))
998                 return -EINVAL;
999
1000         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1001                 return -EFAULT;
1002
1003         if (sin->sin_family != AF_INET)
1004                 return -EINVAL;
1005
1006         if (!cmd.tcpm_keylen)
1007                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1008                                       AF_INET);
1009
1010         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1011                 return -EINVAL;
1012
1013         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1014                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1015                               GFP_KERNEL);
1016 }
1017
1018 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1019                                         __be32 daddr, __be32 saddr, int nbytes)
1020 {
1021         struct tcp4_pseudohdr *bp;
1022         struct scatterlist sg;
1023
1024         bp = &hp->md5_blk.ip4;
1025
1026         /*
1027          * 1. the TCP pseudo-header (in the order: source IP address,
1028          * destination IP address, zero-padded protocol number, and
1029          * segment length)
1030          */
1031         bp->saddr = saddr;
1032         bp->daddr = daddr;
1033         bp->pad = 0;
1034         bp->protocol = IPPROTO_TCP;
1035         bp->len = cpu_to_be16(nbytes);
1036
1037         sg_init_one(&sg, bp, sizeof(*bp));
1038         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1039 }
1040
1041 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1042                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1043 {
1044         struct tcp_md5sig_pool *hp;
1045         struct hash_desc *desc;
1046
1047         hp = tcp_get_md5sig_pool();
1048         if (!hp)
1049                 goto clear_hash_noput;
1050         desc = &hp->md5_desc;
1051
1052         if (crypto_hash_init(desc))
1053                 goto clear_hash;
1054         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1055                 goto clear_hash;
1056         if (tcp_md5_hash_header(hp, th))
1057                 goto clear_hash;
1058         if (tcp_md5_hash_key(hp, key))
1059                 goto clear_hash;
1060         if (crypto_hash_final(desc, md5_hash))
1061                 goto clear_hash;
1062
1063         tcp_put_md5sig_pool();
1064         return 0;
1065
1066 clear_hash:
1067         tcp_put_md5sig_pool();
1068 clear_hash_noput:
1069         memset(md5_hash, 0, 16);
1070         return 1;
1071 }
1072
1073 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1074                         const struct sock *sk,
1075                         const struct sk_buff *skb)
1076 {
1077         struct tcp_md5sig_pool *hp;
1078         struct hash_desc *desc;
1079         const struct tcphdr *th = tcp_hdr(skb);
1080         __be32 saddr, daddr;
1081
1082         if (sk) { /* valid for establish/request sockets */
1083                 saddr = sk->sk_rcv_saddr;
1084                 daddr = sk->sk_daddr;
1085         } else {
1086                 const struct iphdr *iph = ip_hdr(skb);
1087                 saddr = iph->saddr;
1088                 daddr = iph->daddr;
1089         }
1090
1091         hp = tcp_get_md5sig_pool();
1092         if (!hp)
1093                 goto clear_hash_noput;
1094         desc = &hp->md5_desc;
1095
1096         if (crypto_hash_init(desc))
1097                 goto clear_hash;
1098
1099         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1100                 goto clear_hash;
1101         if (tcp_md5_hash_header(hp, th))
1102                 goto clear_hash;
1103         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_key(hp, key))
1106                 goto clear_hash;
1107         if (crypto_hash_final(desc, md5_hash))
1108                 goto clear_hash;
1109
1110         tcp_put_md5sig_pool();
1111         return 0;
1112
1113 clear_hash:
1114         tcp_put_md5sig_pool();
1115 clear_hash_noput:
1116         memset(md5_hash, 0, 16);
1117         return 1;
1118 }
1119 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1120
1121 #endif
1122
1123 /* Called with rcu_read_lock() */
1124 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1125                                     const struct sk_buff *skb)
1126 {
1127 #ifdef CONFIG_TCP_MD5SIG
1128         /*
1129          * This gets called for each TCP segment that arrives
1130          * so we want to be efficient.
1131          * We have 3 drop cases:
1132          * o No MD5 hash and one expected.
1133          * o MD5 hash and we're not expecting one.
1134          * o MD5 hash and its wrong.
1135          */
1136         const __u8 *hash_location = NULL;
1137         struct tcp_md5sig_key *hash_expected;
1138         const struct iphdr *iph = ip_hdr(skb);
1139         const struct tcphdr *th = tcp_hdr(skb);
1140         int genhash;
1141         unsigned char newhash[16];
1142
1143         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1144                                           AF_INET);
1145         hash_location = tcp_parse_md5sig_option(th);
1146
1147         /* We've parsed the options - do we have a hash? */
1148         if (!hash_expected && !hash_location)
1149                 return false;
1150
1151         if (hash_expected && !hash_location) {
1152                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1153                 return true;
1154         }
1155
1156         if (!hash_expected && hash_location) {
1157                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1158                 return true;
1159         }
1160
1161         /* Okay, so this is hash_expected and hash_location -
1162          * so we need to calculate the checksum.
1163          */
1164         genhash = tcp_v4_md5_hash_skb(newhash,
1165                                       hash_expected,
1166                                       NULL, skb);
1167
1168         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1169                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1170                                      &iph->saddr, ntohs(th->source),
1171                                      &iph->daddr, ntohs(th->dest),
1172                                      genhash ? " tcp_v4_calc_md5_hash failed"
1173                                      : "");
1174                 return true;
1175         }
1176         return false;
1177 #endif
1178         return false;
1179 }
1180
1181 static void tcp_v4_init_req(struct request_sock *req,
1182                             const struct sock *sk_listener,
1183                             struct sk_buff *skb)
1184 {
1185         struct inet_request_sock *ireq = inet_rsk(req);
1186
1187         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1188         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1189         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1190         ireq->opt = tcp_v4_save_options(skb);
1191 }
1192
1193 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1194                                           struct flowi *fl,
1195                                           const struct request_sock *req,
1196                                           bool *strict)
1197 {
1198         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1199
1200         if (strict) {
1201                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1202                         *strict = true;
1203                 else
1204                         *strict = false;
1205         }
1206
1207         return dst;
1208 }
1209
1210 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1211         .family         =       PF_INET,
1212         .obj_size       =       sizeof(struct tcp_request_sock),
1213         .rtx_syn_ack    =       tcp_rtx_synack,
1214         .send_ack       =       tcp_v4_reqsk_send_ack,
1215         .destructor     =       tcp_v4_reqsk_destructor,
1216         .send_reset     =       tcp_v4_send_reset,
1217         .syn_ack_timeout =      tcp_syn_ack_timeout,
1218 };
1219
1220 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221         .mss_clamp      =       TCP_MSS_DEFAULT,
1222 #ifdef CONFIG_TCP_MD5SIG
1223         .req_md5_lookup =       tcp_v4_md5_lookup,
1224         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1225 #endif
1226         .init_req       =       tcp_v4_init_req,
1227 #ifdef CONFIG_SYN_COOKIES
1228         .cookie_init_seq =      cookie_v4_init_sequence,
1229 #endif
1230         .route_req      =       tcp_v4_route_req,
1231         .init_seq       =       tcp_v4_init_sequence,
1232         .send_synack    =       tcp_v4_send_synack,
1233 };
1234
1235 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1236 {
1237         /* Never answer to SYNs send to broadcast or multicast */
1238         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239                 goto drop;
1240
1241         return tcp_conn_request(&tcp_request_sock_ops,
1242                                 &tcp_request_sock_ipv4_ops, sk, skb);
1243
1244 drop:
1245         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1246         return 0;
1247 }
1248 EXPORT_SYMBOL(tcp_v4_conn_request);
1249
1250
1251 /*
1252  * The three way handshake has completed - we got a valid synack -
1253  * now create the new socket.
1254  */
1255 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1256                                   struct request_sock *req,
1257                                   struct dst_entry *dst,
1258                                   struct request_sock *req_unhash,
1259                                   bool *own_req)
1260 {
1261         struct inet_request_sock *ireq;
1262         struct inet_sock *newinet;
1263         struct tcp_sock *newtp;
1264         struct sock *newsk;
1265 #ifdef CONFIG_TCP_MD5SIG
1266         struct tcp_md5sig_key *key;
1267 #endif
1268         struct ip_options_rcu *inet_opt;
1269
1270         if (sk_acceptq_is_full(sk))
1271                 goto exit_overflow;
1272
1273         newsk = tcp_create_openreq_child(sk, req, skb);
1274         if (!newsk)
1275                 goto exit_nonewsk;
1276
1277         newsk->sk_gso_type = SKB_GSO_TCPV4;
1278         inet_sk_rx_dst_set(newsk, skb);
1279
1280         newtp                 = tcp_sk(newsk);
1281         newinet               = inet_sk(newsk);
1282         ireq                  = inet_rsk(req);
1283         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1284         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1285         newsk->sk_bound_dev_if = ireq->ir_iif;
1286         newinet->inet_saddr           = ireq->ir_loc_addr;
1287         inet_opt              = ireq->opt;
1288         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1289         ireq->opt             = NULL;
1290         newinet->mc_index     = inet_iif(skb);
1291         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1292         newinet->rcv_tos      = ip_hdr(skb)->tos;
1293         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1294         if (inet_opt)
1295                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1296         newinet->inet_id = newtp->write_seq ^ jiffies;
1297
1298         if (!dst) {
1299                 dst = inet_csk_route_child_sock(sk, newsk, req);
1300                 if (!dst)
1301                         goto put_and_exit;
1302         } else {
1303                 /* syncookie case : see end of cookie_v4_check() */
1304         }
1305         sk_setup_caps(newsk, dst);
1306
1307         tcp_ca_openreq_child(newsk, dst);
1308
1309         tcp_sync_mss(newsk, dst_mtu(dst));
1310         newtp->advmss = dst_metric_advmss(dst);
1311         if (tcp_sk(sk)->rx_opt.user_mss &&
1312             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1313                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1314
1315         tcp_initialize_rcv_mss(newsk);
1316
1317 #ifdef CONFIG_TCP_MD5SIG
1318         /* Copy over the MD5 key from the original socket */
1319         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1320                                 AF_INET);
1321         if (key) {
1322                 /*
1323                  * We're using one, so create a matching key
1324                  * on the newsk structure. If we fail to get
1325                  * memory, then we end up not copying the key
1326                  * across. Shucks.
1327                  */
1328                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1329                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1330                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1331         }
1332 #endif
1333
1334         if (__inet_inherit_port(sk, newsk) < 0)
1335                 goto put_and_exit;
1336         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1337         if (*own_req)
1338                 tcp_move_syn(newtp, req);
1339
1340         return newsk;
1341
1342 exit_overflow:
1343         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1344 exit_nonewsk:
1345         dst_release(dst);
1346 exit:
1347         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1348         return NULL;
1349 put_and_exit:
1350         inet_csk_prepare_forced_close(newsk);
1351         tcp_done(newsk);
1352         goto exit;
1353 }
1354 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1355
1356 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1357 {
1358 #ifdef CONFIG_SYN_COOKIES
1359         const struct tcphdr *th = tcp_hdr(skb);
1360
1361         if (!th->syn)
1362                 sk = cookie_v4_check(sk, skb);
1363 #endif
1364         return sk;
1365 }
1366
1367 /* The socket must have it's spinlock held when we get
1368  * here, unless it is a TCP_LISTEN socket.
1369  *
1370  * We have a potential double-lock case here, so even when
1371  * doing backlog processing we use the BH locking scheme.
1372  * This is because we cannot sleep with the original spinlock
1373  * held.
1374  */
1375 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1376 {
1377         struct sock *rsk;
1378
1379         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1380                 struct dst_entry *dst = sk->sk_rx_dst;
1381
1382                 sock_rps_save_rxhash(sk, skb);
1383                 sk_mark_napi_id(sk, skb);
1384                 if (dst) {
1385                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1386                             !dst->ops->check(dst, 0)) {
1387                                 dst_release(dst);
1388                                 sk->sk_rx_dst = NULL;
1389                         }
1390                 }
1391                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1392                 return 0;
1393         }
1394
1395         if (tcp_checksum_complete(skb))
1396                 goto csum_err;
1397
1398         if (sk->sk_state == TCP_LISTEN) {
1399                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1400
1401                 if (!nsk)
1402                         goto discard;
1403                 if (nsk != sk) {
1404                         sock_rps_save_rxhash(nsk, skb);
1405                         sk_mark_napi_id(nsk, skb);
1406                         if (tcp_child_process(sk, nsk, skb)) {
1407                                 rsk = nsk;
1408                                 goto reset;
1409                         }
1410                         return 0;
1411                 }
1412         } else
1413                 sock_rps_save_rxhash(sk, skb);
1414
1415         if (tcp_rcv_state_process(sk, skb)) {
1416                 rsk = sk;
1417                 goto reset;
1418         }
1419         return 0;
1420
1421 reset:
1422         tcp_v4_send_reset(rsk, skb);
1423 discard:
1424         kfree_skb(skb);
1425         /* Be careful here. If this function gets more complicated and
1426          * gcc suffers from register pressure on the x86, sk (in %ebx)
1427          * might be destroyed here. This current version compiles correctly,
1428          * but you have been warned.
1429          */
1430         return 0;
1431
1432 csum_err:
1433         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1434         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1435         goto discard;
1436 }
1437 EXPORT_SYMBOL(tcp_v4_do_rcv);
1438
1439 void tcp_v4_early_demux(struct sk_buff *skb)
1440 {
1441         const struct iphdr *iph;
1442         const struct tcphdr *th;
1443         struct sock *sk;
1444
1445         if (skb->pkt_type != PACKET_HOST)
1446                 return;
1447
1448         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1449                 return;
1450
1451         iph = ip_hdr(skb);
1452         th = tcp_hdr(skb);
1453
1454         if (th->doff < sizeof(struct tcphdr) / 4)
1455                 return;
1456
1457         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1458                                        iph->saddr, th->source,
1459                                        iph->daddr, ntohs(th->dest),
1460                                        skb->skb_iif);
1461         if (sk) {
1462                 skb->sk = sk;
1463                 skb->destructor = sock_edemux;
1464                 if (sk_fullsock(sk)) {
1465                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1466
1467                         if (dst)
1468                                 dst = dst_check(dst, 0);
1469                         if (dst &&
1470                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1471                                 skb_dst_set_noref(skb, dst);
1472                 }
1473         }
1474 }
1475
1476 /* Packet is added to VJ-style prequeue for processing in process
1477  * context, if a reader task is waiting. Apparently, this exciting
1478  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1479  * failed somewhere. Latency? Burstiness? Well, at least now we will
1480  * see, why it failed. 8)8)                               --ANK
1481  *
1482  */
1483 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1484 {
1485         struct tcp_sock *tp = tcp_sk(sk);
1486
1487         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1488                 return false;
1489
1490         if (skb->len <= tcp_hdrlen(skb) &&
1491             skb_queue_len(&tp->ucopy.prequeue) == 0)
1492                 return false;
1493
1494         /* Before escaping RCU protected region, we need to take care of skb
1495          * dst. Prequeue is only enabled for established sockets.
1496          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1497          * Instead of doing full sk_rx_dst validity here, let's perform
1498          * an optimistic check.
1499          */
1500         if (likely(sk->sk_rx_dst))
1501                 skb_dst_drop(skb);
1502         else
1503                 skb_dst_force_safe(skb);
1504
1505         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1506         tp->ucopy.memory += skb->truesize;
1507         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1508                 struct sk_buff *skb1;
1509
1510                 BUG_ON(sock_owned_by_user(sk));
1511
1512                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1513                         sk_backlog_rcv(sk, skb1);
1514                         NET_INC_STATS_BH(sock_net(sk),
1515                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1516                 }
1517
1518                 tp->ucopy.memory = 0;
1519         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1520                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1521                                            POLLIN | POLLRDNORM | POLLRDBAND);
1522                 if (!inet_csk_ack_scheduled(sk))
1523                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1524                                                   (3 * tcp_rto_min(sk)) / 4,
1525                                                   TCP_RTO_MAX);
1526         }
1527         return true;
1528 }
1529 EXPORT_SYMBOL(tcp_prequeue);
1530
1531 /*
1532  *      From tcp_input.c
1533  */
1534
1535 int tcp_v4_rcv(struct sk_buff *skb)
1536 {
1537         const struct iphdr *iph;
1538         const struct tcphdr *th;
1539         struct sock *sk;
1540         int ret;
1541         struct net *net = dev_net(skb->dev);
1542
1543         if (skb->pkt_type != PACKET_HOST)
1544                 goto discard_it;
1545
1546         /* Count it even if it's bad */
1547         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1548
1549         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1550                 goto discard_it;
1551
1552         th = tcp_hdr(skb);
1553
1554         if (th->doff < sizeof(struct tcphdr) / 4)
1555                 goto bad_packet;
1556         if (!pskb_may_pull(skb, th->doff * 4))
1557                 goto discard_it;
1558
1559         /* An explanation is required here, I think.
1560          * Packet length and doff are validated by header prediction,
1561          * provided case of th->doff==0 is eliminated.
1562          * So, we defer the checks. */
1563
1564         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1565                 goto csum_error;
1566
1567         th = tcp_hdr(skb);
1568         iph = ip_hdr(skb);
1569         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1570          * barrier() makes sure compiler wont play fool^Waliasing games.
1571          */
1572         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1573                 sizeof(struct inet_skb_parm));
1574         barrier();
1575
1576         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1577         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1578                                     skb->len - th->doff * 4);
1579         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1580         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1581         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1582         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1583         TCP_SKB_CB(skb)->sacked  = 0;
1584
1585 lookup:
1586         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1587         if (!sk)
1588                 goto no_tcp_socket;
1589
1590 process:
1591         if (sk->sk_state == TCP_TIME_WAIT)
1592                 goto do_time_wait;
1593
1594         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1595                 struct request_sock *req = inet_reqsk(sk);
1596                 struct sock *nsk = NULL;
1597
1598                 sk = req->rsk_listener;
1599                 if (tcp_v4_inbound_md5_hash(sk, skb))
1600                         goto discard_and_relse;
1601                 if (likely(sk->sk_state == TCP_LISTEN)) {
1602                         nsk = tcp_check_req(sk, skb, req, false);
1603                 } else {
1604                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1605                         goto lookup;
1606                 }
1607                 if (!nsk) {
1608                         reqsk_put(req);
1609                         goto discard_it;
1610                 }
1611                 if (nsk == sk) {
1612                         sock_hold(sk);
1613                         reqsk_put(req);
1614                 } else if (tcp_child_process(sk, nsk, skb)) {
1615                         tcp_v4_send_reset(nsk, skb);
1616                         goto discard_it;
1617                 } else {
1618                         return 0;
1619                 }
1620         }
1621         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1622                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1623                 goto discard_and_relse;
1624         }
1625
1626         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1627                 goto discard_and_relse;
1628
1629         if (tcp_v4_inbound_md5_hash(sk, skb))
1630                 goto discard_and_relse;
1631
1632         nf_reset(skb);
1633
1634         if (sk_filter(sk, skb))
1635                 goto discard_and_relse;
1636
1637         skb->dev = NULL;
1638
1639         if (sk->sk_state == TCP_LISTEN) {
1640                 ret = tcp_v4_do_rcv(sk, skb);
1641                 goto put_and_return;
1642         }
1643
1644         sk_incoming_cpu_update(sk);
1645
1646         bh_lock_sock_nested(sk);
1647         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1648         ret = 0;
1649         if (!sock_owned_by_user(sk)) {
1650                 if (!tcp_prequeue(sk, skb))
1651                         ret = tcp_v4_do_rcv(sk, skb);
1652         } else if (unlikely(sk_add_backlog(sk, skb,
1653                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1654                 bh_unlock_sock(sk);
1655                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1656                 goto discard_and_relse;
1657         }
1658         bh_unlock_sock(sk);
1659
1660 put_and_return:
1661         sock_put(sk);
1662
1663         return ret;
1664
1665 no_tcp_socket:
1666         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1667                 goto discard_it;
1668
1669         if (tcp_checksum_complete(skb)) {
1670 csum_error:
1671                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1672 bad_packet:
1673                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1674         } else {
1675                 tcp_v4_send_reset(NULL, skb);
1676         }
1677
1678 discard_it:
1679         /* Discard frame. */
1680         kfree_skb(skb);
1681         return 0;
1682
1683 discard_and_relse:
1684         sock_put(sk);
1685         goto discard_it;
1686
1687 do_time_wait:
1688         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1689                 inet_twsk_put(inet_twsk(sk));
1690                 goto discard_it;
1691         }
1692
1693         if (tcp_checksum_complete(skb)) {
1694                 inet_twsk_put(inet_twsk(sk));
1695                 goto csum_error;
1696         }
1697         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1698         case TCP_TW_SYN: {
1699                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1700                                                         &tcp_hashinfo,
1701                                                         iph->saddr, th->source,
1702                                                         iph->daddr, th->dest,
1703                                                         inet_iif(skb));
1704                 if (sk2) {
1705                         inet_twsk_deschedule_put(inet_twsk(sk));
1706                         sk = sk2;
1707                         goto process;
1708                 }
1709                 /* Fall through to ACK */
1710         }
1711         case TCP_TW_ACK:
1712                 tcp_v4_timewait_ack(sk, skb);
1713                 break;
1714         case TCP_TW_RST:
1715                 tcp_v4_send_reset(sk, skb);
1716                 inet_twsk_deschedule_put(inet_twsk(sk));
1717                 goto discard_it;
1718         case TCP_TW_SUCCESS:;
1719         }
1720         goto discard_it;
1721 }
1722
1723 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1724         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1725         .twsk_unique    = tcp_twsk_unique,
1726         .twsk_destructor= tcp_twsk_destructor,
1727 };
1728
1729 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1730 {
1731         struct dst_entry *dst = skb_dst(skb);
1732
1733         if (dst && dst_hold_safe(dst)) {
1734                 sk->sk_rx_dst = dst;
1735                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1736         }
1737 }
1738 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1739
1740 const struct inet_connection_sock_af_ops ipv4_specific = {
1741         .queue_xmit        = ip_queue_xmit,
1742         .send_check        = tcp_v4_send_check,
1743         .rebuild_header    = inet_sk_rebuild_header,
1744         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1745         .conn_request      = tcp_v4_conn_request,
1746         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1747         .net_header_len    = sizeof(struct iphdr),
1748         .setsockopt        = ip_setsockopt,
1749         .getsockopt        = ip_getsockopt,
1750         .addr2sockaddr     = inet_csk_addr2sockaddr,
1751         .sockaddr_len      = sizeof(struct sockaddr_in),
1752         .bind_conflict     = inet_csk_bind_conflict,
1753 #ifdef CONFIG_COMPAT
1754         .compat_setsockopt = compat_ip_setsockopt,
1755         .compat_getsockopt = compat_ip_getsockopt,
1756 #endif
1757         .mtu_reduced       = tcp_v4_mtu_reduced,
1758 };
1759 EXPORT_SYMBOL(ipv4_specific);
1760
1761 #ifdef CONFIG_TCP_MD5SIG
1762 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1763         .md5_lookup             = tcp_v4_md5_lookup,
1764         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1765         .md5_parse              = tcp_v4_parse_md5_keys,
1766 };
1767 #endif
1768
1769 /* NOTE: A lot of things set to zero explicitly by call to
1770  *       sk_alloc() so need not be done here.
1771  */
1772 static int tcp_v4_init_sock(struct sock *sk)
1773 {
1774         struct inet_connection_sock *icsk = inet_csk(sk);
1775
1776         tcp_init_sock(sk);
1777
1778         icsk->icsk_af_ops = &ipv4_specific;
1779
1780 #ifdef CONFIG_TCP_MD5SIG
1781         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1782 #endif
1783
1784         return 0;
1785 }
1786
1787 void tcp_v4_destroy_sock(struct sock *sk)
1788 {
1789         struct tcp_sock *tp = tcp_sk(sk);
1790
1791         tcp_clear_xmit_timers(sk);
1792
1793         tcp_cleanup_congestion_control(sk);
1794
1795         /* Cleanup up the write buffer. */
1796         tcp_write_queue_purge(sk);
1797
1798         /* Cleans up our, hopefully empty, out_of_order_queue. */
1799         __skb_queue_purge(&tp->out_of_order_queue);
1800
1801 #ifdef CONFIG_TCP_MD5SIG
1802         /* Clean up the MD5 key list, if any */
1803         if (tp->md5sig_info) {
1804                 tcp_clear_md5_list(sk);
1805                 kfree_rcu(tp->md5sig_info, rcu);
1806                 tp->md5sig_info = NULL;
1807         }
1808 #endif
1809
1810         /* Clean prequeue, it must be empty really */
1811         __skb_queue_purge(&tp->ucopy.prequeue);
1812
1813         /* Clean up a referenced TCP bind bucket. */
1814         if (inet_csk(sk)->icsk_bind_hash)
1815                 inet_put_port(sk);
1816
1817         BUG_ON(tp->fastopen_rsk);
1818
1819         /* If socket is aborted during connect operation */
1820         tcp_free_fastopen_req(tp);
1821         tcp_saved_syn_free(tp);
1822
1823         sk_sockets_allocated_dec(sk);
1824
1825         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1826                 sock_release_memcg(sk);
1827 }
1828 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1829
1830 #ifdef CONFIG_PROC_FS
1831 /* Proc filesystem TCP sock list dumping. */
1832
1833 /*
1834  * Get next listener socket follow cur.  If cur is NULL, get first socket
1835  * starting from bucket given in st->bucket; when st->bucket is zero the
1836  * very first socket in the hash table is returned.
1837  */
1838 static void *listening_get_next(struct seq_file *seq, void *cur)
1839 {
1840         struct inet_connection_sock *icsk;
1841         struct hlist_nulls_node *node;
1842         struct sock *sk = cur;
1843         struct inet_listen_hashbucket *ilb;
1844         struct tcp_iter_state *st = seq->private;
1845         struct net *net = seq_file_net(seq);
1846
1847         if (!sk) {
1848                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1849                 spin_lock_bh(&ilb->lock);
1850                 sk = sk_nulls_head(&ilb->head);
1851                 st->offset = 0;
1852                 goto get_sk;
1853         }
1854         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1855         ++st->num;
1856         ++st->offset;
1857
1858         sk = sk_nulls_next(sk);
1859 get_sk:
1860         sk_nulls_for_each_from(sk, node) {
1861                 if (!net_eq(sock_net(sk), net))
1862                         continue;
1863                 if (sk->sk_family == st->family) {
1864                         cur = sk;
1865                         goto out;
1866                 }
1867                 icsk = inet_csk(sk);
1868         }
1869         spin_unlock_bh(&ilb->lock);
1870         st->offset = 0;
1871         if (++st->bucket < INET_LHTABLE_SIZE) {
1872                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1873                 spin_lock_bh(&ilb->lock);
1874                 sk = sk_nulls_head(&ilb->head);
1875                 goto get_sk;
1876         }
1877         cur = NULL;
1878 out:
1879         return cur;
1880 }
1881
1882 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1883 {
1884         struct tcp_iter_state *st = seq->private;
1885         void *rc;
1886
1887         st->bucket = 0;
1888         st->offset = 0;
1889         rc = listening_get_next(seq, NULL);
1890
1891         while (rc && *pos) {
1892                 rc = listening_get_next(seq, rc);
1893                 --*pos;
1894         }
1895         return rc;
1896 }
1897
1898 static inline bool empty_bucket(const struct tcp_iter_state *st)
1899 {
1900         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1901 }
1902
1903 /*
1904  * Get first established socket starting from bucket given in st->bucket.
1905  * If st->bucket is zero, the very first socket in the hash is returned.
1906  */
1907 static void *established_get_first(struct seq_file *seq)
1908 {
1909         struct tcp_iter_state *st = seq->private;
1910         struct net *net = seq_file_net(seq);
1911         void *rc = NULL;
1912
1913         st->offset = 0;
1914         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1915                 struct sock *sk;
1916                 struct hlist_nulls_node *node;
1917                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1918
1919                 /* Lockless fast path for the common case of empty buckets */
1920                 if (empty_bucket(st))
1921                         continue;
1922
1923                 spin_lock_bh(lock);
1924                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1925                         if (sk->sk_family != st->family ||
1926                             !net_eq(sock_net(sk), net)) {
1927                                 continue;
1928                         }
1929                         rc = sk;
1930                         goto out;
1931                 }
1932                 spin_unlock_bh(lock);
1933         }
1934 out:
1935         return rc;
1936 }
1937
1938 static void *established_get_next(struct seq_file *seq, void *cur)
1939 {
1940         struct sock *sk = cur;
1941         struct hlist_nulls_node *node;
1942         struct tcp_iter_state *st = seq->private;
1943         struct net *net = seq_file_net(seq);
1944
1945         ++st->num;
1946         ++st->offset;
1947
1948         sk = sk_nulls_next(sk);
1949
1950         sk_nulls_for_each_from(sk, node) {
1951                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1952                         return sk;
1953         }
1954
1955         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1956         ++st->bucket;
1957         return established_get_first(seq);
1958 }
1959
1960 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1961 {
1962         struct tcp_iter_state *st = seq->private;
1963         void *rc;
1964
1965         st->bucket = 0;
1966         rc = established_get_first(seq);
1967
1968         while (rc && pos) {
1969                 rc = established_get_next(seq, rc);
1970                 --pos;
1971         }
1972         return rc;
1973 }
1974
1975 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1976 {
1977         void *rc;
1978         struct tcp_iter_state *st = seq->private;
1979
1980         st->state = TCP_SEQ_STATE_LISTENING;
1981         rc        = listening_get_idx(seq, &pos);
1982
1983         if (!rc) {
1984                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1985                 rc        = established_get_idx(seq, pos);
1986         }
1987
1988         return rc;
1989 }
1990
1991 static void *tcp_seek_last_pos(struct seq_file *seq)
1992 {
1993         struct tcp_iter_state *st = seq->private;
1994         int offset = st->offset;
1995         int orig_num = st->num;
1996         void *rc = NULL;
1997
1998         switch (st->state) {
1999         case TCP_SEQ_STATE_LISTENING:
2000                 if (st->bucket >= INET_LHTABLE_SIZE)
2001                         break;
2002                 st->state = TCP_SEQ_STATE_LISTENING;
2003                 rc = listening_get_next(seq, NULL);
2004                 while (offset-- && rc)
2005                         rc = listening_get_next(seq, rc);
2006                 if (rc)
2007                         break;
2008                 st->bucket = 0;
2009                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010                 /* Fallthrough */
2011         case TCP_SEQ_STATE_ESTABLISHED:
2012                 if (st->bucket > tcp_hashinfo.ehash_mask)
2013                         break;
2014                 rc = established_get_first(seq);
2015                 while (offset-- && rc)
2016                         rc = established_get_next(seq, rc);
2017         }
2018
2019         st->num = orig_num;
2020
2021         return rc;
2022 }
2023
2024 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2025 {
2026         struct tcp_iter_state *st = seq->private;
2027         void *rc;
2028
2029         if (*pos && *pos == st->last_pos) {
2030                 rc = tcp_seek_last_pos(seq);
2031                 if (rc)
2032                         goto out;
2033         }
2034
2035         st->state = TCP_SEQ_STATE_LISTENING;
2036         st->num = 0;
2037         st->bucket = 0;
2038         st->offset = 0;
2039         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2040
2041 out:
2042         st->last_pos = *pos;
2043         return rc;
2044 }
2045
2046 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2047 {
2048         struct tcp_iter_state *st = seq->private;
2049         void *rc = NULL;
2050
2051         if (v == SEQ_START_TOKEN) {
2052                 rc = tcp_get_idx(seq, 0);
2053                 goto out;
2054         }
2055
2056         switch (st->state) {
2057         case TCP_SEQ_STATE_LISTENING:
2058                 rc = listening_get_next(seq, v);
2059                 if (!rc) {
2060                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2061                         st->bucket = 0;
2062                         st->offset = 0;
2063                         rc        = established_get_first(seq);
2064                 }
2065                 break;
2066         case TCP_SEQ_STATE_ESTABLISHED:
2067                 rc = established_get_next(seq, v);
2068                 break;
2069         }
2070 out:
2071         ++*pos;
2072         st->last_pos = *pos;
2073         return rc;
2074 }
2075
2076 static void tcp_seq_stop(struct seq_file *seq, void *v)
2077 {
2078         struct tcp_iter_state *st = seq->private;
2079
2080         switch (st->state) {
2081         case TCP_SEQ_STATE_LISTENING:
2082                 if (v != SEQ_START_TOKEN)
2083                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2084                 break;
2085         case TCP_SEQ_STATE_ESTABLISHED:
2086                 if (v)
2087                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2088                 break;
2089         }
2090 }
2091
2092 int tcp_seq_open(struct inode *inode, struct file *file)
2093 {
2094         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2095         struct tcp_iter_state *s;
2096         int err;
2097
2098         err = seq_open_net(inode, file, &afinfo->seq_ops,
2099                           sizeof(struct tcp_iter_state));
2100         if (err < 0)
2101                 return err;
2102
2103         s = ((struct seq_file *)file->private_data)->private;
2104         s->family               = afinfo->family;
2105         s->last_pos             = 0;
2106         return 0;
2107 }
2108 EXPORT_SYMBOL(tcp_seq_open);
2109
2110 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2111 {
2112         int rc = 0;
2113         struct proc_dir_entry *p;
2114
2115         afinfo->seq_ops.start           = tcp_seq_start;
2116         afinfo->seq_ops.next            = tcp_seq_next;
2117         afinfo->seq_ops.stop            = tcp_seq_stop;
2118
2119         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2120                              afinfo->seq_fops, afinfo);
2121         if (!p)
2122                 rc = -ENOMEM;
2123         return rc;
2124 }
2125 EXPORT_SYMBOL(tcp_proc_register);
2126
2127 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2128 {
2129         remove_proc_entry(afinfo->name, net->proc_net);
2130 }
2131 EXPORT_SYMBOL(tcp_proc_unregister);
2132
2133 static void get_openreq4(const struct request_sock *req,
2134                          struct seq_file *f, int i)
2135 {
2136         const struct inet_request_sock *ireq = inet_rsk(req);
2137         long delta = req->rsk_timer.expires - jiffies;
2138
2139         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2140                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2141                 i,
2142                 ireq->ir_loc_addr,
2143                 ireq->ir_num,
2144                 ireq->ir_rmt_addr,
2145                 ntohs(ireq->ir_rmt_port),
2146                 TCP_SYN_RECV,
2147                 0, 0, /* could print option size, but that is af dependent. */
2148                 1,    /* timers active (only the expire timer) */
2149                 jiffies_delta_to_clock_t(delta),
2150                 req->num_timeout,
2151                 from_kuid_munged(seq_user_ns(f),
2152                                  sock_i_uid(req->rsk_listener)),
2153                 0,  /* non standard timer */
2154                 0, /* open_requests have no inode */
2155                 0,
2156                 req);
2157 }
2158
2159 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2160 {
2161         int timer_active;
2162         unsigned long timer_expires;
2163         const struct tcp_sock *tp = tcp_sk(sk);
2164         const struct inet_connection_sock *icsk = inet_csk(sk);
2165         const struct inet_sock *inet = inet_sk(sk);
2166         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2167         __be32 dest = inet->inet_daddr;
2168         __be32 src = inet->inet_rcv_saddr;
2169         __u16 destp = ntohs(inet->inet_dport);
2170         __u16 srcp = ntohs(inet->inet_sport);
2171         int rx_queue;
2172         int state;
2173
2174         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2175             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2176             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2177                 timer_active    = 1;
2178                 timer_expires   = icsk->icsk_timeout;
2179         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2180                 timer_active    = 4;
2181                 timer_expires   = icsk->icsk_timeout;
2182         } else if (timer_pending(&sk->sk_timer)) {
2183                 timer_active    = 2;
2184                 timer_expires   = sk->sk_timer.expires;
2185         } else {
2186                 timer_active    = 0;
2187                 timer_expires = jiffies;
2188         }
2189
2190         state = sk_state_load(sk);
2191         if (state == TCP_LISTEN)
2192                 rx_queue = sk->sk_ack_backlog;
2193         else
2194                 /* Because we don't lock the socket,
2195                  * we might find a transient negative value.
2196                  */
2197                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2198
2199         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2200                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2201                 i, src, srcp, dest, destp, state,
2202                 tp->write_seq - tp->snd_una,
2203                 rx_queue,
2204                 timer_active,
2205                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2206                 icsk->icsk_retransmits,
2207                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2208                 icsk->icsk_probes_out,
2209                 sock_i_ino(sk),
2210                 atomic_read(&sk->sk_refcnt), sk,
2211                 jiffies_to_clock_t(icsk->icsk_rto),
2212                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2213                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2214                 tp->snd_cwnd,
2215                 state == TCP_LISTEN ?
2216                     fastopenq->max_qlen :
2217                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2218 }
2219
2220 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2221                                struct seq_file *f, int i)
2222 {
2223         long delta = tw->tw_timer.expires - jiffies;
2224         __be32 dest, src;
2225         __u16 destp, srcp;
2226
2227         dest  = tw->tw_daddr;
2228         src   = tw->tw_rcv_saddr;
2229         destp = ntohs(tw->tw_dport);
2230         srcp  = ntohs(tw->tw_sport);
2231
2232         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2233                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2234                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2235                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2236                 atomic_read(&tw->tw_refcnt), tw);
2237 }
2238
2239 #define TMPSZ 150
2240
2241 static int tcp4_seq_show(struct seq_file *seq, void *v)
2242 {
2243         struct tcp_iter_state *st;
2244         struct sock *sk = v;
2245
2246         seq_setwidth(seq, TMPSZ - 1);
2247         if (v == SEQ_START_TOKEN) {
2248                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2249                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2250                            "inode");
2251                 goto out;
2252         }
2253         st = seq->private;
2254
2255         if (sk->sk_state == TCP_TIME_WAIT)
2256                 get_timewait4_sock(v, seq, st->num);
2257         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2258                 get_openreq4(v, seq, st->num);
2259         else
2260                 get_tcp4_sock(v, seq, st->num);
2261 out:
2262         seq_pad(seq, '\n');
2263         return 0;
2264 }
2265
2266 static const struct file_operations tcp_afinfo_seq_fops = {
2267         .owner   = THIS_MODULE,
2268         .open    = tcp_seq_open,
2269         .read    = seq_read,
2270         .llseek  = seq_lseek,
2271         .release = seq_release_net
2272 };
2273
2274 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2275         .name           = "tcp",
2276         .family         = AF_INET,
2277         .seq_fops       = &tcp_afinfo_seq_fops,
2278         .seq_ops        = {
2279                 .show           = tcp4_seq_show,
2280         },
2281 };
2282
2283 static int __net_init tcp4_proc_init_net(struct net *net)
2284 {
2285         return tcp_proc_register(net, &tcp4_seq_afinfo);
2286 }
2287
2288 static void __net_exit tcp4_proc_exit_net(struct net *net)
2289 {
2290         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2291 }
2292
2293 static struct pernet_operations tcp4_net_ops = {
2294         .init = tcp4_proc_init_net,
2295         .exit = tcp4_proc_exit_net,
2296 };
2297
2298 int __init tcp4_proc_init(void)
2299 {
2300         return register_pernet_subsys(&tcp4_net_ops);
2301 }
2302
2303 void tcp4_proc_exit(void)
2304 {
2305         unregister_pernet_subsys(&tcp4_net_ops);
2306 }
2307 #endif /* CONFIG_PROC_FS */
2308
2309 struct proto tcp_prot = {
2310         .name                   = "TCP",
2311         .owner                  = THIS_MODULE,
2312         .close                  = tcp_close,
2313         .connect                = tcp_v4_connect,
2314         .disconnect             = tcp_disconnect,
2315         .accept                 = inet_csk_accept,
2316         .ioctl                  = tcp_ioctl,
2317         .init                   = tcp_v4_init_sock,
2318         .destroy                = tcp_v4_destroy_sock,
2319         .shutdown               = tcp_shutdown,
2320         .setsockopt             = tcp_setsockopt,
2321         .getsockopt             = tcp_getsockopt,
2322         .recvmsg                = tcp_recvmsg,
2323         .sendmsg                = tcp_sendmsg,
2324         .sendpage               = tcp_sendpage,
2325         .backlog_rcv            = tcp_v4_do_rcv,
2326         .release_cb             = tcp_release_cb,
2327         .hash                   = inet_hash,
2328         .unhash                 = inet_unhash,
2329         .get_port               = inet_csk_get_port,
2330         .enter_memory_pressure  = tcp_enter_memory_pressure,
2331         .stream_memory_free     = tcp_stream_memory_free,
2332         .sockets_allocated      = &tcp_sockets_allocated,
2333         .orphan_count           = &tcp_orphan_count,
2334         .memory_allocated       = &tcp_memory_allocated,
2335         .memory_pressure        = &tcp_memory_pressure,
2336         .sysctl_mem             = sysctl_tcp_mem,
2337         .sysctl_wmem            = sysctl_tcp_wmem,
2338         .sysctl_rmem            = sysctl_tcp_rmem,
2339         .max_header             = MAX_TCP_HEADER,
2340         .obj_size               = sizeof(struct tcp_sock),
2341         .slab_flags             = SLAB_DESTROY_BY_RCU,
2342         .twsk_prot              = &tcp_timewait_sock_ops,
2343         .rsk_prot               = &tcp_request_sock_ops,
2344         .h.hashinfo             = &tcp_hashinfo,
2345         .no_autobind            = true,
2346 #ifdef CONFIG_COMPAT
2347         .compat_setsockopt      = compat_tcp_setsockopt,
2348         .compat_getsockopt      = compat_tcp_getsockopt,
2349 #endif
2350         .diag_destroy           = tcp_abort,
2351 };
2352 EXPORT_SYMBOL(tcp_prot);
2353
2354 static void __net_exit tcp_sk_exit(struct net *net)
2355 {
2356         int cpu;
2357
2358         for_each_possible_cpu(cpu)
2359                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2360         free_percpu(net->ipv4.tcp_sk);
2361 }
2362
2363 static int __net_init tcp_sk_init(struct net *net)
2364 {
2365         int res, cpu;
2366
2367         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2368         if (!net->ipv4.tcp_sk)
2369                 return -ENOMEM;
2370
2371         for_each_possible_cpu(cpu) {
2372                 struct sock *sk;
2373
2374                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2375                                            IPPROTO_TCP, net);
2376                 if (res)
2377                         goto fail;
2378                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2379         }
2380
2381         net->ipv4.sysctl_tcp_ecn = 2;
2382         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2383
2384         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2385         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2386         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2387
2388         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2389         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2390         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2391
2392         return 0;
2393 fail:
2394         tcp_sk_exit(net);
2395
2396         return res;
2397 }
2398
2399 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2400 {
2401         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2402 }
2403
2404 static struct pernet_operations __net_initdata tcp_sk_ops = {
2405        .init       = tcp_sk_init,
2406        .exit       = tcp_sk_exit,
2407        .exit_batch = tcp_sk_exit_batch,
2408 };
2409
2410 void __init tcp_v4_init(void)
2411 {
2412         inet_hashinfo_init(&tcp_hashinfo);
2413         if (register_pernet_subsys(&tcp_sk_ops))
2414                 panic("Failed to create the TCP control socket.\n");
2415 }